初始化
This commit is contained in:
Vendored
+4
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"python-envs.defaultEnvManager": "ms-python.python:conda",
|
||||
"python-envs.defaultPackageManager": "ms-python.python:conda"
|
||||
}
|
||||
Binary file not shown.
+190
@@ -0,0 +1,190 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>VLM 虚拟主播</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; background-color: #f0f2f5; margin-top: 50px; }
|
||||
.container { display: flex; gap: 20px; }
|
||||
.video-box { background: #000; border-radius: 10px; overflow: hidden; width: 320px; height: 240px; position: relative; }
|
||||
video { width: 100%; height: 100%; object-fit: cover; }
|
||||
.avatar-box { width: 320px; height: 240px; background: #fff; border-radius: 10px; display: flex; align-items: center; justify-content: center; box-shadow: 0 4px 12px rgba(0,0,0,0.1); flex-direction: column; }
|
||||
.avatar-box img { width: 100px; height: 100px; border-radius: 50%; margin-bottom: 10px; }
|
||||
.controls { margin-top: 30px; }
|
||||
button { padding: 15px 30px; font-size: 18px; border: none; border-radius: 25px; cursor: pointer; background-color: #007bff; color: white; transition: background 0.2s; }
|
||||
button:active { background-color: #0056b3; }
|
||||
button:disabled { background-color: #ccc; cursor: not-allowed; }
|
||||
#status { margin-top: 15px; color: #555; font-weight: bold; }
|
||||
#transcript { margin-top: 20px; width: 600px; text-align: center; color: #333; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2>🤖 多模态虚拟主播 (VLM)</h2>
|
||||
|
||||
<div class="container">
|
||||
<div class="video-box">
|
||||
<video id="userVideo" autoplay muted playsinline></video>
|
||||
</div>
|
||||
|
||||
<div class="avatar-box" id="avatarBox">
|
||||
<img src="https://api.dicebear.com/7.x/bottts/svg?seed=Felix" alt="Avatar" id="avatarImg">
|
||||
<span id="avatarStatus">等待互动...</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<canvas id="canvas" style="display:none;"></canvas>
|
||||
|
||||
<div class="controls">
|
||||
<button id="talkBtn" disabled>连接中...</button>
|
||||
</div>
|
||||
|
||||
<div id="status">正在准备摄像头和麦克风...</div>
|
||||
<div id="transcript"></div>
|
||||
|
||||
<script>
|
||||
const videoElement = document.getElementById('userVideo');
|
||||
const canvasElement = document.getElementById('canvas');
|
||||
const talkBtn = document.getElementById('talkBtn');
|
||||
const statusText = document.getElementById('status');
|
||||
const transcriptText = document.getElementById('transcript');
|
||||
const avatarStatus = document.getElementById('avatarStatus');
|
||||
|
||||
let ws;
|
||||
let mediaRecorder;
|
||||
let audioChunks = [];
|
||||
let isRecording = false;
|
||||
|
||||
// 1. 初始化摄像头和麦克风
|
||||
async function initMedia() {
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: true });
|
||||
videoElement.srcObject = stream;
|
||||
|
||||
// 设置音频录制器
|
||||
mediaRecorder = new MediaRecorder(stream);
|
||||
|
||||
mediaRecorder.ondataavailable = event => {
|
||||
if (event.data.size > 0) audioChunks.push(event.data);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = async () => {
|
||||
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
|
||||
audioChunks = []; // 清空准备下一次录音
|
||||
|
||||
// 录音结束时,截取当前摄像头画面
|
||||
const imageBase64 = captureFrame();
|
||||
|
||||
// 将音频转为 Base64
|
||||
const audioBase64 = await blobToBase64(audioBlob);
|
||||
|
||||
// 发送给后端
|
||||
sendToServer(audioBase64, imageBase64);
|
||||
};
|
||||
|
||||
statusText.innerText = "设备已就绪,正在连接服务器...";
|
||||
initWebSocket();
|
||||
|
||||
} catch (err) {
|
||||
statusText.innerText = "获取摄像头/麦克风失败,请允许权限!";
|
||||
console.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. 初始化 WebSocket 连接
|
||||
function initWebSocket() {
|
||||
// 假设我们后端的 FastAPI 跑在 8000 端口
|
||||
ws = new WebSocket('ws://localhost:8000/ws');
|
||||
|
||||
ws.onopen = () => {
|
||||
statusText.innerText = "✅ 已连接到大脑!长按按钮说话。";
|
||||
talkBtn.innerText = "按住说话 🎙️";
|
||||
talkBtn.disabled = false;
|
||||
};
|
||||
|
||||
ws.onmessage = async (event) => {
|
||||
const response = JSON.parse(event.data);
|
||||
|
||||
if (response.type === 'text') {
|
||||
// 后端流式返回的文字
|
||||
transcriptText.innerHTML += response.content;
|
||||
} else if (response.type === 'audio') {
|
||||
// 后端返回的 TTS 音频 (Base64)
|
||||
playAudio(response.content);
|
||||
}
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
statusText.innerText = "❌ 与服务器断开连接";
|
||||
talkBtn.disabled = true;
|
||||
};
|
||||
}
|
||||
|
||||
// 3. 截取视频帧转为 Base64
|
||||
function captureFrame() {
|
||||
const context = canvasElement.getContext('2d');
|
||||
// 为了减轻后端压力,我们缩小图片分辨率到 320x240
|
||||
canvasElement.width = 320;
|
||||
canvasElement.height = 240;
|
||||
context.drawImage(videoElement, 0, 0, 320, 240);
|
||||
return canvasElement.toDataURL('image/jpeg', 0.8); // 压缩为 JPG
|
||||
}
|
||||
|
||||
// 4. 将 Blob 转为 Base64 的辅助函数
|
||||
function blobToBase64(blob) {
|
||||
return new Promise((resolve, _) => {
|
||||
const reader = new FileReader();
|
||||
reader.onloadend = () => resolve(reader.result);
|
||||
reader.readAsDataURL(blob);
|
||||
});
|
||||
}
|
||||
|
||||
// 5. 将图文数据发送给后端
|
||||
function sendToServer(audioB64, imageB64) {
|
||||
statusText.innerText = "🧠 正在思考...";
|
||||
avatarStatus.innerText = "思考中 🤔";
|
||||
transcriptText.innerHTML = "<strong>AI: </strong>"; // 准备显示 AI 的回复
|
||||
|
||||
const payload = {
|
||||
type: "user_input",
|
||||
audio: audioB64,
|
||||
image: imageB64
|
||||
};
|
||||
ws.send(JSON.stringify(payload));
|
||||
}
|
||||
|
||||
// 6. 播放后端返回的音频
|
||||
function playAudio(base64Audio) {
|
||||
avatarStatus.innerText = "说话中 🗣️";
|
||||
const audio = new Audio(base64Audio);
|
||||
audio.play();
|
||||
audio.onended = () => {
|
||||
avatarStatus.innerText = "等待互动...";
|
||||
statusText.innerText = "✅ 播放完毕,可继续对话。";
|
||||
};
|
||||
}
|
||||
|
||||
// 7. 绑定按钮事件 (鼠标按下/松开 模拟对讲机)
|
||||
talkBtn.addEventListener('mousedown', () => {
|
||||
if (!isRecording) {
|
||||
mediaRecorder.start();
|
||||
isRecording = true;
|
||||
talkBtn.innerText = "松开发送 ⬆️";
|
||||
statusText.innerText = "🎙️ 正在录音...并准备抓取画面...";
|
||||
}
|
||||
});
|
||||
|
||||
talkBtn.addEventListener('mouseup', () => {
|
||||
if (isRecording) {
|
||||
mediaRecorder.stop();
|
||||
isRecording = false;
|
||||
talkBtn.innerText = "按住说话 🎙️";
|
||||
}
|
||||
});
|
||||
|
||||
// 启动应用
|
||||
initMedia();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,8 @@
|
||||
import uvicorn
|
||||
|
||||
from server.app import app
|
||||
from server.config import SERVER_HOST, SERVER_PORT
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
|
||||
@@ -0,0 +1,3 @@
|
||||
from .app import app
|
||||
|
||||
__all__ = ["app"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,56 @@
|
||||
import asyncio
|
||||
|
||||
from autogen_agentchat.agents import AssistantAgent
|
||||
from autogen_agentchat.messages import MultiModalMessage, TextMessage
|
||||
from autogen_core import Image
|
||||
from autogen_core.models import ModelFamily
|
||||
from autogen_ext.models.ollama import OllamaChatCompletionClient
|
||||
|
||||
from . import config
|
||||
from .mcp_tools import load_mcp_tools
|
||||
|
||||
|
||||
class AvatarAgentService:
|
||||
def __init__(self) -> None:
|
||||
self._model_client = OllamaChatCompletionClient(
|
||||
model=config.OLLAMA_MODEL,
|
||||
model_info={
|
||||
"vision": True,
|
||||
"function_calling": True,
|
||||
"json_output": True,
|
||||
"family": ModelFamily.UNKNOWN,
|
||||
"structured_output": True,
|
||||
},
|
||||
)
|
||||
self._agent: AssistantAgent | None = None
|
||||
self._agent_lock = asyncio.Lock()
|
||||
|
||||
async def _create_agent(self) -> AssistantAgent:
|
||||
tools = await load_mcp_tools()
|
||||
return AssistantAgent(
|
||||
name="avatar",
|
||||
model_client=self._model_client,
|
||||
system_message=config.SYSTEM_MESSAGE,
|
||||
tools=tools or None,
|
||||
reflect_on_tool_use=bool(tools),
|
||||
)
|
||||
|
||||
async def _get_agent(self) -> AssistantAgent:
|
||||
if self._agent is not None:
|
||||
return self._agent
|
||||
async with self._agent_lock:
|
||||
if self._agent is None:
|
||||
self._agent = await self._create_agent()
|
||||
return self._agent
|
||||
|
||||
async def reply(self, user_text: str, image_b64: str) -> str:
|
||||
agent = await self._get_agent()
|
||||
user_image = Image.from_base64(image_b64)
|
||||
multimodal_task = MultiModalMessage(source="user", content=[user_text, user_image])
|
||||
|
||||
ai_response = ""
|
||||
async for message in agent.run_stream(task=multimodal_task):
|
||||
if isinstance(message, TextMessage) and message.source == "avatar":
|
||||
ai_response = message.content
|
||||
|
||||
return ai_response
|
||||
@@ -0,0 +1,6 @@
|
||||
from fastapi import FastAPI
|
||||
|
||||
from .ws import router as ws_router
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(ws_router)
|
||||
@@ -0,0 +1,44 @@
|
||||
import os
|
||||
import shlex
|
||||
|
||||
|
||||
def _env_bool(name: str, default: bool) -> bool:
|
||||
value = os.getenv(name)
|
||||
if value is None:
|
||||
return default
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_args(name: str, default: str = "") -> list[str]:
|
||||
value = os.getenv(name, default)
|
||||
if not value.strip():
|
||||
return []
|
||||
return shlex.split(value)
|
||||
|
||||
|
||||
SYSTEM_MESSAGE = (
|
||||
"你是一个友好、幽默的AI虚拟主播。你可以看到用户摄像头传来的画面,也能听到他们的话。"
|
||||
"请用简短、自然、热情的中文口语回答,每次回答控制在两三句话以内,不要输出任何 Markdown 格式。"
|
||||
"当用户询问实时天气、最新新闻或网页信息时,优先使用可用工具先查询再回答。"
|
||||
)
|
||||
|
||||
WHISPER_MODEL_NAME = "base"
|
||||
WHISPER_DEVICE = "cpu"
|
||||
WHISPER_COMPUTE_TYPE = "int8"
|
||||
WHISPER_LANGUAGE = "zh"
|
||||
WHISPER_BEAM_SIZE = 5
|
||||
|
||||
TTS_VOICE = "zh-CN-XiaoxiaoNeural"
|
||||
OLLAMA_MODEL = "qwen3-vl:latest"
|
||||
|
||||
SERVER_HOST = "0.0.0.0"
|
||||
SERVER_PORT = 8000
|
||||
|
||||
ENABLE_MCP_TOOLS = _env_bool("ENABLE_MCP_TOOLS", True)
|
||||
MCP_SERVER_READ_TIMEOUT_SECONDS = float(os.getenv("MCP_SERVER_READ_TIMEOUT_SECONDS", "30"))
|
||||
|
||||
MCP_WEATHER_SERVER_COMMAND = os.getenv("MCP_WEATHER_SERVER_COMMAND", "")
|
||||
MCP_WEATHER_SERVER_ARGS = _env_args("MCP_WEATHER_SERVER_ARGS")
|
||||
|
||||
MCP_WEBSEARCH_SERVER_COMMAND = os.getenv("MCP_WEBSEARCH_SERVER_COMMAND", "")
|
||||
MCP_WEBSEARCH_SERVER_ARGS = _env_args("MCP_WEBSEARCH_SERVER_ARGS")
|
||||
@@ -0,0 +1,73 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
|
||||
|
||||
from . import config
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MCPServerConfig:
|
||||
name: str
|
||||
command: str
|
||||
args: list[str]
|
||||
|
||||
|
||||
def _configured_servers() -> list[MCPServerConfig]:
|
||||
if not config.ENABLE_MCP_TOOLS:
|
||||
return []
|
||||
|
||||
servers: list[MCPServerConfig] = []
|
||||
if config.MCP_WEATHER_SERVER_COMMAND:
|
||||
servers.append(
|
||||
MCPServerConfig(
|
||||
name="weather",
|
||||
command=config.MCP_WEATHER_SERVER_COMMAND,
|
||||
args=config.MCP_WEATHER_SERVER_ARGS,
|
||||
)
|
||||
)
|
||||
if config.MCP_WEBSEARCH_SERVER_COMMAND:
|
||||
servers.append(
|
||||
MCPServerConfig(
|
||||
name="websearch",
|
||||
command=config.MCP_WEBSEARCH_SERVER_COMMAND,
|
||||
args=config.MCP_WEBSEARCH_SERVER_ARGS,
|
||||
)
|
||||
)
|
||||
return servers
|
||||
|
||||
|
||||
async def load_mcp_tools() -> list[Any]:
|
||||
configured_servers = _configured_servers()
|
||||
if not configured_servers:
|
||||
print("ℹ️ MCP 工具未配置,跳过加载。")
|
||||
return []
|
||||
|
||||
loaded_tools: list[Any] = []
|
||||
tool_names: set[str] = set()
|
||||
|
||||
for server in configured_servers:
|
||||
params = StdioServerParams(
|
||||
command=server.command,
|
||||
args=server.args,
|
||||
read_timeout_seconds=config.MCP_SERVER_READ_TIMEOUT_SECONDS,
|
||||
)
|
||||
try:
|
||||
server_tools = await mcp_server_tools(params)
|
||||
for tool in server_tools:
|
||||
if tool.name in tool_names:
|
||||
print(f"⚠️ MCP 工具重名,已跳过: {tool.name}")
|
||||
continue
|
||||
loaded_tools.append(tool)
|
||||
tool_names.add(tool.name)
|
||||
print(f"✅ MCP 服务已加载: {server.name} ({len(server_tools)} tools)")
|
||||
except Exception as exc:
|
||||
print(f"⚠️ MCP 服务加载失败: {server.name}, error={exc}")
|
||||
|
||||
if loaded_tools:
|
||||
print(f"✅ MCP 工具总数: {len(loaded_tools)}")
|
||||
else:
|
||||
print("ℹ️ 未加载到任何 MCP 工具。")
|
||||
return loaded_tools
|
||||
@@ -0,0 +1,35 @@
|
||||
import base64
|
||||
|
||||
import edge_tts
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
from . import config
|
||||
|
||||
|
||||
class SpeechService:
|
||||
def __init__(self) -> None:
|
||||
print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
|
||||
self._whisper_model = WhisperModel(
|
||||
config.WHISPER_MODEL_NAME,
|
||||
device=config.WHISPER_DEVICE,
|
||||
compute_type=config.WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
print("✅ 本地语音模型加载完毕!")
|
||||
|
||||
def transcribe(self, audio_path: str) -> str:
|
||||
segments, _ = self._whisper_model.transcribe(
|
||||
audio_path,
|
||||
beam_size=config.WHISPER_BEAM_SIZE,
|
||||
language=config.WHISPER_LANGUAGE,
|
||||
)
|
||||
return "".join(segment.text for segment in segments)
|
||||
|
||||
async def synthesize_audio_data_url(self, text: str) -> str:
|
||||
communicate = edge_tts.Communicate(text, config.TTS_VOICE)
|
||||
audio_data = b""
|
||||
async for chunk in communicate.stream():
|
||||
if chunk["type"] == "audio":
|
||||
audio_data += chunk["data"]
|
||||
|
||||
audio_b64 = base64.b64encode(audio_data).decode("utf-8")
|
||||
return f"data:audio/mp3;base64,{audio_b64}"
|
||||
@@ -0,0 +1,67 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
||||
|
||||
from .agent_service import AvatarAgentService
|
||||
from .speech import SpeechService
|
||||
from .ws_messages import send_audio_message, send_text_message
|
||||
|
||||
router = APIRouter()
|
||||
speech_service = SpeechService()
|
||||
agent_service = AvatarAgentService()
|
||||
|
||||
|
||||
def _save_audio_to_temp_file(audio_b64: str) -> str:
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
|
||||
try:
|
||||
temp_file.write(base64.b64decode(audio_b64))
|
||||
return temp_file.name
|
||||
finally:
|
||||
temp_file.close()
|
||||
|
||||
|
||||
@router.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket) -> None:
|
||||
await websocket.accept()
|
||||
print("✅ WebSocket 连接成功!准备就绪。")
|
||||
|
||||
try:
|
||||
while True:
|
||||
message_text = await websocket.receive_text()
|
||||
data = json.loads(message_text)
|
||||
|
||||
if data.get("type") != "user_input":
|
||||
continue
|
||||
|
||||
audio_b64 = data["audio"].split(",")[-1]
|
||||
image_b64 = data["image"].split(",")[-1]
|
||||
|
||||
audio_path = _save_audio_to_temp_file(audio_b64)
|
||||
try:
|
||||
await send_text_message(websocket, "<i>[👂 正在辨识语音...]</i><br>")
|
||||
user_text = speech_service.transcribe(audio_path)
|
||||
finally:
|
||||
if os.path.exists(audio_path):
|
||||
os.remove(audio_path)
|
||||
|
||||
if not user_text.strip():
|
||||
await send_text_message(websocket, "<i>[没听清你说什么...]</i><br>")
|
||||
continue
|
||||
|
||||
await send_text_message(websocket, f"<b>你说:</b>{user_text}<br>")
|
||||
await send_text_message(websocket, "<i>[🧠 正在看图思考...]</i><br>")
|
||||
|
||||
ai_response = await agent_service.reply(user_text, image_b64)
|
||||
await send_text_message(websocket, f"<b>AI主播:</b>{ai_response}<br><br>")
|
||||
|
||||
await send_text_message(websocket, "<i>[🗣️ 正在生成语音...]</i><br>")
|
||||
audio_data_url = await speech_service.synthesize_audio_data_url(ai_response)
|
||||
await send_audio_message(websocket, audio_data_url)
|
||||
|
||||
except WebSocketDisconnect:
|
||||
print("❌ 前端页面已关闭或断开连接")
|
||||
except Exception as exc:
|
||||
print(f"⚠️ 发生错误: {exc}")
|
||||
@@ -0,0 +1,18 @@
|
||||
import json
|
||||
|
||||
from fastapi import WebSocket
|
||||
|
||||
|
||||
async def send_text_message(websocket: WebSocket, content: str) -> None:
|
||||
await websocket.send_text(json.dumps({"type": "text", "content": content}))
|
||||
|
||||
|
||||
async def send_audio_message(websocket: WebSocket, audio_data_url: str) -> None:
|
||||
await websocket.send_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "audio",
|
||||
"content": audio_data_url,
|
||||
}
|
||||
)
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user