初始化
This commit is contained in:
4
.vscode/settings.json
vendored
Normal file
4
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"python-envs.defaultEnvManager": "ms-python.python:conda",
|
||||||
|
"python-envs.defaultPackageManager": "ms-python.python:conda"
|
||||||
|
}
|
||||||
BIN
__pycache__/main.cpython-312.pyc
Normal file
BIN
__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
190
index.html
Normal file
190
index.html
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>VLM 虚拟主播</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; background-color: #f0f2f5; margin-top: 50px; }
|
||||||
|
.container { display: flex; gap: 20px; }
|
||||||
|
.video-box { background: #000; border-radius: 10px; overflow: hidden; width: 320px; height: 240px; position: relative; }
|
||||||
|
video { width: 100%; height: 100%; object-fit: cover; }
|
||||||
|
.avatar-box { width: 320px; height: 240px; background: #fff; border-radius: 10px; display: flex; align-items: center; justify-content: center; box-shadow: 0 4px 12px rgba(0,0,0,0.1); flex-direction: column; }
|
||||||
|
.avatar-box img { width: 100px; height: 100px; border-radius: 50%; margin-bottom: 10px; }
|
||||||
|
.controls { margin-top: 30px; }
|
||||||
|
button { padding: 15px 30px; font-size: 18px; border: none; border-radius: 25px; cursor: pointer; background-color: #007bff; color: white; transition: background 0.2s; }
|
||||||
|
button:active { background-color: #0056b3; }
|
||||||
|
button:disabled { background-color: #ccc; cursor: not-allowed; }
|
||||||
|
#status { margin-top: 15px; color: #555; font-weight: bold; }
|
||||||
|
#transcript { margin-top: 20px; width: 600px; text-align: center; color: #333; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h2>🤖 多模态虚拟主播 (VLM)</h2>
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div class="video-box">
|
||||||
|
<video id="userVideo" autoplay muted playsinline></video>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="avatar-box" id="avatarBox">
|
||||||
|
<img src="https://api.dicebear.com/7.x/bottts/svg?seed=Felix" alt="Avatar" id="avatarImg">
|
||||||
|
<span id="avatarStatus">等待互动...</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<canvas id="canvas" style="display:none;"></canvas>
|
||||||
|
|
||||||
|
<div class="controls">
|
||||||
|
<button id="talkBtn" disabled>连接中...</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="status">正在准备摄像头和麦克风...</div>
|
||||||
|
<div id="transcript"></div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const videoElement = document.getElementById('userVideo');
|
||||||
|
const canvasElement = document.getElementById('canvas');
|
||||||
|
const talkBtn = document.getElementById('talkBtn');
|
||||||
|
const statusText = document.getElementById('status');
|
||||||
|
const transcriptText = document.getElementById('transcript');
|
||||||
|
const avatarStatus = document.getElementById('avatarStatus');
|
||||||
|
|
||||||
|
let ws;
|
||||||
|
let mediaRecorder;
|
||||||
|
let audioChunks = [];
|
||||||
|
let isRecording = false;
|
||||||
|
|
||||||
|
// 1. 初始化摄像头和麦克风
|
||||||
|
async function initMedia() {
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: true });
|
||||||
|
videoElement.srcObject = stream;
|
||||||
|
|
||||||
|
// 设置音频录制器
|
||||||
|
mediaRecorder = new MediaRecorder(stream);
|
||||||
|
|
||||||
|
mediaRecorder.ondataavailable = event => {
|
||||||
|
if (event.data.size > 0) audioChunks.push(event.data);
|
||||||
|
};
|
||||||
|
|
||||||
|
mediaRecorder.onstop = async () => {
|
||||||
|
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
|
||||||
|
audioChunks = []; // 清空准备下一次录音
|
||||||
|
|
||||||
|
// 录音结束时,截取当前摄像头画面
|
||||||
|
const imageBase64 = captureFrame();
|
||||||
|
|
||||||
|
// 将音频转为 Base64
|
||||||
|
const audioBase64 = await blobToBase64(audioBlob);
|
||||||
|
|
||||||
|
// 发送给后端
|
||||||
|
sendToServer(audioBase64, imageBase64);
|
||||||
|
};
|
||||||
|
|
||||||
|
statusText.innerText = "设备已就绪,正在连接服务器...";
|
||||||
|
initWebSocket();
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
statusText.innerText = "获取摄像头/麦克风失败,请允许权限!";
|
||||||
|
console.error(err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 初始化 WebSocket 连接
|
||||||
|
function initWebSocket() {
|
||||||
|
// 假设我们后端的 FastAPI 跑在 8000 端口
|
||||||
|
ws = new WebSocket('ws://localhost:8000/ws');
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
statusText.innerText = "✅ 已连接到大脑!长按按钮说话。";
|
||||||
|
talkBtn.innerText = "按住说话 🎙️";
|
||||||
|
talkBtn.disabled = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onmessage = async (event) => {
|
||||||
|
const response = JSON.parse(event.data);
|
||||||
|
|
||||||
|
if (response.type === 'text') {
|
||||||
|
// 后端流式返回的文字
|
||||||
|
transcriptText.innerHTML += response.content;
|
||||||
|
} else if (response.type === 'audio') {
|
||||||
|
// 后端返回的 TTS 音频 (Base64)
|
||||||
|
playAudio(response.content);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onclose = () => {
|
||||||
|
statusText.innerText = "❌ 与服务器断开连接";
|
||||||
|
talkBtn.disabled = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. 截取视频帧转为 Base64
|
||||||
|
function captureFrame() {
|
||||||
|
const context = canvasElement.getContext('2d');
|
||||||
|
// 为了减轻后端压力,我们缩小图片分辨率到 320x240
|
||||||
|
canvasElement.width = 320;
|
||||||
|
canvasElement.height = 240;
|
||||||
|
context.drawImage(videoElement, 0, 0, 320, 240);
|
||||||
|
return canvasElement.toDataURL('image/jpeg', 0.8); // 压缩为 JPG
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. 将 Blob 转为 Base64 的辅助函数
|
||||||
|
function blobToBase64(blob) {
|
||||||
|
return new Promise((resolve, _) => {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onloadend = () => resolve(reader.result);
|
||||||
|
reader.readAsDataURL(blob);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. 将图文数据发送给后端
|
||||||
|
function sendToServer(audioB64, imageB64) {
|
||||||
|
statusText.innerText = "🧠 正在思考...";
|
||||||
|
avatarStatus.innerText = "思考中 🤔";
|
||||||
|
transcriptText.innerHTML = "<strong>AI: </strong>"; // 准备显示 AI 的回复
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
type: "user_input",
|
||||||
|
audio: audioB64,
|
||||||
|
image: imageB64
|
||||||
|
};
|
||||||
|
ws.send(JSON.stringify(payload));
|
||||||
|
}
|
||||||
|
|
||||||
|
// 6. 播放后端返回的音频
|
||||||
|
function playAudio(base64Audio) {
|
||||||
|
avatarStatus.innerText = "说话中 🗣️";
|
||||||
|
const audio = new Audio(base64Audio);
|
||||||
|
audio.play();
|
||||||
|
audio.onended = () => {
|
||||||
|
avatarStatus.innerText = "等待互动...";
|
||||||
|
statusText.innerText = "✅ 播放完毕,可继续对话。";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 7. 绑定按钮事件 (鼠标按下/松开 模拟对讲机)
|
||||||
|
talkBtn.addEventListener('mousedown', () => {
|
||||||
|
if (!isRecording) {
|
||||||
|
mediaRecorder.start();
|
||||||
|
isRecording = true;
|
||||||
|
talkBtn.innerText = "松开发送 ⬆️";
|
||||||
|
statusText.innerText = "🎙️ 正在录音...并准备抓取画面...";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
talkBtn.addEventListener('mouseup', () => {
|
||||||
|
if (isRecording) {
|
||||||
|
mediaRecorder.stop();
|
||||||
|
isRecording = false;
|
||||||
|
talkBtn.innerText = "按住说话 🎙️";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 启动应用
|
||||||
|
initMedia();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
8
main.py
Normal file
8
main.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import uvicorn
|
||||||
|
|
||||||
|
from server.app import app
|
||||||
|
from server.config import SERVER_HOST, SERVER_PORT
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
|
||||||
3
server/__init__.py
Normal file
3
server/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .app import app
|
||||||
|
|
||||||
|
__all__ = ["app"]
|
||||||
BIN
server/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
server/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/agent_service.cpython-312.pyc
Normal file
BIN
server/__pycache__/agent_service.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/app.cpython-312.pyc
Normal file
BIN
server/__pycache__/app.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/config.cpython-312.pyc
Normal file
BIN
server/__pycache__/config.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/mcp_tools.cpython-312.pyc
Normal file
BIN
server/__pycache__/mcp_tools.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/speech.cpython-312.pyc
Normal file
BIN
server/__pycache__/speech.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/ws.cpython-312.pyc
Normal file
BIN
server/__pycache__/ws.cpython-312.pyc
Normal file
Binary file not shown.
BIN
server/__pycache__/ws_messages.cpython-312.pyc
Normal file
BIN
server/__pycache__/ws_messages.cpython-312.pyc
Normal file
Binary file not shown.
56
server/agent_service.py
Normal file
56
server/agent_service.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
|
from autogen_agentchat.agents import AssistantAgent
|
||||||
|
from autogen_agentchat.messages import MultiModalMessage, TextMessage
|
||||||
|
from autogen_core import Image
|
||||||
|
from autogen_core.models import ModelFamily
|
||||||
|
from autogen_ext.models.ollama import OllamaChatCompletionClient
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
from .mcp_tools import load_mcp_tools
|
||||||
|
|
||||||
|
|
||||||
|
class AvatarAgentService:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model_client = OllamaChatCompletionClient(
|
||||||
|
model=config.OLLAMA_MODEL,
|
||||||
|
model_info={
|
||||||
|
"vision": True,
|
||||||
|
"function_calling": True,
|
||||||
|
"json_output": True,
|
||||||
|
"family": ModelFamily.UNKNOWN,
|
||||||
|
"structured_output": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self._agent: AssistantAgent | None = None
|
||||||
|
self._agent_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def _create_agent(self) -> AssistantAgent:
|
||||||
|
tools = await load_mcp_tools()
|
||||||
|
return AssistantAgent(
|
||||||
|
name="avatar",
|
||||||
|
model_client=self._model_client,
|
||||||
|
system_message=config.SYSTEM_MESSAGE,
|
||||||
|
tools=tools or None,
|
||||||
|
reflect_on_tool_use=bool(tools),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _get_agent(self) -> AssistantAgent:
|
||||||
|
if self._agent is not None:
|
||||||
|
return self._agent
|
||||||
|
async with self._agent_lock:
|
||||||
|
if self._agent is None:
|
||||||
|
self._agent = await self._create_agent()
|
||||||
|
return self._agent
|
||||||
|
|
||||||
|
async def reply(self, user_text: str, image_b64: str) -> str:
|
||||||
|
agent = await self._get_agent()
|
||||||
|
user_image = Image.from_base64(image_b64)
|
||||||
|
multimodal_task = MultiModalMessage(source="user", content=[user_text, user_image])
|
||||||
|
|
||||||
|
ai_response = ""
|
||||||
|
async for message in agent.run_stream(task=multimodal_task):
|
||||||
|
if isinstance(message, TextMessage) and message.source == "avatar":
|
||||||
|
ai_response = message.content
|
||||||
|
|
||||||
|
return ai_response
|
||||||
6
server/app.py
Normal file
6
server/app.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from .ws import router as ws_router
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
app.include_router(ws_router)
|
||||||
44
server/config.py
Normal file
44
server/config.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import os
|
||||||
|
import shlex
|
||||||
|
|
||||||
|
|
||||||
|
def _env_bool(name: str, default: bool) -> bool:
|
||||||
|
value = os.getenv(name)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def _env_args(name: str, default: str = "") -> list[str]:
|
||||||
|
value = os.getenv(name, default)
|
||||||
|
if not value.strip():
|
||||||
|
return []
|
||||||
|
return shlex.split(value)
|
||||||
|
|
||||||
|
|
||||||
|
SYSTEM_MESSAGE = (
|
||||||
|
"你是一个友好、幽默的AI虚拟主播。你可以看到用户摄像头传来的画面,也能听到他们的话。"
|
||||||
|
"请用简短、自然、热情的中文口语回答,每次回答控制在两三句话以内,不要输出任何 Markdown 格式。"
|
||||||
|
"当用户询问实时天气、最新新闻或网页信息时,优先使用可用工具先查询再回答。"
|
||||||
|
)
|
||||||
|
|
||||||
|
WHISPER_MODEL_NAME = "base"
|
||||||
|
WHISPER_DEVICE = "cpu"
|
||||||
|
WHISPER_COMPUTE_TYPE = "int8"
|
||||||
|
WHISPER_LANGUAGE = "zh"
|
||||||
|
WHISPER_BEAM_SIZE = 5
|
||||||
|
|
||||||
|
TTS_VOICE = "zh-CN-XiaoxiaoNeural"
|
||||||
|
OLLAMA_MODEL = "qwen3-vl:latest"
|
||||||
|
|
||||||
|
SERVER_HOST = "0.0.0.0"
|
||||||
|
SERVER_PORT = 8000
|
||||||
|
|
||||||
|
ENABLE_MCP_TOOLS = _env_bool("ENABLE_MCP_TOOLS", True)
|
||||||
|
MCP_SERVER_READ_TIMEOUT_SECONDS = float(os.getenv("MCP_SERVER_READ_TIMEOUT_SECONDS", "30"))
|
||||||
|
|
||||||
|
MCP_WEATHER_SERVER_COMMAND = os.getenv("MCP_WEATHER_SERVER_COMMAND", "")
|
||||||
|
MCP_WEATHER_SERVER_ARGS = _env_args("MCP_WEATHER_SERVER_ARGS")
|
||||||
|
|
||||||
|
MCP_WEBSEARCH_SERVER_COMMAND = os.getenv("MCP_WEBSEARCH_SERVER_COMMAND", "")
|
||||||
|
MCP_WEBSEARCH_SERVER_ARGS = _env_args("MCP_WEBSEARCH_SERVER_ARGS")
|
||||||
73
server/mcp_tools.py
Normal file
73
server/mcp_tools.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MCPServerConfig:
|
||||||
|
name: str
|
||||||
|
command: str
|
||||||
|
args: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def _configured_servers() -> list[MCPServerConfig]:
|
||||||
|
if not config.ENABLE_MCP_TOOLS:
|
||||||
|
return []
|
||||||
|
|
||||||
|
servers: list[MCPServerConfig] = []
|
||||||
|
if config.MCP_WEATHER_SERVER_COMMAND:
|
||||||
|
servers.append(
|
||||||
|
MCPServerConfig(
|
||||||
|
name="weather",
|
||||||
|
command=config.MCP_WEATHER_SERVER_COMMAND,
|
||||||
|
args=config.MCP_WEATHER_SERVER_ARGS,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if config.MCP_WEBSEARCH_SERVER_COMMAND:
|
||||||
|
servers.append(
|
||||||
|
MCPServerConfig(
|
||||||
|
name="websearch",
|
||||||
|
command=config.MCP_WEBSEARCH_SERVER_COMMAND,
|
||||||
|
args=config.MCP_WEBSEARCH_SERVER_ARGS,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return servers
|
||||||
|
|
||||||
|
|
||||||
|
async def load_mcp_tools() -> list[Any]:
|
||||||
|
configured_servers = _configured_servers()
|
||||||
|
if not configured_servers:
|
||||||
|
print("ℹ️ MCP 工具未配置,跳过加载。")
|
||||||
|
return []
|
||||||
|
|
||||||
|
loaded_tools: list[Any] = []
|
||||||
|
tool_names: set[str] = set()
|
||||||
|
|
||||||
|
for server in configured_servers:
|
||||||
|
params = StdioServerParams(
|
||||||
|
command=server.command,
|
||||||
|
args=server.args,
|
||||||
|
read_timeout_seconds=config.MCP_SERVER_READ_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
server_tools = await mcp_server_tools(params)
|
||||||
|
for tool in server_tools:
|
||||||
|
if tool.name in tool_names:
|
||||||
|
print(f"⚠️ MCP 工具重名,已跳过: {tool.name}")
|
||||||
|
continue
|
||||||
|
loaded_tools.append(tool)
|
||||||
|
tool_names.add(tool.name)
|
||||||
|
print(f"✅ MCP 服务已加载: {server.name} ({len(server_tools)} tools)")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"⚠️ MCP 服务加载失败: {server.name}, error={exc}")
|
||||||
|
|
||||||
|
if loaded_tools:
|
||||||
|
print(f"✅ MCP 工具总数: {len(loaded_tools)}")
|
||||||
|
else:
|
||||||
|
print("ℹ️ 未加载到任何 MCP 工具。")
|
||||||
|
return loaded_tools
|
||||||
35
server/speech.py
Normal file
35
server/speech.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import base64
|
||||||
|
|
||||||
|
import edge_tts
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechService:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
|
||||||
|
self._whisper_model = WhisperModel(
|
||||||
|
config.WHISPER_MODEL_NAME,
|
||||||
|
device=config.WHISPER_DEVICE,
|
||||||
|
compute_type=config.WHISPER_COMPUTE_TYPE,
|
||||||
|
)
|
||||||
|
print("✅ 本地语音模型加载完毕!")
|
||||||
|
|
||||||
|
def transcribe(self, audio_path: str) -> str:
|
||||||
|
segments, _ = self._whisper_model.transcribe(
|
||||||
|
audio_path,
|
||||||
|
beam_size=config.WHISPER_BEAM_SIZE,
|
||||||
|
language=config.WHISPER_LANGUAGE,
|
||||||
|
)
|
||||||
|
return "".join(segment.text for segment in segments)
|
||||||
|
|
||||||
|
async def synthesize_audio_data_url(self, text: str) -> str:
|
||||||
|
communicate = edge_tts.Communicate(text, config.TTS_VOICE)
|
||||||
|
audio_data = b""
|
||||||
|
async for chunk in communicate.stream():
|
||||||
|
if chunk["type"] == "audio":
|
||||||
|
audio_data += chunk["data"]
|
||||||
|
|
||||||
|
audio_b64 = base64.b64encode(audio_data).decode("utf-8")
|
||||||
|
return f"data:audio/mp3;base64,{audio_b64}"
|
||||||
67
server/ws.py
Normal file
67
server/ws.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
||||||
|
|
||||||
|
from .agent_service import AvatarAgentService
|
||||||
|
from .speech import SpeechService
|
||||||
|
from .ws_messages import send_audio_message, send_text_message
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
speech_service = SpeechService()
|
||||||
|
agent_service = AvatarAgentService()
|
||||||
|
|
||||||
|
|
||||||
|
def _save_audio_to_temp_file(audio_b64: str) -> str:
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
|
||||||
|
try:
|
||||||
|
temp_file.write(base64.b64decode(audio_b64))
|
||||||
|
return temp_file.name
|
||||||
|
finally:
|
||||||
|
temp_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
@router.websocket("/ws")
|
||||||
|
async def websocket_endpoint(websocket: WebSocket) -> None:
|
||||||
|
await websocket.accept()
|
||||||
|
print("✅ WebSocket 连接成功!准备就绪。")
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
message_text = await websocket.receive_text()
|
||||||
|
data = json.loads(message_text)
|
||||||
|
|
||||||
|
if data.get("type") != "user_input":
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_b64 = data["audio"].split(",")[-1]
|
||||||
|
image_b64 = data["image"].split(",")[-1]
|
||||||
|
|
||||||
|
audio_path = _save_audio_to_temp_file(audio_b64)
|
||||||
|
try:
|
||||||
|
await send_text_message(websocket, "<i>[👂 正在辨识语音...]</i><br>")
|
||||||
|
user_text = speech_service.transcribe(audio_path)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(audio_path):
|
||||||
|
os.remove(audio_path)
|
||||||
|
|
||||||
|
if not user_text.strip():
|
||||||
|
await send_text_message(websocket, "<i>[没听清你说什么...]</i><br>")
|
||||||
|
continue
|
||||||
|
|
||||||
|
await send_text_message(websocket, f"<b>你说:</b>{user_text}<br>")
|
||||||
|
await send_text_message(websocket, "<i>[🧠 正在看图思考...]</i><br>")
|
||||||
|
|
||||||
|
ai_response = await agent_service.reply(user_text, image_b64)
|
||||||
|
await send_text_message(websocket, f"<b>AI主播:</b>{ai_response}<br><br>")
|
||||||
|
|
||||||
|
await send_text_message(websocket, "<i>[🗣️ 正在生成语音...]</i><br>")
|
||||||
|
audio_data_url = await speech_service.synthesize_audio_data_url(ai_response)
|
||||||
|
await send_audio_message(websocket, audio_data_url)
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
print("❌ 前端页面已关闭或断开连接")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"⚠️ 发生错误: {exc}")
|
||||||
18
server/ws_messages.py
Normal file
18
server/ws_messages.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
from fastapi import WebSocket
|
||||||
|
|
||||||
|
|
||||||
|
async def send_text_message(websocket: WebSocket, content: str) -> None:
|
||||||
|
await websocket.send_text(json.dumps({"type": "text", "content": content}))
|
||||||
|
|
||||||
|
|
||||||
|
async def send_audio_message(websocket: WebSocket, audio_data_url: str) -> None:
|
||||||
|
await websocket.send_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"content": audio_data_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
BIN
temp_audio.webm
Normal file
BIN
temp_audio.webm
Normal file
Binary file not shown.
BIN
test_audio.webm
Normal file
BIN
test_audio.webm
Normal file
Binary file not shown.
Reference in New Issue
Block a user