import base64
import json
import os
import tempfile
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from .agent_service import AvatarAgentService
from .speech import SpeechService
from .ws_messages import send_audio_message, send_text_message
router = APIRouter()
speech_service = SpeechService()
agent_service = AvatarAgentService()
def _save_audio_to_temp_file(audio_b64: str) -> str:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
try:
temp_file.write(base64.b64decode(audio_b64))
return temp_file.name
finally:
temp_file.close()
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket) -> None:
await websocket.accept()
print("✅ WebSocket 连接成功!准备就绪。")
try:
while True:
message_text = await websocket.receive_text()
data = json.loads(message_text)
if data.get("type") != "user_input":
continue
audio_b64 = data["audio"].split(",")[-1]
image_b64 = data["image"].split(",")[-1]
audio_path = _save_audio_to_temp_file(audio_b64)
try:
await send_text_message(websocket, "[👂 正在辨识语音...]
")
user_text = speech_service.transcribe(audio_path)
finally:
if os.path.exists(audio_path):
os.remove(audio_path)
if not user_text.strip():
await send_text_message(websocket, "[没听清你说什么...]
")
continue
await send_text_message(websocket, f"你说:{user_text}
")
await send_text_message(websocket, "[🧠 正在看图思考...]
")
ai_response = await agent_service.reply(user_text, image_b64)
await send_text_message(websocket, f"AI主播:{ai_response}
")
await send_text_message(websocket, "[🗣️ 正在生成语音...]
")
audio_data_url = await speech_service.synthesize_audio_data_url(ai_response)
await send_audio_message(websocket, audio_data_url)
except WebSocketDisconnect:
print("❌ 前端页面已关闭或断开连接")
except Exception as exc:
print(f"⚠️ 发生错误: {exc}")