import base64 import json import os import tempfile from fastapi import APIRouter, WebSocket, WebSocketDisconnect from .agent_service import AvatarAgentService from .speech import SpeechService from .ws_messages import send_audio_message, send_text_message router = APIRouter() speech_service = SpeechService() agent_service = AvatarAgentService() def _save_audio_to_temp_file(audio_b64: str) -> str: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".webm") try: temp_file.write(base64.b64decode(audio_b64)) return temp_file.name finally: temp_file.close() @router.websocket("/ws") async def websocket_endpoint(websocket: WebSocket) -> None: await websocket.accept() print("✅ WebSocket 连接成功!准备就绪。") try: while True: message_text = await websocket.receive_text() data = json.loads(message_text) if data.get("type") != "user_input": continue audio_b64 = data["audio"].split(",")[-1] image_b64 = data["image"].split(",")[-1] audio_path = _save_audio_to_temp_file(audio_b64) try: await send_text_message(websocket, "[👂 正在辨识语音...]
") user_text = speech_service.transcribe(audio_path) finally: if os.path.exists(audio_path): os.remove(audio_path) if not user_text.strip(): await send_text_message(websocket, "[没听清你说什么...]
") continue await send_text_message(websocket, f"你说:{user_text}
") await send_text_message(websocket, "[🧠 正在看图思考...]
") ai_response = await agent_service.reply(user_text, image_b64) await send_text_message(websocket, f"AI主播:{ai_response}

") await send_text_message(websocket, "[🗣️ 正在生成语音...]
") audio_data_url = await speech_service.synthesize_audio_data_url(ai_response) await send_audio_message(websocket, audio_data_url) except WebSocketDisconnect: print("❌ 前端页面已关闭或断开连接") except Exception as exc: print(f"⚠️ 发生错误: {exc}")