diff --git a/main.py b/main.py index e227053..654eee7 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import asyncio import json +import os import sqlite3 import sys from pathlib import Path @@ -12,9 +13,25 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient from autogen_ext.models.openai import _openai_client as openai_client_module from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools +try: + import speech_recognition as sr +except ImportError: + sr = None + +try: + import pyttsx3 +except ImportError: + pyttsx3 = None + BASE_DIR = Path(__file__).resolve().parent USER_DB_PATH = BASE_DIR / "users.db" MODEL_CALL_TIMEOUT_SECONDS = 45 +ASR_LANGUAGE = "zh-CN" +MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct") +MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1") +MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY") + +_TTS_ENGINE = None # --- 第一部分:本地工具(面部 + 语音,以后接硬件)--- @@ -46,6 +63,92 @@ async def _async_console_input(prompt: str) -> str: return await asyncio.to_thread(input, prompt) +def _init_tts_engine(): + """初始化离线 TTS(pyttsx3)。""" + global _TTS_ENGINE + if _TTS_ENGINE is not None: + return _TTS_ENGINE + if pyttsx3 is None: + return None + + engine = pyttsx3.init() + # 优先选择中文语音(不同系统 voice id 不同,这里做模糊匹配) + for voice in engine.getProperty("voices"): + voice_blob = f"{voice.id} {voice.name}".lower() + if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob: + engine.setProperty("voice", voice.id) + break + engine.setProperty("rate", 190) + _TTS_ENGINE = engine + return _TTS_ENGINE + + +def _speak_blocking(text: str) -> bool: + """阻塞式语音播报。成功返回 True。""" + if not text: + return False + engine = _init_tts_engine() + if engine is None: + return False + engine.say(text) + engine.runAndWait() + return True + + +async def _async_speak(text: str) -> bool: + return await asyncio.to_thread(_speak_blocking, text) + + +def _listen_once_blocking( + language: str = ASR_LANGUAGE, + timeout: int = 8, + phrase_time_limit: int = 20, +) -> str: + """阻塞式麦克风识别,返回识别文本。""" + if sr is None: + raise RuntimeError("缺少 speech_recognition 依赖") + + recognizer = sr.Recognizer() + with sr.Microphone(sample_rate=16000) as source: + print(">>>>>> 🎤 请说话... <<<<<<") + recognizer.adjust_for_ambient_noise(source, duration=0.4) + audio = recognizer.listen( + source, + timeout=timeout, + phrase_time_limit=phrase_time_limit, + ) + return recognizer.recognize_google(audio, language=language).strip() + + +async def _async_listen_once() -> str: + """在线程中执行语音识别,避免阻塞事件循环。""" + return await asyncio.to_thread(_listen_once_blocking) + + +async def _get_user_input(io_mode: str) -> str: + """ + 统一用户输入入口: + - text: 纯文本输入 + - voice: 回车后语音输入,也允许直接键入文字 + """ + if io_mode == "text": + return (await _async_console_input("你说: ")).strip() + + typed = (await _async_console_input("你说(回车=语音, 直接输入=文本): ")).strip() + if typed: + return typed + + try: + spoken = await _async_listen_once() + except Exception as e: + print(f">>>>>> ⚠️ 语音识别失败:{e} <<<<<<\n") + return "" + + if spoken: + print(f"[语音识别]: {spoken}") + return spoken + + async def set_expression( expression: Annotated[str, "机器人要展示的表情,如:开心、疑惑、难过、待机"], intensity: Annotated[int, "表情强度 1-10"] = 5 @@ -96,9 +199,9 @@ async def start_simulated_head(): mcp_tools = [t for t in all_mcp_tools if getattr(t, "name", "") != "get_user_profile"] model_client = OpenAIChatCompletionClient( - model="Qwen/Qwen3-VL-8B-Instruct", - base_url="http://localhost:8000/v1", - api_key="EMPTY", + model=MODEL_NAME, + base_url=MODEL_BASE_URL, + api_key=MODEL_API_KEY, model_info={ "vision": True, "function_calling": True, @@ -129,6 +232,8 @@ async def start_simulated_head(): # --- 第四部分:交互循环 --- print("=" * 50) print(" 机器人已上线!输入 'quit' 退出") + print(f" 模型: {MODEL_NAME}") + print(f" 服务: {MODEL_BASE_URL}") print("=" * 50) try: @@ -136,6 +241,39 @@ async def start_simulated_head(): except (EOFError, KeyboardInterrupt): print("\n机器人下线,再见!") return + + has_asr = sr is not None + has_tts = pyttsx3 is not None + if has_asr and has_tts: + mode_tip = "voice" + else: + mode_tip = "text" + try: + io_mode = ( + await _async_console_input( + f"输入模式 voice/text(默认 {mode_tip}): " + ) + ).strip().lower() or mode_tip + except (EOFError, KeyboardInterrupt): + print("\n机器人下线,再见!") + return + if io_mode not in ("voice", "text"): + io_mode = mode_tip + + if io_mode == "voice" and not has_asr: + print(">>>>>> ⚠️ 未安装 speech_recognition,已降级为文本输入。 <<<<<<") + io_mode = "text" + if io_mode == "voice" and not has_tts: + print(">>>>>> ⚠️ 未安装 pyttsx3,将仅文本输出,不播报语音。 <<<<<<") + + print( + "\n[语音依赖状态] " + f"ASR={'ok' if has_asr else 'missing'}, " + f"TTS={'ok' if has_tts else 'missing'}" + ) + if not has_asr or not has_tts: + print("可安装: pip install SpeechRecognition pyaudio pyttsx3") + visual_context = "视觉输入:用户坐在电脑前,表情平静,看着屏幕。" print(f"\n[当前视觉状态]: {visual_context}") @@ -146,7 +284,7 @@ async def start_simulated_head(): try: while True: try: - user_input = (await _async_console_input("你说: ")).strip() + user_input = await _get_user_input(io_mode) except (EOFError, KeyboardInterrupt): print("\n机器人下线,再见!") break @@ -193,6 +331,10 @@ async def start_simulated_head(): speech = response.chat_message.content if speech and isinstance(speech, str): print(f">>>>>> 🔊 机器人说: {speech} <<<<<<\n") + if io_mode == "voice": + spoken_ok = await _async_speak(speech) + if not spoken_ok: + print(">>>>>> ⚠️ TTS 不可用,当前仅文本输出。 <<<<<<\n") # 只把最终回复加入历史,inner_messages 是事件对象不能序列化回模型 history.append(response.chat_message) diff --git a/start_vllm.sh b/start_vllm.sh index d400932..7b4154b 100644 --- a/start_vllm.sh +++ b/start_vllm.sh @@ -5,6 +5,7 @@ python -m vllm.entrypoints.openai.api_server \ --model Qwen/Qwen3-VL-8B-Instruct \ --trust-remote-code \ + --host 0.0.0.0 \ --port 8000 \ --gpu-memory-utilization 0.85 \ --max-model-len 32000 \