refactor: 转移vlm到远程服务器上

2026-03-04 14:41:56 +08:00
parent cb94029ec5
commit a78e984695
2 changed files with 147 additions and 4 deletions
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import os
 import sqlite3
 import sys
 from pathlib import Path
@@ -12,9 +13,25 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient
 from autogen_ext.models.openai import _openai_client as openai_client_module
 from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools

+try:
+    import speech_recognition as sr
+except ImportError:
+    sr = None
+
+try:
+    import pyttsx3
+except ImportError:
+    pyttsx3 = None
+
 BASE_DIR = Path(__file__).resolve().parent
 USER_DB_PATH = BASE_DIR / "users.db"
 MODEL_CALL_TIMEOUT_SECONDS = 45
+ASR_LANGUAGE = "zh-CN"
+MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
+MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1")
+MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY")
+
+_TTS_ENGINE = None

 # --- 第一部分：本地工具（面部 + 语音，以后接硬件）---

@@ -46,6 +63,92 @@ async def _async_console_input(prompt: str) -> str:
    return await asyncio.to_thread(input, prompt)


+def _init_tts_engine():
+    """初始化离线 TTS（pyttsx3）。"""
+    global _TTS_ENGINE
+    if _TTS_ENGINE is not None:
+        return _TTS_ENGINE
+    if pyttsx3 is None:
+        return None
+
+    engine = pyttsx3.init()
+    # 优先选择中文语音（不同系统 voice id 不同，这里做模糊匹配）
+    for voice in engine.getProperty("voices"):
+        voice_blob = f"{voice.id} {voice.name}".lower()
+        if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob:
+            engine.setProperty("voice", voice.id)
+            break
+    engine.setProperty("rate", 190)
+    _TTS_ENGINE = engine
+    return _TTS_ENGINE
+
+
+def _speak_blocking(text: str) -> bool:
+    """阻塞式语音播报。成功返回 True。"""
+    if not text:
+        return False
+    engine = _init_tts_engine()
+    if engine is None:
+        return False
+    engine.say(text)
+    engine.runAndWait()
+    return True
+
+
+async def _async_speak(text: str) -> bool:
+    return await asyncio.to_thread(_speak_blocking, text)
+
+
+def _listen_once_blocking(
+    language: str = ASR_LANGUAGE,
+    timeout: int = 8,
+    phrase_time_limit: int = 20,
+) -> str:
+    """阻塞式麦克风识别，返回识别文本。"""
+    if sr is None:
+        raise RuntimeError("缺少 speech_recognition 依赖")
+
+    recognizer = sr.Recognizer()
+    with sr.Microphone(sample_rate=16000) as source:
+        print(">>>>>> 🎤 请说话... <<<<<<")
+        recognizer.adjust_for_ambient_noise(source, duration=0.4)
+        audio = recognizer.listen(
+            source,
+            timeout=timeout,
+            phrase_time_limit=phrase_time_limit,
+        )
+    return recognizer.recognize_google(audio, language=language).strip()
+
+
+async def _async_listen_once() -> str:
+    """在线程中执行语音识别，避免阻塞事件循环。"""
+    return await asyncio.to_thread(_listen_once_blocking)
+
+
+async def _get_user_input(io_mode: str) -> str:
+    """
+    统一用户输入入口：
+    - text: 纯文本输入
+    - voice: 回车后语音输入，也允许直接键入文字
+    """
+    if io_mode == "text":
+        return (await _async_console_input("你说: ")).strip()
+
+    typed = (await _async_console_input("你说(回车=语音, 直接输入=文本): ")).strip()
+    if typed:
+        return typed
+
+    try:
+        spoken = await _async_listen_once()
+    except Exception as e:
+        print(f">>>>>> ⚠️ 语音识别失败：{e} <<<<<<\n")
+        return ""
+
+    if spoken:
+        print(f"[语音识别]: {spoken}")
+    return spoken
+
+
 async def set_expression(
    expression: Annotated[str, "机器人要展示的表情，如：开心、疑惑、难过、待机"],
    intensity: Annotated[int, "表情强度 1-10"] = 5
@@ -96,9 +199,9 @@ async def start_simulated_head():
    mcp_tools = [t for t in all_mcp_tools if getattr(t, "name", "") != "get_user_profile"]

    model_client = OpenAIChatCompletionClient(
-        model="Qwen/Qwen3-VL-8B-Instruct",
-        base_url="http://localhost:8000/v1",
-        api_key="EMPTY",
+        model=MODEL_NAME,
+        base_url=MODEL_BASE_URL,
+        api_key=MODEL_API_KEY,
        model_info={
            "vision": True,
            "function_calling": True,
@@ -129,6 +232,8 @@ async def start_simulated_head():
    # --- 第四部分：交互循环 ---
    print("=" * 50)
    print("  机器人已上线！输入 'quit' 退出")
+    print(f"  模型: {MODEL_NAME}")
+    print(f"  服务: {MODEL_BASE_URL}")
    print("=" * 50)

    try:
@@ -136,6 +241,39 @@ async def start_simulated_head():
    except (EOFError, KeyboardInterrupt):
        print("\n机器人下线，再见！")
        return
+
+    has_asr = sr is not None
+    has_tts = pyttsx3 is not None
+    if has_asr and has_tts:
+        mode_tip = "voice"
+    else:
+        mode_tip = "text"
+    try:
+        io_mode = (
+            await _async_console_input(
+                f"输入模式 voice/text（默认 {mode_tip}）: "
+            )
+        ).strip().lower() or mode_tip
+    except (EOFError, KeyboardInterrupt):
+        print("\n机器人下线，再见！")
+        return
+    if io_mode not in ("voice", "text"):
+        io_mode = mode_tip
+
+    if io_mode == "voice" and not has_asr:
+        print(">>>>>> ⚠️ 未安装 speech_recognition，已降级为文本输入。 <<<<<<")
+        io_mode = "text"
+    if io_mode == "voice" and not has_tts:
+        print(">>>>>> ⚠️ 未安装 pyttsx3，将仅文本输出，不播报语音。 <<<<<<")
+
+    print(
+        "\n[语音依赖状态] "
+        f"ASR={'ok' if has_asr else 'missing'}, "
+        f"TTS={'ok' if has_tts else 'missing'}"
+    )
+    if not has_asr or not has_tts:
+        print("可安装: pip install SpeechRecognition pyaudio pyttsx3")
+
    visual_context = "视觉输入：用户坐在电脑前，表情平静，看着屏幕。"

    print(f"\n[当前视觉状态]: {visual_context}")
@@ -146,7 +284,7 @@ async def start_simulated_head():
    try:
        while True:
            try:
-                user_input = (await _async_console_input("你说: ")).strip()
+                user_input = await _get_user_input(io_mode)
            except (EOFError, KeyboardInterrupt):
                print("\n机器人下线，再见！")
                break
@@ -193,6 +331,10 @@ async def start_simulated_head():
            speech = response.chat_message.content
            if speech and isinstance(speech, str):
                print(f">>>>>> 🔊 机器人说: {speech} <<<<<<\n")
+                if io_mode == "voice":
+                    spoken_ok = await _async_speak(speech)
+                    if not spoken_ok:
+                        print(">>>>>> ⚠️ TTS 不可用，当前仅文本输出。 <<<<<<\n")

            # 只把最终回复加入历史，inner_messages 是事件对象不能序列化回模型
            history.append(response.chat_message)
--- a/start_vllm.sh
+++ b/start_vllm.sh
@@ -5,6 +5,7 @@
 python -m vllm.entrypoints.openai.api_server \
    --model Qwen/Qwen3-VL-8B-Instruct \
    --trust-remote-code \
+    --host 0.0.0.0 \
    --port 8000 \
    --gpu-memory-utilization 0.85 \
    --max-model-len 32000 \