face_agent/main.py

import asyncio
import json
import os
import sqlite3
import sys
from pathlib import Path
from typing import Annotated

from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import TextMessage
from autogen_core import CancellationToken
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_ext.models.openai import _openai_client as openai_client_module
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools

try:
    import speech_recognition as sr
except ImportError:
    sr = None

try:
    import pyttsx3
except ImportError:
    pyttsx3 = None

BASE_DIR = Path(__file__).resolve().parent
USER_DB_PATH = BASE_DIR / "users.db"
MODEL_CALL_TIMEOUT_SECONDS = 45
ASR_LANGUAGE = "zh-CN"
MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1")
MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY")

_TTS_ENGINE = None

# --- 第一部分：本地工具（面部 + 语音，以后接硬件）---


def _patch_autogen_tool_schema_for_vllm() -> None:
    """
    vLLM 目前会对 OpenAI 工具定义中的 `strict` 字段告警（即便 strict=False）。
    这里做最小补丁：保留工具定义，移除该字段，避免无意义警告。
    """
    if getattr(openai_client_module.convert_tools, "_strict_removed_patch", False):
        return

    original_convert_tools = openai_client_module.convert_tools

    def convert_tools_without_strict(tools):
        converted = original_convert_tools(tools)
        for tool in converted:
            fn = tool.get("function")
            if isinstance(fn, dict):
                fn.pop("strict", None)
        return converted

    convert_tools_without_strict._strict_removed_patch = True
    openai_client_module.convert_tools = convert_tools_without_strict


async def _async_console_input(prompt: str) -> str:
    """在线程中执行阻塞 input，避免阻塞事件循环。"""
    return await asyncio.to_thread(input, prompt)


def _init_tts_engine():
    """初始化离线 TTS（pyttsx3）。"""
    global _TTS_ENGINE
    if _TTS_ENGINE is not None:
        return _TTS_ENGINE
    if pyttsx3 is None:
        return None

    engine = pyttsx3.init()
    # 优先选择中文语音（不同系统 voice id 不同，这里做模糊匹配）
    for voice in engine.getProperty("voices"):
        voice_blob = f"{voice.id} {voice.name}".lower()
        if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob:
            engine.setProperty("voice", voice.id)
            break
    engine.setProperty("rate", 190)
    _TTS_ENGINE = engine
    return _TTS_ENGINE


def _speak_blocking(text: str) -> bool:
    """阻塞式语音播报。成功返回 True。"""
    if not text:
        return False
    engine = _init_tts_engine()
    if engine is None:
        return False
    engine.say(text)
    engine.runAndWait()
    return True


async def _async_speak(text: str) -> bool:
    return await asyncio.to_thread(_speak_blocking, text)


def _listen_once_blocking(
    language: str = ASR_LANGUAGE,
    timeout: int = 8,
    phrase_time_limit: int = 20,
) -> str:
    """阻塞式麦克风识别，返回识别文本。"""
    if sr is None:
        raise RuntimeError("缺少 speech_recognition 依赖")

    recognizer = sr.Recognizer()
    with sr.Microphone(sample_rate=16000) as source:
        print(">>>>>> 🎤 请说话... <<<<<<")
        recognizer.adjust_for_ambient_noise(source, duration=0.4)
        audio = recognizer.listen(
            source,
            timeout=timeout,
            phrase_time_limit=phrase_time_limit,
        )
    return recognizer.recognize_google(audio, language=language).strip()


async def _async_listen_once() -> str:
    """在线程中执行语音识别，避免阻塞事件循环。"""
    return await asyncio.to_thread(_listen_once_blocking)


async def _get_user_input(io_mode: str) -> str:
    """
    统一用户输入入口：
    - text: 纯文本输入
    - voice: 回车后语音输入，也允许直接键入文字
    """
    if io_mode == "text":
        return (await _async_console_input("你说: ")).strip()

    typed = (await _async_console_input("你说(回车=语音, 直接输入=文本): ")).strip()
    if typed:
        return typed

    try:
        spoken = await _async_listen_once()
    except Exception as e:
        print(f">>>>>> ⚠️ 语音识别失败：{e} <<<<<<\n")
        return ""

    if spoken:
        print(f"[语音识别]: {spoken}")
    return spoken


async def set_expression(
    expression: Annotated[str, "机器人要展示的表情，如：开心、疑惑、难过、待机"],
    intensity: Annotated[int, "表情强度 1-10"] = 5
) -> str:
    """[模拟面部] 控制机器人头部的表情展示。"""
    print(f"\n>>>>>> 🤖 表情更新: 【{expression}】 (强度: {intensity}/10) <<<<<<")
    return f"已切换到【{expression}】表情。"

# --- 第二部分：直接读取用户档案（不经过 MCP，避免多轮工具调用）---

def _load_user_profile(user_name: str, db_path: str | Path = USER_DB_PATH) -> str:
    """在 Python 层直接读档案，注入到消息上下文，模型无需主动调用 get_user_profile。"""
    try:
        with sqlite3.connect(db_path) as conn:
            conn.row_factory = sqlite3.Row
            user = conn.execute(
                "SELECT * FROM users WHERE name = ?", (user_name,)
            ).fetchone()
            if not user:
                return f"用户 {user_name} 尚无历史记录，这是第一次见面。"
            prefs = conn.execute(
                "SELECT category, content FROM preferences WHERE user_name = ?",
                (user_name,)
            ).fetchall()
            conn.execute(
                "UPDATE users SET last_seen = datetime('now') WHERE name = ?",
                (user_name,)
            )
            return json.dumps({
                "基本信息": {"姓名": user["name"], "年龄": user["age"], "上次见面": user["last_seen"]},
                "偏好习惯": {p["category"]: p["content"] for p in prefs},
            }, ensure_ascii=False)
    except Exception as e:
        return f"档案读取失败（{e}），当作第一次见面。"

# --- 第三部分：启动大脑 ---

async def start_simulated_head():
    _patch_autogen_tool_schema_for_vllm()

    # 连接 MCP Server，只取写入类工具（读档案由 Python 层直接处理）
    server_params = StdioServerParams(
        command=sys.executable,
        args=["robot_mcp_server.py"],
    )
    all_mcp_tools = await mcp_server_tools(server_params)
    # 过滤掉 get_user_profile，模型无需主动调用它
    mcp_tools = [t for t in all_mcp_tools if getattr(t, "name", "") != "get_user_profile"]

    model_client = OpenAIChatCompletionClient(
        model=MODEL_NAME,
        base_url=MODEL_BASE_URL,
        api_key=MODEL_API_KEY,
        model_info={
            "vision": True,
            "function_calling": True,
            "json_output": True,
            # 关闭 structured_output，避免向 vLLM 发送 strict 字段告警。
            "structured_output": False,
            "family": "unknown",
            "context_length": 32000,
        }
    )

    brain = AssistantAgent(
        name="RobotBrain",
        model_client=model_client,
        tools=[set_expression] + mcp_tools,
        reflect_on_tool_use=True,
        system_message="""你是一个有情感的机器人伙伴。每条消息包含：
[用户档案] 历史记录（已由系统预加载）
[视觉状态] 当前摄像头观察
[用户说]   用户这句话

回应规则：
1. 同一轮内调用：set_expression（表情）+ 所有需要的查询工具（get_weather/get_location/web_search）+ 需要的用户信息工具（upsert_user/set_preference）
2. 工具执行完毕后，用简短、温暖、自然的语言直接回答用户——这段文字就是你的语音输出。
3. 不要说"我去查一下"之类的过渡语，直接完成任务并给出结果。""",
    )

    # --- 第四部分：交互循环 ---
    print("=" * 50)
    print("  机器人已上线！输入 'quit' 退出")
    print(f"  模型: {MODEL_NAME}")
    print(f"  服务: {MODEL_BASE_URL}")
    print("=" * 50)

    try:
        user_name = (await _async_console_input("请输入你的名字: ")).strip() or "用户"
    except (EOFError, KeyboardInterrupt):
        print("\n机器人下线，再见！")
        return

    has_asr = sr is not None
    has_tts = pyttsx3 is not None
    if has_asr and has_tts:
        mode_tip = "voice"
    else:
        mode_tip = "text"
    try:
        io_mode = (
            await _async_console_input(
                f"输入模式 voice/text（默认 {mode_tip}）: "
            )
        ).strip().lower() or mode_tip
    except (EOFError, KeyboardInterrupt):
        print("\n机器人下线，再见！")
        return
    if io_mode not in ("voice", "text"):
        io_mode = mode_tip

    if io_mode == "voice" and not has_asr:
        print(">>>>>> ⚠️ 未安装 speech_recognition，已降级为文本输入。 <<<<<<")
        io_mode = "text"
    if io_mode == "voice" and not has_tts:
        print(">>>>>> ⚠️ 未安装 pyttsx3，将仅文本输出，不播报语音。 <<<<<<")

    print(
        "\n[语音依赖状态] "
        f"ASR={'ok' if has_asr else 'missing'}, "
        f"TTS={'ok' if has_tts else 'missing'}"
    )
    if not has_asr or not has_tts:
        print("可安装: pip install SpeechRecognition pyaudio pyttsx3")

    visual_context = "视觉输入：用户坐在电脑前，表情平静，看着屏幕。"

    print(f"\n[当前视觉状态]: {visual_context}")
    print("提示：输入 'v <描述>' 可以更新视觉状态，例如: v 用户在笑\n")

    history = []

    try:
        while True:
            try:
                user_input = await _get_user_input(io_mode)
            except (EOFError, KeyboardInterrupt):
                print("\n机器人下线，再见！")
                break

            if not user_input:
                continue

            if user_input.lower() in ("quit", "exit", "退出"):
                print("机器人下线，再见！")
                break

            if user_input.lower().startswith("v "):
                visual_context = f"视觉输入：{user_input[2:].strip()}。"
                print(f"[视觉状态已更新]: {visual_context}\n")
                continue

            # Python 层直接读取档案并注入消息，模型无需发起额外工具调用
            profile = _load_user_profile(user_name)
            combined_input = (
                f"[用户档案]\n{profile}\n\n"
                f"[视觉状态] {visual_context}\n"
                f"[用户说] {user_input}"
            )
            history.append(TextMessage(content=combined_input, source="user"))

            # 只保留最近 6 条消息（3轮对话），防止超出 token 上限
            # 用户档案每轮从数据库重新注入，不依赖长历史
            if len(history) > 6:
                history = history[-6:]

            try:
                response = await asyncio.wait_for(
                    brain.on_messages(history, CancellationToken()),
                    timeout=MODEL_CALL_TIMEOUT_SECONDS,
                )
            except asyncio.TimeoutError:
                print(">>>>>> ⚠️ 请求超时，请稍后重试或简化问题。 <<<<<<\n")
                continue
            except Exception as e:
                print(f">>>>>> ⚠️ 本轮处理失败：{e} <<<<<<\n")
                continue

            # 模型的文字回复就是语音输出（reflect_on_tool_use=True 保证这里是 TextMessage）
            speech = response.chat_message.content
            if speech and isinstance(speech, str):
                print(f">>>>>> 🔊 机器人说: {speech} <<<<<<\n")
                if io_mode == "voice":
                    spoken_ok = await _async_speak(speech)
                    if not spoken_ok:
                        print(">>>>>> ⚠️ TTS 不可用，当前仅文本输出。 <<<<<<\n")

            # 只把最终回复加入历史，inner_messages 是事件对象不能序列化回模型
            history.append(response.chat_message)
    finally:
        model_client.close()

if __name__ == "__main__":
    asyncio.run(start_simulated_head())