refactor: 转移vlm到远程服务器上
This commit is contained in:
150
main.py
150
main.py
@@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -12,9 +13,25 @@ from autogen_ext.models.openai import OpenAIChatCompletionClient
|
|||||||
from autogen_ext.models.openai import _openai_client as openai_client_module
|
from autogen_ext.models.openai import _openai_client as openai_client_module
|
||||||
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
|
from autogen_ext.tools.mcp import StdioServerParams, mcp_server_tools
|
||||||
|
|
||||||
|
try:
|
||||||
|
import speech_recognition as sr
|
||||||
|
except ImportError:
|
||||||
|
sr = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pyttsx3
|
||||||
|
except ImportError:
|
||||||
|
pyttsx3 = None
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent
|
BASE_DIR = Path(__file__).resolve().parent
|
||||||
USER_DB_PATH = BASE_DIR / "users.db"
|
USER_DB_PATH = BASE_DIR / "users.db"
|
||||||
MODEL_CALL_TIMEOUT_SECONDS = 45
|
MODEL_CALL_TIMEOUT_SECONDS = 45
|
||||||
|
ASR_LANGUAGE = "zh-CN"
|
||||||
|
MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
|
||||||
|
MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1")
|
||||||
|
MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY")
|
||||||
|
|
||||||
|
_TTS_ENGINE = None
|
||||||
|
|
||||||
# --- 第一部分:本地工具(面部 + 语音,以后接硬件)---
|
# --- 第一部分:本地工具(面部 + 语音,以后接硬件)---
|
||||||
|
|
||||||
@@ -46,6 +63,92 @@ async def _async_console_input(prompt: str) -> str:
|
|||||||
return await asyncio.to_thread(input, prompt)
|
return await asyncio.to_thread(input, prompt)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_tts_engine():
|
||||||
|
"""初始化离线 TTS(pyttsx3)。"""
|
||||||
|
global _TTS_ENGINE
|
||||||
|
if _TTS_ENGINE is not None:
|
||||||
|
return _TTS_ENGINE
|
||||||
|
if pyttsx3 is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
engine = pyttsx3.init()
|
||||||
|
# 优先选择中文语音(不同系统 voice id 不同,这里做模糊匹配)
|
||||||
|
for voice in engine.getProperty("voices"):
|
||||||
|
voice_blob = f"{voice.id} {voice.name}".lower()
|
||||||
|
if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob:
|
||||||
|
engine.setProperty("voice", voice.id)
|
||||||
|
break
|
||||||
|
engine.setProperty("rate", 190)
|
||||||
|
_TTS_ENGINE = engine
|
||||||
|
return _TTS_ENGINE
|
||||||
|
|
||||||
|
|
||||||
|
def _speak_blocking(text: str) -> bool:
|
||||||
|
"""阻塞式语音播报。成功返回 True。"""
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
engine = _init_tts_engine()
|
||||||
|
if engine is None:
|
||||||
|
return False
|
||||||
|
engine.say(text)
|
||||||
|
engine.runAndWait()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_speak(text: str) -> bool:
|
||||||
|
return await asyncio.to_thread(_speak_blocking, text)
|
||||||
|
|
||||||
|
|
||||||
|
def _listen_once_blocking(
|
||||||
|
language: str = ASR_LANGUAGE,
|
||||||
|
timeout: int = 8,
|
||||||
|
phrase_time_limit: int = 20,
|
||||||
|
) -> str:
|
||||||
|
"""阻塞式麦克风识别,返回识别文本。"""
|
||||||
|
if sr is None:
|
||||||
|
raise RuntimeError("缺少 speech_recognition 依赖")
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.Microphone(sample_rate=16000) as source:
|
||||||
|
print(">>>>>> 🎤 请说话... <<<<<<")
|
||||||
|
recognizer.adjust_for_ambient_noise(source, duration=0.4)
|
||||||
|
audio = recognizer.listen(
|
||||||
|
source,
|
||||||
|
timeout=timeout,
|
||||||
|
phrase_time_limit=phrase_time_limit,
|
||||||
|
)
|
||||||
|
return recognizer.recognize_google(audio, language=language).strip()
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_listen_once() -> str:
|
||||||
|
"""在线程中执行语音识别,避免阻塞事件循环。"""
|
||||||
|
return await asyncio.to_thread(_listen_once_blocking)
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_user_input(io_mode: str) -> str:
|
||||||
|
"""
|
||||||
|
统一用户输入入口:
|
||||||
|
- text: 纯文本输入
|
||||||
|
- voice: 回车后语音输入,也允许直接键入文字
|
||||||
|
"""
|
||||||
|
if io_mode == "text":
|
||||||
|
return (await _async_console_input("你说: ")).strip()
|
||||||
|
|
||||||
|
typed = (await _async_console_input("你说(回车=语音, 直接输入=文本): ")).strip()
|
||||||
|
if typed:
|
||||||
|
return typed
|
||||||
|
|
||||||
|
try:
|
||||||
|
spoken = await _async_listen_once()
|
||||||
|
except Exception as e:
|
||||||
|
print(f">>>>>> ⚠️ 语音识别失败:{e} <<<<<<\n")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if spoken:
|
||||||
|
print(f"[语音识别]: {spoken}")
|
||||||
|
return spoken
|
||||||
|
|
||||||
|
|
||||||
async def set_expression(
|
async def set_expression(
|
||||||
expression: Annotated[str, "机器人要展示的表情,如:开心、疑惑、难过、待机"],
|
expression: Annotated[str, "机器人要展示的表情,如:开心、疑惑、难过、待机"],
|
||||||
intensity: Annotated[int, "表情强度 1-10"] = 5
|
intensity: Annotated[int, "表情强度 1-10"] = 5
|
||||||
@@ -96,9 +199,9 @@ async def start_simulated_head():
|
|||||||
mcp_tools = [t for t in all_mcp_tools if getattr(t, "name", "") != "get_user_profile"]
|
mcp_tools = [t for t in all_mcp_tools if getattr(t, "name", "") != "get_user_profile"]
|
||||||
|
|
||||||
model_client = OpenAIChatCompletionClient(
|
model_client = OpenAIChatCompletionClient(
|
||||||
model="Qwen/Qwen3-VL-8B-Instruct",
|
model=MODEL_NAME,
|
||||||
base_url="http://localhost:8000/v1",
|
base_url=MODEL_BASE_URL,
|
||||||
api_key="EMPTY",
|
api_key=MODEL_API_KEY,
|
||||||
model_info={
|
model_info={
|
||||||
"vision": True,
|
"vision": True,
|
||||||
"function_calling": True,
|
"function_calling": True,
|
||||||
@@ -129,6 +232,8 @@ async def start_simulated_head():
|
|||||||
# --- 第四部分:交互循环 ---
|
# --- 第四部分:交互循环 ---
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
print(" 机器人已上线!输入 'quit' 退出")
|
print(" 机器人已上线!输入 'quit' 退出")
|
||||||
|
print(f" 模型: {MODEL_NAME}")
|
||||||
|
print(f" 服务: {MODEL_BASE_URL}")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -136,6 +241,39 @@ async def start_simulated_head():
|
|||||||
except (EOFError, KeyboardInterrupt):
|
except (EOFError, KeyboardInterrupt):
|
||||||
print("\n机器人下线,再见!")
|
print("\n机器人下线,再见!")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
has_asr = sr is not None
|
||||||
|
has_tts = pyttsx3 is not None
|
||||||
|
if has_asr and has_tts:
|
||||||
|
mode_tip = "voice"
|
||||||
|
else:
|
||||||
|
mode_tip = "text"
|
||||||
|
try:
|
||||||
|
io_mode = (
|
||||||
|
await _async_console_input(
|
||||||
|
f"输入模式 voice/text(默认 {mode_tip}): "
|
||||||
|
)
|
||||||
|
).strip().lower() or mode_tip
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
print("\n机器人下线,再见!")
|
||||||
|
return
|
||||||
|
if io_mode not in ("voice", "text"):
|
||||||
|
io_mode = mode_tip
|
||||||
|
|
||||||
|
if io_mode == "voice" and not has_asr:
|
||||||
|
print(">>>>>> ⚠️ 未安装 speech_recognition,已降级为文本输入。 <<<<<<")
|
||||||
|
io_mode = "text"
|
||||||
|
if io_mode == "voice" and not has_tts:
|
||||||
|
print(">>>>>> ⚠️ 未安装 pyttsx3,将仅文本输出,不播报语音。 <<<<<<")
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\n[语音依赖状态] "
|
||||||
|
f"ASR={'ok' if has_asr else 'missing'}, "
|
||||||
|
f"TTS={'ok' if has_tts else 'missing'}"
|
||||||
|
)
|
||||||
|
if not has_asr or not has_tts:
|
||||||
|
print("可安装: pip install SpeechRecognition pyaudio pyttsx3")
|
||||||
|
|
||||||
visual_context = "视觉输入:用户坐在电脑前,表情平静,看着屏幕。"
|
visual_context = "视觉输入:用户坐在电脑前,表情平静,看着屏幕。"
|
||||||
|
|
||||||
print(f"\n[当前视觉状态]: {visual_context}")
|
print(f"\n[当前视觉状态]: {visual_context}")
|
||||||
@@ -146,7 +284,7 @@ async def start_simulated_head():
|
|||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
user_input = (await _async_console_input("你说: ")).strip()
|
user_input = await _get_user_input(io_mode)
|
||||||
except (EOFError, KeyboardInterrupt):
|
except (EOFError, KeyboardInterrupt):
|
||||||
print("\n机器人下线,再见!")
|
print("\n机器人下线,再见!")
|
||||||
break
|
break
|
||||||
@@ -193,6 +331,10 @@ async def start_simulated_head():
|
|||||||
speech = response.chat_message.content
|
speech = response.chat_message.content
|
||||||
if speech and isinstance(speech, str):
|
if speech and isinstance(speech, str):
|
||||||
print(f">>>>>> 🔊 机器人说: {speech} <<<<<<\n")
|
print(f">>>>>> 🔊 机器人说: {speech} <<<<<<\n")
|
||||||
|
if io_mode == "voice":
|
||||||
|
spoken_ok = await _async_speak(speech)
|
||||||
|
if not spoken_ok:
|
||||||
|
print(">>>>>> ⚠️ TTS 不可用,当前仅文本输出。 <<<<<<\n")
|
||||||
|
|
||||||
# 只把最终回复加入历史,inner_messages 是事件对象不能序列化回模型
|
# 只把最终回复加入历史,inner_messages 是事件对象不能序列化回模型
|
||||||
history.append(response.chat_message)
|
history.append(response.chat_message)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
python -m vllm.entrypoints.openai.api_server \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
--model Qwen/Qwen3-VL-8B-Instruct \
|
--model Qwen/Qwen3-VL-8B-Instruct \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--gpu-memory-utilization 0.85 \
|
--gpu-memory-utilization 0.85 \
|
||||||
--max-model-len 32000 \
|
--max-model-len 32000 \
|
||||||
|
|||||||
Reference in New Issue
Block a user