feat: 添加edge-tts

2026-03-04 15:29:18 +08:00
parent d879aa1b2e
commit c97ff111fa
2 changed files with 60 additions and 72 deletions
--- a/main.py
+++ b/main.py
@@ -1,8 +1,11 @@
 import asyncio
 import json
 import os
+import shutil
 import sqlite3
+import subprocess
 import sys
+import tempfile
 from pathlib import Path
 from typing import Annotated

@@ -19,9 +22,9 @@ except ImportError:
    sr = None

 try:
-    import pyttsx3
+    import edge_tts
 except ImportError:
-    pyttsx3 = None
+    edge_tts = None

 BASE_DIR = Path(__file__).resolve().parent
 USER_DB_PATH = BASE_DIR / "users.db"
@@ -30,8 +33,7 @@ ASR_LANGUAGE = "zh-CN"
 MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
 MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1")
 MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY")
-
-_TTS_ENGINE = None
+TTS_VOICE = os.getenv("TTS_VOICE", "zh-CN-YunxiNeural")

 # --- 第一部分：本地工具（面部 + 语音，以后接硬件）---

@@ -63,40 +65,53 @@ async def _async_console_input(prompt: str) -> str:
    return await asyncio.to_thread(input, prompt)


-def _init_tts_engine():
-    """初始化离线 TTS（pyttsx3）。"""
-    global _TTS_ENGINE
-    if _TTS_ENGINE is not None:
-        return _TTS_ENGINE
-    if pyttsx3 is None:
-        return None
-
-    engine = pyttsx3.init()
-    # 优先选择中文语音（不同系统 voice id 不同，这里做模糊匹配）
-    for voice in engine.getProperty("voices"):
-        voice_blob = f"{voice.id} {voice.name}".lower()
-        if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob:
-            engine.setProperty("voice", voice.id)
-            break
-    engine.setProperty("rate", 190)
-    _TTS_ENGINE = engine
-    return _TTS_ENGINE
+def _find_audio_player() -> list[str] | None:
+    """查找可用播放器，优先 ffplay。"""
+    if shutil.which("ffplay"):
+        return ["ffplay", "-nodisp", "-autoexit", "-loglevel", "error"]
+    if shutil.which("mpg123"):
+        return ["mpg123", "-q"]
+    if shutil.which("afplay"):
+        return ["afplay"]
+    return None


-def _speak_blocking(text: str) -> bool:
-    """阻塞式语音播报。成功返回 True。"""
-    if not text:
+def _play_audio_file_blocking(audio_path: str, player_cmd: list[str]) -> bool:
+    """阻塞播放音频文件。"""
+    try:
+        subprocess.run(
+            [*player_cmd, audio_path],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return True
+    except Exception:
        return False
-    engine = _init_tts_engine()
-    if engine is None:
-        return False
-    engine.say(text)
-    engine.runAndWait()
-    return True


 async def _async_speak(text: str) -> bool:
-    return await asyncio.to_thread(_speak_blocking, text)
+    """使用 edge-tts 生成 Yunxi 语音并播放。"""
+    if not text or edge_tts is None:
+        return False
+
+    player_cmd = _find_audio_player()
+    if player_cmd is None:
+        return False
+
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
+        audio_path = fp.name
+    try:
+        communicate = edge_tts.Communicate(text=text, voice=TTS_VOICE)
+        await communicate.save(audio_path)
+        return await asyncio.to_thread(_play_audio_file_blocking, audio_path, player_cmd)
+    except Exception:
+        return False
+    finally:
+        try:
+            Path(audio_path).unlink(missing_ok=True)
+        except Exception:
+            pass


 def _listen_once_blocking(
@@ -243,7 +258,7 @@ async def start_simulated_head():
        return

    has_asr = sr is not None
-    has_tts = pyttsx3 is not None
+    has_tts = edge_tts is not None
    if has_asr and has_tts:
        mode_tip = "voice"
    else:
@@ -264,7 +279,9 @@ async def start_simulated_head():
        print(">>>>>> ⚠️ 未安装 speech_recognition，已降级为文本输入。 <<<<<<")
        io_mode = "text"
    if io_mode == "voice" and not has_tts:
-        print(">>>>>> ⚠️ 未安装 pyttsx3，将仅文本输出，不播报语音。 <<<<<<")
+        print(">>>>>> ⚠️ 未安装 edge-tts，将仅文本输出，不播报语音。 <<<<<<")
+    if io_mode == "voice" and has_tts and _find_audio_player() is None:
+        print(">>>>>> ⚠️ 未检测到播放器(ffplay/mpg123/afplay)，将仅文本输出。 <<<<<<")

    print(
        "\n[语音依赖状态] "
@@ -272,7 +289,7 @@ async def start_simulated_head():
        f"TTS={'ok' if has_tts else 'missing'}"
    )
    if not has_asr or not has_tts:
-        print("可安装: pip install SpeechRecognition pyaudio pyttsx3")
+        print("可安装: pip install SpeechRecognition pyaudio edge-tts")

    visual_context = "视觉输入：用户坐在电脑前，表情平静，看着屏幕。"

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,37 +1,8 @@
-archspec @ file:///home/conda/feedstock_root/build_artifacts/archspec_1737352602016/work
-boltons @ file:///home/conda/feedstock_root/build_artifacts/boltons_1749686179973/work
-Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1764016952863/work
-certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1767500808759/work/certifi
-cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1761202850602/work
-charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1760437218288/work
-colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1733218098505/work
-conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1770031335390/work/conda-src
-conda-libmamba-solver @ file:///home/conda/feedstock_root/build_artifacts/conda-libmamba-solver_1764081326783/work/src
-conda-package-handling @ file:///home/conda/feedstock_root/build_artifacts/conda-package-handling_1736345463896/work
-conda_package_streaming @ file:///home/conda/feedstock_root/build_artifacts/conda-package-streaming_1751548120229/work
-distro @ file:///home/conda/feedstock_root/build_artifacts/distro_1734729835256/work
-frozendict @ file:///home/conda/feedstock_root/build_artifacts/frozendict_1763082802787/work
-h2 @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_h2_1756364871/work
-hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1737618293087/work
-hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1737618333194/work
-idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1760286409563/work
-jsonpatch @ file:///home/conda/feedstock_root/build_artifacts/jsonpatch_1733814567314/work
-jsonpointer @ file:///home/conda/feedstock_root/build_artifacts/jsonpointer_1756754132407/work
-libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1764158555/work/libmambapy
-menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1761299738838/work
-msgpack @ file:///home/conda/feedstock_root/build_artifacts/msgpack-python_1762503974934/work
-packaging @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_packaging_1745345660/work
-platformdirs @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_platformdirs_1759953252/work
-pluggy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pluggy_1764896838/work
-pycosat @ file:///home/conda/feedstock_root/build_artifacts/pycosat_1757744639790/work
-pycparser @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pycparser_1733195786/work
-PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1733217236728/work
-requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1755614211359/work
-ruamel.yaml @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml_1761160588389/work
-ruamel.yaml.clib @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml.clib_1760564169582/work
-setuptools==80.9.0
-tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1735661334605/work
-truststore @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_truststore_1753886790/work
-urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1750271362675/work
-wheel==0.45.1
-zstandard==0.25.0
+autogen-agentchat>=0.4
+autogen-core>=0.4
+autogen-ext[openai,mcp]>=0.4
+mcp>=1.0
+requests>=2.31
+SpeechRecognition>=3.10
+PyAudio>=0.2.14
+edge-tts>=6.1