From c97ff111fa80612ee163dd10e4ecdbf318c456b9 Mon Sep 17 00:00:00 2001 From: JiajunLI Date: Wed, 4 Mar 2026 15:29:18 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0edge-tts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 87 +++++++++++++++++++++++++++++------------------- requirements.txt | 45 +++++-------------------- 2 files changed, 60 insertions(+), 72 deletions(-) diff --git a/main.py b/main.py index 654eee7..a23bd3d 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,11 @@ import asyncio import json import os +import shutil import sqlite3 +import subprocess import sys +import tempfile from pathlib import Path from typing import Annotated @@ -19,9 +22,9 @@ except ImportError: sr = None try: - import pyttsx3 + import edge_tts except ImportError: - pyttsx3 = None + edge_tts = None BASE_DIR = Path(__file__).resolve().parent USER_DB_PATH = BASE_DIR / "users.db" @@ -30,8 +33,7 @@ ASR_LANGUAGE = "zh-CN" MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct") MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1") MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY") - -_TTS_ENGINE = None +TTS_VOICE = os.getenv("TTS_VOICE", "zh-CN-YunxiNeural") # --- 第一部分:本地工具(面部 + 语音,以后接硬件)--- @@ -63,40 +65,53 @@ async def _async_console_input(prompt: str) -> str: return await asyncio.to_thread(input, prompt) -def _init_tts_engine(): - """初始化离线 TTS(pyttsx3)。""" - global _TTS_ENGINE - if _TTS_ENGINE is not None: - return _TTS_ENGINE - if pyttsx3 is None: - return None - - engine = pyttsx3.init() - # 优先选择中文语音(不同系统 voice id 不同,这里做模糊匹配) - for voice in engine.getProperty("voices"): - voice_blob = f"{voice.id} {voice.name}".lower() - if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob: - engine.setProperty("voice", voice.id) - break - engine.setProperty("rate", 190) - _TTS_ENGINE = engine - return _TTS_ENGINE +def _find_audio_player() -> list[str] | None: + """查找可用播放器,优先 ffplay。""" + if shutil.which("ffplay"): + return ["ffplay", "-nodisp", "-autoexit", "-loglevel", "error"] + if shutil.which("mpg123"): + return ["mpg123", "-q"] + if shutil.which("afplay"): + return ["afplay"] + return None -def _speak_blocking(text: str) -> bool: - """阻塞式语音播报。成功返回 True。""" - if not text: +def _play_audio_file_blocking(audio_path: str, player_cmd: list[str]) -> bool: + """阻塞播放音频文件。""" + try: + subprocess.run( + [*player_cmd, audio_path], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + return True + except Exception: return False - engine = _init_tts_engine() - if engine is None: - return False - engine.say(text) - engine.runAndWait() - return True async def _async_speak(text: str) -> bool: - return await asyncio.to_thread(_speak_blocking, text) + """使用 edge-tts 生成 Yunxi 语音并播放。""" + if not text or edge_tts is None: + return False + + player_cmd = _find_audio_player() + if player_cmd is None: + return False + + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: + audio_path = fp.name + try: + communicate = edge_tts.Communicate(text=text, voice=TTS_VOICE) + await communicate.save(audio_path) + return await asyncio.to_thread(_play_audio_file_blocking, audio_path, player_cmd) + except Exception: + return False + finally: + try: + Path(audio_path).unlink(missing_ok=True) + except Exception: + pass def _listen_once_blocking( @@ -243,7 +258,7 @@ async def start_simulated_head(): return has_asr = sr is not None - has_tts = pyttsx3 is not None + has_tts = edge_tts is not None if has_asr and has_tts: mode_tip = "voice" else: @@ -264,7 +279,9 @@ async def start_simulated_head(): print(">>>>>> ⚠️ 未安装 speech_recognition,已降级为文本输入。 <<<<<<") io_mode = "text" if io_mode == "voice" and not has_tts: - print(">>>>>> ⚠️ 未安装 pyttsx3,将仅文本输出,不播报语音。 <<<<<<") + print(">>>>>> ⚠️ 未安装 edge-tts,将仅文本输出,不播报语音。 <<<<<<") + if io_mode == "voice" and has_tts and _find_audio_player() is None: + print(">>>>>> ⚠️ 未检测到播放器(ffplay/mpg123/afplay),将仅文本输出。 <<<<<<") print( "\n[语音依赖状态] " @@ -272,7 +289,7 @@ async def start_simulated_head(): f"TTS={'ok' if has_tts else 'missing'}" ) if not has_asr or not has_tts: - print("可安装: pip install SpeechRecognition pyaudio pyttsx3") + print("可安装: pip install SpeechRecognition pyaudio edge-tts") visual_context = "视觉输入:用户坐在电脑前,表情平静,看着屏幕。" diff --git a/requirements.txt b/requirements.txt index b5b3f11..070cbfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,37 +1,8 @@ -archspec @ file:///home/conda/feedstock_root/build_artifacts/archspec_1737352602016/work -boltons @ file:///home/conda/feedstock_root/build_artifacts/boltons_1749686179973/work -Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1764016952863/work -certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1767500808759/work/certifi -cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1761202850602/work -charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1760437218288/work -colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1733218098505/work -conda @ file:///home/conda/feedstock_root/build_artifacts/conda_1770031335390/work/conda-src -conda-libmamba-solver @ file:///home/conda/feedstock_root/build_artifacts/conda-libmamba-solver_1764081326783/work/src -conda-package-handling @ file:///home/conda/feedstock_root/build_artifacts/conda-package-handling_1736345463896/work -conda_package_streaming @ file:///home/conda/feedstock_root/build_artifacts/conda-package-streaming_1751548120229/work -distro @ file:///home/conda/feedstock_root/build_artifacts/distro_1734729835256/work -frozendict @ file:///home/conda/feedstock_root/build_artifacts/frozendict_1763082802787/work -h2 @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_h2_1756364871/work -hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1737618293087/work -hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1737618333194/work -idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1760286409563/work -jsonpatch @ file:///home/conda/feedstock_root/build_artifacts/jsonpatch_1733814567314/work -jsonpointer @ file:///home/conda/feedstock_root/build_artifacts/jsonpointer_1756754132407/work -libmambapy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_libmambapy_1764158555/work/libmambapy -menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1761299738838/work -msgpack @ file:///home/conda/feedstock_root/build_artifacts/msgpack-python_1762503974934/work -packaging @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_packaging_1745345660/work -platformdirs @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_platformdirs_1759953252/work -pluggy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pluggy_1764896838/work -pycosat @ file:///home/conda/feedstock_root/build_artifacts/pycosat_1757744639790/work -pycparser @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pycparser_1733195786/work -PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1733217236728/work -requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1755614211359/work -ruamel.yaml @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml_1761160588389/work -ruamel.yaml.clib @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml.clib_1760564169582/work -setuptools==80.9.0 -tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1735661334605/work -truststore @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_truststore_1753886790/work -urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1750271362675/work -wheel==0.45.1 -zstandard==0.25.0 +autogen-agentchat>=0.4 +autogen-core>=0.4 +autogen-ext[openai,mcp]>=0.4 +mcp>=1.0 +requests>=2.31 +SpeechRecognition>=3.10 +PyAudio>=0.2.14 +edge-tts>=6.1