feat: 添加edge-tts

This commit is contained in:
JiajunLI
2026-03-04 15:29:18 +08:00
parent d879aa1b2e
commit c97ff111fa
2 changed files with 60 additions and 72 deletions

87
main.py
View File

@@ -1,8 +1,11 @@
import asyncio
import json
import os
import shutil
import sqlite3
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Annotated
@@ -19,9 +22,9 @@ except ImportError:
sr = None
try:
import pyttsx3
import edge_tts
except ImportError:
pyttsx3 = None
edge_tts = None
BASE_DIR = Path(__file__).resolve().parent
USER_DB_PATH = BASE_DIR / "users.db"
@@ -30,8 +33,7 @@ ASR_LANGUAGE = "zh-CN"
MODEL_NAME = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
MODEL_BASE_URL = os.getenv("VLM_BASE_URL", "http://220.248.114.28:8000/v1")
MODEL_API_KEY = os.getenv("VLM_API_KEY", "EMPTY")
_TTS_ENGINE = None
TTS_VOICE = os.getenv("TTS_VOICE", "zh-CN-YunxiNeural")
# --- 第一部分:本地工具(面部 + 语音,以后接硬件)---
@@ -63,40 +65,53 @@ async def _async_console_input(prompt: str) -> str:
return await asyncio.to_thread(input, prompt)
def _init_tts_engine():
"""初始化离线 TTSpyttsx3"""
global _TTS_ENGINE
if _TTS_ENGINE is not None:
return _TTS_ENGINE
if pyttsx3 is None:
return None
engine = pyttsx3.init()
# 优先选择中文语音(不同系统 voice id 不同,这里做模糊匹配)
for voice in engine.getProperty("voices"):
voice_blob = f"{voice.id} {voice.name}".lower()
if "zh" in voice_blob or "chinese" in voice_blob or "mandarin" in voice_blob:
engine.setProperty("voice", voice.id)
break
engine.setProperty("rate", 190)
_TTS_ENGINE = engine
return _TTS_ENGINE
def _find_audio_player() -> list[str] | None:
"""查找可用播放器,优先 ffplay"""
if shutil.which("ffplay"):
return ["ffplay", "-nodisp", "-autoexit", "-loglevel", "error"]
if shutil.which("mpg123"):
return ["mpg123", "-q"]
if shutil.which("afplay"):
return ["afplay"]
return None
def _speak_blocking(text: str) -> bool:
"""阻塞式语音播报。成功返回 True"""
if not text:
def _play_audio_file_blocking(audio_path: str, player_cmd: list[str]) -> bool:
"""阻塞播放音频文件"""
try:
subprocess.run(
[*player_cmd, audio_path],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
return True
except Exception:
return False
engine = _init_tts_engine()
if engine is None:
return False
engine.say(text)
engine.runAndWait()
return True
async def _async_speak(text: str) -> bool:
return await asyncio.to_thread(_speak_blocking, text)
"""使用 edge-tts 生成 Yunxi 语音并播放。"""
if not text or edge_tts is None:
return False
player_cmd = _find_audio_player()
if player_cmd is None:
return False
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
audio_path = fp.name
try:
communicate = edge_tts.Communicate(text=text, voice=TTS_VOICE)
await communicate.save(audio_path)
return await asyncio.to_thread(_play_audio_file_blocking, audio_path, player_cmd)
except Exception:
return False
finally:
try:
Path(audio_path).unlink(missing_ok=True)
except Exception:
pass
def _listen_once_blocking(
@@ -243,7 +258,7 @@ async def start_simulated_head():
return
has_asr = sr is not None
has_tts = pyttsx3 is not None
has_tts = edge_tts is not None
if has_asr and has_tts:
mode_tip = "voice"
else:
@@ -264,7 +279,9 @@ async def start_simulated_head():
print(">>>>>> ⚠️ 未安装 speech_recognition已降级为文本输入。 <<<<<<")
io_mode = "text"
if io_mode == "voice" and not has_tts:
print(">>>>>> ⚠️ 未安装 pyttsx3,将仅文本输出,不播报语音。 <<<<<<")
print(">>>>>> ⚠️ 未安装 edge-tts将仅文本输出不播报语音。 <<<<<<")
if io_mode == "voice" and has_tts and _find_audio_player() is None:
print(">>>>>> ⚠️ 未检测到播放器(ffplay/mpg123/afplay),将仅文本输出。 <<<<<<")
print(
"\n[语音依赖状态] "
@@ -272,7 +289,7 @@ async def start_simulated_head():
f"TTS={'ok' if has_tts else 'missing'}"
)
if not has_asr or not has_tts:
print("可安装: pip install SpeechRecognition pyaudio pyttsx3")
print("可安装: pip install SpeechRecognition pyaudio edge-tts")
visual_context = "视觉输入:用户坐在电脑前,表情平静,看着屏幕。"