import base64 import edge_tts from faster_whisper import WhisperModel from . import config class SpeechService: def __init__(self) -> None: print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...") self._whisper_model = WhisperModel( config.WHISPER_MODEL_NAME, device=config.WHISPER_DEVICE, compute_type=config.WHISPER_COMPUTE_TYPE, ) print("✅ 本地语音模型加载完毕!") def transcribe(self, audio_path: str) -> str: segments, _ = self._whisper_model.transcribe( audio_path, beam_size=config.WHISPER_BEAM_SIZE, language=config.WHISPER_LANGUAGE, ) return "".join(segment.text for segment in segments) async def synthesize_audio_data_url(self, text: str) -> str: communicate = edge_tts.Communicate(text, config.TTS_VOICE) audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] audio_b64 = base64.b64encode(audio_data).decode("utf-8") return f"data:audio/mp3;base64,{audio_b64}"