36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
import base64
|
|
|
|
import edge_tts
|
|
from faster_whisper import WhisperModel
|
|
|
|
from . import config
|
|
|
|
|
|
class SpeechService:
|
|
def __init__(self) -> None:
|
|
print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
|
|
self._whisper_model = WhisperModel(
|
|
config.WHISPER_MODEL_NAME,
|
|
device=config.WHISPER_DEVICE,
|
|
compute_type=config.WHISPER_COMPUTE_TYPE,
|
|
)
|
|
print("✅ 本地语音模型加载完毕!")
|
|
|
|
def transcribe(self, audio_path: str) -> str:
|
|
segments, _ = self._whisper_model.transcribe(
|
|
audio_path,
|
|
beam_size=config.WHISPER_BEAM_SIZE,
|
|
language=config.WHISPER_LANGUAGE,
|
|
)
|
|
return "".join(segment.text for segment in segments)
|
|
|
|
async def synthesize_audio_data_url(self, text: str) -> str:
|
|
communicate = edge_tts.Communicate(text, config.TTS_VOICE)
|
|
audio_data = b""
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
audio_data += chunk["data"]
|
|
|
|
audio_b64 = base64.b64encode(audio_data).decode("utf-8")
|
|
return f"data:audio/mp3;base64,{audio_b64}"
|