vtb/server/speech.py

import base64

import edge_tts
from faster_whisper import WhisperModel

from . import config


class SpeechService:
    def __init__(self) -> None:
        print("⏳ 正在加载本地语音识别模型 (首次启动可能需要下载)...")
        self._whisper_model = WhisperModel(
            config.WHISPER_MODEL_NAME,
            device=config.WHISPER_DEVICE,
            compute_type=config.WHISPER_COMPUTE_TYPE,
        )
        print("✅ 本地语音模型加载完毕！")

    def transcribe(self, audio_path: str) -> str:
        segments, _ = self._whisper_model.transcribe(
            audio_path,
            beam_size=config.WHISPER_BEAM_SIZE,
            language=config.WHISPER_LANGUAGE,
        )
        return "".join(segment.text for segment in segments)

    async def synthesize_audio_data_url(self, text: str) -> str:
        communicate = edge_tts.Communicate(text, config.TTS_VOICE)
        audio_data = b""
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_data += chunk["data"]

        audio_b64 = base64.b64encode(audio_data).decode("utf-8")
        return f"data:audio/mp3;base64,{audio_b64}"