52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
import asyncio
|
|
|
|
from google import genai
|
|
from google.genai import types as genai_types
|
|
|
|
from ...storage import pcm_to_wav
|
|
|
|
VOICE_BY_LANGUAGE: dict[str, str] = {
|
|
"fr": "Kore",
|
|
"es": "Charon",
|
|
"it": "Aoede",
|
|
"de": "Fenrir",
|
|
"en": "Kore",
|
|
}
|
|
|
|
|
|
class GeminiClient():
|
|
"""Communicate with Google's Gemini LLM"""
|
|
def __init__(self, api_key: str):
|
|
self._api_key = api_key
|
|
|
|
def get_voice_by_language(self, target_language: str) -> str:
|
|
possible_voice = VOICE_BY_LANGUAGE.get(target_language)
|
|
|
|
if not possible_voice:
|
|
raise ValueError(f"No voice found for language: {target_language}")
|
|
|
|
return possible_voice
|
|
|
|
|
|
async def generate_audio(self, text: str, voice: str) -> bytes:
|
|
"""Generate TTS audio and return WAV bytes."""
|
|
def _call() -> bytes:
|
|
client = genai.Client(api_key=self._api_key)
|
|
response = client.models.generate_content(
|
|
model="gemini-2.5-flash-preview-tts",
|
|
contents=text,
|
|
config=genai_types.GenerateContentConfig(
|
|
response_modalities=["AUDIO"],
|
|
speech_config=genai_types.SpeechConfig(
|
|
voice_config=genai_types.VoiceConfig(
|
|
prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(
|
|
voice_name=voice,
|
|
)
|
|
)
|
|
),
|
|
),
|
|
)
|
|
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
|
return pcm_to_wav(pcm_data)
|
|
|
|
return await asyncio.to_thread(_call)
|