107 lines
4 KiB
Python
107 lines
4 KiB
Python
import logging
|
|
import re
|
|
import uuid
|
|
|
|
from opentelemetry.trace import get_tracer
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.outbound.postgres.repositories.article_repository import ArticleRepository
|
|
|
|
from ...languages import SUPPORTED_LANGUAGES
|
|
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
|
from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
|
from ...outbound.deepl.deepl_client import DeepLClient
|
|
from ...outbound.gemini.gemini_client import GeminiClient
|
|
from ...outbound.spacy.spacy_client import SpacyClient
|
|
from ...outbound.storage_client import get_storage_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
tracer = get_tracer(__name__)
|
|
|
|
|
|
class SummariseService:
|
|
def __init__(
|
|
self,
|
|
anthropic_client: AnthropicClient,
|
|
deepgram_client: LocalDeepgramClient,
|
|
deepl_client: DeepLClient,
|
|
gemini_client: GeminiClient,
|
|
spacy_client: SpacyClient,
|
|
article_repository: ArticleRepository,
|
|
) -> None:
|
|
self.anthropic_client = anthropic_client
|
|
self.deepgram_client = deepgram_client
|
|
self.deepl_client = deepl_client
|
|
self.gemini_client = gemini_client
|
|
self.spacy_client = spacy_client
|
|
self.article_repository = article_repository
|
|
|
|
def _split_title_and_body(self, text: str) -> tuple[str, str]:
|
|
lines = text.splitlines()
|
|
|
|
if not lines:
|
|
return "", ""
|
|
|
|
title = lines[0].lstrip("#").strip()
|
|
body = "\n".join(lines[1:]).strip()
|
|
return title, body
|
|
|
|
|
|
async def summarise_article(
|
|
self,
|
|
article_id: uuid.UUID,
|
|
target_language: str,
|
|
complexity_level: str,
|
|
input_text: str,
|
|
) -> None:
|
|
print(f"Summarising article {article_id} with target language {target_language} and complexity level {complexity_level}...")
|
|
with tracer.start_as_current_span("summarise_article"):
|
|
try:
|
|
with tracer.start_as_current_span("generate_title_and_text"):
|
|
language_name = SUPPORTED_LANGUAGES[target_language]
|
|
|
|
generated_text = await AnthropicClient.retry(
|
|
self.anthropic_client.create_summary_article,
|
|
content_to_summarise=input_text,
|
|
complexity_level=complexity_level,
|
|
to_language=language_name,
|
|
length_preference="200-400 words",
|
|
)
|
|
|
|
if generated_text is None:
|
|
print(f"Text generated to summarise article {article_id}...")
|
|
raise
|
|
|
|
generated_title, generated_text_without_title = (
|
|
self._split_title_and_body(generated_text)
|
|
)
|
|
|
|
await self.article_repository.update_title_and_text(
|
|
article_id, generated_title, generated_text_without_title
|
|
)
|
|
|
|
with tracer.start_as_current_span("generate_linguistic_data"):
|
|
text_linguistic_data = self.spacy_client.get_parts_of_speech(
|
|
generated_text_without_title, target_language
|
|
)
|
|
|
|
await self.article_repository.update_linguistic_data(
|
|
article_id, text_linguistic_data
|
|
)
|
|
|
|
with tracer.start_as_current_span("generate_voice"):
|
|
voice = self.gemini_client.get_voice_by_language(target_language)
|
|
wav_bytes = await self.gemini_client.generate_audio(
|
|
generated_text, voice
|
|
)
|
|
audio_key = f"audio/{article_id}.wav"
|
|
get_storage_client().upload(audio_key, wav_bytes)
|
|
|
|
await self.article_repository.update_audio_key(
|
|
article_id, audio_key
|
|
)
|
|
|
|
except Exception as exc:
|
|
print(f"Failed to summarise an article: {exc}")
|
|
raise exc
|