feat: Update responsibility between the job and the translated_articles
This commit is contained in:
parent
c0539bcf59
commit
8252b6fcf0
16 changed files with 431 additions and 341 deletions
|
|
@ -0,0 +1,74 @@
|
||||||
|
"""separate job orchestration from article content
|
||||||
|
|
||||||
|
Revision ID: 0006
|
||||||
|
Revises: 0005
|
||||||
|
Create Date: 2026-03-29
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0006"
|
||||||
|
down_revision: Union[str, None] = "0005"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
# Make article content fields nullable — they are now filled in step-by-step
|
||||||
|
op.alter_column("translated_articles", "source_title", nullable=True)
|
||||||
|
op.alter_column("translated_articles", "source_body", nullable=True)
|
||||||
|
op.alter_column("translated_articles", "target_title", nullable=True)
|
||||||
|
op.alter_column("translated_articles", "target_body", nullable=True)
|
||||||
|
|
||||||
|
# Add source_body_pos to translated_articles
|
||||||
|
op.add_column(
|
||||||
|
"translated_articles",
|
||||||
|
sa.Column("source_body_pos", postgresql.JSONB(), nullable=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add FK from jobs to the article they produce
|
||||||
|
op.add_column(
|
||||||
|
"jobs",
|
||||||
|
sa.Column(
|
||||||
|
"translated_article_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("translated_articles.id"),
|
||||||
|
nullable=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Drop content columns that now live on translated_articles
|
||||||
|
op.drop_column("jobs", "source_language")
|
||||||
|
op.drop_column("jobs", "target_language")
|
||||||
|
op.drop_column("jobs", "complexity_level")
|
||||||
|
op.drop_column("jobs", "input_summary")
|
||||||
|
op.drop_column("jobs", "generated_text")
|
||||||
|
op.drop_column("jobs", "translated_text")
|
||||||
|
op.drop_column("jobs", "audio_url")
|
||||||
|
op.drop_column("jobs", "source_pos_data")
|
||||||
|
op.drop_column("jobs", "target_pos_data")
|
||||||
|
op.drop_column("jobs", "audio_transcript")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.add_column("jobs", sa.Column("audio_transcript", postgresql.JSONB(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("target_pos_data", postgresql.JSONB(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("source_pos_data", postgresql.JSONB(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("audio_url", sa.Text(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("translated_text", sa.Text(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("generated_text", sa.Text(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("input_summary", sa.Text(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("complexity_level", sa.String(5), nullable=False, server_default="B1"))
|
||||||
|
op.add_column("jobs", sa.Column("target_language", sa.String(10), nullable=False, server_default="fr"))
|
||||||
|
op.add_column("jobs", sa.Column("source_language", sa.String(10), nullable=False, server_default="en"))
|
||||||
|
|
||||||
|
op.drop_column("jobs", "translated_article_id")
|
||||||
|
|
||||||
|
op.alter_column("translated_articles", "target_body", nullable=False)
|
||||||
|
op.alter_column("translated_articles", "target_title", nullable=False)
|
||||||
|
op.alter_column("translated_articles", "source_body", nullable=False)
|
||||||
|
op.alter_column("translated_articles", "source_title", nullable=False)
|
||||||
|
|
@ -7,17 +7,8 @@ class SummariseJob:
|
||||||
id: str
|
id: str
|
||||||
user_id: str
|
user_id: str
|
||||||
status: str
|
status: str
|
||||||
source_language: str
|
translated_article_id: str | None
|
||||||
target_language: str
|
error_message: str | None
|
||||||
complexity_level: str
|
|
||||||
input_summary: str
|
|
||||||
generated_text: str
|
|
||||||
translated_text: str
|
|
||||||
error_message: str
|
|
||||||
audio_url: str
|
|
||||||
source_pos_data: dict | None
|
|
||||||
target_pos_data: dict | None
|
|
||||||
audio_transcript: dict | None
|
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
started_at: datetime | None = None
|
started_at: datetime | None = None
|
||||||
completed_at: datetime | None = None
|
completed_at: datetime | None = None
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,14 @@ class TranslatedArticle:
|
||||||
id: str
|
id: str
|
||||||
published_at: datetime
|
published_at: datetime
|
||||||
source_language: str
|
source_language: str
|
||||||
source_title: str
|
|
||||||
source_body: str
|
|
||||||
target_language: str
|
target_language: str
|
||||||
target_complexities: list[str]
|
target_complexities: list[str]
|
||||||
target_title: str
|
# Content fields — filled in step-by-step during generation
|
||||||
target_body: str
|
source_title: str | None
|
||||||
|
source_body: str | None
|
||||||
|
source_body_pos: dict | None
|
||||||
|
target_title: str | None
|
||||||
|
target_body: str | None
|
||||||
audio_url: str | None
|
audio_url: str | None
|
||||||
target_body_pos: dict | None
|
target_body_pos: dict | None
|
||||||
target_body_transcript: dict | None
|
target_body_transcript: dict | None
|
||||||
|
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
import re
|
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
|
||||||
|
|
||||||
from ..models.summarise_job import SummariseJob
|
|
||||||
from ..models.translated_article import TranslatedArticle
|
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
|
||||||
|
|
||||||
def first_heading(md: str) -> str | None:
|
|
||||||
m = re.search(r'^#{1,2}\s+(.+)', md, re.MULTILINE)
|
|
||||||
return m.group(1).strip() if m else None
|
|
||||||
|
|
||||||
class ArticleService:
|
|
||||||
def __init__(self, db: AsyncSession) -> None:
|
|
||||||
self.translated_articles_repository = TranslatedArticleRepository(db)
|
|
||||||
|
|
||||||
async def get_all_articles(self, target_language: str) -> list[TranslatedArticle]:
|
|
||||||
"""Fetch all translated articles"""
|
|
||||||
articles = await self.translated_articles_repository.list_all(target_language)
|
|
||||||
return articles
|
|
||||||
149
api/app/domain/services/summarise_service.py
Normal file
149
api/app/domain/services/summarise_service.py
Normal file
|
|
@ -0,0 +1,149 @@
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Callable, Coroutine
|
||||||
|
|
||||||
|
import anthropic
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
|
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
||||||
|
from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
||||||
|
from ...outbound.deepl.deepl_client import DeepLClient
|
||||||
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
|
from ...storage import upload_audio
|
||||||
|
from ...languages import SUPPORTED_LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
_ANTHROPIC_RETRYABLE = (
|
||||||
|
anthropic.RateLimitError,
|
||||||
|
anthropic.InternalServerError,
|
||||||
|
anthropic.APITimeoutError,
|
||||||
|
anthropic.APIConnectionError,
|
||||||
|
)
|
||||||
|
_MAX_RETRIES = 4
|
||||||
|
_BASE_DELAY = 1.0
|
||||||
|
_MAX_DELAY = 60.0
|
||||||
|
|
||||||
|
|
||||||
|
async def _anthropic_with_backoff(
|
||||||
|
coro_fn: Callable[..., Coroutine[Any, Any, Any]],
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Any:
|
||||||
|
for attempt in range(_MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
return await coro_fn(*args, **kwargs)
|
||||||
|
except _ANTHROPIC_RETRYABLE as exc:
|
||||||
|
if attempt == _MAX_RETRIES:
|
||||||
|
raise
|
||||||
|
retry_after: float | None = None
|
||||||
|
if isinstance(exc, anthropic.RateLimitError):
|
||||||
|
raw = exc.response.headers.get("retry-after")
|
||||||
|
if raw is not None:
|
||||||
|
retry_after = float(raw)
|
||||||
|
if retry_after is None:
|
||||||
|
retry_after = min(_BASE_DELAY * (2 ** attempt), _MAX_DELAY)
|
||||||
|
jittered = retry_after * (0.8 + random.random() * 0.4)
|
||||||
|
await asyncio.sleep(jittered)
|
||||||
|
|
||||||
|
|
||||||
|
class SummariseService:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
anthropic_client: AnthropicClient,
|
||||||
|
deepgram_client: LocalDeepgramClient,
|
||||||
|
deepl_client: DeepLClient,
|
||||||
|
gemini_client: GeminiClient,
|
||||||
|
spacy_client: SpacyClient,
|
||||||
|
) -> None:
|
||||||
|
self.anthropic_client = anthropic_client
|
||||||
|
self.deepgram_client = deepgram_client
|
||||||
|
self.deepl_client = deepl_client
|
||||||
|
self.gemini_client = gemini_client
|
||||||
|
self.spacy_client = spacy_client
|
||||||
|
|
||||||
|
def _first_heading(self, md: str) -> str | None:
|
||||||
|
m = re.search(r'^#{1,2}\s+(.+)', md, re.MULTILINE)
|
||||||
|
return m.group(1).strip() if m else None
|
||||||
|
|
||||||
|
def _split_title_and_body(self, text: str) -> tuple[str, str]:
|
||||||
|
"""Splits the text into a title (first heading) and body (the rest)."""
|
||||||
|
title = self._first_heading(text) or ""
|
||||||
|
body = text[len(title):].lstrip() if title else text
|
||||||
|
if title == "":
|
||||||
|
title = "Untitled Article"
|
||||||
|
|
||||||
|
return title, body
|
||||||
|
|
||||||
|
async def run(
|
||||||
|
self,
|
||||||
|
db: AsyncSession,
|
||||||
|
job_id: uuid.UUID,
|
||||||
|
article_id: uuid.UUID,
|
||||||
|
source_language: str,
|
||||||
|
target_language: str,
|
||||||
|
complexity_level: str,
|
||||||
|
input_texts: list[str],
|
||||||
|
) -> None:
|
||||||
|
article_repo = TranslatedArticleRepository(db)
|
||||||
|
job = await summarise_job_repository.get_by_id(db, job_id)
|
||||||
|
await summarise_job_repository.mark_processing(db, job)
|
||||||
|
|
||||||
|
try:
|
||||||
|
language_name = SUPPORTED_LANGUAGES[target_language]
|
||||||
|
source_material = "\n\n".join(input_texts[:3])
|
||||||
|
|
||||||
|
generated_text = await _anthropic_with_backoff(
|
||||||
|
self.anthropic_client.generate_summary_text,
|
||||||
|
content_to_summarise=source_material,
|
||||||
|
complexity_level=complexity_level,
|
||||||
|
from_language=language_name,
|
||||||
|
to_language=language_name,
|
||||||
|
length_preference="200-400 words",
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_title, generated_text_without_title = self._split_title_and_body(generated_text)
|
||||||
|
|
||||||
|
await article_repo.update_content(
|
||||||
|
article_id,
|
||||||
|
target_title=generated_title,
|
||||||
|
target_body=generated_text_without_title,
|
||||||
|
source_title="",
|
||||||
|
source_body="",
|
||||||
|
)
|
||||||
|
|
||||||
|
translated_text = await self.deepl_client.translate(generated_text, source_language)
|
||||||
|
|
||||||
|
translated_title, translated_text_without_title = self._split_title_and_body(translated_text)
|
||||||
|
|
||||||
|
await article_repo.update_content(
|
||||||
|
article_id,
|
||||||
|
target_title=generated_title,
|
||||||
|
target_body=generated_text_without_title,
|
||||||
|
source_title=translated_title,
|
||||||
|
source_body=translated_text_without_title,
|
||||||
|
)
|
||||||
|
|
||||||
|
target_pos_data = self.spacy_client.get_parts_of_speech(generated_text_without_title, target_language)
|
||||||
|
source_pos_data = self.spacy_client.get_parts_of_speech(translated_text_without_title, source_language)
|
||||||
|
|
||||||
|
await article_repo.update_pos(article_id, target_pos_data, source_pos_data)
|
||||||
|
|
||||||
|
voice = self.gemini_client.get_voice_by_language(target_language)
|
||||||
|
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
|
||||||
|
audio_key = f"audio/{job_id}.wav"
|
||||||
|
upload_audio(audio_key, wav_bytes)
|
||||||
|
|
||||||
|
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)
|
||||||
|
|
||||||
|
await article_repo.update_audio(article_id, audio_key, transcript)
|
||||||
|
|
||||||
|
await summarise_job_repository.mark_succeeded(db, job)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
await summarise_job_repository.mark_failed(db, job, str(exc))
|
||||||
|
|
@ -19,15 +19,15 @@ class AnthropicClient():
|
||||||
) -> str:
|
) -> str:
|
||||||
return (
|
return (
|
||||||
f"You are a language learning content creator.\n"
|
f"You are a language learning content creator.\n"
|
||||||
f"You generate markdown summaries of user-provided content.\n"
|
f"You generate original, level-appropriate content from a source.\n"
|
||||||
|
f"The content will be spoken aloud in {to_language}, write it accordingly.\n"
|
||||||
f"You will provide content in {to_language} at {complexity_level} proficiency level on the CEFR scale.\n"
|
f"You will provide content in {to_language} at {complexity_level} proficiency level on the CEFR scale.\n"
|
||||||
f"The text you generate will:\n"
|
f"The text you generate will:\n"
|
||||||
f"- Contain ONLY the generated summary text in {to_language}.\n"
|
f"- Contain ONLY the generated summary text in {to_language}.\n"
|
||||||
f"- Never contain inappropriate (hateful, sexual, violent) content. It is preferable to return no text than to generate such content.\n"
|
|
||||||
f"- Speak directly to the reader/listener, adopting the tone and style of a semi-formal news reporter or podcaster.\n"
|
f"- Speak directly to the reader/listener, adopting the tone and style of a semi-formal news reporter or podcaster.\n"
|
||||||
f"- Where appropriate (fluency level, content), use a small number of idiomatic expressions.\n"
|
f"- Occasionally, where natural, include idiomatic expressions appropriate to {complexity_level} level.\n"
|
||||||
f"- Where appropriate use at least one additional tense, beyond the default of the content.\n"
|
f"- Vary tense usage naturally — do not restrict the piece to a single tense.\n"
|
||||||
f"- Be formatted in markdown. Contain a single level 1 header (#) at the top, followed by paragraphs and line breaks.\n"
|
f"- Contain only plain text. The piece should start with a title prefaced like a level-1 markdown title (#), but all other text should be plain. \n"
|
||||||
f"- Be around {length_preference} long.\n"
|
f"- Be around {length_preference} long.\n"
|
||||||
f"- Be inspired by the content, but not the tone, of the source material."
|
f"- Be inspired by the content, but not the tone, of the source material."
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import asyncio
|
import json
|
||||||
|
|
||||||
from deepgram import (
|
from deepgram import (
|
||||||
AsyncDeepgramClient,
|
AsyncDeepgramClient,
|
||||||
)
|
)
|
||||||
|
|
@ -15,7 +16,7 @@ class LocalDeepgramClient:
|
||||||
utterances=True,
|
utterances=True,
|
||||||
smart_format=True,
|
smart_format=True,
|
||||||
)
|
)
|
||||||
return response.results.json()
|
return json.loads(response.results.json())
|
||||||
|
|
||||||
async def transcribe_local_file(self, local_file_path: str, language_code: str) -> dict:
|
async def transcribe_local_file(self, local_file_path: str, language_code: str) -> dict:
|
||||||
with open(local_file_path, "rb") as audio_file:
|
with open(local_file_path, "rb") as audio_file:
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,11 @@ from datetime import datetime, timezone
|
||||||
|
|
||||||
from sqlalchemy import String, Text, DateTime, ForeignKey
|
from sqlalchemy import String, Text, DateTime, ForeignKey
|
||||||
from sqlalchemy.orm import Mapped, mapped_column
|
from sqlalchemy.orm import Mapped, mapped_column
|
||||||
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
|
||||||
from ..database import Base
|
from ..database import Base
|
||||||
|
|
||||||
|
|
||||||
class SummariseJobEntity(Base):
|
class SummariseJobEntity(Base):
|
||||||
__tablename__ = "jobs"
|
__tablename__ = "jobs"
|
||||||
|
|
||||||
|
|
@ -17,17 +18,10 @@ class SummariseJobEntity(Base):
|
||||||
UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True
|
UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True
|
||||||
)
|
)
|
||||||
status: Mapped[str] = mapped_column(String(20), nullable=False, default="pending")
|
status: Mapped[str] = mapped_column(String(20), nullable=False, default="pending")
|
||||||
source_language: Mapped[str] = mapped_column(String(10), nullable=False, default="en")
|
translated_article_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||||
target_language: Mapped[str] = mapped_column(String(10), nullable=False)
|
UUID(as_uuid=True), ForeignKey("translated_articles.id"), nullable=True
|
||||||
complexity_level: Mapped[str] = mapped_column(String(5), nullable=False)
|
)
|
||||||
input_summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
||||||
generated_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
||||||
translated_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
||||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
audio_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
||||||
source_pos_data: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
|
||||||
target_pos_data: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
|
||||||
audio_transcript: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
|
||||||
created_at: Mapped[datetime] = mapped_column(
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
DateTime(timezone=True),
|
DateTime(timezone=True),
|
||||||
default=lambda: datetime.now(timezone.utc),
|
default=lambda: datetime.now(timezone.utc),
|
||||||
|
|
|
||||||
|
|
@ -19,12 +19,14 @@ class TranslatedArticleEntity(Base):
|
||||||
default=lambda: datetime.now(timezone.utc),
|
default=lambda: datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
source_language: Mapped[str] = mapped_column(String(10), nullable=False)
|
source_language: Mapped[str] = mapped_column(String(10), nullable=False)
|
||||||
source_title: Mapped[str] = mapped_column(Text, nullable=False)
|
|
||||||
source_body: Mapped[str] = mapped_column(Text, nullable=False)
|
|
||||||
target_language: Mapped[str] = mapped_column(String(10), nullable=False)
|
target_language: Mapped[str] = mapped_column(String(10), nullable=False)
|
||||||
target_complexities: Mapped[list[str]] = mapped_column(ARRAY(String(5)), nullable=False)
|
target_complexities: Mapped[list[str]] = mapped_column(ARRAY(String(5)), nullable=False)
|
||||||
target_title: Mapped[str] = mapped_column(Text, nullable=False)
|
# Content fields — nullable, filled in step-by-step during generation
|
||||||
target_body: Mapped[str] = mapped_column(Text, nullable=False)
|
source_title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
source_body: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
source_body_pos: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
|
target_title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
target_body: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
audio_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
audio_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
target_body_pos: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
target_body_pos: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
target_body_transcript: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
target_body_transcript: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
|
|
|
||||||
|
|
@ -7,69 +7,32 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from ..entities.summarise_job_entity import SummariseJobEntity
|
from ..entities.summarise_job_entity import SummariseJobEntity
|
||||||
from ....domain.models.summarise_job import SummariseJob
|
from ....domain.models.summarise_job import SummariseJob
|
||||||
|
|
||||||
class PostgresSummariseJobRepository:
|
|
||||||
def __init__(self, db: AsyncSession):
|
|
||||||
self.db = db
|
|
||||||
|
|
||||||
async def list_all(self) -> list[SummariseJob]:
|
|
||||||
result = self.db.execute(
|
|
||||||
select(SummariseJobEntity).order_by(SummariseJobEntity.created_at.desc())
|
|
||||||
)
|
|
||||||
|
|
||||||
return list(result.scalars().all()).map(self.entity_to_model)
|
|
||||||
|
|
||||||
async def get_by_audio_url(
|
|
||||||
self,
|
|
||||||
audio_url: str
|
|
||||||
) -> SummariseJob | None:
|
|
||||||
result = await self.db.execute(
|
|
||||||
select(SummariseJobEntity).where(
|
|
||||||
SummariseJobEntity.audio_url == audio_url
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return self.entity_to_model(result.scalar_one_or_none())
|
|
||||||
|
|
||||||
def entity_to_model(self, entity: SummariseJobEntity | None) -> SummariseJob:
|
|
||||||
if entity is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
def _to_model(entity: SummariseJobEntity) -> SummariseJob:
|
||||||
return SummariseJob(
|
return SummariseJob(
|
||||||
id=str(entity.id),
|
id=str(entity.id),
|
||||||
user_id=str(entity.user_id),
|
user_id=str(entity.user_id),
|
||||||
status=entity.status,
|
status=entity.status,
|
||||||
source_language=entity.source_language,
|
translated_article_id=str(entity.translated_article_id) if entity.translated_article_id else None,
|
||||||
target_language=entity.target_language,
|
|
||||||
complexity_level=entity.complexity_level,
|
|
||||||
input_summary=entity.input_summary,
|
|
||||||
generated_text=entity.generated_text,
|
|
||||||
translated_text=entity.translated_text,
|
|
||||||
error_message=entity.error_message,
|
error_message=entity.error_message,
|
||||||
audio_url=entity.audio_url,
|
|
||||||
source_pos_data=entity.source_pos_data,
|
|
||||||
target_pos_data=entity.target_pos_data,
|
|
||||||
audio_transcript=entity.audio_transcript,
|
|
||||||
created_at=entity.created_at,
|
created_at=entity.created_at,
|
||||||
started_at=entity.started_at,
|
started_at=entity.started_at,
|
||||||
completed_at=entity.completed_at,
|
completed_at=entity.completed_at,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def update(db: AsyncSession, job: SummariseJobEntity) -> None:
|
|
||||||
|
async def _commit(db: AsyncSession) -> None:
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
async def create(
|
async def create(
|
||||||
db: AsyncSession,
|
db: AsyncSession,
|
||||||
user_id: uuid.UUID,
|
user_id: uuid.UUID,
|
||||||
source_language: str,
|
translated_article_id: uuid.UUID,
|
||||||
target_language: str,
|
|
||||||
complexity_level: str,
|
|
||||||
) -> SummariseJobEntity:
|
) -> SummariseJobEntity:
|
||||||
job = SummariseJobEntity(
|
job = SummariseJobEntity(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
source_language=source_language,
|
translated_article_id=translated_article_id,
|
||||||
target_language=target_language,
|
|
||||||
complexity_level=complexity_level,
|
|
||||||
)
|
)
|
||||||
db.add(job)
|
db.add(job)
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
@ -92,58 +55,17 @@ async def mark_processing(db: AsyncSession, job: SummariseJobEntity) -> None:
|
||||||
job.status = "processing"
|
job.status = "processing"
|
||||||
job.started_at = datetime.now(timezone.utc)
|
job.started_at = datetime.now(timezone.utc)
|
||||||
job.error_message = None
|
job.error_message = None
|
||||||
await update(db, job)
|
await _commit(db)
|
||||||
|
|
||||||
|
|
||||||
async def save_generated_text(
|
async def mark_succeeded(db: AsyncSession, job: SummariseJobEntity) -> None:
|
||||||
db: AsyncSession,
|
|
||||||
job: SummariseJobEntity,
|
|
||||||
generated_text: str,
|
|
||||||
input_summary: str,
|
|
||||||
) -> None:
|
|
||||||
job.generated_text = generated_text
|
|
||||||
job.input_summary = input_summary
|
|
||||||
await update(db, job)
|
|
||||||
|
|
||||||
|
|
||||||
async def save_translated_text(
|
|
||||||
db: AsyncSession,
|
|
||||||
job: SummariseJobEntity,
|
|
||||||
translated_text: str,
|
|
||||||
) -> None:
|
|
||||||
job.translated_text = translated_text
|
|
||||||
await update(db, job)
|
|
||||||
|
|
||||||
|
|
||||||
async def save_pos_data(
|
|
||||||
db: AsyncSession,
|
|
||||||
job: SummariseJobEntity,
|
|
||||||
source_pos_data: dict,
|
|
||||||
target_pos_data: dict,
|
|
||||||
) -> None:
|
|
||||||
job.source_pos_data = source_pos_data
|
|
||||||
job.target_pos_data = target_pos_data
|
|
||||||
await update(db, job)
|
|
||||||
|
|
||||||
|
|
||||||
async def save_audio_transcript(
|
|
||||||
db: AsyncSession,
|
|
||||||
job: SummariseJobEntity,
|
|
||||||
audio_transcript: dict,
|
|
||||||
) -> None:
|
|
||||||
job.audio_transcript = audio_transcript
|
|
||||||
await update(db, job)
|
|
||||||
|
|
||||||
|
|
||||||
async def mark_succeeded(db: AsyncSession, job: SummariseJobEntity, audio_url: str) -> None:
|
|
||||||
job.status = "succeeded"
|
job.status = "succeeded"
|
||||||
job.audio_url = audio_url
|
|
||||||
job.completed_at = datetime.now(timezone.utc)
|
job.completed_at = datetime.now(timezone.utc)
|
||||||
await update(db, job)
|
await _commit(db)
|
||||||
|
|
||||||
|
|
||||||
async def mark_failed(db: AsyncSession, job: SummariseJobEntity, error: str) -> None:
|
async def mark_failed(db: AsyncSession, job: SummariseJobEntity, error: str) -> None:
|
||||||
job.status = "failed"
|
job.status = "failed"
|
||||||
job.error_message = error
|
job.error_message = error
|
||||||
job.completed_at = datetime.now(timezone.utc)
|
job.completed_at = datetime.now(timezone.utc)
|
||||||
await update(db, job)
|
await _commit(db)
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from sqlalchemy import select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ..entities.translated_article_entity import TranslatedArticleEntity
|
from ..entities.translated_article_entity import TranslatedArticleEntity
|
||||||
|
from ..entities.summarise_job_entity import SummariseJobEntity
|
||||||
from ....domain.models.translated_article import TranslatedArticle
|
from ....domain.models.translated_article import TranslatedArticle
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -17,10 +18,11 @@ class TranslatedArticleRepository:
|
||||||
id=str(entity.id),
|
id=str(entity.id),
|
||||||
published_at=entity.published_at,
|
published_at=entity.published_at,
|
||||||
source_language=entity.source_language,
|
source_language=entity.source_language,
|
||||||
source_title=entity.source_title,
|
|
||||||
source_body=entity.source_body,
|
|
||||||
target_language=entity.target_language,
|
target_language=entity.target_language,
|
||||||
target_complexities=list(entity.target_complexities),
|
target_complexities=list(entity.target_complexities),
|
||||||
|
source_title=entity.source_title,
|
||||||
|
source_body=entity.source_body,
|
||||||
|
source_body_pos=entity.source_body_pos,
|
||||||
target_title=entity.target_title,
|
target_title=entity.target_title,
|
||||||
target_body=entity.target_body,
|
target_body=entity.target_body,
|
||||||
audio_url=entity.audio_url,
|
audio_url=entity.audio_url,
|
||||||
|
|
@ -31,42 +33,93 @@ class TranslatedArticleRepository:
|
||||||
async def create(
|
async def create(
|
||||||
self,
|
self,
|
||||||
source_language: str,
|
source_language: str,
|
||||||
source_title: str,
|
|
||||||
source_body: str,
|
|
||||||
target_language: str,
|
target_language: str,
|
||||||
target_complexities: list[str],
|
target_complexities: list[str],
|
||||||
target_title: str,
|
|
||||||
target_body: str,
|
|
||||||
audio_url: str | None,
|
|
||||||
target_body_pos: dict | None,
|
|
||||||
target_body_transcript: dict | None,
|
|
||||||
) -> TranslatedArticle:
|
) -> TranslatedArticle:
|
||||||
entity = TranslatedArticleEntity(
|
entity = TranslatedArticleEntity(
|
||||||
published_at=datetime.now(timezone.utc),
|
published_at=datetime.now(timezone.utc),
|
||||||
source_language=source_language,
|
source_language=source_language,
|
||||||
source_title=source_title,
|
|
||||||
source_body=source_body,
|
|
||||||
target_language=target_language,
|
target_language=target_language,
|
||||||
target_complexities=target_complexities,
|
target_complexities=target_complexities,
|
||||||
target_title=target_title,
|
|
||||||
target_body=target_body,
|
|
||||||
audio_url=audio_url,
|
|
||||||
target_body_pos=target_body_pos,
|
|
||||||
target_body_transcript=target_body_transcript,
|
|
||||||
)
|
)
|
||||||
self.db.add(entity)
|
self.db.add(entity)
|
||||||
await self.db.commit()
|
await self.db.commit()
|
||||||
await self.db.refresh(entity)
|
await self.db.refresh(entity)
|
||||||
return self._to_model(entity)
|
return self._to_model(entity)
|
||||||
|
|
||||||
async def list_all(self, target_language: str) -> list[TranslatedArticle]:
|
async def update_content(
|
||||||
|
self,
|
||||||
|
article_id: uuid.UUID,
|
||||||
|
target_title: str,
|
||||||
|
target_body: str,
|
||||||
|
source_title: str,
|
||||||
|
source_body: str,
|
||||||
|
) -> None:
|
||||||
|
entity = await self.db.get(TranslatedArticleEntity, article_id)
|
||||||
|
entity.target_title = target_title
|
||||||
|
entity.target_body = target_body
|
||||||
|
entity.source_title = source_title
|
||||||
|
entity.source_body = source_body
|
||||||
|
await self.db.commit()
|
||||||
|
|
||||||
|
async def update_pos(self, article_id: uuid.UUID, target_body_pos: dict, source_body_pos: dict) -> None:
|
||||||
|
entity = await self.db.get(TranslatedArticleEntity, article_id)
|
||||||
|
entity.target_body_pos = target_body_pos
|
||||||
|
entity.source_body_pos = source_body_pos
|
||||||
|
await self.db.commit()
|
||||||
|
|
||||||
|
async def update_audio(
|
||||||
|
self,
|
||||||
|
article_id: uuid.UUID,
|
||||||
|
audio_url: str,
|
||||||
|
target_body_transcript: dict,
|
||||||
|
) -> None:
|
||||||
|
entity = await self.db.get(TranslatedArticleEntity, article_id)
|
||||||
|
entity.audio_url = audio_url
|
||||||
|
entity.target_body_transcript = target_body_transcript
|
||||||
|
await self.db.commit()
|
||||||
|
|
||||||
|
async def get_by_audio_url(self, audio_url: str) -> TranslatedArticle | None:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(TranslatedArticleEntity).where(
|
||||||
|
TranslatedArticleEntity.audio_url == audio_url
|
||||||
|
)
|
||||||
|
)
|
||||||
|
entity = result.scalar_one_or_none()
|
||||||
|
return self._to_model(entity) if entity else None
|
||||||
|
|
||||||
|
async def list_complete(self, target_language: str) -> list[TranslatedArticle]:
|
||||||
|
"""Return articles that are fully generated (all content fields set, job succeeded)."""
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(TranslatedArticleEntity)
|
select(TranslatedArticleEntity)
|
||||||
.where(TranslatedArticleEntity.target_language == target_language)
|
.join(
|
||||||
|
SummariseJobEntity,
|
||||||
|
SummariseJobEntity.translated_article_id == TranslatedArticleEntity.id,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
TranslatedArticleEntity.target_language == target_language,
|
||||||
|
SummariseJobEntity.status == "succeeded",
|
||||||
|
TranslatedArticleEntity.target_body.is_not(None),
|
||||||
|
TranslatedArticleEntity.source_body.is_not(None),
|
||||||
|
)
|
||||||
.order_by(TranslatedArticleEntity.published_at.desc())
|
.order_by(TranslatedArticleEntity.published_at.desc())
|
||||||
)
|
)
|
||||||
return [self._to_model(e) for e in result.scalars().all()]
|
return [self._to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
async def get_by_id(self, article_id: uuid.UUID) -> TranslatedArticle | None:
|
async def get_complete_by_id(self, article_id: uuid.UUID) -> TranslatedArticle | None:
|
||||||
entity = await self.db.get(TranslatedArticleEntity, article_id)
|
"""Return the article only if fully generated (all content fields set, job succeeded)."""
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(TranslatedArticleEntity)
|
||||||
|
.join(
|
||||||
|
SummariseJobEntity,
|
||||||
|
SummariseJobEntity.translated_article_id == TranslatedArticleEntity.id,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
TranslatedArticleEntity.id == article_id,
|
||||||
|
SummariseJobEntity.status == "succeeded",
|
||||||
|
TranslatedArticleEntity.target_body.is_not(None),
|
||||||
|
TranslatedArticleEntity.source_body.is_not(None),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
entity = result.scalar_one_or_none()
|
||||||
return self._to_model(entity) if entity else None
|
return self._to_model(entity) if entity else None
|
||||||
|
|
|
||||||
|
|
@ -29,18 +29,31 @@ class SpacyClient:
|
||||||
return self._cache[language]
|
return self._cache[language]
|
||||||
|
|
||||||
def get_parts_of_speech(self, text: str, language: str) -> dict:
|
def get_parts_of_speech(self, text: str, language: str) -> dict:
|
||||||
|
"""Use SpaCy to get parts of speech for the given text and language,
|
||||||
|
broken down by sentences and then by tokens."""
|
||||||
nlp = self._get_nlp(language)
|
nlp = self._get_nlp(language)
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
tokens = [
|
|
||||||
|
sentences = [
|
||||||
|
{
|
||||||
|
"text": sent.text,
|
||||||
|
"tokens": [
|
||||||
{
|
{
|
||||||
"text": token.text,
|
"text": token.text,
|
||||||
"lemma": token.lemma_,
|
"lemma": token.lemma_,
|
||||||
|
"type": token.ent_type_ if token.ent_type_ else None,
|
||||||
"pos": token.pos_,
|
"pos": token.pos_,
|
||||||
"tag": token.tag_,
|
"tag": token.tag_,
|
||||||
"dep": token.dep_,
|
"dep": token.dep_,
|
||||||
"is_stop": token.is_stop,
|
"is_stop": token.is_stop,
|
||||||
|
"is_punct": token.is_punct,
|
||||||
|
"is_alpha": token.is_alpha,
|
||||||
|
|
||||||
}
|
}
|
||||||
for token in doc
|
for token in sent
|
||||||
if not token.is_space
|
if not token.is_space
|
||||||
|
],
|
||||||
|
}
|
||||||
|
for sent in doc.sents
|
||||||
]
|
]
|
||||||
return {"language": language, "tokens": tokens}
|
return {"language": language, "sentences": sentences}
|
||||||
|
|
|
||||||
|
|
@ -1,64 +1,26 @@
|
||||||
import asyncio
|
|
||||||
import random
|
|
||||||
import uuid
|
import uuid
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Callable, Coroutine
|
|
||||||
|
|
||||||
import anthropic
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...languages import SUPPORTED_LANGUAGES, SUPPORTED_LEVELS
|
from ...languages import SUPPORTED_LANGUAGES, SUPPORTED_LEVELS
|
||||||
from ...auth import require_admin
|
from ...auth import require_admin
|
||||||
from ...storage import upload_audio
|
|
||||||
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
||||||
from ...outbound.postgres.repositories import summarise_job_repository
|
from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ...domain.services.article_service import first_heading
|
|
||||||
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
||||||
from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
||||||
from ...outbound.deepl.deepl_client import DeepLClient
|
from ...outbound.deepl.deepl_client import DeepLClient
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...outbound.spacy.spacy_client import SpacyClient
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
|
from ...domain.services.summarise_service import SummariseService
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
from ... import worker
|
from ... import worker
|
||||||
|
|
||||||
router = APIRouter(prefix="/generate", tags=["api"])
|
router = APIRouter(prefix="/generate", tags=["api"])
|
||||||
|
|
||||||
_ANTHROPIC_RETRYABLE = (
|
|
||||||
anthropic.RateLimitError,
|
|
||||||
anthropic.InternalServerError,
|
|
||||||
anthropic.APITimeoutError,
|
|
||||||
anthropic.APIConnectionError,
|
|
||||||
)
|
|
||||||
_MAX_RETRIES = 4
|
|
||||||
_BASE_DELAY = 1.0 # seconds
|
|
||||||
_MAX_DELAY = 60.0 # seconds
|
|
||||||
|
|
||||||
|
|
||||||
async def _anthropic_with_backoff(
|
|
||||||
coro_fn: Callable[..., Coroutine[Any, Any, Any]],
|
|
||||||
*args: Any,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> Any:
|
|
||||||
for attempt in range(_MAX_RETRIES + 1):
|
|
||||||
try:
|
|
||||||
return await coro_fn(*args, **kwargs)
|
|
||||||
except _ANTHROPIC_RETRYABLE as exc:
|
|
||||||
if attempt == _MAX_RETRIES:
|
|
||||||
raise
|
|
||||||
retry_after: float | None = None
|
|
||||||
if isinstance(exc, anthropic.RateLimitError):
|
|
||||||
raw = exc.response.headers.get("retry-after")
|
|
||||||
if raw is not None:
|
|
||||||
retry_after = float(raw)
|
|
||||||
if retry_after is None:
|
|
||||||
retry_after = min(_BASE_DELAY * (2 ** attempt), _MAX_DELAY)
|
|
||||||
# ±20 % jitter to spread out concurrent retries
|
|
||||||
jittered = retry_after * (0.8 + random.random() * 0.4)
|
|
||||||
await asyncio.sleep(jittered)
|
|
||||||
|
|
||||||
|
|
||||||
class GenerationRequest(BaseModel):
|
class GenerationRequest(BaseModel):
|
||||||
target_language: str
|
target_language: str
|
||||||
|
|
@ -71,70 +33,29 @@ class GenerationResponse(BaseModel):
|
||||||
job_id: str
|
job_id: str
|
||||||
|
|
||||||
|
|
||||||
async def _run_generation(job_id: uuid.UUID, request: GenerationRequest) -> None:
|
async def _run_generation(
|
||||||
anthropic_client = AnthropicClient.new(settings.anthropic_api_key)
|
job_id: uuid.UUID,
|
||||||
deepgram_client = LocalDeepgramClient(settings.deepgram_api_key)
|
article_id: uuid.UUID,
|
||||||
deepl_client = DeepLClient(settings.deepl_api_key)
|
request: GenerationRequest,
|
||||||
gemini_client = GeminiClient(settings.gemini_api_key)
|
) -> None:
|
||||||
spacy_client = SpacyClient()
|
service = SummariseService(
|
||||||
|
anthropic_client=AnthropicClient.new(settings.anthropic_api_key),
|
||||||
|
deepgram_client=LocalDeepgramClient(settings.deepgram_api_key),
|
||||||
|
deepl_client=DeepLClient(settings.deepl_api_key),
|
||||||
|
gemini_client=GeminiClient(settings.gemini_api_key),
|
||||||
|
spacy_client=SpacyClient(),
|
||||||
|
)
|
||||||
async with AsyncSessionLocal() as db:
|
async with AsyncSessionLocal() as db:
|
||||||
job = await summarise_job_repository.get_by_id(db, job_id)
|
await service.run(
|
||||||
await summarise_job_repository.mark_processing(db, job)
|
db=db,
|
||||||
|
job_id=job_id,
|
||||||
try:
|
article_id=article_id,
|
||||||
language_name = SUPPORTED_LANGUAGES[request.target_language]
|
|
||||||
|
|
||||||
source_material = "\n\n".join(request.input_texts[:3])
|
|
||||||
|
|
||||||
generated_text = await _anthropic_with_backoff(
|
|
||||||
anthropic_client.generate_summary_text,
|
|
||||||
content_to_summarise=source_material,
|
|
||||||
complexity_level=request.complexity_level,
|
|
||||||
from_language=language_name,
|
|
||||||
to_language=language_name,
|
|
||||||
length_preference="200-400 words",
|
|
||||||
)
|
|
||||||
|
|
||||||
await summarise_job_repository.save_generated_text(
|
|
||||||
db, job, generated_text, source_material[:500]
|
|
||||||
)
|
|
||||||
|
|
||||||
translated_text = await deepl_client.translate(generated_text, request.source_language)
|
|
||||||
|
|
||||||
await summarise_job_repository.save_translated_text(db, job, translated_text)
|
|
||||||
|
|
||||||
target_pos_data = spacy_client.get_parts_of_speech(generated_text, request.target_language)
|
|
||||||
source_pos_data = spacy_client.get_parts_of_speech(translated_text, request.source_language)
|
|
||||||
|
|
||||||
await summarise_job_repository.save_pos_data(db, job, source_pos_data, target_pos_data)
|
|
||||||
|
|
||||||
voice = gemini_client.get_voice_by_language(request.target_language)
|
|
||||||
wav_bytes = await gemini_client.generate_audio(generated_text, voice)
|
|
||||||
audio_key = f"audio/{job_id}.wav"
|
|
||||||
upload_audio(audio_key, wav_bytes)
|
|
||||||
|
|
||||||
transcript = await deepgram_client.transcribe_bytes(wav_bytes, request.target_language)
|
|
||||||
await summarise_job_repository.save_audio_transcript(db, job, transcript)
|
|
||||||
|
|
||||||
await summarise_job_repository.mark_succeeded(db, job, audio_key)
|
|
||||||
|
|
||||||
await TranslatedArticleRepository(db).create(
|
|
||||||
source_language=request.source_language,
|
source_language=request.source_language,
|
||||||
source_title=first_heading(translated_text) or "",
|
|
||||||
source_body=translated_text,
|
|
||||||
target_language=request.target_language,
|
target_language=request.target_language,
|
||||||
target_complexities=[request.complexity_level],
|
complexity_level=request.complexity_level,
|
||||||
target_title=first_heading(generated_text) or "",
|
input_texts=request.input_texts,
|
||||||
target_body=generated_text,
|
|
||||||
audio_url=audio_key,
|
|
||||||
target_body_pos=target_pos_data,
|
|
||||||
target_body_transcript=transcript,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
await summarise_job_repository.mark_failed(db, job, str(exc))
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("", response_model=GenerationResponse, status_code=202)
|
@router.post("", response_model=GenerationResponse, status_code=202)
|
||||||
async def create_generation_job(
|
async def create_generation_job(
|
||||||
|
|
@ -155,14 +76,18 @@ async def create_generation_job(
|
||||||
f"Supported: {sorted(SUPPORTED_LEVELS)}",
|
f"Supported: {sorted(SUPPORTED_LEVELS)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
article = await TranslatedArticleRepository(db).create(
|
||||||
|
source_language=request.source_language,
|
||||||
|
target_language=request.target_language,
|
||||||
|
target_complexities=[request.complexity_level],
|
||||||
|
)
|
||||||
|
|
||||||
job = await summarise_job_repository.create(
|
job = await summarise_job_repository.create(
|
||||||
db,
|
db,
|
||||||
user_id=uuid.UUID(token_data["sub"]),
|
user_id=uuid.UUID(token_data["sub"]),
|
||||||
source_language=request.source_language,
|
translated_article_id=uuid.UUID(article.id),
|
||||||
target_language=request.target_language,
|
|
||||||
complexity_level=request.complexity_level,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
await worker.enqueue(partial(_run_generation, job.id, request))
|
await worker.enqueue(partial(_run_generation, job.id, uuid.UUID(article.id), request))
|
||||||
|
|
||||||
return GenerationResponse(job_id=str(job.id))
|
return GenerationResponse(job_id=str(job.id))
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
|
@ -9,6 +9,8 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from ...auth import require_admin
|
from ...auth import require_admin
|
||||||
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
||||||
from ...outbound.postgres.repositories import summarise_job_repository
|
from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
|
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...storage import upload_audio
|
from ...storage import upload_audio
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
|
|
@ -20,21 +22,10 @@ router = APIRouter(prefix="/jobs", dependencies=[Depends(require_admin)])
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
id: uuid.UUID
|
id: uuid.UUID
|
||||||
status: str
|
status: str
|
||||||
source_language: str
|
translated_article_id: uuid.UUID | None = None
|
||||||
target_language: str
|
|
||||||
complexity_level: str
|
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
started_at: datetime | None = None
|
started_at: datetime | None = None
|
||||||
completed_at: datetime | None = None
|
completed_at: datetime | None = None
|
||||||
# only present on success
|
|
||||||
generated_text: str | None = None
|
|
||||||
generated_text_pos: dict | None = None
|
|
||||||
translated_text: str | None = None
|
|
||||||
translated_text_pos: dict | None = None
|
|
||||||
input_summary: str | None = None
|
|
||||||
audio_url: str | None = None
|
|
||||||
audio_transcript: dict | None = None
|
|
||||||
# only present on failure
|
|
||||||
error_message: str | None = None
|
error_message: str | None = None
|
||||||
model_config = {"from_attributes": True}
|
model_config = {"from_attributes": True}
|
||||||
|
|
||||||
|
|
@ -45,6 +36,7 @@ class JobSummary(BaseModel):
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
completed_at: datetime | None = None
|
completed_at: datetime | None = None
|
||||||
error_message: str | None = None
|
error_message: str | None = None
|
||||||
|
model_config = {"from_attributes": True}
|
||||||
|
|
||||||
|
|
||||||
class JobListResponse(BaseModel):
|
class JobListResponse(BaseModel):
|
||||||
|
|
@ -77,44 +69,37 @@ async def get_job(
|
||||||
if job is None:
|
if job is None:
|
||||||
raise HTTPException(status_code=404, detail="Job not found")
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
|
||||||
response = JobResponse(
|
return JobResponse(
|
||||||
id=str(job.id),
|
id=job.id,
|
||||||
status=job.status,
|
status=job.status,
|
||||||
source_language=job.source_language,
|
translated_article_id=job.translated_article_id,
|
||||||
target_language=job.target_language,
|
|
||||||
complexity_level=job.complexity_level,
|
|
||||||
created_at=job.created_at,
|
created_at=job.created_at,
|
||||||
started_at=job.started_at,
|
started_at=job.started_at,
|
||||||
completed_at=job.completed_at,
|
completed_at=job.completed_at,
|
||||||
|
error_message=job.error_message,
|
||||||
)
|
)
|
||||||
|
|
||||||
if job.status == "succeeded":
|
|
||||||
response.generated_text = job.generated_text
|
|
||||||
response.generated_text_pos = job.target_pos_data
|
|
||||||
response.translated_text = job.translated_text
|
|
||||||
response.translated_text_pos = job.source_pos_data
|
|
||||||
response.input_summary = job.input_summary
|
|
||||||
response.audio_url = job.audio_url
|
|
||||||
response.audio_transcript = job.audio_transcript
|
|
||||||
elif job.status == "failed":
|
|
||||||
response.error_message = job.error_message
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
async def _run_regenerate_audio(job_id: uuid.UUID) -> None:
|
async def _run_regenerate_audio(job_id: uuid.UUID) -> None:
|
||||||
gemini_client = GeminiClient(settings.gemini_api_key)
|
gemini_client = GeminiClient(settings.gemini_api_key)
|
||||||
async with AsyncSessionLocal() as db:
|
async with AsyncSessionLocal() as db:
|
||||||
job = await summarise_job_repository.get_by_id(db, job_id)
|
job = await summarise_job_repository.get_by_id(db, job_id)
|
||||||
|
article_repo = TranslatedArticleRepository(db)
|
||||||
|
article_entity = await db.get(TranslatedArticleEntity, job.translated_article_id)
|
||||||
await summarise_job_repository.mark_processing(db, job)
|
await summarise_job_repository.mark_processing(db, job)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
voice = gemini_client.get_voice_by_language(job.target_language)
|
voice = gemini_client.get_voice_by_language(article_entity.target_language)
|
||||||
wav_bytes = await gemini_client.generate_audio(job.generated_text, voice)
|
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
upload_audio(audio_key, wav_bytes)
|
upload_audio(audio_key, wav_bytes)
|
||||||
|
|
||||||
await summarise_job_repository.mark_succeeded(db, job, audio_key)
|
await article_repo.update_audio(
|
||||||
|
article_entity.id,
|
||||||
|
audio_url=audio_key,
|
||||||
|
target_body_transcript=article_entity.target_body_transcript,
|
||||||
|
)
|
||||||
|
await summarise_job_repository.mark_succeeded(db, job)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await summarise_job_repository.mark_failed(db, job, str(exc))
|
await summarise_job_repository.mark_failed(db, job, str(exc))
|
||||||
|
|
@ -136,19 +121,21 @@ async def regenerate_audio(
|
||||||
raise HTTPException(status_code=404, detail="Job not found")
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
|
||||||
if str(job.user_id) != token_data["sub"]:
|
if str(job.user_id) != token_data["sub"]:
|
||||||
raise HTTPException(
|
raise HTTPException(status_code=403, detail="Not authorized to modify this job")
|
||||||
status_code=403, detail="Not authorized to modify this job")
|
|
||||||
|
|
||||||
if not job.generated_text:
|
if job.translated_article_id is None:
|
||||||
raise HTTPException(
|
raise HTTPException(status_code=400, detail="Job has no associated article")
|
||||||
status_code=400, detail="Job has no generated text to synthesize")
|
|
||||||
|
|
||||||
if job.audio_url:
|
article_entity = await db.get(TranslatedArticleEntity, job.translated_article_id)
|
||||||
|
|
||||||
|
if not article_entity or not article_entity.target_body:
|
||||||
|
raise HTTPException(status_code=400, detail="Job has no generated text to synthesize")
|
||||||
|
|
||||||
|
if article_entity.audio_url:
|
||||||
raise HTTPException(status_code=409, detail="Job already has audio")
|
raise HTTPException(status_code=409, detail="Job already has audio")
|
||||||
|
|
||||||
if job.status == "processing":
|
if job.status == "processing":
|
||||||
raise HTTPException(
|
raise HTTPException(status_code=409, detail="Job is already processing")
|
||||||
status_code=409, detail="Job is already processing")
|
|
||||||
|
|
||||||
await worker.enqueue(partial(_run_regenerate_audio, uid))
|
await worker.enqueue(partial(_run_regenerate_audio, uid))
|
||||||
return {"job_id": job_id}
|
return {"job_id": job_id}
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
from ...domain.services.article_service import ArticleService
|
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
|
|
||||||
|
|
@ -34,12 +33,13 @@ class ArticleDetail(BaseModel):
|
||||||
source_language: str
|
source_language: str
|
||||||
source_title: str
|
source_title: str
|
||||||
source_body: str
|
source_body: str
|
||||||
|
source_body_pos: dict
|
||||||
target_language: str
|
target_language: str
|
||||||
target_complexities: list[str]
|
target_complexities: list[str]
|
||||||
target_title: str
|
target_title: str
|
||||||
target_body: str
|
target_body: str
|
||||||
target_audio_url: str | None
|
target_audio_url: str | None
|
||||||
target_body_pos: dict | None
|
target_body_pos: dict
|
||||||
target_body_transcript: dict | None
|
target_body_transcript: dict | None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -55,8 +55,7 @@ async def list_articles(
|
||||||
db: AsyncSession = Depends(get_db),
|
db: AsyncSession = Depends(get_db),
|
||||||
_: dict = Depends(verify_token),
|
_: dict = Depends(verify_token),
|
||||||
) -> ArticleListResponse:
|
) -> ArticleListResponse:
|
||||||
service = ArticleService(TranslatedArticleRepository(db))
|
articles = await TranslatedArticleRepository(db).list_complete(target_language=target_language)
|
||||||
articles = await service.get_all_articles(target_language=target_language)
|
|
||||||
return ArticleListResponse(
|
return ArticleListResponse(
|
||||||
articles=[
|
articles=[
|
||||||
ArticleItem(
|
ArticleItem(
|
||||||
|
|
@ -84,7 +83,7 @@ async def get_article(
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise HTTPException(status_code=400, detail="Invalid article ID")
|
raise HTTPException(status_code=400, detail="Invalid article ID")
|
||||||
|
|
||||||
article = await TranslatedArticleRepository(db).get_by_id(uid)
|
article = await TranslatedArticleRepository(db).get_complete_by_id(uid)
|
||||||
if article is None:
|
if article is None:
|
||||||
raise HTTPException(status_code=404, detail="Article not found")
|
raise HTTPException(status_code=404, detail="Article not found")
|
||||||
|
|
||||||
|
|
@ -94,6 +93,7 @@ async def get_article(
|
||||||
source_language=article.source_language,
|
source_language=article.source_language,
|
||||||
source_title=article.source_title,
|
source_title=article.source_title,
|
||||||
source_body=article.source_body,
|
source_body=article.source_body,
|
||||||
|
source_body_pos=article.source_body_pos,
|
||||||
target_language=article.target_language,
|
target_language=article.target_language,
|
||||||
target_complexities=article.target_complexities,
|
target_complexities=article.target_complexities,
|
||||||
target_title=article.target_title,
|
target_title=article.target_title,
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,10 @@
|
||||||
import uuid
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
from ..auth import verify_token
|
|
||||||
from ..outbound.postgres.database import get_db
|
from ..outbound.postgres.database import get_db
|
||||||
from ..outbound.postgres.repositories.summarise_job_repository import PostgresSummariseJobRepository
|
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ..storage import download_audio
|
from ..storage import download_audio
|
||||||
|
|
||||||
router = APIRouter(prefix="/media", tags=["media"])
|
router = APIRouter(prefix="/media", tags=["media"])
|
||||||
|
|
@ -18,10 +15,9 @@ async def get_media_file(
|
||||||
filename: str,
|
filename: str,
|
||||||
db: AsyncSession = Depends(get_db),
|
db: AsyncSession = Depends(get_db),
|
||||||
) -> Response:
|
) -> Response:
|
||||||
repository = PostgresSummariseJobRepository(db)
|
article = await TranslatedArticleRepository(db).get_by_audio_url(filename)
|
||||||
job = await repository.get_by_audio_url(filename)
|
|
||||||
|
|
||||||
if job is None:
|
if article is None:
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue