Compare commits

..

2 commits

Author SHA1 Message Date
9b9bdc3a39 feat: Change out storage from local to Bunny (via env param)
Some checks failed
/ test (push) Has been cancelled
2026-05-18 21:18:19 +01:00
293a8ab3f9 fix: Commit various features relating to CYOA 2026-05-17 13:36:21 +01:00
24 changed files with 858 additions and 123 deletions

View file

@ -15,10 +15,16 @@ class Settings(BaseSettings):
scaleway_tem_project_id: str = "" scaleway_tem_project_id: str = ""
scaleway_tem_from_address: str = "" scaleway_tem_from_address: str = ""
scaleway_tem_region: str = "fr-par" scaleway_tem_region: str = "fr-par"
storage_endpoint_url: str storage_provider: str = "local" # or 'bunny'
storage_access_key: str storage_endpoint_url: str = ""
storage_secret_key: str storage_access_key: str = ""
storage_secret_key: str = ""
storage_bucket: str = "langlearn" storage_bucket: str = "langlearn"
bunny_zone: str = "languagelearningapp"
bunny_api_key: str = ""
bunny_cdn_base_url: str = ""
bunny_token_auth_key: str = ""
bunny_storage_endpoint: str = "https://storage.bunnycdn.com"
stub_generation: bool = False stub_generation: bool = False
model_config = {"env_file": ".env"} model_config = {"env_file": ".env"}

View file

@ -24,7 +24,7 @@ from ...outbound.postgres.repositories.adventure_repository import (
PostgresAdventureRepository, PostgresAdventureRepository,
) )
from ...outbound.spacy.spacy_client import SpacyClient from ...outbound.spacy.spacy_client import SpacyClient
from ...storage import upload_audio from ...outbound.storage_client import get_storage_client
from ..models.adventure import ( from ..models.adventure import (
Adventure, Adventure,
AdventureEntry, AdventureEntry,
@ -251,7 +251,7 @@ class AdventureService:
for sent_idx, target_sent in enumerate(target_nlp["sentences"]): for sent_idx, target_sent in enumerate(target_nlp["sentences"]):
t0 = time.monotonic() t0 = time.monotonic()
translated_sentence = await self.deepl_client.translate( translated_sentence = await self.deepl_client.translate(
target_sent["text"], adventure.source_language target_sent["text"], adventure.source_language, paragraph_text
) )
timing_translations += time.monotonic() - t0 timing_translations += time.monotonic() - t0
@ -315,7 +315,7 @@ class AdventureService:
# ── File upload ─────────────────────────────────────────────────── # ── File upload ───────────────────────────────────────────────────
t0 = time.monotonic() t0 = time.monotonic()
audio_key = f"adventure-audio/{entry_id}.wav" audio_key = f"adventure-audio/{entry_id}.wav"
upload_audio(audio_key, wav_bytes) get_storage_client().upload(audio_key, wav_bytes)
timing_file_uploading = time.monotonic() - t0 timing_file_uploading = time.monotonic() - t0
await self.audio_repo.create( await self.audio_repo.create(

View file

@ -14,7 +14,7 @@ from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
from ...outbound.deepl.deepl_client import DeepLClient from ...outbound.deepl.deepl_client import DeepLClient
from ...outbound.gemini.gemini_client import GeminiClient from ...outbound.gemini.gemini_client import GeminiClient
from ...outbound.spacy.spacy_client import SpacyClient from ...outbound.spacy.spacy_client import SpacyClient
from ...storage import upload_audio from ...outbound.storage_client import get_storage_client
from ...languages import SUPPORTED_LANGUAGES from ...languages import SUPPORTED_LANGUAGES
@ -137,7 +137,7 @@ class SummariseService:
voice = self.gemini_client.get_voice_by_language(target_language) voice = self.gemini_client.get_voice_by_language(target_language)
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice) wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
audio_key = f"audio/{job_id}.wav" audio_key = f"audio/{job_id}.wav"
upload_audio(audio_key, wav_bytes) get_storage_client().upload(audio_key, wav_bytes)
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language) transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)

View file

@ -8,13 +8,13 @@ from .routers.api import jobs
from .routers import media as media_router from .routers import media as media_router
from .routers.api.main import api_router from .routers.api.main import api_router
from .routers.bff.main import bff_router from .routers.bff.main import bff_router
from .storage import ensure_bucket_exists from .outbound.storage_factory import init_storage
from . import worker from . import worker
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
ensure_bucket_exists() init_storage()
worker_task = asyncio.create_task(worker.worker_loop()) worker_task = asyncio.create_task(worker.worker_loop())
yield yield
worker_task.cancel() worker_task.cancel()

View file

View file

@ -0,0 +1,77 @@
import base64
import hashlib
import time
import urllib.error
import urllib.request
_SIGNED_URL_EXPIRY_SECONDS = 3600
class BunnyClient:
def __init__(
self,
zone: str,
api_key: str,
cdn_base_url: str,
token_auth_key: str,
storage_endpoint: str = "https://storage.bunnycdn.com",
) -> None:
self._zone = zone
self._api_key = api_key
self._cdn_base_url = cdn_base_url.rstrip("/")
self._token_auth_key = token_auth_key
self._storage_endpoint = storage_endpoint.rstrip("/")
def _storage_url(self, path: str) -> str:
return f"{self._storage_endpoint}/{self._zone}/{path.lstrip('/')}"
def upload(self, path: str, data: bytes) -> bool:
req = urllib.request.Request(
self._storage_url(path),
data=data,
method="PUT",
headers={
"AccessKey": self._api_key,
"Content-Type": "audio/wav",
},
)
try:
with urllib.request.urlopen(req) as resp:
return resp.status == 201
except urllib.error.HTTPError:
return False
def get_url(self, path: str) -> str:
url_path = f"/{path.lstrip('/')}"
expiration = int(time.time()) + _SIGNED_URL_EXPIRY_SECONDS
digest = hashlib.sha256(
(self._token_auth_key + url_path + str(expiration)).encode()
).digest()
token = (
base64.b64encode(digest)
.decode()
.replace("+", "-")
.replace("/", "_")
.replace("=", "")
)
return f"{self._cdn_base_url}{url_path}?token={token}&expires={expiration}"
def get_public_url(self, path: str) -> str:
return f"{self._cdn_base_url}/{path.lstrip('/')}"
def delete(self, path: str) -> bool:
req = urllib.request.Request(
self._storage_url(path),
method="DELETE",
headers={"AccessKey": self._api_key},
)
try:
with urllib.request.urlopen(req) as resp:
return resp.status == 200
except urllib.error.HTTPError:
return False
def download(self, path: str) -> tuple[bytes, str]:
raise NotImplementedError(
"Direct download not available with Bunny — use get_url() to obtain a signed CDN URL"
)

View file

@ -1,9 +1,19 @@
import asyncio import asyncio
import io
import wave
from google import genai from google import genai
from google.genai import types as genai_types from google.genai import types as genai_types
from ...storage import pcm_to_wav
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(pcm_data)
return buf.getvalue()
VOICE_BY_LANGUAGE: dict[str, str] = { VOICE_BY_LANGUAGE: dict[str, str] = {
"fr": "Kore", "fr": "Kore",
@ -47,6 +57,6 @@ class GeminiClient():
), ),
) )
pcm_data = response.candidates[0].content.parts[0].inline_data.data pcm_data = response.candidates[0].content.parts[0].inline_data.data
return pcm_to_wav(pcm_data) return _pcm_to_wav(pcm_data)
return await asyncio.to_thread(_call) return await asyncio.to_thread(_call)

View file

View file

@ -0,0 +1,70 @@
import boto3
from botocore.exceptions import ClientError
class MinioClient:
def __init__(
self,
endpoint_url: str,
access_key: str,
secret_key: str,
bucket: str,
api_base_url: str,
) -> None:
self._endpoint_url = endpoint_url
self._access_key = access_key
self._secret_key = secret_key
self._bucket = bucket
self._api_base_url = api_base_url.rstrip("/")
def _s3(self):
return boto3.client(
"s3",
endpoint_url=self._endpoint_url,
aws_access_key_id=self._access_key,
aws_secret_access_key=self._secret_key,
)
def ensure_bucket_exists(self) -> None:
client = self._s3()
try:
client.head_bucket(Bucket=self._bucket)
except ClientError as e:
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
client.create_bucket(Bucket=self._bucket)
else:
raise
def upload(self, path: str, data: bytes) -> bool:
try:
self._s3().put_object(
Bucket=self._bucket,
Key=path,
Body=data,
ContentType="audio/wav",
)
return True
except ClientError:
return False
def get_url(self, path: str) -> str:
return f"{self._api_base_url}/media/{path}"
def get_public_url(self, path: str) -> str:
return f"{self._api_base_url}/media/{path}"
def delete(self, path: str) -> bool:
try:
self._s3().delete_object(Bucket=self._bucket, Key=path)
return True
except ClientError:
return False
def download(self, path: str) -> tuple[bytes, str]:
try:
response = self._s3().get_object(Bucket=self._bucket, Key=path)
return response["Body"].read(), response.get("ContentType", "audio/wav")
except ClientError as e:
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
raise FileNotFoundError(path)
raise

View file

@ -0,0 +1,21 @@
from typing import Protocol
_client: "StorageClient | None" = None
class StorageClient(Protocol):
def upload(self, path: str, data: bytes) -> bool: ...
def get_url(self, path: str) -> str: ...
def get_public_url(self, path: str) -> str: ...
def delete(self, path: str) -> bool: ...
def download(self, path: str) -> tuple[bytes, str]: ...
def get_storage_client() -> "StorageClient":
assert _client is not None, "Storage client not initialised — call init_storage() at startup"
return _client
def _set_storage_client(c: "StorageClient") -> None:
global _client
_client = c

View file

@ -0,0 +1,27 @@
from ..config import settings
from .storage_client import StorageClient, _set_storage_client
from .minio.minio_client import MinioClient
from .bunny.bunny_client import BunnyClient
def init_storage() -> None:
client: StorageClient
if settings.storage_provider == "bunny":
client = BunnyClient(
zone=settings.bunny_zone,
api_key=settings.bunny_api_key,
cdn_base_url=settings.bunny_cdn_base_url,
token_auth_key=settings.bunny_token_auth_key,
storage_endpoint=settings.bunny_storage_endpoint,
)
else:
minio = MinioClient(
endpoint_url=settings.storage_endpoint_url,
access_key=settings.storage_access_key,
secret_key=settings.storage_secret_key,
bucket=settings.storage_bucket,
api_base_url=settings.api_base_url,
)
minio.ensure_bucket_exists()
client = minio
_set_storage_client(client)

View file

@ -12,7 +12,7 @@ from ...outbound.postgres.repositories import summarise_job_repository
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
from ...outbound.gemini.gemini_client import GeminiClient from ...outbound.gemini.gemini_client import GeminiClient
from ...storage import upload_audio from ...outbound.storage_client import get_storage_client
from ...config import settings from ...config import settings
from ... import worker from ... import worker
@ -92,7 +92,7 @@ async def _run_regenerate_audio(job_id: uuid.UUID) -> None:
voice = gemini_client.get_voice_by_language(article_entity.target_language) voice = gemini_client.get_voice_by_language(article_entity.target_language)
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice) wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
audio_key = f"audio/{job_id}.wav" audio_key = f"audio/{job_id}.wav"
upload_audio(audio_key, wav_bytes) get_storage_client().upload(audio_key, wav_bytes)
await article_repo.update_audio( await article_repo.update_audio(
article_entity.id, article_entity.id,

View file

@ -5,8 +5,8 @@ from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token from ...auth import verify_token
from ...config import settings
from ...outbound.postgres.database import get_db from ...outbound.postgres.database import get_db
from ...outbound.storage_client import get_storage_client
from ...outbound.postgres.repositories.adventure_repository import ( from ...outbound.postgres.repositories.adventure_repository import (
PostgresAdventureEntryAudioRepository, PostgresAdventureEntryAudioRepository,
PostgresAdventureEntryChoiceRepository, PostgresAdventureEntryChoiceRepository,
@ -61,7 +61,7 @@ class AdventureDetailResponse(BaseModel):
def _audio_url(key: str | None) -> str | None: def _audio_url(key: str | None) -> str | None:
if key is None: if key is None:
return None return None
return f"{settings.api_base_url}/media/{key}" return get_storage_client().get_url(key)
@router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200) @router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200)

View file

@ -6,8 +6,8 @@ from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token from ...auth import verify_token
from ...config import settings
from ...outbound.postgres.database import get_db from ...outbound.postgres.database import get_db
from ...outbound.storage_client import get_storage_client
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
router = APIRouter(prefix="/articles", tags=["bff", "articles"]) router = APIRouter(prefix="/articles", tags=["bff", "articles"])
@ -46,7 +46,7 @@ class ArticleDetail(BaseModel):
def _audio_url(key: str | None) -> str | None: def _audio_url(key: str | None) -> str | None:
if key is None: if key is None:
return None return None
return f"{settings.api_base_url}/media/{key}" return get_storage_client().get_url(key)
@router.get("", response_model=ArticleListResponse, status_code=200) @router.get("", response_model=ArticleListResponse, status_code=200)

View file

@ -3,12 +3,11 @@ import uuid
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response from fastapi.responses import Response
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from botocore.exceptions import ClientError
from ..outbound.postgres.database import get_db from ..outbound.postgres.database import get_db
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository
from ..storage import download_audio from ..outbound.storage_client import get_storage_client
router = APIRouter(prefix="/media", tags=["media"]) router = APIRouter(prefix="/media", tags=["media"])
@ -23,21 +22,23 @@ async def get_adventure_audio_file(
except ValueError: except ValueError:
raise HTTPException(status_code=400, detail="Invalid file ID") raise HTTPException(status_code=400, detail="Invalid file ID")
print(f"Looking for adventure audio with entry ID: {eid}")
adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text") adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text")
if adventure_audio is None: if adventure_audio is None:
raise HTTPException(status_code=404, detail="File not found") raise HTTPException(status_code=404, detail="File not found")
try: try:
audio_bytes, content_type = download_audio("adventure-audio/" + filename) audio_bytes, content_type = get_storage_client().download("adventure-audio/" + filename)
except ClientError as e: except FileNotFoundError:
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
raise HTTPException(status_code=404, detail="File not found") raise HTTPException(status_code=404, detail="File not found")
except NotImplementedError:
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
except Exception:
raise HTTPException(status_code=500, detail="Storage error") raise HTTPException(status_code=500, detail="Storage error")
return Response(content=audio_bytes, media_type=content_type) return Response(content=audio_bytes, media_type=content_type)
@router.get("/{filename:path}") @router.get("/{filename:path}")
async def get_media_file( async def get_media_file(
filename: str, filename: str,
@ -49,11 +50,12 @@ async def get_media_file(
raise HTTPException(status_code=404, detail="File not found") raise HTTPException(status_code=404, detail="File not found")
try: try:
audio_bytes, content_type = download_audio(filename) audio_bytes, content_type = get_storage_client().download(filename)
except ClientError as e: except FileNotFoundError:
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
raise HTTPException(status_code=404, detail="File not found") raise HTTPException(status_code=404, detail="File not found")
except NotImplementedError:
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
except Exception:
raise HTTPException(status_code=500, detail="Storage error") raise HTTPException(status_code=500, detail="Storage error")
return Response(content=audio_bytes, media_type=content_type) return Response(content=audio_bytes, media_type=content_type)

View file

@ -1,56 +0,0 @@
import io
import wave
import boto3
from botocore.exceptions import ClientError
from .config import settings
def get_s3_client():
return boto3.client(
"s3",
endpoint_url=settings.storage_endpoint_url,
aws_access_key_id=settings.storage_access_key,
aws_secret_access_key=settings.storage_secret_key,
)
def ensure_bucket_exists() -> None:
client = get_s3_client()
try:
client.head_bucket(Bucket=settings.storage_bucket)
except ClientError as e:
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
client.create_bucket(Bucket=settings.storage_bucket)
else:
raise
def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
"""Wrap raw 16-bit mono PCM data in a WAV container."""
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(sample_rate)
wf.writeframes(pcm_data)
return buf.getvalue()
def upload_audio(object_key: str, audio_bytes: bytes, content_type: str = "audio/wav") -> None:
client = get_s3_client()
client.put_object(
Bucket=settings.storage_bucket,
Key=object_key,
Body=audio_bytes,
ContentType=content_type,
)
def download_audio(object_key: str) -> tuple[bytes, str]:
"""Return (file_bytes, content_type)."""
client = get_s3_client()
response = client.get_object(Bucket=settings.storage_bucket, Key=object_key)
content_type = response.get("ContentType", "audio/wav")
return response["Body"].read(), content_type

View file

@ -0,0 +1,82 @@
# Design Document: Object Storage with Bunny CDN
This is a technical design document for implementing object (e.g. audio file) storage with Bunny CDN. This directory (`api/docs`) contains other similar files, notably `architecture.md` and `domain.md`. When you have worked through the change described here, please update `architecture.md`
## The problem
Language Learning App has audio as a core component, which requires files to be delivered to the end user. When developing locally, these files have been stored in a min.io service, mimicking an S3-like storage bucket.
Using this approach on a deployed instance (e.g. on a VPS using Docker), would result in high bandwidth and therefore a high cost. Using a dedicated, EU-based service like Bunny allows us to offload the delivery of content to a third-party at reduced cost (great!)
## The current implementation
Object storage was one of the first features built into this software in MVP state, as such it does not fit within the current architecture.
Right now `api/app/storage.py` contains some helper functions, notably the `upload_audio` and `download_audio` functions.
Users (through the web client) retrieve the media through two URLs (detailed in `api/app/routers/media.py`):
- `GET /media/adventure-audio/{filename:path}` for the choose-your-own-adventure file names
- `GET /media/{filename:path}`, used for the summary transcriptions
## The solution
We are going to use Bunny (bunny.net) as the CDN for all objects in deployed environments (right now, just production — in the future preprod or staging may exist).
Locally, for development purposes, we retain the use of MinIO. To decide which backend to use, we introduce an environment variable `STORAGE_PROVIDER` with a default value of `local` and an accepted alternative of `bunny`.
In situations where we use `local`, the existing `/media/..` proxy endpoints are returned when constructing audio URLs (e.g. in `api/app/routers/bff/articles.py` and `api/app/routers/bff/adventure.py`). When we use `bunny`, the Bunny CDN URL is returned directly so the request is never proxied through our service.
### Client interface
We will create a `BunnyClient` in `api/app/outbound/bunny/bunny_client.py` and extract the current MinIO logic into a `MinioClient` in `api/app/outbound/minio/minio_client.py`. Both implement a shared `StorageClient` protocol.
The interface is **generic** — the clients are storage adapters and must not encode domain concepts. Path construction (which directory, which filename) is the responsibility of the caller (the service layer), not the client.
```python
class StorageClient(Protocol):
def upload(self, path: str, data: bytes) -> bool: ...
def get_url(self, path: str) -> str: ...
def delete(self, path: str) -> bool: ...
```
Services construct paths using hardcoded directory prefixes (e.g. `"adventure-audio/"`, `"audio/"`). These are constants, not environment variables — they are not environment-specific and do not belong in config.
### Factory and instantiation
A factory function reads `STORAGE_PROVIDER` and returns the appropriate `StorageClient` implementation. The client is instantiated **once at app startup** (e.g. in `main.py`) as a module-level singleton — not per-request. This is consistent with how other outbound clients (`AnthropicClient`, `GeminiClient`, etc.) are handled.
### Bunny configuration
Bunny requires the following environment variables:
- `BUNNY_ZONE` — the storage zone name (the zone `languagelearningapp` has been created in the Bunny UI). No "DEFAULT" suffix; there is one zone.
- `BUNNY_API_KEY` — the Bunny API key for upload/delete operations.
- `BUNNY_CDN_BASE_URL` — the public CDN hostname used to construct delivery URLs.
### Signed vs. public URLs
Audio files are user-specific (i.e. one user should not be able to use another user's audio URL), Bunny signed URLs are required. Public CDN URLs are shareable by anyone who has the link.
As per Bunny's own documentation they recommend the token.py package:
```py
from token import sign_url
url = sign_url(
"https://myzone.b-cdn.net/videos/stream1/playlist.m3u8",
"your-security-key",
expiration_time=3600,
is_directory=True,
path_allowed="/videos/stream1/",
countries_allowed="GB",
)
```
`get_url(path)` on the `BunnyClient` must generate a time-limited (pick a sensible default for audio content here) signed URL using the Bunny Token Authentication feature. The MinIO implementation would use pre-signed S3 URLs for consistency.
Create a sibling method that explicitely creates public URLs for any future public content, call this `get_public_url`.
### Misc
`pcm_to_wav()` currently lives in `api/app/storage.py` but is a Gemini output concern. Move it to the Gemini client module (`api/app/outbound/gemini/`) when carrying out this refactor.

View file

@ -146,7 +146,6 @@ def parse_llm_response(text: str) -> tuple[str, list[tuple[str, str]], str]:
app/domain/models/adventure.py app/domain/models/adventure.py
app/domain/services/adventure_service.py app/domain/services/adventure_service.py
app/routers/api/adventures.py app/routers/api/adventures.py
app/routers/bff/adventures.py
app/outbound/postgres/entities/adventure_entities.py app/outbound/postgres/entities/adventure_entities.py
app/outbound/postgres/repositories/adventure_repository.py app/outbound/postgres/repositories/adventure_repository.py
alembic/versions/20260503_0016_add_choose_your_own_adventure.py alembic/versions/20260503_0016_add_choose_your_own_adventure.py
@ -158,7 +157,6 @@ Modified files:
``` ```
app/outbound/anthropic/anthropic_client.py (add 2 methods) app/outbound/anthropic/anthropic_client.py (add 2 methods)
app/routers/api/main.py (register router) app/routers/api/main.py (register router)
app/routers/bff/main.py (register router)
``` ```
--- ---

80
docker-compose-dev.yml Normal file
View file

@ -0,0 +1,80 @@
services:
db:
image: postgres:16-alpine
environment:
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
volumes:
- pgdata:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
interval: 5s
timeout: 5s
retries: 10
storage:
image: minio/minio:latest
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: ${STORAGE_ACCESS_KEY:-langlearn}
MINIO_ROOT_PASSWORD: ${STORAGE_SECRET_KEY}
ports:
- "9000:9000"
- "9001:9001"
volumes:
- storagedata:/data
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:9000/minio/health/live || exit 1"]
interval: 5s
timeout: 5s
retries: 10
api:
build: ./api
volumes:
- ./api:/app:z
ports:
- "${API_PORT:-8000}:8000"
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
environment:
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS:-wilson@thomaswilson.xyz}
API_BASE_URL: ${API_BASE_URL:-http://localhost:8000}
JWT_SECRET: ${JWT_SECRET}
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
DEEPL_API_KEY: ${DEEPL_API_KEY}
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
GEMINI_API_KEY: ${GEMINI_API_KEY}
PYTHONPATH: /app
STORAGE_PROVIDER: local
STORAGE_ENDPOINT_URL: http://storage:9000
STORAGE_ACCESS_KEY: ${STORAGE_ACCESS_KEY:-langlearn}
STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY}
STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn}
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub}
depends_on:
db:
condition: service_healthy
storage:
condition: service_healthy
restart: unless-stopped
frontend:
build:
context: ./frontend
args:
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000}
ports:
- "${FRONTEND_PORT:-3000}:3000"
environment:
ORIGIN: ${ORIGIN:-http://localhost:3000}
depends_on:
- api
restart: unless-stopped
volumes:
pgdata:
storagedata:

83
docker-compose-prod.yml Normal file
View file

@ -0,0 +1,83 @@
services:
db:
image: postgres:16-alpine
environment:
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
volumes:
- pgdata:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
interval: 10s
timeout: 5s
retries: 10
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
api:
build: ./api
ports:
- "${API_PORT:-8000}:8000"
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 2
environment:
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS}
API_BASE_URL: ${API_BASE_URL}
JWT_SECRET: ${JWT_SECRET}
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
DEEPL_API_KEY: ${DEEPL_API_KEY}
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
GEMINI_API_KEY: ${GEMINI_API_KEY}
PYTHONPATH: /app
STORAGE_PROVIDER: bunny
BUNNY_ZONE: ${BUNNY_ZONE}
BUNNY_API_KEY: ${BUNNY_API_KEY}
BUNNY_CDN_BASE_URL: ${BUNNY_CDN_BASE_URL}
BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY}
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER}
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8000/health || exit 1"]
interval: 10s
timeout: 5s
retries: 10
start_period: 20s
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
frontend:
build:
context: ./frontend
args:
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL}
ports:
- "${FRONTEND_PORT:-3000}:3000"
environment:
ORIGIN: ${ORIGIN}
depends_on:
api:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '0.5'
memory: 256M
volumes:
pgdata:
networks:
default:
name: langlearn

View file

@ -299,6 +299,7 @@
<LatestEntry <LatestEntry
sourceText={latestEntry?.story_text} sourceText={latestEntry?.story_text}
translationText={latestEntry?.translation} translationText={latestEntry?.translation}
storyTextLinguisticData={latestEntry?.story_text_linguistic_data}
audioUrl={latestEntry?.audio_url} audioUrl={latestEntry?.audio_url}
onSelectNextStep={handleNextStepSelect} onSelectNextStep={handleNextStepSelect}
isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration} isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration}

View file

@ -6,6 +6,7 @@
type Props = { type Props = {
sourceText: string | null | undefined; sourceText: string | null | undefined;
translationText: string | null | undefined; translationText: string | null | undefined;
storyTextLinguisticData: Record<string, unknown> | null | undefined;
audioUrl: string | null | undefined; audioUrl: string | null | undefined;
onSelectNextStep: (optionId: string) => Promise<void>; onSelectNextStep: (optionId: string) => Promise<void>;
@ -24,6 +25,7 @@
const { const {
sourceText, sourceText,
translationText, translationText,
storyTextLinguisticData,
audioUrl, audioUrl,
onSelectNextStep, onSelectNextStep,
@ -33,10 +35,222 @@
errorMessage errorMessage
}: Props = $props(); }: Props = $props();
const sourceParagraphs = $derived.by(() => toParagraphs(sourceText)); type LinguisticToken = {
const translationParagraphs = $derived.by(() => toParagraphs(translationText)); text: string;
lemma: string | null;
pos: string | null;
};
let lastClickedParagraphIndex: number | null = $state(null); type TextSegment = {
kind: 'text';
text: string;
};
type WordSegment = {
kind: 'word';
text: string;
lemma: string | null;
pos: string | null;
};
type SentenceSegments = {
key: string;
sourceText: string;
targetText: string;
sourceSegments: Array<TextSegment | WordSegment>;
targetSegments: Array<TextSegment | WordSegment>;
};
type LinguisticParagraph = {
key: string;
sourceText: string;
targetText: string;
sentences: SentenceSegments[];
};
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null;
}
function asString(value: unknown): string | null {
return typeof value === 'string' ? value : null;
}
function parseTokens(value: unknown): LinguisticToken[] {
if (!Array.isArray(value)) {
return [];
}
return value
.map((token): LinguisticToken | null => {
if (!isRecord(token)) {
return null;
}
const text = asString(token.text);
if (!text) {
return null;
}
return {
text,
lemma: asString(token.lemma),
pos: asString(token.pos)
};
})
.filter((token): token is LinguisticToken => token !== null);
}
function buildSegments(
text: string,
tokens: LinguisticToken[]
): Array<TextSegment | WordSegment> {
if (tokens.length === 0) {
return text ? [{ kind: 'text', text }] : [];
}
if (!text) {
return tokens.flatMap((token, index) => [
{ kind: 'word', text: token.text, lemma: token.lemma, pos: token.pos } as WordSegment,
...(index < tokens.length - 1 ? ([{ kind: 'text', text: ' ' }] as TextSegment[]) : [])
]);
}
const segments: Array<TextSegment | WordSegment> = [];
let cursor = 0;
for (const token of tokens) {
const tokenIndex = text.indexOf(token.text, cursor);
if (tokenIndex === -1) {
continue;
}
if (tokenIndex > cursor) {
segments.push({
kind: 'text',
text: text.slice(cursor, tokenIndex)
});
}
segments.push({
kind: 'word',
text: token.text,
lemma: token.lemma,
pos: token.pos
});
cursor = tokenIndex + token.text.length;
}
if (cursor < text.length) {
segments.push({
kind: 'text',
text: text.slice(cursor)
});
}
return segments.length > 0 ? segments : [{ kind: 'text', text }];
}
function parseLinguisticParagraphs(
value: Record<string, unknown> | null | undefined
): LinguisticParagraph[] {
if (!value) {
return [];
}
const paragraphs = value.paragraphs;
if (!Array.isArray(paragraphs)) {
return [];
}
return paragraphs
.map((paragraphValue, paragraphIndex): LinguisticParagraph | null => {
if (!isRecord(paragraphValue)) {
return null;
}
const sentencesRaw = paragraphValue.sentences;
const sentenceValues = Array.isArray(sentencesRaw) ? sentencesRaw : [];
const sentences = sentenceValues
.map((sentenceValue, sentenceIndex): SentenceSegments | null => {
if (!isRecord(sentenceValue)) {
return null;
}
const sourceSentence = asString(sentenceValue.source_text) ?? '';
const targetSentence = asString(sentenceValue.target_text) ?? '';
const sourceTokens = parseTokens(sentenceValue.source_tokens);
const targetTokens = parseTokens(sentenceValue.target_tokens);
return {
key: `${paragraphIndex}-${sentenceIndex}`,
sourceText: sourceSentence,
targetText: targetSentence,
sourceSegments: buildSegments(sourceSentence, sourceTokens),
targetSegments: buildSegments(targetSentence, targetTokens)
};
})
.filter((sentence): sentence is SentenceSegments => sentence !== null);
const sourceText =
asString(paragraphValue.source_text) ??
sentences
.map((sentence) => sentence.sourceText)
.filter(Boolean)
.join(' ');
const targetText =
asString(paragraphValue.target_text) ??
sentences
.map((sentence) => sentence.targetText)
.filter(Boolean)
.join(' ');
if (sentences.length === 0 && (sourceText || targetText)) {
sentences.push({
key: `${paragraphIndex}-0`,
sourceText,
targetText,
sourceSegments: sourceText ? [{ kind: 'text', text: sourceText }] : [],
targetSegments: targetText ? [{ kind: 'text', text: targetText }] : []
});
}
if (!sourceText && !targetText && sentences.length === 0) {
return null;
}
return {
key: `p-${paragraphIndex}`,
sourceText,
targetText,
sentences
};
})
.filter((paragraph): paragraph is LinguisticParagraph => paragraph !== null);
}
function isWordLike(text: string): boolean {
return /[\p{L}\p{N}]/u.test(text);
}
const linguisticParagraphs = $derived.by(() =>
parseLinguisticParagraphs(storyTextLinguisticData)
);
const sourceParagraphs = $derived.by(() =>
linguisticParagraphs.length > 0
? linguisticParagraphs.map((paragraph) => paragraph.targetText).filter(Boolean)
: toParagraphs(sourceText)
);
const translationParagraphs = $derived.by(() =>
linguisticParagraphs.length > 0
? linguisticParagraphs.map((paragraph) => paragraph.sourceText).filter(Boolean)
: toParagraphs(translationText)
);
let selectedWord: { sentenceKey: string; text: string } | null = $state(null);
let sourcePane = $state<HTMLDivElement | undefined>(); let sourcePane = $state<HTMLDivElement | undefined>();
let translationPane = $state<HTMLDivElement | undefined>(); let translationPane = $state<HTMLDivElement | undefined>();
let suppressSourceScroll = $state(false); let suppressSourceScroll = $state(false);
@ -101,8 +315,9 @@
}, 20000); }, 20000);
} }
function handleParagraphClicked(paragraphIndex: number) { function handleWordClicked(sentenceKey: string, text: string) {
lastClickedParagraphIndex = paragraphIndex; selectedWord = { sentenceKey, text };
showTranslation();
} }
async function handleNextStepSelect(optionId: string) { async function handleNextStepSelect(optionId: string) {
@ -146,18 +361,45 @@
<div class="pane source-pane"> <div class="pane source-pane">
<div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}> <div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}>
{#if sourceParagraphs.length > 0} {#if sourceParagraphs.length > 0}
{#each sourceParagraphs as paragraph, index (index)} {#if linguisticParagraphs.length > 0}
{#each linguisticParagraphs as paragraph (paragraph.key)}
<p class="paragraph paragraph--text" data-language="source">
{#each paragraph.sentences as sentence (sentence.key)}
<span
class="sentence-chunk"
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
>
{#each sentence.targetSegments as segment, segmentIndex (`${sentence.key}-target-${segmentIndex}`)}
{#if segment.kind === 'word' && isWordLike(segment.text)}
<button <button
type="button" type="button"
class="paragraph" class="word-token"
class:active={lastClickedParagraphIndex === index} class:active={selectedWord?.sentenceKey === sentence.key &&
selectedWord?.text === segment.text}
title={segment.lemma ? `Lemma: ${segment.lemma}` : undefined}
onclick={() => handleWordClicked(sentence.key, segment.text)}
>
{segment.text}
</button>
{:else}
<span>{segment.text}</span>
{/if}
{/each}
</span>
{/each}
</p>
{/each}
{:else}
{#each sourceParagraphs as paragraph, index (index)}
<p
class="paragraph paragraph--text"
data-paragraph-index={index} data-paragraph-index={index}
data-language="source" data-language="source"
onclick={() => handleParagraphClicked(index)}
> >
{paragraph} {paragraph}
</button> </p>
{/each} {/each}
{/if}
{:else} {:else}
<div class="loading-block" role="status" aria-live="polite"> <div class="loading-block" role="status" aria-live="polite">
<p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p> <p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p>
@ -177,6 +419,12 @@
</button> </button>
</header> </header>
{#if selectedWord}
<p class="translation-selected-word" role="status" aria-live="polite">
Selected word: <strong>{selectedWord.text}</strong>
</p>
{/if}
{#if translationVisible} {#if translationVisible}
<div <div
class="latest-entry__pane-body" class="latest-entry__pane-body"
@ -184,18 +432,42 @@
onscroll={handleTranslationScroll} onscroll={handleTranslationScroll}
> >
{#if translationParagraphs.length > 0} {#if translationParagraphs.length > 0}
{#if linguisticParagraphs.length > 0}
{#each linguisticParagraphs as paragraph (paragraph.key)}
<p class="paragraph paragraph--text" data-language="translation">
{#each paragraph.sentences as sentence (sentence.key)}
<span
class="sentence-chunk"
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
>
{#each sentence.sourceSegments as segment, segmentIndex (`${sentence.key}-source-${segmentIndex}`)}
{#if segment.kind === 'word'}
<span
class="word-token word-token--passive"
class:active={selectedWord?.sentenceKey === sentence.key &&
selectedWord?.text === segment.text}
>
{segment.text}
</span>
{:else}
<span>{segment.text}</span>
{/if}
{/each}
</span>
{/each}
</p>
{/each}
{:else}
{#each translationParagraphs as paragraph, index (index)} {#each translationParagraphs as paragraph, index (index)}
<button <p
type="button" class="paragraph paragraph--text"
class="paragraph"
class:active={lastClickedParagraphIndex === index}
data-paragraph-index={index} data-paragraph-index={index}
data-language="translation" data-language="translation"
onclick={() => handleParagraphClicked(index)}
> >
{paragraph} {paragraph}
</button> </p>
{/each} {/each}
{/if}
{:else} {:else}
<div class="loading-block" role="status" aria-live="polite"> <div class="loading-block" role="status" aria-live="polite">
<p class="loading-block__label"> <p class="loading-block__label">
@ -511,6 +783,18 @@
color: color-mix(in srgb, var(--color-on-surface) 72%, transparent); color: color-mix(in srgb, var(--color-on-surface) 72%, transparent);
} }
.translation-selected-word {
margin: 0;
padding: 0 var(--latest-entry-pane-padding) var(--space-2);
font-size: var(--text-label-md);
color: color-mix(in srgb, var(--color-on-surface) 84%, transparent);
}
.translation-selected-word strong {
font-weight: var(--weight-semibold);
color: var(--color-primary);
}
.latest-entry__pane-body::-webkit-scrollbar { .latest-entry__pane-body::-webkit-scrollbar {
width: 0.75rem; width: 0.75rem;
} }
@ -542,9 +826,58 @@
transition: background-color var(--duration-fast) var(--ease-standard); transition: background-color var(--duration-fast) var(--ease-standard);
} }
.paragraph.active { .paragraph--text {
background-color: color-mix(in srgb, var(--color-primary-container) 56%, transparent); margin: 0;
border-radius: var(--radius-md); padding: 0;
border: none;
background: transparent;
cursor: default;
white-space: pre-wrap;
}
.sentence-chunk {
display: inline;
border-radius: var(--radius-sm);
transition: background-color var(--duration-fast) var(--ease-standard);
&::after {
content: '';
display: inline-block;
width: 0.85ch;
}
}
.sentence-chunk.active-sentence {
background-color: color-mix(in srgb, var(--color-primary-container) 32%, transparent);
}
.word-token {
display: inline;
padding: 0;
margin: 0;
border: none;
background: transparent;
color: inherit;
font: inherit;
line-height: inherit;
cursor: pointer;
}
.word-token:hover {
text-decoration: underline;
}
.word-token.active {
color: var(--color-primary);
text-decoration: underline;
}
.word-token--passive {
cursor: default;
}
.word-token--passive:hover {
text-decoration: none;
} }
.paragraph:focus-visible { .paragraph:focus-visible {

View file

@ -4,6 +4,7 @@ export type AdventureEntry = {
id: string; id: string;
story_text: string | null; story_text: string | null;
translation: string | null; translation: string | null;
story_text_linguistic_data: Record<string, unknown> | null;
audio_url: string | null; audio_url: string | null;
generated_from_choice_id: string | null; generated_from_choice_id: string | null;
possible_choices: { id: string; text: string }[] | null; possible_choices: { id: string; text: string }[] | null;