Compare commits
2 commits
941396fc60
...
9b9bdc3a39
| Author | SHA1 | Date | |
|---|---|---|---|
| 9b9bdc3a39 | |||
| 293a8ab3f9 |
24 changed files with 858 additions and 123 deletions
|
|
@ -15,10 +15,16 @@ class Settings(BaseSettings):
|
||||||
scaleway_tem_project_id: str = ""
|
scaleway_tem_project_id: str = ""
|
||||||
scaleway_tem_from_address: str = ""
|
scaleway_tem_from_address: str = ""
|
||||||
scaleway_tem_region: str = "fr-par"
|
scaleway_tem_region: str = "fr-par"
|
||||||
storage_endpoint_url: str
|
storage_provider: str = "local" # or 'bunny'
|
||||||
storage_access_key: str
|
storage_endpoint_url: str = ""
|
||||||
storage_secret_key: str
|
storage_access_key: str = ""
|
||||||
|
storage_secret_key: str = ""
|
||||||
storage_bucket: str = "langlearn"
|
storage_bucket: str = "langlearn"
|
||||||
|
bunny_zone: str = "languagelearningapp"
|
||||||
|
bunny_api_key: str = ""
|
||||||
|
bunny_cdn_base_url: str = ""
|
||||||
|
bunny_token_auth_key: str = ""
|
||||||
|
bunny_storage_endpoint: str = "https://storage.bunnycdn.com"
|
||||||
stub_generation: bool = False
|
stub_generation: bool = False
|
||||||
|
|
||||||
model_config = {"env_file": ".env"}
|
model_config = {"env_file": ".env"}
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ from ...outbound.postgres.repositories.adventure_repository import (
|
||||||
PostgresAdventureRepository,
|
PostgresAdventureRepository,
|
||||||
)
|
)
|
||||||
from ...outbound.spacy.spacy_client import SpacyClient
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
from ...storage import upload_audio
|
from ...outbound.storage_client import get_storage_client
|
||||||
from ..models.adventure import (
|
from ..models.adventure import (
|
||||||
Adventure,
|
Adventure,
|
||||||
AdventureEntry,
|
AdventureEntry,
|
||||||
|
|
@ -251,7 +251,7 @@ class AdventureService:
|
||||||
for sent_idx, target_sent in enumerate(target_nlp["sentences"]):
|
for sent_idx, target_sent in enumerate(target_nlp["sentences"]):
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
translated_sentence = await self.deepl_client.translate(
|
translated_sentence = await self.deepl_client.translate(
|
||||||
target_sent["text"], adventure.source_language
|
target_sent["text"], adventure.source_language, paragraph_text
|
||||||
)
|
)
|
||||||
timing_translations += time.monotonic() - t0
|
timing_translations += time.monotonic() - t0
|
||||||
|
|
||||||
|
|
@ -315,7 +315,7 @@ class AdventureService:
|
||||||
# ── File upload ───────────────────────────────────────────────────
|
# ── File upload ───────────────────────────────────────────────────
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
audio_key = f"adventure-audio/{entry_id}.wav"
|
audio_key = f"adventure-audio/{entry_id}.wav"
|
||||||
upload_audio(audio_key, wav_bytes)
|
get_storage_client().upload(audio_key, wav_bytes)
|
||||||
timing_file_uploading = time.monotonic() - t0
|
timing_file_uploading = time.monotonic() - t0
|
||||||
|
|
||||||
await self.audio_repo.create(
|
await self.audio_repo.create(
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
||||||
from ...outbound.deepl.deepl_client import DeepLClient
|
from ...outbound.deepl.deepl_client import DeepLClient
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...outbound.spacy.spacy_client import SpacyClient
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
from ...storage import upload_audio
|
from ...outbound.storage_client import get_storage_client
|
||||||
from ...languages import SUPPORTED_LANGUAGES
|
from ...languages import SUPPORTED_LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -137,7 +137,7 @@ class SummariseService:
|
||||||
voice = self.gemini_client.get_voice_by_language(target_language)
|
voice = self.gemini_client.get_voice_by_language(target_language)
|
||||||
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
|
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
upload_audio(audio_key, wav_bytes)
|
get_storage_client().upload(audio_key, wav_bytes)
|
||||||
|
|
||||||
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)
|
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,13 @@ from .routers.api import jobs
|
||||||
from .routers import media as media_router
|
from .routers import media as media_router
|
||||||
from .routers.api.main import api_router
|
from .routers.api.main import api_router
|
||||||
from .routers.bff.main import bff_router
|
from .routers.bff.main import bff_router
|
||||||
from .storage import ensure_bucket_exists
|
from .outbound.storage_factory import init_storage
|
||||||
from . import worker
|
from . import worker
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
ensure_bucket_exists()
|
init_storage()
|
||||||
worker_task = asyncio.create_task(worker.worker_loop())
|
worker_task = asyncio.create_task(worker.worker_loop())
|
||||||
yield
|
yield
|
||||||
worker_task.cancel()
|
worker_task.cancel()
|
||||||
|
|
|
||||||
0
api/app/outbound/bunny/__init__.py
Normal file
0
api/app/outbound/bunny/__init__.py
Normal file
77
api/app/outbound/bunny/bunny_client.py
Normal file
77
api/app/outbound/bunny/bunny_client.py
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
_SIGNED_URL_EXPIRY_SECONDS = 3600
|
||||||
|
|
||||||
|
|
||||||
|
class BunnyClient:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
zone: str,
|
||||||
|
api_key: str,
|
||||||
|
cdn_base_url: str,
|
||||||
|
token_auth_key: str,
|
||||||
|
storage_endpoint: str = "https://storage.bunnycdn.com",
|
||||||
|
) -> None:
|
||||||
|
self._zone = zone
|
||||||
|
self._api_key = api_key
|
||||||
|
self._cdn_base_url = cdn_base_url.rstrip("/")
|
||||||
|
self._token_auth_key = token_auth_key
|
||||||
|
self._storage_endpoint = storage_endpoint.rstrip("/")
|
||||||
|
|
||||||
|
def _storage_url(self, path: str) -> str:
|
||||||
|
return f"{self._storage_endpoint}/{self._zone}/{path.lstrip('/')}"
|
||||||
|
|
||||||
|
def upload(self, path: str, data: bytes) -> bool:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
self._storage_url(path),
|
||||||
|
data=data,
|
||||||
|
method="PUT",
|
||||||
|
headers={
|
||||||
|
"AccessKey": self._api_key,
|
||||||
|
"Content-Type": "audio/wav",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return resp.status == 201
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_url(self, path: str) -> str:
|
||||||
|
url_path = f"/{path.lstrip('/')}"
|
||||||
|
expiration = int(time.time()) + _SIGNED_URL_EXPIRY_SECONDS
|
||||||
|
digest = hashlib.sha256(
|
||||||
|
(self._token_auth_key + url_path + str(expiration)).encode()
|
||||||
|
).digest()
|
||||||
|
token = (
|
||||||
|
base64.b64encode(digest)
|
||||||
|
.decode()
|
||||||
|
.replace("+", "-")
|
||||||
|
.replace("/", "_")
|
||||||
|
.replace("=", "")
|
||||||
|
)
|
||||||
|
return f"{self._cdn_base_url}{url_path}?token={token}&expires={expiration}"
|
||||||
|
|
||||||
|
def get_public_url(self, path: str) -> str:
|
||||||
|
return f"{self._cdn_base_url}/{path.lstrip('/')}"
|
||||||
|
|
||||||
|
def delete(self, path: str) -> bool:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
self._storage_url(path),
|
||||||
|
method="DELETE",
|
||||||
|
headers={"AccessKey": self._api_key},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
return resp.status == 200
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def download(self, path: str) -> tuple[bytes, str]:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Direct download not available with Bunny — use get_url() to obtain a signed CDN URL"
|
||||||
|
)
|
||||||
|
|
@ -1,9 +1,19 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
|
||||||
from google import genai
|
from google import genai
|
||||||
from google.genai import types as genai_types
|
from google.genai import types as genai_types
|
||||||
|
|
||||||
from ...storage import pcm_to_wav
|
|
||||||
|
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
|
||||||
|
buf = io.BytesIO()
|
||||||
|
with wave.open(buf, "wb") as wf:
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(2)
|
||||||
|
wf.setframerate(sample_rate)
|
||||||
|
wf.writeframes(pcm_data)
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
VOICE_BY_LANGUAGE: dict[str, str] = {
|
VOICE_BY_LANGUAGE: dict[str, str] = {
|
||||||
"fr": "Kore",
|
"fr": "Kore",
|
||||||
|
|
@ -47,6 +57,6 @@ class GeminiClient():
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
||||||
return pcm_to_wav(pcm_data)
|
return _pcm_to_wav(pcm_data)
|
||||||
|
|
||||||
return await asyncio.to_thread(_call)
|
return await asyncio.to_thread(_call)
|
||||||
|
|
|
||||||
0
api/app/outbound/minio/__init__.py
Normal file
0
api/app/outbound/minio/__init__.py
Normal file
70
api/app/outbound/minio/minio_client.py
Normal file
70
api/app/outbound/minio/minio_client.py
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
import boto3
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
|
||||||
|
class MinioClient:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
endpoint_url: str,
|
||||||
|
access_key: str,
|
||||||
|
secret_key: str,
|
||||||
|
bucket: str,
|
||||||
|
api_base_url: str,
|
||||||
|
) -> None:
|
||||||
|
self._endpoint_url = endpoint_url
|
||||||
|
self._access_key = access_key
|
||||||
|
self._secret_key = secret_key
|
||||||
|
self._bucket = bucket
|
||||||
|
self._api_base_url = api_base_url.rstrip("/")
|
||||||
|
|
||||||
|
def _s3(self):
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=self._endpoint_url,
|
||||||
|
aws_access_key_id=self._access_key,
|
||||||
|
aws_secret_access_key=self._secret_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
def ensure_bucket_exists(self) -> None:
|
||||||
|
client = self._s3()
|
||||||
|
try:
|
||||||
|
client.head_bucket(Bucket=self._bucket)
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
|
||||||
|
client.create_bucket(Bucket=self._bucket)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def upload(self, path: str, data: bytes) -> bool:
|
||||||
|
try:
|
||||||
|
self._s3().put_object(
|
||||||
|
Bucket=self._bucket,
|
||||||
|
Key=path,
|
||||||
|
Body=data,
|
||||||
|
ContentType="audio/wav",
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except ClientError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_url(self, path: str) -> str:
|
||||||
|
return f"{self._api_base_url}/media/{path}"
|
||||||
|
|
||||||
|
def get_public_url(self, path: str) -> str:
|
||||||
|
return f"{self._api_base_url}/media/{path}"
|
||||||
|
|
||||||
|
def delete(self, path: str) -> bool:
|
||||||
|
try:
|
||||||
|
self._s3().delete_object(Bucket=self._bucket, Key=path)
|
||||||
|
return True
|
||||||
|
except ClientError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def download(self, path: str) -> tuple[bytes, str]:
|
||||||
|
try:
|
||||||
|
response = self._s3().get_object(Bucket=self._bucket, Key=path)
|
||||||
|
return response["Body"].read(), response.get("ContentType", "audio/wav")
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
||||||
|
raise FileNotFoundError(path)
|
||||||
|
raise
|
||||||
21
api/app/outbound/storage_client.py
Normal file
21
api/app/outbound/storage_client.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
_client: "StorageClient | None" = None
|
||||||
|
|
||||||
|
|
||||||
|
class StorageClient(Protocol):
|
||||||
|
def upload(self, path: str, data: bytes) -> bool: ...
|
||||||
|
def get_url(self, path: str) -> str: ...
|
||||||
|
def get_public_url(self, path: str) -> str: ...
|
||||||
|
def delete(self, path: str) -> bool: ...
|
||||||
|
def download(self, path: str) -> tuple[bytes, str]: ...
|
||||||
|
|
||||||
|
|
||||||
|
def get_storage_client() -> "StorageClient":
|
||||||
|
assert _client is not None, "Storage client not initialised — call init_storage() at startup"
|
||||||
|
return _client
|
||||||
|
|
||||||
|
|
||||||
|
def _set_storage_client(c: "StorageClient") -> None:
|
||||||
|
global _client
|
||||||
|
_client = c
|
||||||
27
api/app/outbound/storage_factory.py
Normal file
27
api/app/outbound/storage_factory.py
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
from ..config import settings
|
||||||
|
from .storage_client import StorageClient, _set_storage_client
|
||||||
|
from .minio.minio_client import MinioClient
|
||||||
|
from .bunny.bunny_client import BunnyClient
|
||||||
|
|
||||||
|
|
||||||
|
def init_storage() -> None:
|
||||||
|
client: StorageClient
|
||||||
|
if settings.storage_provider == "bunny":
|
||||||
|
client = BunnyClient(
|
||||||
|
zone=settings.bunny_zone,
|
||||||
|
api_key=settings.bunny_api_key,
|
||||||
|
cdn_base_url=settings.bunny_cdn_base_url,
|
||||||
|
token_auth_key=settings.bunny_token_auth_key,
|
||||||
|
storage_endpoint=settings.bunny_storage_endpoint,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
minio = MinioClient(
|
||||||
|
endpoint_url=settings.storage_endpoint_url,
|
||||||
|
access_key=settings.storage_access_key,
|
||||||
|
secret_key=settings.storage_secret_key,
|
||||||
|
bucket=settings.storage_bucket,
|
||||||
|
api_base_url=settings.api_base_url,
|
||||||
|
)
|
||||||
|
minio.ensure_bucket_exists()
|
||||||
|
client = minio
|
||||||
|
_set_storage_client(client)
|
||||||
|
|
@ -12,7 +12,7 @@ from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
|
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...storage import upload_audio
|
from ...outbound.storage_client import get_storage_client
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
from ... import worker
|
from ... import worker
|
||||||
|
|
||||||
|
|
@ -92,7 +92,7 @@ async def _run_regenerate_audio(job_id: uuid.UUID) -> None:
|
||||||
voice = gemini_client.get_voice_by_language(article_entity.target_language)
|
voice = gemini_client.get_voice_by_language(article_entity.target_language)
|
||||||
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
|
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
upload_audio(audio_key, wav_bytes)
|
get_storage_client().upload(audio_key, wav_bytes)
|
||||||
|
|
||||||
await article_repo.update_audio(
|
await article_repo.update_audio(
|
||||||
article_entity.id,
|
article_entity.id,
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,8 @@ from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
from ...config import settings
|
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
|
from ...outbound.storage_client import get_storage_client
|
||||||
from ...outbound.postgres.repositories.adventure_repository import (
|
from ...outbound.postgres.repositories.adventure_repository import (
|
||||||
PostgresAdventureEntryAudioRepository,
|
PostgresAdventureEntryAudioRepository,
|
||||||
PostgresAdventureEntryChoiceRepository,
|
PostgresAdventureEntryChoiceRepository,
|
||||||
|
|
@ -61,7 +61,7 @@ class AdventureDetailResponse(BaseModel):
|
||||||
def _audio_url(key: str | None) -> str | None:
|
def _audio_url(key: str | None) -> str | None:
|
||||||
if key is None:
|
if key is None:
|
||||||
return None
|
return None
|
||||||
return f"{settings.api_base_url}/media/{key}"
|
return get_storage_client().get_url(key)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200)
|
@router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200)
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,8 @@ from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
from ...config import settings
|
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
|
from ...outbound.storage_client import get_storage_client
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
|
|
||||||
router = APIRouter(prefix="/articles", tags=["bff", "articles"])
|
router = APIRouter(prefix="/articles", tags=["bff", "articles"])
|
||||||
|
|
@ -46,7 +46,7 @@ class ArticleDetail(BaseModel):
|
||||||
def _audio_url(key: str | None) -> str | None:
|
def _audio_url(key: str | None) -> str | None:
|
||||||
if key is None:
|
if key is None:
|
||||||
return None
|
return None
|
||||||
return f"{settings.api_base_url}/media/{key}"
|
return get_storage_client().get_url(key)
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=ArticleListResponse, status_code=200)
|
@router.get("", response_model=ArticleListResponse, status_code=200)
|
||||||
|
|
|
||||||
|
|
@ -3,12 +3,11 @@ import uuid
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from botocore.exceptions import ClientError
|
|
||||||
|
|
||||||
from ..outbound.postgres.database import get_db
|
from ..outbound.postgres.database import get_db
|
||||||
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository
|
from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository
|
||||||
from ..storage import download_audio
|
from ..outbound.storage_client import get_storage_client
|
||||||
|
|
||||||
router = APIRouter(prefix="/media", tags=["media"])
|
router = APIRouter(prefix="/media", tags=["media"])
|
||||||
|
|
||||||
|
|
@ -23,21 +22,23 @@ async def get_adventure_audio_file(
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise HTTPException(status_code=400, detail="Invalid file ID")
|
raise HTTPException(status_code=400, detail="Invalid file ID")
|
||||||
|
|
||||||
print(f"Looking for adventure audio with entry ID: {eid}")
|
|
||||||
|
|
||||||
adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text")
|
adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text")
|
||||||
|
|
||||||
if adventure_audio is None:
|
if adventure_audio is None:
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_bytes, content_type = download_audio("adventure-audio/" + filename)
|
audio_bytes, content_type = get_storage_client().download("adventure-audio/" + filename)
|
||||||
except ClientError as e:
|
except FileNotFoundError:
|
||||||
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
except NotImplementedError:
|
||||||
|
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
|
||||||
|
except Exception:
|
||||||
raise HTTPException(status_code=500, detail="Storage error")
|
raise HTTPException(status_code=500, detail="Storage error")
|
||||||
|
|
||||||
return Response(content=audio_bytes, media_type=content_type)
|
return Response(content=audio_bytes, media_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{filename:path}")
|
@router.get("/{filename:path}")
|
||||||
async def get_media_file(
|
async def get_media_file(
|
||||||
filename: str,
|
filename: str,
|
||||||
|
|
@ -49,11 +50,12 @@ async def get_media_file(
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_bytes, content_type = download_audio(filename)
|
audio_bytes, content_type = get_storage_client().download(filename)
|
||||||
except ClientError as e:
|
except FileNotFoundError:
|
||||||
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
except NotImplementedError:
|
||||||
|
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
|
||||||
|
except Exception:
|
||||||
raise HTTPException(status_code=500, detail="Storage error")
|
raise HTTPException(status_code=500, detail="Storage error")
|
||||||
|
|
||||||
return Response(content=audio_bytes, media_type=content_type)
|
return Response(content=audio_bytes, media_type=content_type)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
||||||
import io
|
|
||||||
import wave
|
|
||||||
|
|
||||||
import boto3
|
|
||||||
from botocore.exceptions import ClientError
|
|
||||||
|
|
||||||
from .config import settings
|
|
||||||
|
|
||||||
|
|
||||||
def get_s3_client():
|
|
||||||
return boto3.client(
|
|
||||||
"s3",
|
|
||||||
endpoint_url=settings.storage_endpoint_url,
|
|
||||||
aws_access_key_id=settings.storage_access_key,
|
|
||||||
aws_secret_access_key=settings.storage_secret_key,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_bucket_exists() -> None:
|
|
||||||
client = get_s3_client()
|
|
||||||
try:
|
|
||||||
client.head_bucket(Bucket=settings.storage_bucket)
|
|
||||||
except ClientError as e:
|
|
||||||
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
|
|
||||||
client.create_bucket(Bucket=settings.storage_bucket)
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
|
|
||||||
"""Wrap raw 16-bit mono PCM data in a WAV container."""
|
|
||||||
buf = io.BytesIO()
|
|
||||||
with wave.open(buf, "wb") as wf:
|
|
||||||
wf.setnchannels(1)
|
|
||||||
wf.setsampwidth(2) # 16-bit
|
|
||||||
wf.setframerate(sample_rate)
|
|
||||||
wf.writeframes(pcm_data)
|
|
||||||
return buf.getvalue()
|
|
||||||
|
|
||||||
|
|
||||||
def upload_audio(object_key: str, audio_bytes: bytes, content_type: str = "audio/wav") -> None:
|
|
||||||
client = get_s3_client()
|
|
||||||
client.put_object(
|
|
||||||
Bucket=settings.storage_bucket,
|
|
||||||
Key=object_key,
|
|
||||||
Body=audio_bytes,
|
|
||||||
ContentType=content_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def download_audio(object_key: str) -> tuple[bytes, str]:
|
|
||||||
"""Return (file_bytes, content_type)."""
|
|
||||||
client = get_s3_client()
|
|
||||||
response = client.get_object(Bucket=settings.storage_bucket, Key=object_key)
|
|
||||||
content_type = response.get("ContentType", "audio/wav")
|
|
||||||
return response["Body"].read(), content_type
|
|
||||||
82
api/docs/design-doc-object-storage.md
Normal file
82
api/docs/design-doc-object-storage.md
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
# Design Document: Object Storage with Bunny CDN
|
||||||
|
|
||||||
|
This is a technical design document for implementing object (e.g. audio file) storage with Bunny CDN. This directory (`api/docs`) contains other similar files, notably `architecture.md` and `domain.md`. When you have worked through the change described here, please update `architecture.md`
|
||||||
|
|
||||||
|
## The problem
|
||||||
|
|
||||||
|
Language Learning App has audio as a core component, which requires files to be delivered to the end user. When developing locally, these files have been stored in a min.io service, mimicking an S3-like storage bucket.
|
||||||
|
|
||||||
|
Using this approach on a deployed instance (e.g. on a VPS using Docker), would result in high bandwidth and therefore a high cost. Using a dedicated, EU-based service like Bunny allows us to offload the delivery of content to a third-party at reduced cost (great!)
|
||||||
|
|
||||||
|
## The current implementation
|
||||||
|
|
||||||
|
Object storage was one of the first features built into this software in MVP state, as such it does not fit within the current architecture.
|
||||||
|
|
||||||
|
Right now `api/app/storage.py` contains some helper functions, notably the `upload_audio` and `download_audio` functions.
|
||||||
|
|
||||||
|
Users (through the web client) retrieve the media through two URLs (detailed in `api/app/routers/media.py`):
|
||||||
|
|
||||||
|
- `GET /media/adventure-audio/{filename:path}` for the choose-your-own-adventure file names
|
||||||
|
- `GET /media/{filename:path}`, used for the summary transcriptions
|
||||||
|
|
||||||
|
## The solution
|
||||||
|
|
||||||
|
We are going to use Bunny (bunny.net) as the CDN for all objects in deployed environments (right now, just production — in the future preprod or staging may exist).
|
||||||
|
|
||||||
|
Locally, for development purposes, we retain the use of MinIO. To decide which backend to use, we introduce an environment variable `STORAGE_PROVIDER` with a default value of `local` and an accepted alternative of `bunny`.
|
||||||
|
|
||||||
|
In situations where we use `local`, the existing `/media/..` proxy endpoints are returned when constructing audio URLs (e.g. in `api/app/routers/bff/articles.py` and `api/app/routers/bff/adventure.py`). When we use `bunny`, the Bunny CDN URL is returned directly so the request is never proxied through our service.
|
||||||
|
|
||||||
|
### Client interface
|
||||||
|
|
||||||
|
We will create a `BunnyClient` in `api/app/outbound/bunny/bunny_client.py` and extract the current MinIO logic into a `MinioClient` in `api/app/outbound/minio/minio_client.py`. Both implement a shared `StorageClient` protocol.
|
||||||
|
|
||||||
|
The interface is **generic** — the clients are storage adapters and must not encode domain concepts. Path construction (which directory, which filename) is the responsibility of the caller (the service layer), not the client.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class StorageClient(Protocol):
|
||||||
|
def upload(self, path: str, data: bytes) -> bool: ...
|
||||||
|
def get_url(self, path: str) -> str: ...
|
||||||
|
def delete(self, path: str) -> bool: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Services construct paths using hardcoded directory prefixes (e.g. `"adventure-audio/"`, `"audio/"`). These are constants, not environment variables — they are not environment-specific and do not belong in config.
|
||||||
|
|
||||||
|
### Factory and instantiation
|
||||||
|
|
||||||
|
A factory function reads `STORAGE_PROVIDER` and returns the appropriate `StorageClient` implementation. The client is instantiated **once at app startup** (e.g. in `main.py`) as a module-level singleton — not per-request. This is consistent with how other outbound clients (`AnthropicClient`, `GeminiClient`, etc.) are handled.
|
||||||
|
|
||||||
|
### Bunny configuration
|
||||||
|
|
||||||
|
Bunny requires the following environment variables:
|
||||||
|
|
||||||
|
- `BUNNY_ZONE` — the storage zone name (the zone `languagelearningapp` has been created in the Bunny UI). No "DEFAULT" suffix; there is one zone.
|
||||||
|
- `BUNNY_API_KEY` — the Bunny API key for upload/delete operations.
|
||||||
|
- `BUNNY_CDN_BASE_URL` — the public CDN hostname used to construct delivery URLs.
|
||||||
|
|
||||||
|
### Signed vs. public URLs
|
||||||
|
|
||||||
|
Audio files are user-specific (i.e. one user should not be able to use another user's audio URL), Bunny signed URLs are required. Public CDN URLs are shareable by anyone who has the link.
|
||||||
|
|
||||||
|
As per Bunny's own documentation they recommend the token.py package:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from token import sign_url
|
||||||
|
|
||||||
|
url = sign_url(
|
||||||
|
"https://myzone.b-cdn.net/videos/stream1/playlist.m3u8",
|
||||||
|
"your-security-key",
|
||||||
|
expiration_time=3600,
|
||||||
|
is_directory=True,
|
||||||
|
path_allowed="/videos/stream1/",
|
||||||
|
countries_allowed="GB",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`get_url(path)` on the `BunnyClient` must generate a time-limited (pick a sensible default for audio content here) signed URL using the Bunny Token Authentication feature. The MinIO implementation would use pre-signed S3 URLs for consistency.
|
||||||
|
|
||||||
|
Create a sibling method that explicitely creates public URLs for any future public content, call this `get_public_url`.
|
||||||
|
|
||||||
|
### Misc
|
||||||
|
|
||||||
|
`pcm_to_wav()` currently lives in `api/app/storage.py` but is a Gemini output concern. Move it to the Gemini client module (`api/app/outbound/gemini/`) when carrying out this refactor.
|
||||||
|
|
@ -146,7 +146,6 @@ def parse_llm_response(text: str) -> tuple[str, list[tuple[str, str]], str]:
|
||||||
app/domain/models/adventure.py
|
app/domain/models/adventure.py
|
||||||
app/domain/services/adventure_service.py
|
app/domain/services/adventure_service.py
|
||||||
app/routers/api/adventures.py
|
app/routers/api/adventures.py
|
||||||
app/routers/bff/adventures.py
|
|
||||||
app/outbound/postgres/entities/adventure_entities.py
|
app/outbound/postgres/entities/adventure_entities.py
|
||||||
app/outbound/postgres/repositories/adventure_repository.py
|
app/outbound/postgres/repositories/adventure_repository.py
|
||||||
alembic/versions/20260503_0016_add_choose_your_own_adventure.py
|
alembic/versions/20260503_0016_add_choose_your_own_adventure.py
|
||||||
|
|
@ -158,7 +157,6 @@ Modified files:
|
||||||
```
|
```
|
||||||
app/outbound/anthropic/anthropic_client.py (add 2 methods)
|
app/outbound/anthropic/anthropic_client.py (add 2 methods)
|
||||||
app/routers/api/main.py (register router)
|
app/routers/api/main.py (register router)
|
||||||
app/routers/bff/main.py (register router)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
||||||
0
content/choose-your-own-adventure/README.md
Normal file
0
content/choose-your-own-adventure/README.md
Normal file
80
docker-compose-dev.yml
Normal file
80
docker-compose-dev.yml
Normal file
|
|
@ -0,0 +1,80 @@
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
|
||||||
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
|
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
|
||||||
|
volumes:
|
||||||
|
- pgdata:/var/lib/postgresql/data
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
storage:
|
||||||
|
image: minio/minio:latest
|
||||||
|
command: server /data --console-address ":9001"
|
||||||
|
environment:
|
||||||
|
MINIO_ROOT_USER: ${STORAGE_ACCESS_KEY:-langlearn}
|
||||||
|
MINIO_ROOT_PASSWORD: ${STORAGE_SECRET_KEY}
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
|
- "9001:9001"
|
||||||
|
volumes:
|
||||||
|
- storagedata:/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -sf http://localhost:9000/minio/health/live || exit 1"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
api:
|
||||||
|
build: ./api
|
||||||
|
volumes:
|
||||||
|
- ./api:/app:z
|
||||||
|
ports:
|
||||||
|
- "${API_PORT:-8000}:8000"
|
||||||
|
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
|
||||||
|
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS:-wilson@thomaswilson.xyz}
|
||||||
|
API_BASE_URL: ${API_BASE_URL:-http://localhost:8000}
|
||||||
|
JWT_SECRET: ${JWT_SECRET}
|
||||||
|
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
||||||
|
DEEPL_API_KEY: ${DEEPL_API_KEY}
|
||||||
|
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
|
||||||
|
GEMINI_API_KEY: ${GEMINI_API_KEY}
|
||||||
|
PYTHONPATH: /app
|
||||||
|
STORAGE_PROVIDER: local
|
||||||
|
STORAGE_ENDPOINT_URL: http://storage:9000
|
||||||
|
STORAGE_ACCESS_KEY: ${STORAGE_ACCESS_KEY:-langlearn}
|
||||||
|
STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY}
|
||||||
|
STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn}
|
||||||
|
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub}
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
storage:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
frontend:
|
||||||
|
build:
|
||||||
|
context: ./frontend
|
||||||
|
args:
|
||||||
|
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000}
|
||||||
|
ports:
|
||||||
|
- "${FRONTEND_PORT:-3000}:3000"
|
||||||
|
environment:
|
||||||
|
ORIGIN: ${ORIGIN:-http://localhost:3000}
|
||||||
|
depends_on:
|
||||||
|
- api
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
pgdata:
|
||||||
|
storagedata:
|
||||||
83
docker-compose-prod.yml
Normal file
83
docker-compose-prod.yml
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
|
||||||
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
|
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
|
||||||
|
volumes:
|
||||||
|
- pgdata:/var/lib/postgresql/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
restart: unless-stopped
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '1'
|
||||||
|
memory: 1G
|
||||||
|
|
||||||
|
api:
|
||||||
|
build: ./api
|
||||||
|
ports:
|
||||||
|
- "${API_PORT:-8000}:8000"
|
||||||
|
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 2
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
|
||||||
|
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS}
|
||||||
|
API_BASE_URL: ${API_BASE_URL}
|
||||||
|
JWT_SECRET: ${JWT_SECRET}
|
||||||
|
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
||||||
|
DEEPL_API_KEY: ${DEEPL_API_KEY}
|
||||||
|
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
|
||||||
|
GEMINI_API_KEY: ${GEMINI_API_KEY}
|
||||||
|
PYTHONPATH: /app
|
||||||
|
STORAGE_PROVIDER: bunny
|
||||||
|
BUNNY_ZONE: ${BUNNY_ZONE}
|
||||||
|
BUNNY_API_KEY: ${BUNNY_API_KEY}
|
||||||
|
BUNNY_CDN_BASE_URL: ${BUNNY_CDN_BASE_URL}
|
||||||
|
BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY}
|
||||||
|
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -sf http://localhost:8000/health || exit 1"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
start_period: 20s
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: unless-stopped
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '1'
|
||||||
|
memory: 1G
|
||||||
|
|
||||||
|
frontend:
|
||||||
|
build:
|
||||||
|
context: ./frontend
|
||||||
|
args:
|
||||||
|
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL}
|
||||||
|
ports:
|
||||||
|
- "${FRONTEND_PORT:-3000}:3000"
|
||||||
|
environment:
|
||||||
|
ORIGIN: ${ORIGIN}
|
||||||
|
depends_on:
|
||||||
|
api:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: unless-stopped
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '0.5'
|
||||||
|
memory: 256M
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
pgdata:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
name: langlearn
|
||||||
|
|
@ -299,6 +299,7 @@
|
||||||
<LatestEntry
|
<LatestEntry
|
||||||
sourceText={latestEntry?.story_text}
|
sourceText={latestEntry?.story_text}
|
||||||
translationText={latestEntry?.translation}
|
translationText={latestEntry?.translation}
|
||||||
|
storyTextLinguisticData={latestEntry?.story_text_linguistic_data}
|
||||||
audioUrl={latestEntry?.audio_url}
|
audioUrl={latestEntry?.audio_url}
|
||||||
onSelectNextStep={handleNextStepSelect}
|
onSelectNextStep={handleNextStepSelect}
|
||||||
isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration}
|
isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
type Props = {
|
type Props = {
|
||||||
sourceText: string | null | undefined;
|
sourceText: string | null | undefined;
|
||||||
translationText: string | null | undefined;
|
translationText: string | null | undefined;
|
||||||
|
storyTextLinguisticData: Record<string, unknown> | null | undefined;
|
||||||
audioUrl: string | null | undefined;
|
audioUrl: string | null | undefined;
|
||||||
|
|
||||||
onSelectNextStep: (optionId: string) => Promise<void>;
|
onSelectNextStep: (optionId: string) => Promise<void>;
|
||||||
|
|
@ -24,6 +25,7 @@
|
||||||
const {
|
const {
|
||||||
sourceText,
|
sourceText,
|
||||||
translationText,
|
translationText,
|
||||||
|
storyTextLinguisticData,
|
||||||
audioUrl,
|
audioUrl,
|
||||||
|
|
||||||
onSelectNextStep,
|
onSelectNextStep,
|
||||||
|
|
@ -33,10 +35,222 @@
|
||||||
errorMessage
|
errorMessage
|
||||||
}: Props = $props();
|
}: Props = $props();
|
||||||
|
|
||||||
const sourceParagraphs = $derived.by(() => toParagraphs(sourceText));
|
type LinguisticToken = {
|
||||||
const translationParagraphs = $derived.by(() => toParagraphs(translationText));
|
text: string;
|
||||||
|
lemma: string | null;
|
||||||
|
pos: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
let lastClickedParagraphIndex: number | null = $state(null);
|
type TextSegment = {
|
||||||
|
kind: 'text';
|
||||||
|
text: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type WordSegment = {
|
||||||
|
kind: 'word';
|
||||||
|
text: string;
|
||||||
|
lemma: string | null;
|
||||||
|
pos: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
type SentenceSegments = {
|
||||||
|
key: string;
|
||||||
|
sourceText: string;
|
||||||
|
targetText: string;
|
||||||
|
sourceSegments: Array<TextSegment | WordSegment>;
|
||||||
|
targetSegments: Array<TextSegment | WordSegment>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type LinguisticParagraph = {
|
||||||
|
key: string;
|
||||||
|
sourceText: string;
|
||||||
|
targetText: string;
|
||||||
|
sentences: SentenceSegments[];
|
||||||
|
};
|
||||||
|
|
||||||
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||||
|
return typeof value === 'object' && value !== null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function asString(value: unknown): string | null {
|
||||||
|
return typeof value === 'string' ? value : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseTokens(value: unknown): LinguisticToken[] {
|
||||||
|
if (!Array.isArray(value)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
.map((token): LinguisticToken | null => {
|
||||||
|
if (!isRecord(token)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = asString(token.text);
|
||||||
|
if (!text) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
text,
|
||||||
|
lemma: asString(token.lemma),
|
||||||
|
pos: asString(token.pos)
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter((token): token is LinguisticToken => token !== null);
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildSegments(
|
||||||
|
text: string,
|
||||||
|
tokens: LinguisticToken[]
|
||||||
|
): Array<TextSegment | WordSegment> {
|
||||||
|
if (tokens.length === 0) {
|
||||||
|
return text ? [{ kind: 'text', text }] : [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!text) {
|
||||||
|
return tokens.flatMap((token, index) => [
|
||||||
|
{ kind: 'word', text: token.text, lemma: token.lemma, pos: token.pos } as WordSegment,
|
||||||
|
...(index < tokens.length - 1 ? ([{ kind: 'text', text: ' ' }] as TextSegment[]) : [])
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const segments: Array<TextSegment | WordSegment> = [];
|
||||||
|
let cursor = 0;
|
||||||
|
|
||||||
|
for (const token of tokens) {
|
||||||
|
const tokenIndex = text.indexOf(token.text, cursor);
|
||||||
|
|
||||||
|
if (tokenIndex === -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tokenIndex > cursor) {
|
||||||
|
segments.push({
|
||||||
|
kind: 'text',
|
||||||
|
text: text.slice(cursor, tokenIndex)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
segments.push({
|
||||||
|
kind: 'word',
|
||||||
|
text: token.text,
|
||||||
|
lemma: token.lemma,
|
||||||
|
pos: token.pos
|
||||||
|
});
|
||||||
|
|
||||||
|
cursor = tokenIndex + token.text.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cursor < text.length) {
|
||||||
|
segments.push({
|
||||||
|
kind: 'text',
|
||||||
|
text: text.slice(cursor)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments.length > 0 ? segments : [{ kind: 'text', text }];
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseLinguisticParagraphs(
|
||||||
|
value: Record<string, unknown> | null | undefined
|
||||||
|
): LinguisticParagraph[] {
|
||||||
|
if (!value) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const paragraphs = value.paragraphs;
|
||||||
|
if (!Array.isArray(paragraphs)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return paragraphs
|
||||||
|
.map((paragraphValue, paragraphIndex): LinguisticParagraph | null => {
|
||||||
|
if (!isRecord(paragraphValue)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sentencesRaw = paragraphValue.sentences;
|
||||||
|
const sentenceValues = Array.isArray(sentencesRaw) ? sentencesRaw : [];
|
||||||
|
|
||||||
|
const sentences = sentenceValues
|
||||||
|
.map((sentenceValue, sentenceIndex): SentenceSegments | null => {
|
||||||
|
if (!isRecord(sentenceValue)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sourceSentence = asString(sentenceValue.source_text) ?? '';
|
||||||
|
const targetSentence = asString(sentenceValue.target_text) ?? '';
|
||||||
|
const sourceTokens = parseTokens(sentenceValue.source_tokens);
|
||||||
|
const targetTokens = parseTokens(sentenceValue.target_tokens);
|
||||||
|
|
||||||
|
return {
|
||||||
|
key: `${paragraphIndex}-${sentenceIndex}`,
|
||||||
|
sourceText: sourceSentence,
|
||||||
|
targetText: targetSentence,
|
||||||
|
sourceSegments: buildSegments(sourceSentence, sourceTokens),
|
||||||
|
targetSegments: buildSegments(targetSentence, targetTokens)
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter((sentence): sentence is SentenceSegments => sentence !== null);
|
||||||
|
|
||||||
|
const sourceText =
|
||||||
|
asString(paragraphValue.source_text) ??
|
||||||
|
sentences
|
||||||
|
.map((sentence) => sentence.sourceText)
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(' ');
|
||||||
|
const targetText =
|
||||||
|
asString(paragraphValue.target_text) ??
|
||||||
|
sentences
|
||||||
|
.map((sentence) => sentence.targetText)
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(' ');
|
||||||
|
|
||||||
|
if (sentences.length === 0 && (sourceText || targetText)) {
|
||||||
|
sentences.push({
|
||||||
|
key: `${paragraphIndex}-0`,
|
||||||
|
sourceText,
|
||||||
|
targetText,
|
||||||
|
sourceSegments: sourceText ? [{ kind: 'text', text: sourceText }] : [],
|
||||||
|
targetSegments: targetText ? [{ kind: 'text', text: targetText }] : []
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sourceText && !targetText && sentences.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
key: `p-${paragraphIndex}`,
|
||||||
|
sourceText,
|
||||||
|
targetText,
|
||||||
|
sentences
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter((paragraph): paragraph is LinguisticParagraph => paragraph !== null);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isWordLike(text: string): boolean {
|
||||||
|
return /[\p{L}\p{N}]/u.test(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
const linguisticParagraphs = $derived.by(() =>
|
||||||
|
parseLinguisticParagraphs(storyTextLinguisticData)
|
||||||
|
);
|
||||||
|
const sourceParagraphs = $derived.by(() =>
|
||||||
|
linguisticParagraphs.length > 0
|
||||||
|
? linguisticParagraphs.map((paragraph) => paragraph.targetText).filter(Boolean)
|
||||||
|
: toParagraphs(sourceText)
|
||||||
|
);
|
||||||
|
const translationParagraphs = $derived.by(() =>
|
||||||
|
linguisticParagraphs.length > 0
|
||||||
|
? linguisticParagraphs.map((paragraph) => paragraph.sourceText).filter(Boolean)
|
||||||
|
: toParagraphs(translationText)
|
||||||
|
);
|
||||||
|
|
||||||
|
let selectedWord: { sentenceKey: string; text: string } | null = $state(null);
|
||||||
let sourcePane = $state<HTMLDivElement | undefined>();
|
let sourcePane = $state<HTMLDivElement | undefined>();
|
||||||
let translationPane = $state<HTMLDivElement | undefined>();
|
let translationPane = $state<HTMLDivElement | undefined>();
|
||||||
let suppressSourceScroll = $state(false);
|
let suppressSourceScroll = $state(false);
|
||||||
|
|
@ -101,8 +315,9 @@
|
||||||
}, 20000);
|
}, 20000);
|
||||||
}
|
}
|
||||||
|
|
||||||
function handleParagraphClicked(paragraphIndex: number) {
|
function handleWordClicked(sentenceKey: string, text: string) {
|
||||||
lastClickedParagraphIndex = paragraphIndex;
|
selectedWord = { sentenceKey, text };
|
||||||
|
showTranslation();
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleNextStepSelect(optionId: string) {
|
async function handleNextStepSelect(optionId: string) {
|
||||||
|
|
@ -146,18 +361,45 @@
|
||||||
<div class="pane source-pane">
|
<div class="pane source-pane">
|
||||||
<div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}>
|
<div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}>
|
||||||
{#if sourceParagraphs.length > 0}
|
{#if sourceParagraphs.length > 0}
|
||||||
{#each sourceParagraphs as paragraph, index (index)}
|
{#if linguisticParagraphs.length > 0}
|
||||||
<button
|
{#each linguisticParagraphs as paragraph (paragraph.key)}
|
||||||
type="button"
|
<p class="paragraph paragraph--text" data-language="source">
|
||||||
class="paragraph"
|
{#each paragraph.sentences as sentence (sentence.key)}
|
||||||
class:active={lastClickedParagraphIndex === index}
|
<span
|
||||||
data-paragraph-index={index}
|
class="sentence-chunk"
|
||||||
data-language="source"
|
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
|
||||||
onclick={() => handleParagraphClicked(index)}
|
>
|
||||||
>
|
{#each sentence.targetSegments as segment, segmentIndex (`${sentence.key}-target-${segmentIndex}`)}
|
||||||
{paragraph}
|
{#if segment.kind === 'word' && isWordLike(segment.text)}
|
||||||
</button>
|
<button
|
||||||
{/each}
|
type="button"
|
||||||
|
class="word-token"
|
||||||
|
class:active={selectedWord?.sentenceKey === sentence.key &&
|
||||||
|
selectedWord?.text === segment.text}
|
||||||
|
title={segment.lemma ? `Lemma: ${segment.lemma}` : undefined}
|
||||||
|
onclick={() => handleWordClicked(sentence.key, segment.text)}
|
||||||
|
>
|
||||||
|
{segment.text}
|
||||||
|
</button>
|
||||||
|
{:else}
|
||||||
|
<span>{segment.text}</span>
|
||||||
|
{/if}
|
||||||
|
{/each}
|
||||||
|
</span>
|
||||||
|
{/each}
|
||||||
|
</p>
|
||||||
|
{/each}
|
||||||
|
{:else}
|
||||||
|
{#each sourceParagraphs as paragraph, index (index)}
|
||||||
|
<p
|
||||||
|
class="paragraph paragraph--text"
|
||||||
|
data-paragraph-index={index}
|
||||||
|
data-language="source"
|
||||||
|
>
|
||||||
|
{paragraph}
|
||||||
|
</p>
|
||||||
|
{/each}
|
||||||
|
{/if}
|
||||||
{:else}
|
{:else}
|
||||||
<div class="loading-block" role="status" aria-live="polite">
|
<div class="loading-block" role="status" aria-live="polite">
|
||||||
<p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p>
|
<p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p>
|
||||||
|
|
@ -177,6 +419,12 @@
|
||||||
</button>
|
</button>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
{#if selectedWord}
|
||||||
|
<p class="translation-selected-word" role="status" aria-live="polite">
|
||||||
|
Selected word: <strong>{selectedWord.text}</strong>
|
||||||
|
</p>
|
||||||
|
{/if}
|
||||||
|
|
||||||
{#if translationVisible}
|
{#if translationVisible}
|
||||||
<div
|
<div
|
||||||
class="latest-entry__pane-body"
|
class="latest-entry__pane-body"
|
||||||
|
|
@ -184,18 +432,42 @@
|
||||||
onscroll={handleTranslationScroll}
|
onscroll={handleTranslationScroll}
|
||||||
>
|
>
|
||||||
{#if translationParagraphs.length > 0}
|
{#if translationParagraphs.length > 0}
|
||||||
{#each translationParagraphs as paragraph, index (index)}
|
{#if linguisticParagraphs.length > 0}
|
||||||
<button
|
{#each linguisticParagraphs as paragraph (paragraph.key)}
|
||||||
type="button"
|
<p class="paragraph paragraph--text" data-language="translation">
|
||||||
class="paragraph"
|
{#each paragraph.sentences as sentence (sentence.key)}
|
||||||
class:active={lastClickedParagraphIndex === index}
|
<span
|
||||||
data-paragraph-index={index}
|
class="sentence-chunk"
|
||||||
data-language="translation"
|
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
|
||||||
onclick={() => handleParagraphClicked(index)}
|
>
|
||||||
>
|
{#each sentence.sourceSegments as segment, segmentIndex (`${sentence.key}-source-${segmentIndex}`)}
|
||||||
{paragraph}
|
{#if segment.kind === 'word'}
|
||||||
</button>
|
<span
|
||||||
{/each}
|
class="word-token word-token--passive"
|
||||||
|
class:active={selectedWord?.sentenceKey === sentence.key &&
|
||||||
|
selectedWord?.text === segment.text}
|
||||||
|
>
|
||||||
|
{segment.text}
|
||||||
|
</span>
|
||||||
|
{:else}
|
||||||
|
<span>{segment.text}</span>
|
||||||
|
{/if}
|
||||||
|
{/each}
|
||||||
|
</span>
|
||||||
|
{/each}
|
||||||
|
</p>
|
||||||
|
{/each}
|
||||||
|
{:else}
|
||||||
|
{#each translationParagraphs as paragraph, index (index)}
|
||||||
|
<p
|
||||||
|
class="paragraph paragraph--text"
|
||||||
|
data-paragraph-index={index}
|
||||||
|
data-language="translation"
|
||||||
|
>
|
||||||
|
{paragraph}
|
||||||
|
</p>
|
||||||
|
{/each}
|
||||||
|
{/if}
|
||||||
{:else}
|
{:else}
|
||||||
<div class="loading-block" role="status" aria-live="polite">
|
<div class="loading-block" role="status" aria-live="polite">
|
||||||
<p class="loading-block__label">
|
<p class="loading-block__label">
|
||||||
|
|
@ -511,6 +783,18 @@
|
||||||
color: color-mix(in srgb, var(--color-on-surface) 72%, transparent);
|
color: color-mix(in srgb, var(--color-on-surface) 72%, transparent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.translation-selected-word {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0 var(--latest-entry-pane-padding) var(--space-2);
|
||||||
|
font-size: var(--text-label-md);
|
||||||
|
color: color-mix(in srgb, var(--color-on-surface) 84%, transparent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.translation-selected-word strong {
|
||||||
|
font-weight: var(--weight-semibold);
|
||||||
|
color: var(--color-primary);
|
||||||
|
}
|
||||||
|
|
||||||
.latest-entry__pane-body::-webkit-scrollbar {
|
.latest-entry__pane-body::-webkit-scrollbar {
|
||||||
width: 0.75rem;
|
width: 0.75rem;
|
||||||
}
|
}
|
||||||
|
|
@ -542,9 +826,58 @@
|
||||||
transition: background-color var(--duration-fast) var(--ease-standard);
|
transition: background-color var(--duration-fast) var(--ease-standard);
|
||||||
}
|
}
|
||||||
|
|
||||||
.paragraph.active {
|
.paragraph--text {
|
||||||
background-color: color-mix(in srgb, var(--color-primary-container) 56%, transparent);
|
margin: 0;
|
||||||
border-radius: var(--radius-md);
|
padding: 0;
|
||||||
|
border: none;
|
||||||
|
background: transparent;
|
||||||
|
cursor: default;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sentence-chunk {
|
||||||
|
display: inline;
|
||||||
|
border-radius: var(--radius-sm);
|
||||||
|
transition: background-color var(--duration-fast) var(--ease-standard);
|
||||||
|
|
||||||
|
&::after {
|
||||||
|
content: '';
|
||||||
|
display: inline-block;
|
||||||
|
width: 0.85ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.sentence-chunk.active-sentence {
|
||||||
|
background-color: color-mix(in srgb, var(--color-primary-container) 32%, transparent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.word-token {
|
||||||
|
display: inline;
|
||||||
|
padding: 0;
|
||||||
|
margin: 0;
|
||||||
|
border: none;
|
||||||
|
background: transparent;
|
||||||
|
color: inherit;
|
||||||
|
font: inherit;
|
||||||
|
line-height: inherit;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word-token:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word-token.active {
|
||||||
|
color: var(--color-primary);
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word-token--passive {
|
||||||
|
cursor: default;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word-token--passive:hover {
|
||||||
|
text-decoration: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
.paragraph:focus-visible {
|
.paragraph:focus-visible {
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ export type AdventureEntry = {
|
||||||
id: string;
|
id: string;
|
||||||
story_text: string | null;
|
story_text: string | null;
|
||||||
translation: string | null;
|
translation: string | null;
|
||||||
|
story_text_linguistic_data: Record<string, unknown> | null;
|
||||||
audio_url: string | null;
|
audio_url: string | null;
|
||||||
generated_from_choice_id: string | null;
|
generated_from_choice_id: string | null;
|
||||||
possible_choices: { id: string; text: string }[] | null;
|
possible_choices: { id: string; text: string }[] | null;
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue