Compare commits
No commits in common. "9b9bdc3a3994b8c9c2e9a646d69326b171f96b99" and "941396fc6076e9bd6c8e0927b406b53aba7fbfed" have entirely different histories.
9b9bdc3a39
...
941396fc60
24 changed files with 123 additions and 858 deletions
|
|
@ -15,16 +15,10 @@ class Settings(BaseSettings):
|
||||||
scaleway_tem_project_id: str = ""
|
scaleway_tem_project_id: str = ""
|
||||||
scaleway_tem_from_address: str = ""
|
scaleway_tem_from_address: str = ""
|
||||||
scaleway_tem_region: str = "fr-par"
|
scaleway_tem_region: str = "fr-par"
|
||||||
storage_provider: str = "local" # or 'bunny'
|
storage_endpoint_url: str
|
||||||
storage_endpoint_url: str = ""
|
storage_access_key: str
|
||||||
storage_access_key: str = ""
|
storage_secret_key: str
|
||||||
storage_secret_key: str = ""
|
|
||||||
storage_bucket: str = "langlearn"
|
storage_bucket: str = "langlearn"
|
||||||
bunny_zone: str = "languagelearningapp"
|
|
||||||
bunny_api_key: str = ""
|
|
||||||
bunny_cdn_base_url: str = ""
|
|
||||||
bunny_token_auth_key: str = ""
|
|
||||||
bunny_storage_endpoint: str = "https://storage.bunnycdn.com"
|
|
||||||
stub_generation: bool = False
|
stub_generation: bool = False
|
||||||
|
|
||||||
model_config = {"env_file": ".env"}
|
model_config = {"env_file": ".env"}
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ from ...outbound.postgres.repositories.adventure_repository import (
|
||||||
PostgresAdventureRepository,
|
PostgresAdventureRepository,
|
||||||
)
|
)
|
||||||
from ...outbound.spacy.spacy_client import SpacyClient
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
from ...outbound.storage_client import get_storage_client
|
from ...storage import upload_audio
|
||||||
from ..models.adventure import (
|
from ..models.adventure import (
|
||||||
Adventure,
|
Adventure,
|
||||||
AdventureEntry,
|
AdventureEntry,
|
||||||
|
|
@ -251,7 +251,7 @@ class AdventureService:
|
||||||
for sent_idx, target_sent in enumerate(target_nlp["sentences"]):
|
for sent_idx, target_sent in enumerate(target_nlp["sentences"]):
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
translated_sentence = await self.deepl_client.translate(
|
translated_sentence = await self.deepl_client.translate(
|
||||||
target_sent["text"], adventure.source_language, paragraph_text
|
target_sent["text"], adventure.source_language
|
||||||
)
|
)
|
||||||
timing_translations += time.monotonic() - t0
|
timing_translations += time.monotonic() - t0
|
||||||
|
|
||||||
|
|
@ -315,7 +315,7 @@ class AdventureService:
|
||||||
# ── File upload ───────────────────────────────────────────────────
|
# ── File upload ───────────────────────────────────────────────────
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
audio_key = f"adventure-audio/{entry_id}.wav"
|
audio_key = f"adventure-audio/{entry_id}.wav"
|
||||||
get_storage_client().upload(audio_key, wav_bytes)
|
upload_audio(audio_key, wav_bytes)
|
||||||
timing_file_uploading = time.monotonic() - t0
|
timing_file_uploading = time.monotonic() - t0
|
||||||
|
|
||||||
await self.audio_repo.create(
|
await self.audio_repo.create(
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
||||||
from ...outbound.deepl.deepl_client import DeepLClient
|
from ...outbound.deepl.deepl_client import DeepLClient
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...outbound.spacy.spacy_client import SpacyClient
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
from ...outbound.storage_client import get_storage_client
|
from ...storage import upload_audio
|
||||||
from ...languages import SUPPORTED_LANGUAGES
|
from ...languages import SUPPORTED_LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -137,7 +137,7 @@ class SummariseService:
|
||||||
voice = self.gemini_client.get_voice_by_language(target_language)
|
voice = self.gemini_client.get_voice_by_language(target_language)
|
||||||
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
|
wav_bytes = await self.gemini_client.generate_audio(generated_text, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
get_storage_client().upload(audio_key, wav_bytes)
|
upload_audio(audio_key, wav_bytes)
|
||||||
|
|
||||||
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)
|
transcript = await self.deepgram_client.transcribe_bytes(wav_bytes, target_language)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,13 @@ from .routers.api import jobs
|
||||||
from .routers import media as media_router
|
from .routers import media as media_router
|
||||||
from .routers.api.main import api_router
|
from .routers.api.main import api_router
|
||||||
from .routers.bff.main import bff_router
|
from .routers.bff.main import bff_router
|
||||||
from .outbound.storage_factory import init_storage
|
from .storage import ensure_bucket_exists
|
||||||
from . import worker
|
from . import worker
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
init_storage()
|
ensure_bucket_exists()
|
||||||
worker_task = asyncio.create_task(worker.worker_loop())
|
worker_task = asyncio.create_task(worker.worker_loop())
|
||||||
yield
|
yield
|
||||||
worker_task.cancel()
|
worker_task.cancel()
|
||||||
|
|
|
||||||
|
|
@ -1,77 +0,0 @@
|
||||||
import base64
|
|
||||||
import hashlib
|
|
||||||
import time
|
|
||||||
import urllib.error
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
_SIGNED_URL_EXPIRY_SECONDS = 3600
|
|
||||||
|
|
||||||
|
|
||||||
class BunnyClient:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
zone: str,
|
|
||||||
api_key: str,
|
|
||||||
cdn_base_url: str,
|
|
||||||
token_auth_key: str,
|
|
||||||
storage_endpoint: str = "https://storage.bunnycdn.com",
|
|
||||||
) -> None:
|
|
||||||
self._zone = zone
|
|
||||||
self._api_key = api_key
|
|
||||||
self._cdn_base_url = cdn_base_url.rstrip("/")
|
|
||||||
self._token_auth_key = token_auth_key
|
|
||||||
self._storage_endpoint = storage_endpoint.rstrip("/")
|
|
||||||
|
|
||||||
def _storage_url(self, path: str) -> str:
|
|
||||||
return f"{self._storage_endpoint}/{self._zone}/{path.lstrip('/')}"
|
|
||||||
|
|
||||||
def upload(self, path: str, data: bytes) -> bool:
|
|
||||||
req = urllib.request.Request(
|
|
||||||
self._storage_url(path),
|
|
||||||
data=data,
|
|
||||||
method="PUT",
|
|
||||||
headers={
|
|
||||||
"AccessKey": self._api_key,
|
|
||||||
"Content-Type": "audio/wav",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(req) as resp:
|
|
||||||
return resp.status == 201
|
|
||||||
except urllib.error.HTTPError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_url(self, path: str) -> str:
|
|
||||||
url_path = f"/{path.lstrip('/')}"
|
|
||||||
expiration = int(time.time()) + _SIGNED_URL_EXPIRY_SECONDS
|
|
||||||
digest = hashlib.sha256(
|
|
||||||
(self._token_auth_key + url_path + str(expiration)).encode()
|
|
||||||
).digest()
|
|
||||||
token = (
|
|
||||||
base64.b64encode(digest)
|
|
||||||
.decode()
|
|
||||||
.replace("+", "-")
|
|
||||||
.replace("/", "_")
|
|
||||||
.replace("=", "")
|
|
||||||
)
|
|
||||||
return f"{self._cdn_base_url}{url_path}?token={token}&expires={expiration}"
|
|
||||||
|
|
||||||
def get_public_url(self, path: str) -> str:
|
|
||||||
return f"{self._cdn_base_url}/{path.lstrip('/')}"
|
|
||||||
|
|
||||||
def delete(self, path: str) -> bool:
|
|
||||||
req = urllib.request.Request(
|
|
||||||
self._storage_url(path),
|
|
||||||
method="DELETE",
|
|
||||||
headers={"AccessKey": self._api_key},
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(req) as resp:
|
|
||||||
return resp.status == 200
|
|
||||||
except urllib.error.HTTPError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, path: str) -> tuple[bytes, str]:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Direct download not available with Bunny — use get_url() to obtain a signed CDN URL"
|
|
||||||
)
|
|
||||||
|
|
@ -1,19 +1,9 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import io
|
|
||||||
import wave
|
|
||||||
|
|
||||||
from google import genai
|
from google import genai
|
||||||
from google.genai import types as genai_types
|
from google.genai import types as genai_types
|
||||||
|
|
||||||
|
from ...storage import pcm_to_wav
|
||||||
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
|
|
||||||
buf = io.BytesIO()
|
|
||||||
with wave.open(buf, "wb") as wf:
|
|
||||||
wf.setnchannels(1)
|
|
||||||
wf.setsampwidth(2)
|
|
||||||
wf.setframerate(sample_rate)
|
|
||||||
wf.writeframes(pcm_data)
|
|
||||||
return buf.getvalue()
|
|
||||||
|
|
||||||
VOICE_BY_LANGUAGE: dict[str, str] = {
|
VOICE_BY_LANGUAGE: dict[str, str] = {
|
||||||
"fr": "Kore",
|
"fr": "Kore",
|
||||||
|
|
@ -57,6 +47,6 @@ class GeminiClient():
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
||||||
return _pcm_to_wav(pcm_data)
|
return pcm_to_wav(pcm_data)
|
||||||
|
|
||||||
return await asyncio.to_thread(_call)
|
return await asyncio.to_thread(_call)
|
||||||
|
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
import boto3
|
|
||||||
from botocore.exceptions import ClientError
|
|
||||||
|
|
||||||
|
|
||||||
class MinioClient:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
endpoint_url: str,
|
|
||||||
access_key: str,
|
|
||||||
secret_key: str,
|
|
||||||
bucket: str,
|
|
||||||
api_base_url: str,
|
|
||||||
) -> None:
|
|
||||||
self._endpoint_url = endpoint_url
|
|
||||||
self._access_key = access_key
|
|
||||||
self._secret_key = secret_key
|
|
||||||
self._bucket = bucket
|
|
||||||
self._api_base_url = api_base_url.rstrip("/")
|
|
||||||
|
|
||||||
def _s3(self):
|
|
||||||
return boto3.client(
|
|
||||||
"s3",
|
|
||||||
endpoint_url=self._endpoint_url,
|
|
||||||
aws_access_key_id=self._access_key,
|
|
||||||
aws_secret_access_key=self._secret_key,
|
|
||||||
)
|
|
||||||
|
|
||||||
def ensure_bucket_exists(self) -> None:
|
|
||||||
client = self._s3()
|
|
||||||
try:
|
|
||||||
client.head_bucket(Bucket=self._bucket)
|
|
||||||
except ClientError as e:
|
|
||||||
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
|
|
||||||
client.create_bucket(Bucket=self._bucket)
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
def upload(self, path: str, data: bytes) -> bool:
|
|
||||||
try:
|
|
||||||
self._s3().put_object(
|
|
||||||
Bucket=self._bucket,
|
|
||||||
Key=path,
|
|
||||||
Body=data,
|
|
||||||
ContentType="audio/wav",
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
except ClientError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_url(self, path: str) -> str:
|
|
||||||
return f"{self._api_base_url}/media/{path}"
|
|
||||||
|
|
||||||
def get_public_url(self, path: str) -> str:
|
|
||||||
return f"{self._api_base_url}/media/{path}"
|
|
||||||
|
|
||||||
def delete(self, path: str) -> bool:
|
|
||||||
try:
|
|
||||||
self._s3().delete_object(Bucket=self._bucket, Key=path)
|
|
||||||
return True
|
|
||||||
except ClientError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, path: str) -> tuple[bytes, str]:
|
|
||||||
try:
|
|
||||||
response = self._s3().get_object(Bucket=self._bucket, Key=path)
|
|
||||||
return response["Body"].read(), response.get("ContentType", "audio/wav")
|
|
||||||
except ClientError as e:
|
|
||||||
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
|
||||||
raise FileNotFoundError(path)
|
|
||||||
raise
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
from typing import Protocol
|
|
||||||
|
|
||||||
_client: "StorageClient | None" = None
|
|
||||||
|
|
||||||
|
|
||||||
class StorageClient(Protocol):
|
|
||||||
def upload(self, path: str, data: bytes) -> bool: ...
|
|
||||||
def get_url(self, path: str) -> str: ...
|
|
||||||
def get_public_url(self, path: str) -> str: ...
|
|
||||||
def delete(self, path: str) -> bool: ...
|
|
||||||
def download(self, path: str) -> tuple[bytes, str]: ...
|
|
||||||
|
|
||||||
|
|
||||||
def get_storage_client() -> "StorageClient":
|
|
||||||
assert _client is not None, "Storage client not initialised — call init_storage() at startup"
|
|
||||||
return _client
|
|
||||||
|
|
||||||
|
|
||||||
def _set_storage_client(c: "StorageClient") -> None:
|
|
||||||
global _client
|
|
||||||
_client = c
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
from ..config import settings
|
|
||||||
from .storage_client import StorageClient, _set_storage_client
|
|
||||||
from .minio.minio_client import MinioClient
|
|
||||||
from .bunny.bunny_client import BunnyClient
|
|
||||||
|
|
||||||
|
|
||||||
def init_storage() -> None:
|
|
||||||
client: StorageClient
|
|
||||||
if settings.storage_provider == "bunny":
|
|
||||||
client = BunnyClient(
|
|
||||||
zone=settings.bunny_zone,
|
|
||||||
api_key=settings.bunny_api_key,
|
|
||||||
cdn_base_url=settings.bunny_cdn_base_url,
|
|
||||||
token_auth_key=settings.bunny_token_auth_key,
|
|
||||||
storage_endpoint=settings.bunny_storage_endpoint,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
minio = MinioClient(
|
|
||||||
endpoint_url=settings.storage_endpoint_url,
|
|
||||||
access_key=settings.storage_access_key,
|
|
||||||
secret_key=settings.storage_secret_key,
|
|
||||||
bucket=settings.storage_bucket,
|
|
||||||
api_base_url=settings.api_base_url,
|
|
||||||
)
|
|
||||||
minio.ensure_bucket_exists()
|
|
||||||
client = minio
|
|
||||||
_set_storage_client(client)
|
|
||||||
|
|
@ -12,7 +12,7 @@ from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
|
from ...outbound.postgres.entities.translated_article_entity import TranslatedArticleEntity
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
from ...outbound.storage_client import get_storage_client
|
from ...storage import upload_audio
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
from ... import worker
|
from ... import worker
|
||||||
|
|
||||||
|
|
@ -92,7 +92,7 @@ async def _run_regenerate_audio(job_id: uuid.UUID) -> None:
|
||||||
voice = gemini_client.get_voice_by_language(article_entity.target_language)
|
voice = gemini_client.get_voice_by_language(article_entity.target_language)
|
||||||
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
|
wav_bytes = await gemini_client.generate_audio(article_entity.target_body, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
get_storage_client().upload(audio_key, wav_bytes)
|
upload_audio(audio_key, wav_bytes)
|
||||||
|
|
||||||
await article_repo.update_audio(
|
await article_repo.update_audio(
|
||||||
article_entity.id,
|
article_entity.id,
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,8 @@ from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
|
from ...config import settings
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
from ...outbound.storage_client import get_storage_client
|
|
||||||
from ...outbound.postgres.repositories.adventure_repository import (
|
from ...outbound.postgres.repositories.adventure_repository import (
|
||||||
PostgresAdventureEntryAudioRepository,
|
PostgresAdventureEntryAudioRepository,
|
||||||
PostgresAdventureEntryChoiceRepository,
|
PostgresAdventureEntryChoiceRepository,
|
||||||
|
|
@ -61,7 +61,7 @@ class AdventureDetailResponse(BaseModel):
|
||||||
def _audio_url(key: str | None) -> str | None:
|
def _audio_url(key: str | None) -> str | None:
|
||||||
if key is None:
|
if key is None:
|
||||||
return None
|
return None
|
||||||
return get_storage_client().get_url(key)
|
return f"{settings.api_base_url}/media/{key}"
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200)
|
@router.get("/{adventure_id}", response_model=AdventureDetailResponse, status_code=200)
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,8 @@ from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
|
from ...config import settings
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
from ...outbound.storage_client import get_storage_client
|
|
||||||
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ...outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
|
|
||||||
router = APIRouter(prefix="/articles", tags=["bff", "articles"])
|
router = APIRouter(prefix="/articles", tags=["bff", "articles"])
|
||||||
|
|
@ -46,7 +46,7 @@ class ArticleDetail(BaseModel):
|
||||||
def _audio_url(key: str | None) -> str | None:
|
def _audio_url(key: str | None) -> str | None:
|
||||||
if key is None:
|
if key is None:
|
||||||
return None
|
return None
|
||||||
return get_storage_client().get_url(key)
|
return f"{settings.api_base_url}/media/{key}"
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=ArticleListResponse, status_code=200)
|
@router.get("", response_model=ArticleListResponse, status_code=200)
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,12 @@ import uuid
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
from ..outbound.postgres.database import get_db
|
from ..outbound.postgres.database import get_db
|
||||||
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
from ..outbound.postgres.repositories.translated_article_repository import TranslatedArticleRepository
|
||||||
from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository
|
from ..outbound.postgres.repositories.adventure_repository import PostgresAdventureEntryAudioRepository
|
||||||
from ..outbound.storage_client import get_storage_client
|
from ..storage import download_audio
|
||||||
|
|
||||||
router = APIRouter(prefix="/media", tags=["media"])
|
router = APIRouter(prefix="/media", tags=["media"])
|
||||||
|
|
||||||
|
|
@ -22,23 +23,21 @@ async def get_adventure_audio_file(
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise HTTPException(status_code=400, detail="Invalid file ID")
|
raise HTTPException(status_code=400, detail="Invalid file ID")
|
||||||
|
|
||||||
|
print(f"Looking for adventure audio with entry ID: {eid}")
|
||||||
|
|
||||||
adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text")
|
adventure_audio = await PostgresAdventureEntryAudioRepository(db).get_for_entry(entry_id=eid, component_type="story_text")
|
||||||
|
|
||||||
if adventure_audio is None:
|
if adventure_audio is None:
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_bytes, content_type = get_storage_client().download("adventure-audio/" + filename)
|
audio_bytes, content_type = download_audio("adventure-audio/" + filename)
|
||||||
except FileNotFoundError:
|
except ClientError as e:
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
||||||
except NotImplementedError:
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
|
|
||||||
except Exception:
|
|
||||||
raise HTTPException(status_code=500, detail="Storage error")
|
raise HTTPException(status_code=500, detail="Storage error")
|
||||||
|
|
||||||
return Response(content=audio_bytes, media_type=content_type)
|
return Response(content=audio_bytes, media_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{filename:path}")
|
@router.get("/{filename:path}")
|
||||||
async def get_media_file(
|
async def get_media_file(
|
||||||
filename: str,
|
filename: str,
|
||||||
|
|
@ -50,12 +49,11 @@ async def get_media_file(
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_bytes, content_type = get_storage_client().download(filename)
|
audio_bytes, content_type = download_audio(filename)
|
||||||
except FileNotFoundError:
|
except ClientError as e:
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
if e.response["Error"]["Code"] in ("NoSuchKey", "404"):
|
||||||
except NotImplementedError:
|
raise HTTPException(status_code=404, detail="File not found")
|
||||||
raise HTTPException(status_code=501, detail="Media proxy not available with current storage provider")
|
|
||||||
except Exception:
|
|
||||||
raise HTTPException(status_code=500, detail="Storage error")
|
raise HTTPException(status_code=500, detail="Storage error")
|
||||||
|
|
||||||
return Response(content=audio_bytes, media_type=content_type)
|
return Response(content=audio_bytes, media_type=content_type)
|
||||||
|
|
||||||
|
|
|
||||||
56
api/app/storage.py
Normal file
56
api/app/storage.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
|
||||||
|
def get_s3_client():
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=settings.storage_endpoint_url,
|
||||||
|
aws_access_key_id=settings.storage_access_key,
|
||||||
|
aws_secret_access_key=settings.storage_secret_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_bucket_exists() -> None:
|
||||||
|
client = get_s3_client()
|
||||||
|
try:
|
||||||
|
client.head_bucket(Bucket=settings.storage_bucket)
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response["Error"]["Code"] in ("404", "NoSuchBucket"):
|
||||||
|
client.create_bucket(Bucket=settings.storage_bucket)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000) -> bytes:
|
||||||
|
"""Wrap raw 16-bit mono PCM data in a WAV container."""
|
||||||
|
buf = io.BytesIO()
|
||||||
|
with wave.open(buf, "wb") as wf:
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(2) # 16-bit
|
||||||
|
wf.setframerate(sample_rate)
|
||||||
|
wf.writeframes(pcm_data)
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def upload_audio(object_key: str, audio_bytes: bytes, content_type: str = "audio/wav") -> None:
|
||||||
|
client = get_s3_client()
|
||||||
|
client.put_object(
|
||||||
|
Bucket=settings.storage_bucket,
|
||||||
|
Key=object_key,
|
||||||
|
Body=audio_bytes,
|
||||||
|
ContentType=content_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_audio(object_key: str) -> tuple[bytes, str]:
|
||||||
|
"""Return (file_bytes, content_type)."""
|
||||||
|
client = get_s3_client()
|
||||||
|
response = client.get_object(Bucket=settings.storage_bucket, Key=object_key)
|
||||||
|
content_type = response.get("ContentType", "audio/wav")
|
||||||
|
return response["Body"].read(), content_type
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
# Design Document: Object Storage with Bunny CDN
|
|
||||||
|
|
||||||
This is a technical design document for implementing object (e.g. audio file) storage with Bunny CDN. This directory (`api/docs`) contains other similar files, notably `architecture.md` and `domain.md`. When you have worked through the change described here, please update `architecture.md`
|
|
||||||
|
|
||||||
## The problem
|
|
||||||
|
|
||||||
Language Learning App has audio as a core component, which requires files to be delivered to the end user. When developing locally, these files have been stored in a min.io service, mimicking an S3-like storage bucket.
|
|
||||||
|
|
||||||
Using this approach on a deployed instance (e.g. on a VPS using Docker), would result in high bandwidth and therefore a high cost. Using a dedicated, EU-based service like Bunny allows us to offload the delivery of content to a third-party at reduced cost (great!)
|
|
||||||
|
|
||||||
## The current implementation
|
|
||||||
|
|
||||||
Object storage was one of the first features built into this software in MVP state, as such it does not fit within the current architecture.
|
|
||||||
|
|
||||||
Right now `api/app/storage.py` contains some helper functions, notably the `upload_audio` and `download_audio` functions.
|
|
||||||
|
|
||||||
Users (through the web client) retrieve the media through two URLs (detailed in `api/app/routers/media.py`):
|
|
||||||
|
|
||||||
- `GET /media/adventure-audio/{filename:path}` for the choose-your-own-adventure file names
|
|
||||||
- `GET /media/{filename:path}`, used for the summary transcriptions
|
|
||||||
|
|
||||||
## The solution
|
|
||||||
|
|
||||||
We are going to use Bunny (bunny.net) as the CDN for all objects in deployed environments (right now, just production — in the future preprod or staging may exist).
|
|
||||||
|
|
||||||
Locally, for development purposes, we retain the use of MinIO. To decide which backend to use, we introduce an environment variable `STORAGE_PROVIDER` with a default value of `local` and an accepted alternative of `bunny`.
|
|
||||||
|
|
||||||
In situations where we use `local`, the existing `/media/..` proxy endpoints are returned when constructing audio URLs (e.g. in `api/app/routers/bff/articles.py` and `api/app/routers/bff/adventure.py`). When we use `bunny`, the Bunny CDN URL is returned directly so the request is never proxied through our service.
|
|
||||||
|
|
||||||
### Client interface
|
|
||||||
|
|
||||||
We will create a `BunnyClient` in `api/app/outbound/bunny/bunny_client.py` and extract the current MinIO logic into a `MinioClient` in `api/app/outbound/minio/minio_client.py`. Both implement a shared `StorageClient` protocol.
|
|
||||||
|
|
||||||
The interface is **generic** — the clients are storage adapters and must not encode domain concepts. Path construction (which directory, which filename) is the responsibility of the caller (the service layer), not the client.
|
|
||||||
|
|
||||||
```python
|
|
||||||
class StorageClient(Protocol):
|
|
||||||
def upload(self, path: str, data: bytes) -> bool: ...
|
|
||||||
def get_url(self, path: str) -> str: ...
|
|
||||||
def delete(self, path: str) -> bool: ...
|
|
||||||
```
|
|
||||||
|
|
||||||
Services construct paths using hardcoded directory prefixes (e.g. `"adventure-audio/"`, `"audio/"`). These are constants, not environment variables — they are not environment-specific and do not belong in config.
|
|
||||||
|
|
||||||
### Factory and instantiation
|
|
||||||
|
|
||||||
A factory function reads `STORAGE_PROVIDER` and returns the appropriate `StorageClient` implementation. The client is instantiated **once at app startup** (e.g. in `main.py`) as a module-level singleton — not per-request. This is consistent with how other outbound clients (`AnthropicClient`, `GeminiClient`, etc.) are handled.
|
|
||||||
|
|
||||||
### Bunny configuration
|
|
||||||
|
|
||||||
Bunny requires the following environment variables:
|
|
||||||
|
|
||||||
- `BUNNY_ZONE` — the storage zone name (the zone `languagelearningapp` has been created in the Bunny UI). No "DEFAULT" suffix; there is one zone.
|
|
||||||
- `BUNNY_API_KEY` — the Bunny API key for upload/delete operations.
|
|
||||||
- `BUNNY_CDN_BASE_URL` — the public CDN hostname used to construct delivery URLs.
|
|
||||||
|
|
||||||
### Signed vs. public URLs
|
|
||||||
|
|
||||||
Audio files are user-specific (i.e. one user should not be able to use another user's audio URL), Bunny signed URLs are required. Public CDN URLs are shareable by anyone who has the link.
|
|
||||||
|
|
||||||
As per Bunny's own documentation they recommend the token.py package:
|
|
||||||
|
|
||||||
```py
|
|
||||||
from token import sign_url
|
|
||||||
|
|
||||||
url = sign_url(
|
|
||||||
"https://myzone.b-cdn.net/videos/stream1/playlist.m3u8",
|
|
||||||
"your-security-key",
|
|
||||||
expiration_time=3600,
|
|
||||||
is_directory=True,
|
|
||||||
path_allowed="/videos/stream1/",
|
|
||||||
countries_allowed="GB",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
`get_url(path)` on the `BunnyClient` must generate a time-limited (pick a sensible default for audio content here) signed URL using the Bunny Token Authentication feature. The MinIO implementation would use pre-signed S3 URLs for consistency.
|
|
||||||
|
|
||||||
Create a sibling method that explicitely creates public URLs for any future public content, call this `get_public_url`.
|
|
||||||
|
|
||||||
### Misc
|
|
||||||
|
|
||||||
`pcm_to_wav()` currently lives in `api/app/storage.py` but is a Gemini output concern. Move it to the Gemini client module (`api/app/outbound/gemini/`) when carrying out this refactor.
|
|
||||||
|
|
@ -146,6 +146,7 @@ def parse_llm_response(text: str) -> tuple[str, list[tuple[str, str]], str]:
|
||||||
app/domain/models/adventure.py
|
app/domain/models/adventure.py
|
||||||
app/domain/services/adventure_service.py
|
app/domain/services/adventure_service.py
|
||||||
app/routers/api/adventures.py
|
app/routers/api/adventures.py
|
||||||
|
app/routers/bff/adventures.py
|
||||||
app/outbound/postgres/entities/adventure_entities.py
|
app/outbound/postgres/entities/adventure_entities.py
|
||||||
app/outbound/postgres/repositories/adventure_repository.py
|
app/outbound/postgres/repositories/adventure_repository.py
|
||||||
alembic/versions/20260503_0016_add_choose_your_own_adventure.py
|
alembic/versions/20260503_0016_add_choose_your_own_adventure.py
|
||||||
|
|
@ -157,6 +158,7 @@ Modified files:
|
||||||
```
|
```
|
||||||
app/outbound/anthropic/anthropic_client.py (add 2 methods)
|
app/outbound/anthropic/anthropic_client.py (add 2 methods)
|
||||||
app/routers/api/main.py (register router)
|
app/routers/api/main.py (register router)
|
||||||
|
app/routers/bff/main.py (register router)
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
||||||
|
|
@ -1,80 +0,0 @@
|
||||||
services:
|
|
||||||
db:
|
|
||||||
image: postgres:16-alpine
|
|
||||||
environment:
|
|
||||||
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
|
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
|
||||||
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
|
|
||||||
volumes:
|
|
||||||
- pgdata:/var/lib/postgresql/data
|
|
||||||
ports:
|
|
||||||
- "5432:5432"
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 10
|
|
||||||
|
|
||||||
storage:
|
|
||||||
image: minio/minio:latest
|
|
||||||
command: server /data --console-address ":9001"
|
|
||||||
environment:
|
|
||||||
MINIO_ROOT_USER: ${STORAGE_ACCESS_KEY:-langlearn}
|
|
||||||
MINIO_ROOT_PASSWORD: ${STORAGE_SECRET_KEY}
|
|
||||||
ports:
|
|
||||||
- "9000:9000"
|
|
||||||
- "9001:9001"
|
|
||||||
volumes:
|
|
||||||
- storagedata:/data
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "curl -sf http://localhost:9000/minio/health/live || exit 1"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 10
|
|
||||||
|
|
||||||
api:
|
|
||||||
build: ./api
|
|
||||||
volumes:
|
|
||||||
- ./api:/app:z
|
|
||||||
ports:
|
|
||||||
- "${API_PORT:-8000}:8000"
|
|
||||||
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
|
|
||||||
environment:
|
|
||||||
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
|
|
||||||
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS:-wilson@thomaswilson.xyz}
|
|
||||||
API_BASE_URL: ${API_BASE_URL:-http://localhost:8000}
|
|
||||||
JWT_SECRET: ${JWT_SECRET}
|
|
||||||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
|
||||||
DEEPL_API_KEY: ${DEEPL_API_KEY}
|
|
||||||
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
|
|
||||||
GEMINI_API_KEY: ${GEMINI_API_KEY}
|
|
||||||
PYTHONPATH: /app
|
|
||||||
STORAGE_PROVIDER: local
|
|
||||||
STORAGE_ENDPOINT_URL: http://storage:9000
|
|
||||||
STORAGE_ACCESS_KEY: ${STORAGE_ACCESS_KEY:-langlearn}
|
|
||||||
STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY}
|
|
||||||
STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn}
|
|
||||||
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub}
|
|
||||||
depends_on:
|
|
||||||
db:
|
|
||||||
condition: service_healthy
|
|
||||||
storage:
|
|
||||||
condition: service_healthy
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
frontend:
|
|
||||||
build:
|
|
||||||
context: ./frontend
|
|
||||||
args:
|
|
||||||
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000}
|
|
||||||
ports:
|
|
||||||
- "${FRONTEND_PORT:-3000}:3000"
|
|
||||||
environment:
|
|
||||||
ORIGIN: ${ORIGIN:-http://localhost:3000}
|
|
||||||
depends_on:
|
|
||||||
- api
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
pgdata:
|
|
||||||
storagedata:
|
|
||||||
|
|
@ -1,83 +0,0 @@
|
||||||
services:
|
|
||||||
db:
|
|
||||||
image: postgres:16-alpine
|
|
||||||
environment:
|
|
||||||
POSTGRES_USER: ${POSTGRES_USER:-langlearn}
|
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
|
||||||
POSTGRES_DB: ${POSTGRES_DB:-langlearn}
|
|
||||||
volumes:
|
|
||||||
- pgdata:/var/lib/postgresql/data
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langlearn}"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 10
|
|
||||||
restart: unless-stopped
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '1'
|
|
||||||
memory: 1G
|
|
||||||
|
|
||||||
api:
|
|
||||||
build: ./api
|
|
||||||
ports:
|
|
||||||
- "${API_PORT:-8000}:8000"
|
|
||||||
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 2
|
|
||||||
environment:
|
|
||||||
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-langlearn}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-langlearn}
|
|
||||||
ADMIN_USER_EMAILS: ${ADMIN_USER_EMAILS}
|
|
||||||
API_BASE_URL: ${API_BASE_URL}
|
|
||||||
JWT_SECRET: ${JWT_SECRET}
|
|
||||||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
|
||||||
DEEPL_API_KEY: ${DEEPL_API_KEY}
|
|
||||||
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
|
|
||||||
GEMINI_API_KEY: ${GEMINI_API_KEY}
|
|
||||||
PYTHONPATH: /app
|
|
||||||
STORAGE_PROVIDER: bunny
|
|
||||||
BUNNY_ZONE: ${BUNNY_ZONE}
|
|
||||||
BUNNY_API_KEY: ${BUNNY_API_KEY}
|
|
||||||
BUNNY_CDN_BASE_URL: ${BUNNY_CDN_BASE_URL}
|
|
||||||
BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY}
|
|
||||||
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "curl -sf http://localhost:8000/health || exit 1"]
|
|
||||||
interval: 10s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 10
|
|
||||||
start_period: 20s
|
|
||||||
depends_on:
|
|
||||||
db:
|
|
||||||
condition: service_healthy
|
|
||||||
restart: unless-stopped
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '1'
|
|
||||||
memory: 1G
|
|
||||||
|
|
||||||
frontend:
|
|
||||||
build:
|
|
||||||
context: ./frontend
|
|
||||||
args:
|
|
||||||
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL}
|
|
||||||
ports:
|
|
||||||
- "${FRONTEND_PORT:-3000}:3000"
|
|
||||||
environment:
|
|
||||||
ORIGIN: ${ORIGIN}
|
|
||||||
depends_on:
|
|
||||||
api:
|
|
||||||
condition: service_healthy
|
|
||||||
restart: unless-stopped
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '0.5'
|
|
||||||
memory: 256M
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
pgdata:
|
|
||||||
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
name: langlearn
|
|
||||||
|
|
@ -299,7 +299,6 @@
|
||||||
<LatestEntry
|
<LatestEntry
|
||||||
sourceText={latestEntry?.story_text}
|
sourceText={latestEntry?.story_text}
|
||||||
translationText={latestEntry?.translation}
|
translationText={latestEntry?.translation}
|
||||||
storyTextLinguisticData={latestEntry?.story_text_linguistic_data}
|
|
||||||
audioUrl={latestEntry?.audio_url}
|
audioUrl={latestEntry?.audio_url}
|
||||||
onSelectNextStep={handleNextStepSelect}
|
onSelectNextStep={handleNextStepSelect}
|
||||||
isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration}
|
isWaitingForGeneration={$adventureState.ui.isWaitingForGeneration}
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@
|
||||||
type Props = {
|
type Props = {
|
||||||
sourceText: string | null | undefined;
|
sourceText: string | null | undefined;
|
||||||
translationText: string | null | undefined;
|
translationText: string | null | undefined;
|
||||||
storyTextLinguisticData: Record<string, unknown> | null | undefined;
|
|
||||||
audioUrl: string | null | undefined;
|
audioUrl: string | null | undefined;
|
||||||
|
|
||||||
onSelectNextStep: (optionId: string) => Promise<void>;
|
onSelectNextStep: (optionId: string) => Promise<void>;
|
||||||
|
|
@ -25,7 +24,6 @@
|
||||||
const {
|
const {
|
||||||
sourceText,
|
sourceText,
|
||||||
translationText,
|
translationText,
|
||||||
storyTextLinguisticData,
|
|
||||||
audioUrl,
|
audioUrl,
|
||||||
|
|
||||||
onSelectNextStep,
|
onSelectNextStep,
|
||||||
|
|
@ -35,222 +33,10 @@
|
||||||
errorMessage
|
errorMessage
|
||||||
}: Props = $props();
|
}: Props = $props();
|
||||||
|
|
||||||
type LinguisticToken = {
|
const sourceParagraphs = $derived.by(() => toParagraphs(sourceText));
|
||||||
text: string;
|
const translationParagraphs = $derived.by(() => toParagraphs(translationText));
|
||||||
lemma: string | null;
|
|
||||||
pos: string | null;
|
|
||||||
};
|
|
||||||
|
|
||||||
type TextSegment = {
|
let lastClickedParagraphIndex: number | null = $state(null);
|
||||||
kind: 'text';
|
|
||||||
text: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
type WordSegment = {
|
|
||||||
kind: 'word';
|
|
||||||
text: string;
|
|
||||||
lemma: string | null;
|
|
||||||
pos: string | null;
|
|
||||||
};
|
|
||||||
|
|
||||||
type SentenceSegments = {
|
|
||||||
key: string;
|
|
||||||
sourceText: string;
|
|
||||||
targetText: string;
|
|
||||||
sourceSegments: Array<TextSegment | WordSegment>;
|
|
||||||
targetSegments: Array<TextSegment | WordSegment>;
|
|
||||||
};
|
|
||||||
|
|
||||||
type LinguisticParagraph = {
|
|
||||||
key: string;
|
|
||||||
sourceText: string;
|
|
||||||
targetText: string;
|
|
||||||
sentences: SentenceSegments[];
|
|
||||||
};
|
|
||||||
|
|
||||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
||||||
return typeof value === 'object' && value !== null;
|
|
||||||
}
|
|
||||||
|
|
||||||
function asString(value: unknown): string | null {
|
|
||||||
return typeof value === 'string' ? value : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseTokens(value: unknown): LinguisticToken[] {
|
|
||||||
if (!Array.isArray(value)) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
return value
|
|
||||||
.map((token): LinguisticToken | null => {
|
|
||||||
if (!isRecord(token)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const text = asString(token.text);
|
|
||||||
if (!text) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
text,
|
|
||||||
lemma: asString(token.lemma),
|
|
||||||
pos: asString(token.pos)
|
|
||||||
};
|
|
||||||
})
|
|
||||||
.filter((token): token is LinguisticToken => token !== null);
|
|
||||||
}
|
|
||||||
|
|
||||||
function buildSegments(
|
|
||||||
text: string,
|
|
||||||
tokens: LinguisticToken[]
|
|
||||||
): Array<TextSegment | WordSegment> {
|
|
||||||
if (tokens.length === 0) {
|
|
||||||
return text ? [{ kind: 'text', text }] : [];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!text) {
|
|
||||||
return tokens.flatMap((token, index) => [
|
|
||||||
{ kind: 'word', text: token.text, lemma: token.lemma, pos: token.pos } as WordSegment,
|
|
||||||
...(index < tokens.length - 1 ? ([{ kind: 'text', text: ' ' }] as TextSegment[]) : [])
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
const segments: Array<TextSegment | WordSegment> = [];
|
|
||||||
let cursor = 0;
|
|
||||||
|
|
||||||
for (const token of tokens) {
|
|
||||||
const tokenIndex = text.indexOf(token.text, cursor);
|
|
||||||
|
|
||||||
if (tokenIndex === -1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tokenIndex > cursor) {
|
|
||||||
segments.push({
|
|
||||||
kind: 'text',
|
|
||||||
text: text.slice(cursor, tokenIndex)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
segments.push({
|
|
||||||
kind: 'word',
|
|
||||||
text: token.text,
|
|
||||||
lemma: token.lemma,
|
|
||||||
pos: token.pos
|
|
||||||
});
|
|
||||||
|
|
||||||
cursor = tokenIndex + token.text.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cursor < text.length) {
|
|
||||||
segments.push({
|
|
||||||
kind: 'text',
|
|
||||||
text: text.slice(cursor)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return segments.length > 0 ? segments : [{ kind: 'text', text }];
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseLinguisticParagraphs(
|
|
||||||
value: Record<string, unknown> | null | undefined
|
|
||||||
): LinguisticParagraph[] {
|
|
||||||
if (!value) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const paragraphs = value.paragraphs;
|
|
||||||
if (!Array.isArray(paragraphs)) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
return paragraphs
|
|
||||||
.map((paragraphValue, paragraphIndex): LinguisticParagraph | null => {
|
|
||||||
if (!isRecord(paragraphValue)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sentencesRaw = paragraphValue.sentences;
|
|
||||||
const sentenceValues = Array.isArray(sentencesRaw) ? sentencesRaw : [];
|
|
||||||
|
|
||||||
const sentences = sentenceValues
|
|
||||||
.map((sentenceValue, sentenceIndex): SentenceSegments | null => {
|
|
||||||
if (!isRecord(sentenceValue)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sourceSentence = asString(sentenceValue.source_text) ?? '';
|
|
||||||
const targetSentence = asString(sentenceValue.target_text) ?? '';
|
|
||||||
const sourceTokens = parseTokens(sentenceValue.source_tokens);
|
|
||||||
const targetTokens = parseTokens(sentenceValue.target_tokens);
|
|
||||||
|
|
||||||
return {
|
|
||||||
key: `${paragraphIndex}-${sentenceIndex}`,
|
|
||||||
sourceText: sourceSentence,
|
|
||||||
targetText: targetSentence,
|
|
||||||
sourceSegments: buildSegments(sourceSentence, sourceTokens),
|
|
||||||
targetSegments: buildSegments(targetSentence, targetTokens)
|
|
||||||
};
|
|
||||||
})
|
|
||||||
.filter((sentence): sentence is SentenceSegments => sentence !== null);
|
|
||||||
|
|
||||||
const sourceText =
|
|
||||||
asString(paragraphValue.source_text) ??
|
|
||||||
sentences
|
|
||||||
.map((sentence) => sentence.sourceText)
|
|
||||||
.filter(Boolean)
|
|
||||||
.join(' ');
|
|
||||||
const targetText =
|
|
||||||
asString(paragraphValue.target_text) ??
|
|
||||||
sentences
|
|
||||||
.map((sentence) => sentence.targetText)
|
|
||||||
.filter(Boolean)
|
|
||||||
.join(' ');
|
|
||||||
|
|
||||||
if (sentences.length === 0 && (sourceText || targetText)) {
|
|
||||||
sentences.push({
|
|
||||||
key: `${paragraphIndex}-0`,
|
|
||||||
sourceText,
|
|
||||||
targetText,
|
|
||||||
sourceSegments: sourceText ? [{ kind: 'text', text: sourceText }] : [],
|
|
||||||
targetSegments: targetText ? [{ kind: 'text', text: targetText }] : []
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sourceText && !targetText && sentences.length === 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
key: `p-${paragraphIndex}`,
|
|
||||||
sourceText,
|
|
||||||
targetText,
|
|
||||||
sentences
|
|
||||||
};
|
|
||||||
})
|
|
||||||
.filter((paragraph): paragraph is LinguisticParagraph => paragraph !== null);
|
|
||||||
}
|
|
||||||
|
|
||||||
function isWordLike(text: string): boolean {
|
|
||||||
return /[\p{L}\p{N}]/u.test(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
const linguisticParagraphs = $derived.by(() =>
|
|
||||||
parseLinguisticParagraphs(storyTextLinguisticData)
|
|
||||||
);
|
|
||||||
const sourceParagraphs = $derived.by(() =>
|
|
||||||
linguisticParagraphs.length > 0
|
|
||||||
? linguisticParagraphs.map((paragraph) => paragraph.targetText).filter(Boolean)
|
|
||||||
: toParagraphs(sourceText)
|
|
||||||
);
|
|
||||||
const translationParagraphs = $derived.by(() =>
|
|
||||||
linguisticParagraphs.length > 0
|
|
||||||
? linguisticParagraphs.map((paragraph) => paragraph.sourceText).filter(Boolean)
|
|
||||||
: toParagraphs(translationText)
|
|
||||||
);
|
|
||||||
|
|
||||||
let selectedWord: { sentenceKey: string; text: string } | null = $state(null);
|
|
||||||
let sourcePane = $state<HTMLDivElement | undefined>();
|
let sourcePane = $state<HTMLDivElement | undefined>();
|
||||||
let translationPane = $state<HTMLDivElement | undefined>();
|
let translationPane = $state<HTMLDivElement | undefined>();
|
||||||
let suppressSourceScroll = $state(false);
|
let suppressSourceScroll = $state(false);
|
||||||
|
|
@ -315,9 +101,8 @@
|
||||||
}, 20000);
|
}, 20000);
|
||||||
}
|
}
|
||||||
|
|
||||||
function handleWordClicked(sentenceKey: string, text: string) {
|
function handleParagraphClicked(paragraphIndex: number) {
|
||||||
selectedWord = { sentenceKey, text };
|
lastClickedParagraphIndex = paragraphIndex;
|
||||||
showTranslation();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleNextStepSelect(optionId: string) {
|
async function handleNextStepSelect(optionId: string) {
|
||||||
|
|
@ -361,45 +146,18 @@
|
||||||
<div class="pane source-pane">
|
<div class="pane source-pane">
|
||||||
<div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}>
|
<div class="latest-entry__pane-body" bind:this={sourcePane} onscroll={handleSourceScroll}>
|
||||||
{#if sourceParagraphs.length > 0}
|
{#if sourceParagraphs.length > 0}
|
||||||
{#if linguisticParagraphs.length > 0}
|
{#each sourceParagraphs as paragraph, index (index)}
|
||||||
{#each linguisticParagraphs as paragraph (paragraph.key)}
|
<button
|
||||||
<p class="paragraph paragraph--text" data-language="source">
|
type="button"
|
||||||
{#each paragraph.sentences as sentence (sentence.key)}
|
class="paragraph"
|
||||||
<span
|
class:active={lastClickedParagraphIndex === index}
|
||||||
class="sentence-chunk"
|
data-paragraph-index={index}
|
||||||
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
|
data-language="source"
|
||||||
>
|
onclick={() => handleParagraphClicked(index)}
|
||||||
{#each sentence.targetSegments as segment, segmentIndex (`${sentence.key}-target-${segmentIndex}`)}
|
>
|
||||||
{#if segment.kind === 'word' && isWordLike(segment.text)}
|
{paragraph}
|
||||||
<button
|
</button>
|
||||||
type="button"
|
{/each}
|
||||||
class="word-token"
|
|
||||||
class:active={selectedWord?.sentenceKey === sentence.key &&
|
|
||||||
selectedWord?.text === segment.text}
|
|
||||||
title={segment.lemma ? `Lemma: ${segment.lemma}` : undefined}
|
|
||||||
onclick={() => handleWordClicked(sentence.key, segment.text)}
|
|
||||||
>
|
|
||||||
{segment.text}
|
|
||||||
</button>
|
|
||||||
{:else}
|
|
||||||
<span>{segment.text}</span>
|
|
||||||
{/if}
|
|
||||||
{/each}
|
|
||||||
</span>
|
|
||||||
{/each}
|
|
||||||
</p>
|
|
||||||
{/each}
|
|
||||||
{:else}
|
|
||||||
{#each sourceParagraphs as paragraph, index (index)}
|
|
||||||
<p
|
|
||||||
class="paragraph paragraph--text"
|
|
||||||
data-paragraph-index={index}
|
|
||||||
data-language="source"
|
|
||||||
>
|
|
||||||
{paragraph}
|
|
||||||
</p>
|
|
||||||
{/each}
|
|
||||||
{/if}
|
|
||||||
{:else}
|
{:else}
|
||||||
<div class="loading-block" role="status" aria-live="polite">
|
<div class="loading-block" role="status" aria-live="polite">
|
||||||
<p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p>
|
<p class="loading-block__label">{statusMessage || 'Writing your next entry...'}</p>
|
||||||
|
|
@ -419,12 +177,6 @@
|
||||||
</button>
|
</button>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
{#if selectedWord}
|
|
||||||
<p class="translation-selected-word" role="status" aria-live="polite">
|
|
||||||
Selected word: <strong>{selectedWord.text}</strong>
|
|
||||||
</p>
|
|
||||||
{/if}
|
|
||||||
|
|
||||||
{#if translationVisible}
|
{#if translationVisible}
|
||||||
<div
|
<div
|
||||||
class="latest-entry__pane-body"
|
class="latest-entry__pane-body"
|
||||||
|
|
@ -432,42 +184,18 @@
|
||||||
onscroll={handleTranslationScroll}
|
onscroll={handleTranslationScroll}
|
||||||
>
|
>
|
||||||
{#if translationParagraphs.length > 0}
|
{#if translationParagraphs.length > 0}
|
||||||
{#if linguisticParagraphs.length > 0}
|
{#each translationParagraphs as paragraph, index (index)}
|
||||||
{#each linguisticParagraphs as paragraph (paragraph.key)}
|
<button
|
||||||
<p class="paragraph paragraph--text" data-language="translation">
|
type="button"
|
||||||
{#each paragraph.sentences as sentence (sentence.key)}
|
class="paragraph"
|
||||||
<span
|
class:active={lastClickedParagraphIndex === index}
|
||||||
class="sentence-chunk"
|
data-paragraph-index={index}
|
||||||
class:active-sentence={selectedWord?.sentenceKey === sentence.key}
|
data-language="translation"
|
||||||
>
|
onclick={() => handleParagraphClicked(index)}
|
||||||
{#each sentence.sourceSegments as segment, segmentIndex (`${sentence.key}-source-${segmentIndex}`)}
|
>
|
||||||
{#if segment.kind === 'word'}
|
{paragraph}
|
||||||
<span
|
</button>
|
||||||
class="word-token word-token--passive"
|
{/each}
|
||||||
class:active={selectedWord?.sentenceKey === sentence.key &&
|
|
||||||
selectedWord?.text === segment.text}
|
|
||||||
>
|
|
||||||
{segment.text}
|
|
||||||
</span>
|
|
||||||
{:else}
|
|
||||||
<span>{segment.text}</span>
|
|
||||||
{/if}
|
|
||||||
{/each}
|
|
||||||
</span>
|
|
||||||
{/each}
|
|
||||||
</p>
|
|
||||||
{/each}
|
|
||||||
{:else}
|
|
||||||
{#each translationParagraphs as paragraph, index (index)}
|
|
||||||
<p
|
|
||||||
class="paragraph paragraph--text"
|
|
||||||
data-paragraph-index={index}
|
|
||||||
data-language="translation"
|
|
||||||
>
|
|
||||||
{paragraph}
|
|
||||||
</p>
|
|
||||||
{/each}
|
|
||||||
{/if}
|
|
||||||
{:else}
|
{:else}
|
||||||
<div class="loading-block" role="status" aria-live="polite">
|
<div class="loading-block" role="status" aria-live="polite">
|
||||||
<p class="loading-block__label">
|
<p class="loading-block__label">
|
||||||
|
|
@ -783,18 +511,6 @@
|
||||||
color: color-mix(in srgb, var(--color-on-surface) 72%, transparent);
|
color: color-mix(in srgb, var(--color-on-surface) 72%, transparent);
|
||||||
}
|
}
|
||||||
|
|
||||||
.translation-selected-word {
|
|
||||||
margin: 0;
|
|
||||||
padding: 0 var(--latest-entry-pane-padding) var(--space-2);
|
|
||||||
font-size: var(--text-label-md);
|
|
||||||
color: color-mix(in srgb, var(--color-on-surface) 84%, transparent);
|
|
||||||
}
|
|
||||||
|
|
||||||
.translation-selected-word strong {
|
|
||||||
font-weight: var(--weight-semibold);
|
|
||||||
color: var(--color-primary);
|
|
||||||
}
|
|
||||||
|
|
||||||
.latest-entry__pane-body::-webkit-scrollbar {
|
.latest-entry__pane-body::-webkit-scrollbar {
|
||||||
width: 0.75rem;
|
width: 0.75rem;
|
||||||
}
|
}
|
||||||
|
|
@ -826,58 +542,9 @@
|
||||||
transition: background-color var(--duration-fast) var(--ease-standard);
|
transition: background-color var(--duration-fast) var(--ease-standard);
|
||||||
}
|
}
|
||||||
|
|
||||||
.paragraph--text {
|
.paragraph.active {
|
||||||
margin: 0;
|
background-color: color-mix(in srgb, var(--color-primary-container) 56%, transparent);
|
||||||
padding: 0;
|
border-radius: var(--radius-md);
|
||||||
border: none;
|
|
||||||
background: transparent;
|
|
||||||
cursor: default;
|
|
||||||
white-space: pre-wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sentence-chunk {
|
|
||||||
display: inline;
|
|
||||||
border-radius: var(--radius-sm);
|
|
||||||
transition: background-color var(--duration-fast) var(--ease-standard);
|
|
||||||
|
|
||||||
&::after {
|
|
||||||
content: '';
|
|
||||||
display: inline-block;
|
|
||||||
width: 0.85ch;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
.sentence-chunk.active-sentence {
|
|
||||||
background-color: color-mix(in srgb, var(--color-primary-container) 32%, transparent);
|
|
||||||
}
|
|
||||||
|
|
||||||
.word-token {
|
|
||||||
display: inline;
|
|
||||||
padding: 0;
|
|
||||||
margin: 0;
|
|
||||||
border: none;
|
|
||||||
background: transparent;
|
|
||||||
color: inherit;
|
|
||||||
font: inherit;
|
|
||||||
line-height: inherit;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
.word-token:hover {
|
|
||||||
text-decoration: underline;
|
|
||||||
}
|
|
||||||
|
|
||||||
.word-token.active {
|
|
||||||
color: var(--color-primary);
|
|
||||||
text-decoration: underline;
|
|
||||||
}
|
|
||||||
|
|
||||||
.word-token--passive {
|
|
||||||
cursor: default;
|
|
||||||
}
|
|
||||||
|
|
||||||
.word-token--passive:hover {
|
|
||||||
text-decoration: none;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.paragraph:focus-visible {
|
.paragraph:focus-visible {
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ export type AdventureEntry = {
|
||||||
id: string;
|
id: string;
|
||||||
story_text: string | null;
|
story_text: string | null;
|
||||||
translation: string | null;
|
translation: string | null;
|
||||||
story_text_linguistic_data: Record<string, unknown> | null;
|
|
||||||
audio_url: string | null;
|
audio_url: string | null;
|
||||||
generated_from_choice_id: string | null;
|
generated_from_choice_id: string | null;
|
||||||
possible_choices: { id: string; text: string }[] | null;
|
possible_choices: { id: string; text: string }[] | null;
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue