From 689e10d1bc64b4e4db59baab5c9782c2d528d6df Mon Sep 17 00:00:00 2001 From: wilson Date: Wed, 8 Apr 2026 20:37:00 +0100 Subject: [PATCH] feat: vocab endpoints --- .../versions/20260408_0008_add_vocab_bank.py | 96 ++++++++++ api/app/domain/models/vocab.py | 25 +++ api/app/domain/services/vocab_service.py | 135 ++++++++++++++ .../postgres/entities/vocab_entities.py | 64 +++++++ .../postgres/repositories/vocab_repository.py | 166 ++++++++++++++++++ api/app/routers/api/main.py | 2 + api/app/routers/api/vocab.py | 143 +++++++++++++++ 7 files changed, 631 insertions(+) create mode 100644 api/alembic/versions/20260408_0008_add_vocab_bank.py create mode 100644 api/app/domain/models/vocab.py create mode 100644 api/app/domain/services/vocab_service.py create mode 100644 api/app/outbound/postgres/entities/vocab_entities.py create mode 100644 api/app/outbound/postgres/repositories/vocab_repository.py create mode 100644 api/app/routers/api/vocab.py diff --git a/api/alembic/versions/20260408_0008_add_vocab_bank.py b/api/alembic/versions/20260408_0008_add_vocab_bank.py new file mode 100644 index 0000000..7d44e87 --- /dev/null +++ b/api/alembic/versions/20260408_0008_add_vocab_bank.py @@ -0,0 +1,96 @@ +"""add vocab bank tables + +Revision ID: 0008 +Revises: 0007 +Create Date: 2026-04-08 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0008" +down_revision: Union[str, None] = "0007" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "user_language_pair", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "user_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("users.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("source_lang", sa.String(2), nullable=False), + sa.Column("target_lang", sa.String(2), nullable=False), + sa.UniqueConstraint("user_id", "source_lang", "target_lang", name="uq_user_language_pair"), + ) + op.create_index("ix_user_language_pair_user_id", "user_language_pair", ["user_id"]) + + op.create_table( + "learnable_word_bank_entry", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "user_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("users.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column( + "language_pair_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("user_language_pair.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column( + "sense_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_sense.id", ondelete="SET NULL"), + nullable=True, + ), + sa.Column( + "wordform_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_wordform.id", ondelete="SET NULL"), + nullable=True, + ), + sa.Column("surface_text", sa.Text(), nullable=False), + sa.Column("is_phrase", sa.Boolean(), nullable=False, server_default="false"), + sa.Column("entry_pathway", sa.Text(), nullable=False), + sa.Column("source_article_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("disambiguation_status", sa.Text(), nullable=False, server_default="pending"), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + ) + op.create_index( + "ix_learnable_word_bank_entry_user_id", "learnable_word_bank_entry", ["user_id"] + ) + op.create_index( + "ix_learnable_word_bank_entry_language_pair_id", + "learnable_word_bank_entry", + ["language_pair_id"], + ) + op.create_index( + "ix_learnable_word_bank_entry_sense_id", "learnable_word_bank_entry", ["sense_id"] + ) + + +def downgrade() -> None: + op.drop_index("ix_learnable_word_bank_entry_sense_id", table_name="learnable_word_bank_entry") + op.drop_index( + "ix_learnable_word_bank_entry_language_pair_id", table_name="learnable_word_bank_entry" + ) + op.drop_index("ix_learnable_word_bank_entry_user_id", table_name="learnable_word_bank_entry") + op.drop_table("learnable_word_bank_entry") + op.drop_index("ix_user_language_pair_user_id", table_name="user_language_pair") + op.drop_table("user_language_pair") diff --git a/api/app/domain/models/vocab.py b/api/app/domain/models/vocab.py new file mode 100644 index 0000000..108d5f9 --- /dev/null +++ b/api/app/domain/models/vocab.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from datetime import datetime + + +@dataclass +class UserLanguagePair: + id: str + user_id: str + source_lang: str + target_lang: str + + +@dataclass +class LearnableWordBankEntry: + id: str + user_id: str + language_pair_id: str + sense_id: str | None + wordform_id: str | None + surface_text: str + is_phrase: bool + entry_pathway: str + source_article_id: str | None + disambiguation_status: str + created_at: datetime diff --git a/api/app/domain/services/vocab_service.py b/api/app/domain/services/vocab_service.py new file mode 100644 index 0000000..3d5e083 --- /dev/null +++ b/api/app/domain/services/vocab_service.py @@ -0,0 +1,135 @@ +import uuid + +from ..models.vocab import LearnableWordBankEntry +from ...outbound.postgres.repositories.vocab_repository import VocabRepository +from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository + + +class VocabService: + """Manages a user's learnable word bank — adding words from various sources and + resolving which dictionary sense a word belongs to. + + Usage: + service = VocabService( + vocab_repo=PostgresVocabRepository(db), + dict_repo=PostgresDictionaryRepository(db), + ) + entry = await service.add_word_to_bank( + user_id=user.id, + surface_text="banque", + language_pair_id=pair.id, + pathway="highlight", + ) + # entry.disambiguation_status is "auto_resolved" if "banque" has exactly one + # dictionary sense, or "pending" if the user needs to pick from multiple senses. + """ + + def __init__(self, vocab_repo: VocabRepository, dict_repo: DictionaryRepository) -> None: + self.vocab_repo = vocab_repo + self.dict_repo = dict_repo + + async def add_word_to_bank( + self, + user_id: uuid.UUID, + surface_text: str, + language_pair_id: uuid.UUID, + pathway: str, + is_phrase: bool = False, + source_article_id: uuid.UUID | None = None, + ) -> LearnableWordBankEntry: + """Add a word or phrase to the user's vocab bank, automatically linking it to a + dictionary sense when exactly one match exists, or flagging it as pending + disambiguation when zero or multiple senses are found. + + Phrases (``is_phrase=True``) bypass dictionary lookup entirely and are always + created with ``disambiguation_status="pending"`` since they cannot be resolved + to a single headword. + + Usage:: + + # Word with a single sense — auto-resolved immediately + entry = await service.add_word_to_bank( + user_id=user_id, + surface_text="bisque", + language_pair_id=fr_en_pair_id, + pathway="highlight", + ) + assert entry.disambiguation_status == "auto_resolved" + + # Common word with many senses — user must pick one + entry = await service.add_word_to_bank( + user_id=user_id, + surface_text="avoir", + language_pair_id=fr_en_pair_id, + pathway="manual", + ) + assert entry.disambiguation_status == "pending" + + # Multi-word expression — skips lookup, always pending + entry = await service.add_word_to_bank( + user_id=user_id, + surface_text="avoir l'air", + language_pair_id=fr_en_pair_id, + pathway="manual", + is_phrase=True, + ) + """ + pair = await self.vocab_repo.get_language_pair(language_pair_id) + if pair is None: + raise ValueError(f"Language pair {language_pair_id} not found") + + if is_phrase: + return await self.vocab_repo.add_entry( + user_id=user_id, + language_pair_id=language_pair_id, + surface_text=surface_text, + entry_pathway=pathway, + is_phrase=True, + source_article_id=source_article_id, + disambiguation_status="pending", + ) + + senses = await self.dict_repo.get_senses_for_headword(surface_text, pair.target_lang) + + if len(senses) == 1: + sense_id = uuid.UUID(senses[0].id) + status = "auto_resolved" + elif len(senses) > 1: + sense_id = None + status = "pending" + else: + sense_id = None + status = "pending" + + return await self.vocab_repo.add_entry( + user_id=user_id, + language_pair_id=language_pair_id, + surface_text=surface_text, + entry_pathway=pathway, + is_phrase=False, + sense_id=sense_id, + source_article_id=source_article_id, + disambiguation_status=status, + ) + + async def resolve_disambiguation( + self, entry_id: uuid.UUID, sense_id: uuid.UUID + ) -> LearnableWordBankEntry: + """Attach a specific dictionary sense to a pending vocab bank entry, marking it + as ``resolved`` so it can be used for flashcard generation. + + This is called after the user selects the correct sense from the list presented + during disambiguation — for example, choosing "bank (finance)" over + "bank (river)" for the French word "banque". + + Usage:: + + # User has been shown the sense list and picked sense_id for "bank (finance)" + resolved_entry = await service.resolve_disambiguation( + entry_id=pending_entry.id, + sense_id=finance_sense_id, + ) + assert resolved_entry.disambiguation_status == "resolved" + assert resolved_entry.sense_id == str(finance_sense_id) + """ + return await self.vocab_repo.set_sense(entry_id, sense_id) diff --git a/api/app/outbound/postgres/entities/vocab_entities.py b/api/app/outbound/postgres/entities/vocab_entities.py new file mode 100644 index 0000000..9b70aa0 --- /dev/null +++ b/api/app/outbound/postgres/entities/vocab_entities.py @@ -0,0 +1,64 @@ +import uuid +from datetime import datetime, timezone + +from sqlalchemy import Boolean, ForeignKey, String, Text, UniqueConstraint, DateTime +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.dialects.postgresql import UUID + +from ..database import Base + + +class UserLanguagePairEntity(Base): + __tablename__ = "user_language_pair" + __table_args__ = (UniqueConstraint("user_id", "source_lang", "target_lang"),) + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("users.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + source_lang: Mapped[str] = mapped_column(String(2), nullable=False) + target_lang: Mapped[str] = mapped_column(String(2), nullable=False) + + +class LearnableWordBankEntryEntity(Base): + __tablename__ = "learnable_word_bank_entry" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("users.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + language_pair_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("user_language_pair.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + sense_id: Mapped[uuid.UUID | None] = mapped_column( + UUID(as_uuid=True), + ForeignKey("dictionary_sense.id", ondelete="SET NULL"), + nullable=True, + index=True, + ) + wordform_id: Mapped[uuid.UUID | None] = mapped_column( + UUID(as_uuid=True), + ForeignKey("dictionary_wordform.id", ondelete="SET NULL"), + nullable=True, + ) + surface_text: Mapped[str] = mapped_column(Text, nullable=False) + is_phrase: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + entry_pathway: Mapped[str] = mapped_column(Text, nullable=False) + source_article_id: Mapped[uuid.UUID | None] = mapped_column( + UUID(as_uuid=True), nullable=True + ) + disambiguation_status: Mapped[str] = mapped_column(Text, nullable=False, default="pending") + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + nullable=False, + default=lambda: datetime.now(timezone.utc), + ) diff --git a/api/app/outbound/postgres/repositories/vocab_repository.py b/api/app/outbound/postgres/repositories/vocab_repository.py new file mode 100644 index 0000000..00f38e3 --- /dev/null +++ b/api/app/outbound/postgres/repositories/vocab_repository.py @@ -0,0 +1,166 @@ +import uuid +from datetime import datetime, timezone +from typing import Protocol + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from ..entities.vocab_entities import LearnableWordBankEntryEntity, UserLanguagePairEntity +from ....domain.models.vocab import LearnableWordBankEntry, UserLanguagePair + + +class VocabRepository(Protocol): + async def get_or_create_language_pair( + self, user_id: uuid.UUID, source_lang: str, target_lang: str + ) -> UserLanguagePair: ... + + async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None: ... + + async def add_entry( + self, + user_id: uuid.UUID, + language_pair_id: uuid.UUID, + surface_text: str, + entry_pathway: str, + is_phrase: bool = False, + sense_id: uuid.UUID | None = None, + wordform_id: uuid.UUID | None = None, + source_article_id: uuid.UUID | None = None, + disambiguation_status: str = "pending", + ) -> LearnableWordBankEntry: ... + + async def get_entries_for_user( + self, user_id: uuid.UUID, language_pair_id: uuid.UUID + ) -> list[LearnableWordBankEntry]: ... + + async def set_sense( + self, entry_id: uuid.UUID, sense_id: uuid.UUID + ) -> LearnableWordBankEntry: ... + + async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]: ... + + +def _pair_to_model(entity: UserLanguagePairEntity) -> UserLanguagePair: + return UserLanguagePair( + id=str(entity.id), + user_id=str(entity.user_id), + source_lang=entity.source_lang, + target_lang=entity.target_lang, + ) + + +def _entry_to_model(entity: LearnableWordBankEntryEntity) -> LearnableWordBankEntry: + return LearnableWordBankEntry( + id=str(entity.id), + user_id=str(entity.user_id), + language_pair_id=str(entity.language_pair_id), + sense_id=str(entity.sense_id) if entity.sense_id else None, + wordform_id=str(entity.wordform_id) if entity.wordform_id else None, + surface_text=entity.surface_text, + is_phrase=entity.is_phrase, + entry_pathway=entity.entry_pathway, + source_article_id=str(entity.source_article_id) if entity.source_article_id else None, + disambiguation_status=entity.disambiguation_status, + created_at=entity.created_at, + ) + + +class PostgresVocabRepository: + def __init__(self, db: AsyncSession) -> None: + self.db = db + + async def get_or_create_language_pair( + self, user_id: uuid.UUID, source_lang: str, target_lang: str + ) -> UserLanguagePair: + result = await self.db.execute( + select(UserLanguagePairEntity).where( + UserLanguagePairEntity.user_id == user_id, + UserLanguagePairEntity.source_lang == source_lang, + UserLanguagePairEntity.target_lang == target_lang, + ) + ) + entity = result.scalar_one_or_none() + if entity is None: + entity = UserLanguagePairEntity( + user_id=user_id, + source_lang=source_lang, + target_lang=target_lang, + ) + self.db.add(entity) + await self.db.flush() + return _pair_to_model(entity) + + async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None: + result = await self.db.execute( + select(UserLanguagePairEntity).where(UserLanguagePairEntity.id == language_pair_id) + ) + entity = result.scalar_one_or_none() + return _pair_to_model(entity) if entity else None + + async def add_entry( + self, + user_id: uuid.UUID, + language_pair_id: uuid.UUID, + surface_text: str, + entry_pathway: str, + is_phrase: bool = False, + sense_id: uuid.UUID | None = None, + wordform_id: uuid.UUID | None = None, + source_article_id: uuid.UUID | None = None, + disambiguation_status: str = "pending", + ) -> LearnableWordBankEntry: + entity = LearnableWordBankEntryEntity( + user_id=user_id, + language_pair_id=language_pair_id, + surface_text=surface_text, + entry_pathway=entry_pathway, + is_phrase=is_phrase, + sense_id=sense_id, + wordform_id=wordform_id, + source_article_id=source_article_id, + disambiguation_status=disambiguation_status, + created_at=datetime.now(timezone.utc), + ) + self.db.add(entity) + await self.db.commit() + await self.db.refresh(entity) + return _entry_to_model(entity) + + async def get_entries_for_user( + self, user_id: uuid.UUID, language_pair_id: uuid.UUID + ) -> list[LearnableWordBankEntry]: + result = await self.db.execute( + select(LearnableWordBankEntryEntity) + .where( + LearnableWordBankEntryEntity.user_id == user_id, + LearnableWordBankEntryEntity.language_pair_id == language_pair_id, + ) + .order_by(LearnableWordBankEntryEntity.created_at.desc()) + ) + return [_entry_to_model(e) for e in result.scalars().all()] + + async def set_sense( + self, entry_id: uuid.UUID, sense_id: uuid.UUID + ) -> LearnableWordBankEntry: + result = await self.db.execute( + select(LearnableWordBankEntryEntity).where( + LearnableWordBankEntryEntity.id == entry_id + ) + ) + entity = result.scalar_one() + entity.sense_id = sense_id + entity.disambiguation_status = "resolved" + await self.db.commit() + await self.db.refresh(entity) + return _entry_to_model(entity) + + async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]: + result = await self.db.execute( + select(LearnableWordBankEntryEntity) + .where( + LearnableWordBankEntryEntity.user_id == user_id, + LearnableWordBankEntryEntity.disambiguation_status == "pending", + ) + .order_by(LearnableWordBankEntryEntity.created_at.desc()) + ) + return [_entry_to_model(e) for e in result.scalars().all()] diff --git a/api/app/routers/api/main.py b/api/app/routers/api/main.py index bef4d48..f671211 100644 --- a/api/app/routers/api/main.py +++ b/api/app/routers/api/main.py @@ -3,6 +3,7 @@ from .translate import router as translate_router from .generation import router as generation_router from .jobs import router as jobs_router from .learnable_languages import router as learnable_languages_router +from .vocab import router as vocab_router from fastapi import APIRouter @@ -13,3 +14,4 @@ api_router.include_router(translate_router) api_router.include_router(generation_router) api_router.include_router(jobs_router) api_router.include_router(learnable_languages_router) +api_router.include_router(vocab_router) diff --git a/api/app/routers/api/vocab.py b/api/app/routers/api/vocab.py new file mode 100644 index 0000000..40bedde --- /dev/null +++ b/api/app/routers/api/vocab.py @@ -0,0 +1,143 @@ +import uuid + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession + +from ...auth import verify_token +from ...domain.services.vocab_service import VocabService +from ...outbound.postgres.database import get_db +from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository +from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository + +router = APIRouter(prefix="/vocab", tags=["vocab"]) + + +class AddWordRequest(BaseModel): + language_pair_id: str + surface_text: str + entry_pathway: str = "manual" + is_phrase: bool = False + source_article_id: str | None = None + + +class SetSenseRequest(BaseModel): + sense_id: str + + +class WordBankEntryResponse(BaseModel): + id: str + user_id: str + language_pair_id: str + sense_id: str | None + wordform_id: str | None + surface_text: str + is_phrase: bool + entry_pathway: str + source_article_id: str | None + disambiguation_status: str + created_at: str + + +def _service(db: AsyncSession) -> VocabService: + return VocabService( + vocab_repo=PostgresVocabRepository(db), + dict_repo=PostgresDictionaryRepository(db), + ) + + +@router.post("", response_model=WordBankEntryResponse, status_code=201) +async def add_word( + request: AddWordRequest, + db: AsyncSession = Depends(get_db), + token_data: dict = Depends(verify_token), +) -> WordBankEntryResponse: + user_id = uuid.UUID(token_data["sub"]) + try: + language_pair_id = uuid.UUID(request.language_pair_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid language_pair_id") + + source_article_id = None + if request.source_article_id: + try: + source_article_id = uuid.UUID(request.source_article_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid source_article_id") + + try: + entry = await _service(db).add_word_to_bank( + user_id=user_id, + surface_text=request.surface_text.strip(), + language_pair_id=language_pair_id, + pathway=request.entry_pathway, + is_phrase=request.is_phrase, + source_article_id=source_article_id, + ) + except ValueError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + + return _to_response(entry) + + +@router.get("", response_model=list[WordBankEntryResponse]) +async def list_entries( + language_pair_id: str, + db: AsyncSession = Depends(get_db), + token_data: dict = Depends(verify_token), +) -> list[WordBankEntryResponse]: + user_id = uuid.UUID(token_data["sub"]) + try: + pair_id = uuid.UUID(language_pair_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid language_pair_id") + + entries = await PostgresVocabRepository(db).get_entries_for_user(user_id, pair_id) + return [_to_response(e) for e in entries] + + +@router.get("/pending-disambiguation", response_model=list[WordBankEntryResponse]) +async def pending_disambiguation( + db: AsyncSession = Depends(get_db), + token_data: dict = Depends(verify_token), +) -> list[WordBankEntryResponse]: + user_id = uuid.UUID(token_data["sub"]) + entries = await PostgresVocabRepository(db).get_pending_disambiguation(user_id) + return [_to_response(e) for e in entries] + + +@router.patch("/{entry_id}/sense", response_model=WordBankEntryResponse) +async def resolve_sense( + entry_id: str, + request: SetSenseRequest, + db: AsyncSession = Depends(get_db), + token_data: dict = Depends(verify_token), +) -> WordBankEntryResponse: + try: + eid = uuid.UUID(entry_id) + sid = uuid.UUID(request.sense_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid UUID") + + try: + entry = await _service(db).resolve_disambiguation(eid, sid) + except Exception: + raise HTTPException(status_code=404, detail="Entry not found") + + return _to_response(entry) + + +def _to_response(entry) -> WordBankEntryResponse: + return WordBankEntryResponse( + id=entry.id, + user_id=entry.user_id, + language_pair_id=entry.language_pair_id, + sense_id=entry.sense_id, + wordform_id=entry.wordform_id, + surface_text=entry.surface_text, + is_phrase=entry.is_phrase, + entry_pathway=entry.entry_pathway, + source_article_id=entry.source_article_id, + disambiguation_status=entry.disambiguation_status, + created_at=entry.created_at.isoformat(), + )