feat: vocab endpoints

This commit is contained in:
wilson 2026-04-08 20:37:00 +01:00
parent 486e0bf3d5
commit 689e10d1bc
7 changed files with 631 additions and 0 deletions

View file

@ -0,0 +1,96 @@
"""add vocab bank tables
Revision ID: 0008
Revises: 0007
Create Date: 2026-04-08
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0008"
down_revision: Union[str, None] = "0007"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"user_language_pair",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("source_lang", sa.String(2), nullable=False),
sa.Column("target_lang", sa.String(2), nullable=False),
sa.UniqueConstraint("user_id", "source_lang", "target_lang", name="uq_user_language_pair"),
)
op.create_index("ix_user_language_pair_user_id", "user_language_pair", ["user_id"])
op.create_table(
"learnable_word_bank_entry",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"language_pair_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("user_language_pair.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"sense_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column(
"wordform_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column("surface_text", sa.Text(), nullable=False),
sa.Column("is_phrase", sa.Boolean(), nullable=False, server_default="false"),
sa.Column("entry_pathway", sa.Text(), nullable=False),
sa.Column("source_article_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("disambiguation_status", sa.Text(), nullable=False, server_default="pending"),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.func.now(),
),
)
op.create_index(
"ix_learnable_word_bank_entry_user_id", "learnable_word_bank_entry", ["user_id"]
)
op.create_index(
"ix_learnable_word_bank_entry_language_pair_id",
"learnable_word_bank_entry",
["language_pair_id"],
)
op.create_index(
"ix_learnable_word_bank_entry_sense_id", "learnable_word_bank_entry", ["sense_id"]
)
def downgrade() -> None:
op.drop_index("ix_learnable_word_bank_entry_sense_id", table_name="learnable_word_bank_entry")
op.drop_index(
"ix_learnable_word_bank_entry_language_pair_id", table_name="learnable_word_bank_entry"
)
op.drop_index("ix_learnable_word_bank_entry_user_id", table_name="learnable_word_bank_entry")
op.drop_table("learnable_word_bank_entry")
op.drop_index("ix_user_language_pair_user_id", table_name="user_language_pair")
op.drop_table("user_language_pair")

View file

@ -0,0 +1,25 @@
from dataclasses import dataclass
from datetime import datetime
@dataclass
class UserLanguagePair:
id: str
user_id: str
source_lang: str
target_lang: str
@dataclass
class LearnableWordBankEntry:
id: str
user_id: str
language_pair_id: str
sense_id: str | None
wordform_id: str | None
surface_text: str
is_phrase: bool
entry_pathway: str
source_article_id: str | None
disambiguation_status: str
created_at: datetime

View file

@ -0,0 +1,135 @@
import uuid
from ..models.vocab import LearnableWordBankEntry
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
class VocabService:
"""Manages a user's learnable word bank — adding words from various sources and
resolving which dictionary sense a word belongs to.
Usage:
service = VocabService(
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
entry = await service.add_word_to_bank(
user_id=user.id,
surface_text="banque",
language_pair_id=pair.id,
pathway="highlight",
)
# entry.disambiguation_status is "auto_resolved" if "banque" has exactly one
# dictionary sense, or "pending" if the user needs to pick from multiple senses.
"""
def __init__(self, vocab_repo: VocabRepository, dict_repo: DictionaryRepository) -> None:
self.vocab_repo = vocab_repo
self.dict_repo = dict_repo
async def add_word_to_bank(
self,
user_id: uuid.UUID,
surface_text: str,
language_pair_id: uuid.UUID,
pathway: str,
is_phrase: bool = False,
source_article_id: uuid.UUID | None = None,
) -> LearnableWordBankEntry:
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
dictionary sense when exactly one match exists, or flagging it as pending
disambiguation when zero or multiple senses are found.
Phrases (``is_phrase=True``) bypass dictionary lookup entirely and are always
created with ``disambiguation_status="pending"`` since they cannot be resolved
to a single headword.
Usage::
# Word with a single sense — auto-resolved immediately
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="bisque",
language_pair_id=fr_en_pair_id,
pathway="highlight",
)
assert entry.disambiguation_status == "auto_resolved"
# Common word with many senses — user must pick one
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="avoir",
language_pair_id=fr_en_pair_id,
pathway="manual",
)
assert entry.disambiguation_status == "pending"
# Multi-word expression — skips lookup, always pending
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="avoir l'air",
language_pair_id=fr_en_pair_id,
pathway="manual",
is_phrase=True,
)
"""
pair = await self.vocab_repo.get_language_pair(language_pair_id)
if pair is None:
raise ValueError(f"Language pair {language_pair_id} not found")
if is_phrase:
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=pathway,
is_phrase=True,
source_article_id=source_article_id,
disambiguation_status="pending",
)
senses = await self.dict_repo.get_senses_for_headword(surface_text, pair.target_lang)
if len(senses) == 1:
sense_id = uuid.UUID(senses[0].id)
status = "auto_resolved"
elif len(senses) > 1:
sense_id = None
status = "pending"
else:
sense_id = None
status = "pending"
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=pathway,
is_phrase=False,
sense_id=sense_id,
source_article_id=source_article_id,
disambiguation_status=status,
)
async def resolve_disambiguation(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry:
"""Attach a specific dictionary sense to a pending vocab bank entry, marking it
as ``resolved`` so it can be used for flashcard generation.
This is called after the user selects the correct sense from the list presented
during disambiguation for example, choosing "bank (finance)" over
"bank (river)" for the French word "banque".
Usage::
# User has been shown the sense list and picked sense_id for "bank (finance)"
resolved_entry = await service.resolve_disambiguation(
entry_id=pending_entry.id,
sense_id=finance_sense_id,
)
assert resolved_entry.disambiguation_status == "resolved"
assert resolved_entry.sense_id == str(finance_sense_id)
"""
return await self.vocab_repo.set_sense(entry_id, sense_id)

View file

@ -0,0 +1,64 @@
import uuid
from datetime import datetime, timezone
from sqlalchemy import Boolean, ForeignKey, String, Text, UniqueConstraint, DateTime
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import UUID
from ..database import Base
class UserLanguagePairEntity(Base):
__tablename__ = "user_language_pair"
__table_args__ = (UniqueConstraint("user_id", "source_lang", "target_lang"),)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
source_lang: Mapped[str] = mapped_column(String(2), nullable=False)
target_lang: Mapped[str] = mapped_column(String(2), nullable=False)
class LearnableWordBankEntryEntity(Base):
__tablename__ = "learnable_word_bank_entry"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
language_pair_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("user_language_pair.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
sense_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
nullable=True,
index=True,
)
wordform_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
nullable=True,
)
surface_text: Mapped[str] = mapped_column(Text, nullable=False)
is_phrase: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
entry_pathway: Mapped[str] = mapped_column(Text, nullable=False)
source_article_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), nullable=True
)
disambiguation_status: Mapped[str] = mapped_column(Text, nullable=False, default="pending")
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
)

View file

@ -0,0 +1,166 @@
import uuid
from datetime import datetime, timezone
from typing import Protocol
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..entities.vocab_entities import LearnableWordBankEntryEntity, UserLanguagePairEntity
from ....domain.models.vocab import LearnableWordBankEntry, UserLanguagePair
class VocabRepository(Protocol):
async def get_or_create_language_pair(
self, user_id: uuid.UUID, source_lang: str, target_lang: str
) -> UserLanguagePair: ...
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None: ...
async def add_entry(
self,
user_id: uuid.UUID,
language_pair_id: uuid.UUID,
surface_text: str,
entry_pathway: str,
is_phrase: bool = False,
sense_id: uuid.UUID | None = None,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
disambiguation_status: str = "pending",
) -> LearnableWordBankEntry: ...
async def get_entries_for_user(
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
) -> list[LearnableWordBankEntry]: ...
async def set_sense(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry: ...
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]: ...
def _pair_to_model(entity: UserLanguagePairEntity) -> UserLanguagePair:
return UserLanguagePair(
id=str(entity.id),
user_id=str(entity.user_id),
source_lang=entity.source_lang,
target_lang=entity.target_lang,
)
def _entry_to_model(entity: LearnableWordBankEntryEntity) -> LearnableWordBankEntry:
return LearnableWordBankEntry(
id=str(entity.id),
user_id=str(entity.user_id),
language_pair_id=str(entity.language_pair_id),
sense_id=str(entity.sense_id) if entity.sense_id else None,
wordform_id=str(entity.wordform_id) if entity.wordform_id else None,
surface_text=entity.surface_text,
is_phrase=entity.is_phrase,
entry_pathway=entity.entry_pathway,
source_article_id=str(entity.source_article_id) if entity.source_article_id else None,
disambiguation_status=entity.disambiguation_status,
created_at=entity.created_at,
)
class PostgresVocabRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def get_or_create_language_pair(
self, user_id: uuid.UUID, source_lang: str, target_lang: str
) -> UserLanguagePair:
result = await self.db.execute(
select(UserLanguagePairEntity).where(
UserLanguagePairEntity.user_id == user_id,
UserLanguagePairEntity.source_lang == source_lang,
UserLanguagePairEntity.target_lang == target_lang,
)
)
entity = result.scalar_one_or_none()
if entity is None:
entity = UserLanguagePairEntity(
user_id=user_id,
source_lang=source_lang,
target_lang=target_lang,
)
self.db.add(entity)
await self.db.flush()
return _pair_to_model(entity)
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None:
result = await self.db.execute(
select(UserLanguagePairEntity).where(UserLanguagePairEntity.id == language_pair_id)
)
entity = result.scalar_one_or_none()
return _pair_to_model(entity) if entity else None
async def add_entry(
self,
user_id: uuid.UUID,
language_pair_id: uuid.UUID,
surface_text: str,
entry_pathway: str,
is_phrase: bool = False,
sense_id: uuid.UUID | None = None,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
disambiguation_status: str = "pending",
) -> LearnableWordBankEntry:
entity = LearnableWordBankEntryEntity(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=entry_pathway,
is_phrase=is_phrase,
sense_id=sense_id,
wordform_id=wordform_id,
source_article_id=source_article_id,
disambiguation_status=disambiguation_status,
created_at=datetime.now(timezone.utc),
)
self.db.add(entity)
await self.db.commit()
await self.db.refresh(entity)
return _entry_to_model(entity)
async def get_entries_for_user(
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
) -> list[LearnableWordBankEntry]:
result = await self.db.execute(
select(LearnableWordBankEntryEntity)
.where(
LearnableWordBankEntryEntity.user_id == user_id,
LearnableWordBankEntryEntity.language_pair_id == language_pair_id,
)
.order_by(LearnableWordBankEntryEntity.created_at.desc())
)
return [_entry_to_model(e) for e in result.scalars().all()]
async def set_sense(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry:
result = await self.db.execute(
select(LearnableWordBankEntryEntity).where(
LearnableWordBankEntryEntity.id == entry_id
)
)
entity = result.scalar_one()
entity.sense_id = sense_id
entity.disambiguation_status = "resolved"
await self.db.commit()
await self.db.refresh(entity)
return _entry_to_model(entity)
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]:
result = await self.db.execute(
select(LearnableWordBankEntryEntity)
.where(
LearnableWordBankEntryEntity.user_id == user_id,
LearnableWordBankEntryEntity.disambiguation_status == "pending",
)
.order_by(LearnableWordBankEntryEntity.created_at.desc())
)
return [_entry_to_model(e) for e in result.scalars().all()]

View file

@ -3,6 +3,7 @@ from .translate import router as translate_router
from .generation import router as generation_router
from .jobs import router as jobs_router
from .learnable_languages import router as learnable_languages_router
from .vocab import router as vocab_router
from fastapi import APIRouter
@ -13,3 +14,4 @@ api_router.include_router(translate_router)
api_router.include_router(generation_router)
api_router.include_router(jobs_router)
api_router.include_router(learnable_languages_router)
api_router.include_router(vocab_router)

View file

@ -0,0 +1,143 @@
import uuid
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token
from ...domain.services.vocab_service import VocabService
from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository
router = APIRouter(prefix="/vocab", tags=["vocab"])
class AddWordRequest(BaseModel):
language_pair_id: str
surface_text: str
entry_pathway: str = "manual"
is_phrase: bool = False
source_article_id: str | None = None
class SetSenseRequest(BaseModel):
sense_id: str
class WordBankEntryResponse(BaseModel):
id: str
user_id: str
language_pair_id: str
sense_id: str | None
wordform_id: str | None
surface_text: str
is_phrase: bool
entry_pathway: str
source_article_id: str | None
disambiguation_status: str
created_at: str
def _service(db: AsyncSession) -> VocabService:
return VocabService(
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
@router.post("", response_model=WordBankEntryResponse, status_code=201)
async def add_word(
request: AddWordRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> WordBankEntryResponse:
user_id = uuid.UUID(token_data["sub"])
try:
language_pair_id = uuid.UUID(request.language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
source_article_id = None
if request.source_article_id:
try:
source_article_id = uuid.UUID(request.source_article_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid source_article_id")
try:
entry = await _service(db).add_word_to_bank(
user_id=user_id,
surface_text=request.surface_text.strip(),
language_pair_id=language_pair_id,
pathway=request.entry_pathway,
is_phrase=request.is_phrase,
source_article_id=source_article_id,
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc))
return _to_response(entry)
@router.get("", response_model=list[WordBankEntryResponse])
async def list_entries(
language_pair_id: str,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[WordBankEntryResponse]:
user_id = uuid.UUID(token_data["sub"])
try:
pair_id = uuid.UUID(language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
entries = await PostgresVocabRepository(db).get_entries_for_user(user_id, pair_id)
return [_to_response(e) for e in entries]
@router.get("/pending-disambiguation", response_model=list[WordBankEntryResponse])
async def pending_disambiguation(
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[WordBankEntryResponse]:
user_id = uuid.UUID(token_data["sub"])
entries = await PostgresVocabRepository(db).get_pending_disambiguation(user_id)
return [_to_response(e) for e in entries]
@router.patch("/{entry_id}/sense", response_model=WordBankEntryResponse)
async def resolve_sense(
entry_id: str,
request: SetSenseRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> WordBankEntryResponse:
try:
eid = uuid.UUID(entry_id)
sid = uuid.UUID(request.sense_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid UUID")
try:
entry = await _service(db).resolve_disambiguation(eid, sid)
except Exception:
raise HTTPException(status_code=404, detail="Entry not found")
return _to_response(entry)
def _to_response(entry) -> WordBankEntryResponse:
return WordBankEntryResponse(
id=entry.id,
user_id=entry.user_id,
language_pair_id=entry.language_pair_id,
sense_id=entry.sense_id,
wordform_id=entry.wordform_id,
surface_text=entry.surface_text,
is_phrase=entry.is_phrase,
entry_pathway=entry.entry_pathway,
source_article_id=entry.source_article_id,
disambiguation_status=entry.disambiguation_status,
created_at=entry.created_at.isoformat(),
)