Compare commits

...

6 commits

Author SHA1 Message Date
aa4987981d feat: Create the Dictionary Lookup Service; methods for fidning
Some checks are pending
/ test (push) Waiting to run
vocabulary and words
2026-04-10 07:11:57 +01:00
27f7a7c3f3 feat: Build the flashcards model, routes, etc. 2026-04-09 20:40:11 +01:00
0281caef7c feat: Endpoints to manage your account. 2026-04-08 20:50:26 +01:00
689e10d1bc feat: vocab endpoints 2026-04-08 20:37:00 +01:00
486e0bf3d5 docs: Update the pyproject.toml to make it compatible with spaCy; update
architecture.md
2026-04-08 20:26:57 +01:00
873ebacd4d feat: Build the bilingual dictionary data 2026-04-08 20:26:26 +01:00
30 changed files with 2454 additions and 18 deletions

View file

@ -1,4 +1,4 @@
.PHONY: down build up logs shell lock migrate migration
.PHONY: down build up logs shell lock migrate migration import-dictionary
build:
docker compose build
@ -28,3 +28,10 @@ lock:
cd api && uv pip compile pyproject.toml -o requirements.txt
rebuild: down build up
# Import a kaikki dictionary JSONL into Postgres.
# Requires the DB to be running with its port exposed on localhost (docker compose up).
# DATABASE_URL defaults to the docker-compose dev credentials.
# Usage: make import-dictionary lang=fr
import-dictionary:
cd api && python scripts/import_dictionary.py --lang $(lang)

View file

@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base
import app.outbound.postgres.entities.summarise_job_entity
import app.outbound.postgres.entities.user_entity
import app.outbound.postgres.entities.dictionary_entities
config = context.config
config.set_main_option("sqlalchemy.url", settings.database_url)

View file

@ -0,0 +1,89 @@
"""add dictionary tables
Revision ID: 0007
Revises: 0006
Create Date: 2026-04-07
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0007"
down_revision: Union[str, None] = "0006"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"dictionary_lemma",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("headword", sa.Text(), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("pos_raw", sa.Text(), nullable=False),
sa.Column("pos_normalised", sa.Text(), nullable=True),
sa.Column("gender", sa.Text(), nullable=True),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
op.create_table(
"dictionary_sense",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("sense_index", sa.Integer(), nullable=False),
sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
op.create_table(
"dictionary_wordform",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("form", sa.Text(), nullable=False),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
op.create_table(
"dictionary_lemma_raw",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
unique=True,
),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("raw", postgresql.JSONB(), nullable=False),
)
def downgrade() -> None:
op.drop_table("dictionary_lemma_raw")
op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
op.drop_table("dictionary_wordform")
op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
op.drop_table("dictionary_sense")
op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
op.drop_table("dictionary_lemma")

View file

@ -0,0 +1,96 @@
"""add vocab bank tables
Revision ID: 0008
Revises: 0007
Create Date: 2026-04-08
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0008"
down_revision: Union[str, None] = "0007"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"user_language_pair",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("source_lang", sa.String(2), nullable=False),
sa.Column("target_lang", sa.String(2), nullable=False),
sa.UniqueConstraint("user_id", "source_lang", "target_lang", name="uq_user_language_pair"),
)
op.create_index("ix_user_language_pair_user_id", "user_language_pair", ["user_id"])
op.create_table(
"learnable_word_bank_entry",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"language_pair_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("user_language_pair.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"sense_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column(
"wordform_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column("surface_text", sa.Text(), nullable=False),
sa.Column("is_phrase", sa.Boolean(), nullable=False, server_default="false"),
sa.Column("entry_pathway", sa.Text(), nullable=False),
sa.Column("source_article_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("disambiguation_status", sa.Text(), nullable=False, server_default="pending"),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.func.now(),
),
)
op.create_index(
"ix_learnable_word_bank_entry_user_id", "learnable_word_bank_entry", ["user_id"]
)
op.create_index(
"ix_learnable_word_bank_entry_language_pair_id",
"learnable_word_bank_entry",
["language_pair_id"],
)
op.create_index(
"ix_learnable_word_bank_entry_sense_id", "learnable_word_bank_entry", ["sense_id"]
)
def downgrade() -> None:
op.drop_index("ix_learnable_word_bank_entry_sense_id", table_name="learnable_word_bank_entry")
op.drop_index(
"ix_learnable_word_bank_entry_language_pair_id", table_name="learnable_word_bank_entry"
)
op.drop_index("ix_learnable_word_bank_entry_user_id", table_name="learnable_word_bank_entry")
op.drop_table("learnable_word_bank_entry")
op.drop_index("ix_user_language_pair_user_id", table_name="user_language_pair")
op.drop_table("user_language_pair")

View file

@ -0,0 +1,88 @@
"""add flashcard tables
Revision ID: 0009
Revises: 0008
Create Date: 2026-04-08
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0009"
down_revision: Union[str, None] = "0008"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"flashcard",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"bank_entry_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("learnable_word_bank_entry.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("source_lang", sa.Text(), nullable=False),
sa.Column("target_lang", sa.Text(), nullable=False),
sa.Column("prompt_text", sa.Text(), nullable=False),
sa.Column("answer_text", sa.Text(), nullable=False),
sa.Column("prompt_context_text", sa.Text(), nullable=True),
sa.Column("answer_context_text", sa.Text(), nullable=True),
sa.Column("card_direction", sa.Text(), nullable=False),
sa.Column("prompt_modality", sa.Text(), nullable=False, server_default="text"),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.func.now(),
),
)
op.create_index("ix_flashcard_user_id", "flashcard", ["user_id"])
op.create_index("ix_flashcard_bank_entry_id", "flashcard", ["bank_entry_id"])
op.create_table(
"flashcard_event",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"flashcard_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("flashcard.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("event_type", sa.Text(), nullable=False),
sa.Column("user_response", sa.Text(), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.func.now(),
),
)
op.create_index("ix_flashcard_event_flashcard_id", "flashcard_event", ["flashcard_id"])
op.create_index("ix_flashcard_event_user_id", "flashcard_event", ["user_id"])
def downgrade() -> None:
op.drop_index("ix_flashcard_event_user_id", table_name="flashcard_event")
op.drop_index("ix_flashcard_event_flashcard_id", table_name="flashcard_event")
op.drop_table("flashcard_event")
op.drop_index("ix_flashcard_bank_entry_id", table_name="flashcard")
op.drop_index("ix_flashcard_user_id", table_name="flashcard")
op.drop_table("flashcard")

View file

@ -0,0 +1,14 @@
from dataclasses import dataclass, field
from datetime import datetime
from .learnable_language import LearnableLanguage
@dataclass
class Account:
id: str
email: str
is_active: bool
is_email_verified: bool
created_at: datetime
learnable_languages: list[LearnableLanguage] = field(default_factory=list)

View file

@ -0,0 +1,32 @@
from dataclasses import dataclass
@dataclass
class Wordform:
id: str
lemma_id: str
form: str
tags: list[str]
@dataclass
class Sense:
id: str
lemma_id: str
sense_index: int
gloss: str
topics: list[str]
tags: list[str]
@dataclass
class Lemma:
id: str
headword: str
language: str
pos_raw: str
pos_normalised: str | None
gender: str | None
tags: list[str]
senses: list[Sense] = field(default_factory=list)
wordforms: list[Wordform] = field(default_factory=list)

View file

@ -0,0 +1,28 @@
from dataclasses import dataclass
from datetime import datetime
@dataclass
class Flashcard:
id: str
user_id: str
bank_entry_id: str
source_lang: str
target_lang: str
prompt_text: str
answer_text: str
prompt_context_text: str | None
answer_context_text: str | None
card_direction: str
prompt_modality: str
created_at: datetime
@dataclass
class FlashcardEvent:
id: str
flashcard_id: str
user_id: str
event_type: str
user_response: str | None
created_at: datetime

View file

@ -0,0 +1,25 @@
from dataclasses import dataclass
from datetime import datetime
@dataclass
class UserLanguagePair:
id: str
user_id: str
source_lang: str
target_lang: str
@dataclass
class LearnableWordBankEntry:
id: str
user_id: str
language_pair_id: str
sense_id: str | None
wordform_id: str | None
surface_text: str
is_phrase: bool
entry_pathway: str
source_article_id: str | None
disambiguation_status: str
created_at: datetime

View file

@ -0,0 +1,150 @@
import uuid
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from ..models.account import Account
from ..models.learnable_language import LearnableLanguage
from ...auth import hash_password
from ...outbound.postgres.entities.user_entity import User as UserEntity
from ...outbound.postgres.repositories import learnable_language_repository, user_repository
class AccountService:
"""Handles account-level operations: registration, profile retrieval, and managing
the set of languages a user is learning.
All methods operate on behalf of a single authenticated user (or, for
``create_account``, the user being created).
Usage::
service = AccountService(db)
# Registration
account = await service.create_account("alice@example.com", "s3cr3t")
# Profile retrieval
account = await service.get_account(user_id)
print(account.learnable_languages) # [LearnableLanguage(...), ...]
# Add French (B1) to the account
lang = await service.add_learnable_language(
user_id, source_language="en", target_language="fr", proficiencies=["B1"]
)
# Remove it again
await service.remove_learnable_language(user_id, lang.id)
"""
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def create_account(self, email: str, password: str) -> Account:
"""Create a new user account, hashing the plain-text password before storage.
Raises ``ValueError`` if the email address is already registered, so the
caller does not need to catch SQLAlchemy exceptions directly.
Usage::
try:
account = await service.create_account("alice@example.com", "s3cr3t")
except ValueError:
# email already taken
...
"""
try:
user = await user_repository.create(
self.db,
email=email,
hashed_password=hash_password(password),
)
except IntegrityError:
await self.db.rollback()
raise ValueError("Email already registered")
return Account(
id=str(user.id),
email=user.email,
is_active=user.is_active,
is_email_verified=user.is_email_verified,
created_at=user.created_at,
)
async def get_account(self, user_id: uuid.UUID) -> Account:
"""Retrieve a user's account profile including all their learnable languages.
Raises ``ValueError`` if no user exists for the given ``user_id``.
Usage::
account = await service.get_account(user_id)
for lang in account.learnable_languages:
print(lang.target_language, lang.proficiencies)
"""
# user_repository only exposes get_by_email; query by id directly
result = await self.db.execute(
select(UserEntity).where(UserEntity.id == user_id)
)
user = result.scalar_one_or_none()
if user is None:
raise ValueError(f"User {user_id} not found")
languages = await learnable_language_repository.list_for_user(self.db, user_id)
return Account(
id=str(user.id),
email=user.email,
is_active=user.is_active,
is_email_verified=user.is_email_verified,
created_at=user.created_at,
learnable_languages=languages,
)
async def add_learnable_language(
self,
user_id: uuid.UUID,
source_language: str,
target_language: str,
proficiencies: list[str],
) -> LearnableLanguage:
"""Add a language pair to the user's account, or update proficiency levels if
the pair already exists (upsert semantics).
Usage::
lang = await service.add_learnable_language(
user_id,
source_language="en",
target_language="fr",
proficiencies=["B1", "B2"],
)
print(lang.id) # UUID of the learnable_language row
"""
return await learnable_language_repository.upsert(
self.db,
user_id=user_id,
source_language=source_language,
target_language=target_language,
proficiencies=proficiencies,
)
async def remove_learnable_language(
self, user_id: uuid.UUID, language_id: uuid.UUID
) -> None:
"""Remove a learnable language from the user's account by its row ID.
Raises ``ValueError`` if the language entry does not exist or does not belong
to ``user_id``.
Usage::
await service.remove_learnable_language(user_id, lang.id)
"""
deleted = await learnable_language_repository.delete(
self.db, user_id=user_id, language_id=language_id
)
if not deleted:
raise ValueError(f"Learnable language {language_id} not found for this user")

View file

@ -0,0 +1,108 @@
import uuid
from dataclasses import dataclass, field
from ..models.dictionary import Sense, Wordform
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@dataclass
class TokenLookupResult:
"""The result of resolving a spaCy token against the dictionary.
``senses`` is the ranked list of candidate senses for disambiguation.
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
allowing the vocab bank entry to be pre-linked to the exact inflected form.
``matched_via`` describes which lookup strategy succeeded.
"""
senses: list[Sense]
wordform_id: str | None
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
matched_wordforms: list[Wordform] = field(default_factory=list)
class DictionaryLookupService:
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
senses, using a three-stage fallback strategy.
Stage 1 wordform table lookup (most precise):
Searches ``dictionary_wordform`` for an exact match on the inflected surface
form within the target language. "allons" wordform row lemma "aller".
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
Stage 2 lemma + POS fallback:
If no wordform row exists, tries the spaCy-provided lemma string against
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
Reduces false matches for homographs with different parts of speech.
Stage 3 lemma-only fallback:
Drops the POS filter as a last resort. Returns all senses for the headword
regardless of POS.
Usage::
service = DictionaryLookupService(PostgresDictionaryRepository(db))
result = await service.lookup_token(
surface="allons",
spacy_lemma="aller",
pos_ud="VERB",
language="fr",
)
# result.senses — candidate Sense rows for disambiguation
# result.wordform_id — pre-resolved wordform UUID string, or None
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
"""
def __init__(self, dict_repo: DictionaryRepository) -> None:
self.dict_repo = dict_repo
async def lookup_token(
self,
surface: str,
spacy_lemma: str,
pos_ud: str,
language: str,
) -> TokenLookupResult:
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
``surface`` is the raw token text (e.g. ``"allons"``).
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
``language`` is the target language code (e.g. ``"fr"``).
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
surface form was found in the wordform table, a ``wordform_id`` that can be
stored on the vocab bank entry for precise inflection tracking.
"""
# Stage 1: wordform table lookup by inflected surface form
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
if wordforms:
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
senses: list[Sense] = []
for lemma_id in unique_lemma_ids:
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
# Only pre-assign wordform_id when a single wordform matched — if multiple
# wordforms from different lemmas matched, the ambiguity must be resolved
# by the user and we cannot confidently pick one.
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
return TokenLookupResult(
senses=senses,
wordform_id=wordform_id,
matched_via="wordform",
matched_wordforms=wordforms,
)
# Stage 2: spaCy lemma + UD POS filter
senses = await self.dict_repo.get_senses_for_headword_and_pos(
spacy_lemma, language, pos_ud
)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
# Stage 3: spaCy lemma only — no POS filter
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")

View file

@ -0,0 +1,152 @@
import uuid
from ..models.flashcard import Flashcard, FlashcardEvent
from ...outbound.postgres.repositories.flashcard_repository import FlashcardRepository
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
VALID_DIRECTIONS = {"target_to_en", "en_to_target"}
VALID_EVENT_TYPES = {"shown", "answered", "skipped"}
class FlashcardService:
"""Generates flashcards from resolved vocab bank entries and records study events.
Flashcard text is derived directly from the dictionary: the lemma headword is the
target-language side and the sense gloss is the English side. Both directions are
created by default.
Usage::
service = FlashcardService(
flashcard_repo=PostgresFlashcardRepository(db),
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
# Generate both directions for a resolved bank entry
cards = await service.generate_flashcard_from_entry(entry_id)
# Record that the user answered correctly
event = await service.record_flashcard_event(
flashcard_id=cards[0].id,
user_id=user_id,
event_type="answered",
response="banque",
)
"""
def __init__(
self,
flashcard_repo: FlashcardRepository,
vocab_repo: VocabRepository,
dict_repo: DictionaryRepository,
) -> None:
self.flashcard_repo = flashcard_repo
self.vocab_repo = vocab_repo
self.dict_repo = dict_repo
async def generate_flashcard_from_entry(
self,
entry_id: uuid.UUID,
direction: str | None = None,
) -> list[Flashcard]:
"""Create flashcard(s) from a vocab bank entry that has a resolved sense.
Looks up the sense gloss (English meaning) and lemma headword (target-language
word) and creates one card per direction. Pass ``direction`` to generate only
``"target_to_en"`` or ``"en_to_target"``; omit it to create both.
Raises ``ValueError`` if the entry does not exist, has no resolved sense, or
if the underlying sense or lemma rows cannot be found in the dictionary.
Usage::
# Both directions — typical case
cards = await service.generate_flashcard_from_entry(entry_id)
assert len(cards) == 2
# One direction only
cards = await service.generate_flashcard_from_entry(
entry_id, direction="target_to_en"
)
"""
if direction is not None and direction not in VALID_DIRECTIONS:
raise ValueError(f"Invalid direction '{direction}'. Must be one of {VALID_DIRECTIONS}")
entry = await self.vocab_repo.get_entry(entry_id)
if entry is None:
raise ValueError(f"Bank entry {entry_id} not found")
if entry.sense_id is None:
raise ValueError(
"Entry has no resolved sense; disambiguate before generating flashcards"
)
sense = await self.dict_repo.get_sense(uuid.UUID(entry.sense_id))
if sense is None:
raise ValueError(f"Sense {entry.sense_id} not found in dictionary")
lemma = await self.dict_repo.get_lemma(uuid.UUID(sense.lemma_id))
if lemma is None:
raise ValueError(f"Lemma for sense {entry.sense_id} not found in dictionary")
pair = await self.vocab_repo.get_language_pair(uuid.UUID(entry.language_pair_id))
if pair is None:
raise ValueError(f"Language pair {entry.language_pair_id} not found")
user_id = uuid.UUID(entry.user_id)
directions = [direction] if direction else ["target_to_en", "en_to_target"]
flashcards = []
for d in directions:
if d == "target_to_en":
prompt, answer = lemma.headword, sense.gloss
else:
prompt, answer = sense.gloss, lemma.headword
card = await self.flashcard_repo.create_flashcard(
user_id=user_id,
bank_entry_id=entry_id,
source_lang=pair.source_lang,
target_lang=pair.target_lang,
prompt_text=prompt,
answer_text=answer,
card_direction=d,
)
flashcards.append(card)
return flashcards
async def record_flashcard_event(
self,
flashcard_id: uuid.UUID,
user_id: uuid.UUID,
event_type: str,
response: str | None = None,
) -> FlashcardEvent:
"""Record a study event against a flashcard — shown, answered, or skipped.
``response`` is the user's free-text answer and is only meaningful for
``event_type="answered"``; it is stored as-is without grading.
Raises ``ValueError`` for unrecognised event types.
Usage::
event = await service.record_flashcard_event(
flashcard_id=card.id,
user_id=user_id,
event_type="answered",
response="banque",
)
"""
if event_type not in VALID_EVENT_TYPES:
raise ValueError(
f"Invalid event_type '{event_type}'. Must be one of {VALID_EVENT_TYPES}"
)
return await self.flashcard_repo.record_event(
flashcard_id=flashcard_id,
user_id=user_id,
event_type=event_type,
user_response=response,
)

View file

@ -0,0 +1,190 @@
import uuid
from ..models.dictionary import Sense
from ..models.vocab import LearnableWordBankEntry
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
class VocabService:
"""Manages a user's learnable word bank — adding words from various sources and
resolving which dictionary sense a word belongs to.
Usage:
service = VocabService(
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
entry = await service.add_word_to_bank(
user_id=user.id,
surface_text="banque",
language_pair_id=pair.id,
pathway="highlight",
)
# entry.disambiguation_status is "auto_resolved" if "banque" has exactly one
# dictionary sense, or "pending" if the user needs to pick from multiple senses.
"""
def __init__(self, vocab_repo: VocabRepository, dict_repo: DictionaryRepository) -> None:
self.vocab_repo = vocab_repo
self.dict_repo = dict_repo
async def add_word_to_bank(
self,
user_id: uuid.UUID,
surface_text: str,
language_pair_id: uuid.UUID,
pathway: str,
is_phrase: bool = False,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
) -> LearnableWordBankEntry:
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
dictionary sense when exactly one match exists, or flagging it as pending
disambiguation when zero or multiple senses are found.
Phrases (``is_phrase=True``) bypass dictionary lookup entirely and are always
created with ``disambiguation_status="pending"`` since they cannot be resolved
to a single headword.
Usage::
# Word with a single sense — auto-resolved immediately
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="bisque",
language_pair_id=fr_en_pair_id,
pathway="highlight",
)
assert entry.disambiguation_status == "auto_resolved"
# Common word with many senses — user must pick one
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="avoir",
language_pair_id=fr_en_pair_id,
pathway="manual",
)
assert entry.disambiguation_status == "pending"
# Multi-word expression — skips lookup, always pending
entry = await service.add_word_to_bank(
user_id=user_id,
surface_text="avoir l'air",
language_pair_id=fr_en_pair_id,
pathway="manual",
is_phrase=True,
)
"""
pair = await self.vocab_repo.get_language_pair(language_pair_id)
if pair is None:
raise ValueError(f"Language pair {language_pair_id} not found")
if is_phrase:
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=pathway,
is_phrase=True,
source_article_id=source_article_id,
disambiguation_status="pending",
)
senses = await self.dict_repo.get_senses_for_headword(surface_text, pair.target_lang)
if len(senses) == 1:
sense_id = uuid.UUID(senses[0].id)
status = "auto_resolved"
elif len(senses) > 1:
sense_id = None
status = "pending"
else:
sense_id = None
status = "pending"
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=pathway,
is_phrase=False,
sense_id=sense_id,
wordform_id=wordform_id,
source_article_id=source_article_id,
disambiguation_status=status,
)
async def add_token_to_bank(
self,
user_id: uuid.UUID,
surface_text: str,
language_pair_id: uuid.UUID,
senses: list[Sense],
wordform_id: uuid.UUID | None,
source_article_id: uuid.UUID | None = None,
) -> LearnableWordBankEntry:
"""Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
results, skipping the redundant dictionary query that ``add_word_to_bank`` would
otherwise perform.
``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
are stored directly on the bank entry. Auto-resolution still applies: exactly
one sense means ``auto_resolved``; anything else means ``pending``.
Usage::
result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
entry = await vocab_service.add_token_to_bank(
user_id=user_id,
surface_text="allons",
language_pair_id=pair_id,
senses=result.senses,
wordform_id=wf_id,
)
# entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
"""
pair = await self.vocab_repo.get_language_pair(language_pair_id)
if pair is None:
raise ValueError(f"Language pair {language_pair_id} not found")
if len(senses) == 1:
sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
status = "auto_resolved"
else:
sense_id = None
status = "pending"
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway="nlp_extraction",
wordform_id=wordform_id,
sense_id=sense_id,
source_article_id=source_article_id,
disambiguation_status=status,
)
async def resolve_disambiguation(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry:
"""Attach a specific dictionary sense to a pending vocab bank entry, marking it
as ``resolved`` so it can be used for flashcard generation.
This is called after the user selects the correct sense from the list presented
during disambiguation for example, choosing "bank (finance)" over
"bank (river)" for the French word "banque".
Usage::
# User has been shown the sense list and picked sense_id for "bank (finance)"
resolved_entry = await service.resolve_disambiguation(
entry_id=pending_entry.id,
sense_id=finance_sense_id,
)
assert resolved_entry.disambiguation_status == "resolved"
assert resolved_entry.sense_id == str(finance_sense_id)
"""
return await self.vocab_repo.set_sense(entry_id, sense_id)

View file

@ -0,0 +1,63 @@
import uuid
from sqlalchemy import String, Text, ForeignKey, Integer
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
from ..database import Base
class DictionaryLemmaEntity(Base):
__tablename__ = "dictionary_lemma"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
headword: Mapped[str] = mapped_column(Text, nullable=False)
language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionarySenseEntity(Base):
__tablename__ = "dictionary_sense"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionaryWordformEntity(Base):
__tablename__ = "dictionary_wordform"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionaryLemmaRawEntity(Base):
__tablename__ = "dictionary_lemma_raw"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
unique=True,
)
language: Mapped[str] = mapped_column(String(2), nullable=False)
raw: Mapped[dict] = mapped_column(JSONB, nullable=False)

View file

@ -0,0 +1,64 @@
import uuid
from datetime import datetime, timezone
from sqlalchemy import DateTime, ForeignKey, Text
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import UUID
from ..database import Base
class FlashcardEntity(Base):
__tablename__ = "flashcard"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
bank_entry_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("learnable_word_bank_entry.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
source_lang: Mapped[str] = mapped_column(Text, nullable=False)
target_lang: Mapped[str] = mapped_column(Text, nullable=False)
prompt_text: Mapped[str] = mapped_column(Text, nullable=False)
answer_text: Mapped[str] = mapped_column(Text, nullable=False)
prompt_context_text: Mapped[str | None] = mapped_column(Text, nullable=True)
answer_context_text: Mapped[str | None] = mapped_column(Text, nullable=True)
card_direction: Mapped[str] = mapped_column(Text, nullable=False)
prompt_modality: Mapped[str] = mapped_column(Text, nullable=False, default="text")
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
)
class FlashcardEventEntity(Base):
__tablename__ = "flashcard_event"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
flashcard_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("flashcard.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
event_type: Mapped[str] = mapped_column(Text, nullable=False)
user_response: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
)

View file

@ -0,0 +1,64 @@
import uuid
from datetime import datetime, timezone
from sqlalchemy import Boolean, ForeignKey, String, Text, UniqueConstraint, DateTime
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import UUID
from ..database import Base
class UserLanguagePairEntity(Base):
__tablename__ = "user_language_pair"
__table_args__ = (UniqueConstraint("user_id", "source_lang", "target_lang"),)
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
source_lang: Mapped[str] = mapped_column(String(2), nullable=False)
target_lang: Mapped[str] = mapped_column(String(2), nullable=False)
class LearnableWordBankEntryEntity(Base):
__tablename__ = "learnable_word_bank_entry"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
language_pair_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("user_language_pair.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
sense_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
nullable=True,
index=True,
)
wordform_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
nullable=True,
)
surface_text: Mapped[str] = mapped_column(Text, nullable=False)
is_phrase: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
entry_pathway: Mapped[str] = mapped_column(Text, nullable=False)
source_article_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), nullable=True
)
disambiguation_status: Mapped[str] = mapped_column(Text, nullable=False, default="pending")
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
)

View file

@ -0,0 +1,145 @@
import uuid
from typing import Protocol
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..entities.dictionary_entities import (
DictionaryLemmaEntity,
DictionarySenseEntity,
DictionaryWordformEntity,
)
from ....domain.models.dictionary import Lemma, Sense, Wordform
class DictionaryRepository(Protocol):
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
return Sense(
id=str(entity.id),
lemma_id=str(entity.lemma_id),
sense_index=entity.sense_index,
gloss=entity.gloss,
topics=entity.topics or [],
tags=entity.tags or [],
)
def _lemma_to_model(entity: DictionaryLemmaEntity) -> Lemma:
return Lemma(
id=str(entity.id),
headword=entity.headword,
language=entity.language,
pos_raw=entity.pos_raw,
pos_normalised=entity.pos_normalised,
gender=entity.gender,
tags=entity.tags or [],
)
def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
return Wordform(
id=str(entity.id),
lemma_id=str(entity.lemma_id),
form=entity.form,
tags=entity.tags or [],
)
class PostgresDictionaryRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
"""EN→target direction: find senses whose gloss matches the given English text.
Uses a case-insensitive exact match on the gloss column, filtered to the
target language via the joined lemma row.
"""
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionarySenseEntity.gloss.ilike(text),
DictionaryLemmaEntity.language == target_lang,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None:
result = await self.db.execute(
select(DictionarySenseEntity).where(DictionarySenseEntity.id == sense_id)
)
entity = result.scalar_one_or_none()
return _sense_to_model(entity) if entity else None
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None:
result = await self.db.execute(
select(DictionaryLemmaEntity).where(DictionaryLemmaEntity.id == lemma_id)
)
entity = result.scalar_one_or_none()
return _lemma_to_model(entity) if entity else None
async def get_senses_for_headword_and_pos(
self, headword: str, language: str, pos_normalised: str
) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
DictionaryLemmaEntity.pos_normalised == pos_normalised,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.where(DictionarySenseEntity.lemma_id == lemma_id)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity)
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryWordformEntity.form == form,
DictionaryLemmaEntity.language == language,
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity).where(
DictionaryWordformEntity.lemma_id == lemma_id
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]

View file

@ -0,0 +1,136 @@
import uuid
from datetime import datetime, timezone
from typing import Protocol
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..entities.flashcard_entities import FlashcardEntity, FlashcardEventEntity
from ....domain.models.flashcard import Flashcard, FlashcardEvent
class FlashcardRepository(Protocol):
async def create_flashcard(
self,
user_id: uuid.UUID,
bank_entry_id: uuid.UUID,
source_lang: str,
target_lang: str,
prompt_text: str,
answer_text: str,
card_direction: str,
prompt_modality: str = "text",
prompt_context_text: str | None = None,
answer_context_text: str | None = None,
) -> Flashcard: ...
async def get_flashcards_for_user(self, user_id: uuid.UUID) -> list[Flashcard]: ...
async def get_flashcards_for_entry(self, bank_entry_id: uuid.UUID) -> list[Flashcard]: ...
async def record_event(
self,
flashcard_id: uuid.UUID,
user_id: uuid.UUID,
event_type: str,
user_response: str | None = None,
) -> FlashcardEvent: ...
def _flashcard_to_model(entity: FlashcardEntity) -> Flashcard:
return Flashcard(
id=str(entity.id),
user_id=str(entity.user_id),
bank_entry_id=str(entity.bank_entry_id),
source_lang=entity.source_lang,
target_lang=entity.target_lang,
prompt_text=entity.prompt_text,
answer_text=entity.answer_text,
prompt_context_text=entity.prompt_context_text,
answer_context_text=entity.answer_context_text,
card_direction=entity.card_direction,
prompt_modality=entity.prompt_modality,
created_at=entity.created_at,
)
def _event_to_model(entity: FlashcardEventEntity) -> FlashcardEvent:
return FlashcardEvent(
id=str(entity.id),
flashcard_id=str(entity.flashcard_id),
user_id=str(entity.user_id),
event_type=entity.event_type,
user_response=entity.user_response,
created_at=entity.created_at,
)
class PostgresFlashcardRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def create_flashcard(
self,
user_id: uuid.UUID,
bank_entry_id: uuid.UUID,
source_lang: str,
target_lang: str,
prompt_text: str,
answer_text: str,
card_direction: str,
prompt_modality: str = "text",
prompt_context_text: str | None = None,
answer_context_text: str | None = None,
) -> Flashcard:
entity = FlashcardEntity(
user_id=user_id,
bank_entry_id=bank_entry_id,
source_lang=source_lang,
target_lang=target_lang,
prompt_text=prompt_text,
answer_text=answer_text,
prompt_context_text=prompt_context_text,
answer_context_text=answer_context_text,
card_direction=card_direction,
prompt_modality=prompt_modality,
created_at=datetime.now(timezone.utc),
)
self.db.add(entity)
await self.db.commit()
await self.db.refresh(entity)
return _flashcard_to_model(entity)
async def get_flashcards_for_user(self, user_id: uuid.UUID) -> list[Flashcard]:
result = await self.db.execute(
select(FlashcardEntity)
.where(FlashcardEntity.user_id == user_id)
.order_by(FlashcardEntity.created_at.desc())
)
return [_flashcard_to_model(e) for e in result.scalars().all()]
async def get_flashcards_for_entry(self, bank_entry_id: uuid.UUID) -> list[Flashcard]:
result = await self.db.execute(
select(FlashcardEntity)
.where(FlashcardEntity.bank_entry_id == bank_entry_id)
.order_by(FlashcardEntity.created_at.desc())
)
return [_flashcard_to_model(e) for e in result.scalars().all()]
async def record_event(
self,
flashcard_id: uuid.UUID,
user_id: uuid.UUID,
event_type: str,
user_response: str | None = None,
) -> FlashcardEvent:
entity = FlashcardEventEntity(
flashcard_id=flashcard_id,
user_id=user_id,
event_type=event_type,
user_response=user_response,
created_at=datetime.now(timezone.utc),
)
self.db.add(entity)
await self.db.commit()
await self.db.refresh(entity)
return _event_to_model(entity)

View file

@ -7,6 +7,26 @@ from ..entities.learnable_language_entity import LearnableLanguageEntity
from ....domain.models.learnable_language import LearnableLanguage
async def delete(db: AsyncSession, user_id: uuid.UUID, language_id: uuid.UUID) -> bool:
"""Delete a learnable language row owned by ``user_id``.
Returns ``True`` if a row was deleted, ``False`` if no matching row was found.
The ``user_id`` check prevents one user from deleting another's data.
"""
result = await db.execute(
select(LearnableLanguageEntity).where(
LearnableLanguageEntity.id == language_id,
LearnableLanguageEntity.user_id == user_id,
)
)
entity = result.scalar_one_or_none()
if entity is None:
return False
await db.delete(entity)
await db.commit()
return True
def _to_model(entity: LearnableLanguageEntity) -> LearnableLanguage:
return LearnableLanguage(
id=str(entity.id),

View file

@ -0,0 +1,177 @@
import uuid
from datetime import datetime, timezone
from typing import Protocol
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..entities.vocab_entities import LearnableWordBankEntryEntity, UserLanguagePairEntity
from ....domain.models.vocab import LearnableWordBankEntry, UserLanguagePair
class VocabRepository(Protocol):
async def get_or_create_language_pair(
self, user_id: uuid.UUID, source_lang: str, target_lang: str
) -> UserLanguagePair: ...
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None: ...
async def add_entry(
self,
user_id: uuid.UUID,
language_pair_id: uuid.UUID,
surface_text: str,
entry_pathway: str,
is_phrase: bool = False,
sense_id: uuid.UUID | None = None,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
disambiguation_status: str = "pending",
) -> LearnableWordBankEntry: ...
async def get_entries_for_user(
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
) -> list[LearnableWordBankEntry]: ...
async def set_sense(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry: ...
async def get_entry(self, entry_id: uuid.UUID) -> LearnableWordBankEntry | None: ...
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]: ...
def _pair_to_model(entity: UserLanguagePairEntity) -> UserLanguagePair:
return UserLanguagePair(
id=str(entity.id),
user_id=str(entity.user_id),
source_lang=entity.source_lang,
target_lang=entity.target_lang,
)
def _entry_to_model(entity: LearnableWordBankEntryEntity) -> LearnableWordBankEntry:
return LearnableWordBankEntry(
id=str(entity.id),
user_id=str(entity.user_id),
language_pair_id=str(entity.language_pair_id),
sense_id=str(entity.sense_id) if entity.sense_id else None,
wordform_id=str(entity.wordform_id) if entity.wordform_id else None,
surface_text=entity.surface_text,
is_phrase=entity.is_phrase,
entry_pathway=entity.entry_pathway,
source_article_id=str(entity.source_article_id) if entity.source_article_id else None,
disambiguation_status=entity.disambiguation_status,
created_at=entity.created_at,
)
class PostgresVocabRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def get_or_create_language_pair(
self, user_id: uuid.UUID, source_lang: str, target_lang: str
) -> UserLanguagePair:
result = await self.db.execute(
select(UserLanguagePairEntity).where(
UserLanguagePairEntity.user_id == user_id,
UserLanguagePairEntity.source_lang == source_lang,
UserLanguagePairEntity.target_lang == target_lang,
)
)
entity = result.scalar_one_or_none()
if entity is None:
entity = UserLanguagePairEntity(
user_id=user_id,
source_lang=source_lang,
target_lang=target_lang,
)
self.db.add(entity)
await self.db.flush()
return _pair_to_model(entity)
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None:
result = await self.db.execute(
select(UserLanguagePairEntity).where(UserLanguagePairEntity.id == language_pair_id)
)
entity = result.scalar_one_or_none()
return _pair_to_model(entity) if entity else None
async def add_entry(
self,
user_id: uuid.UUID,
language_pair_id: uuid.UUID,
surface_text: str,
entry_pathway: str,
is_phrase: bool = False,
sense_id: uuid.UUID | None = None,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
disambiguation_status: str = "pending",
) -> LearnableWordBankEntry:
entity = LearnableWordBankEntryEntity(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway=entry_pathway,
is_phrase=is_phrase,
sense_id=sense_id,
wordform_id=wordform_id,
source_article_id=source_article_id,
disambiguation_status=disambiguation_status,
created_at=datetime.now(timezone.utc),
)
self.db.add(entity)
await self.db.commit()
await self.db.refresh(entity)
return _entry_to_model(entity)
async def get_entries_for_user(
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
) -> list[LearnableWordBankEntry]:
result = await self.db.execute(
select(LearnableWordBankEntryEntity)
.where(
LearnableWordBankEntryEntity.user_id == user_id,
LearnableWordBankEntryEntity.language_pair_id == language_pair_id,
)
.order_by(LearnableWordBankEntryEntity.created_at.desc())
)
return [_entry_to_model(e) for e in result.scalars().all()]
async def set_sense(
self, entry_id: uuid.UUID, sense_id: uuid.UUID
) -> LearnableWordBankEntry:
result = await self.db.execute(
select(LearnableWordBankEntryEntity).where(
LearnableWordBankEntryEntity.id == entry_id
)
)
entity = result.scalar_one()
entity.sense_id = sense_id
entity.disambiguation_status = "resolved"
await self.db.commit()
await self.db.refresh(entity)
return _entry_to_model(entity)
async def get_entry(self, entry_id: uuid.UUID) -> LearnableWordBankEntry | None:
result = await self.db.execute(
select(LearnableWordBankEntryEntity).where(
LearnableWordBankEntryEntity.id == entry_id
)
)
entity = result.scalar_one_or_none()
return _entry_to_model(entity) if entity else None
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]:
result = await self.db.execute(
select(LearnableWordBankEntryEntity)
.where(
LearnableWordBankEntryEntity.user_id == user_id,
LearnableWordBankEntryEntity.disambiguation_status == "pending",
)
.order_by(LearnableWordBankEntryEntity.created_at.desc())
)
return [_entry_to_model(e) for e in result.scalars().all()]

View file

@ -0,0 +1,97 @@
import uuid
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, field_validator
from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token
from ...domain.services.account_service import AccountService
from ...languages import SUPPORTED_LANGUAGES, SUPPORTED_LEVELS
from ...outbound.postgres.database import get_db
router = APIRouter(prefix="/account", tags=["account"])
class AddLearnableLanguageRequest(BaseModel):
source_language: str
target_language: str
proficiencies: list[str]
@field_validator("proficiencies")
@classmethod
def validate_proficiencies(cls, v: list[str]) -> list[str]:
if not (1 <= len(v) <= 2):
raise ValueError("proficiencies must contain 1 or 2 levels")
invalid = [p for p in v if p not in SUPPORTED_LEVELS]
if invalid:
raise ValueError(f"Invalid proficiency levels: {invalid}. Supported: {sorted(SUPPORTED_LEVELS)}")
return v
class LearnableLanguageResponse(BaseModel):
id: str
source_language: str
target_language: str
proficiencies: list[str]
@router.post(
"/learnable-languages",
response_model=LearnableLanguageResponse,
status_code=status.HTTP_201_CREATED,
)
async def add_learnable_language(
body: AddLearnableLanguageRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> LearnableLanguageResponse:
if body.source_language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported source language '{body.source_language}'. Supported: {list(SUPPORTED_LANGUAGES)}",
)
if body.target_language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported target language '{body.target_language}'. Supported: {list(SUPPORTED_LANGUAGES)}",
)
if body.source_language == body.target_language:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="source_language and target_language must differ",
)
user_id = uuid.UUID(token_data["sub"])
lang = await AccountService(db).add_learnable_language(
user_id=user_id,
source_language=body.source_language,
target_language=body.target_language,
proficiencies=body.proficiencies,
)
return LearnableLanguageResponse(
id=lang.id,
source_language=lang.source_language,
target_language=lang.target_language,
proficiencies=lang.proficiencies,
)
@router.delete(
"/learnable-languages/{language_id}",
status_code=status.HTTP_204_NO_CONTENT,
)
async def remove_learnable_language(
language_id: str,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> None:
try:
lid = uuid.UUID(language_id)
except ValueError:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid language_id")
user_id = uuid.UUID(token_data["sub"])
try:
await AccountService(db).remove_learnable_language(user_id=user_id, language_id=lid)
except ValueError as exc:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc))

View file

@ -0,0 +1,143 @@
import uuid
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token
from ...domain.services.flashcard_service import FlashcardService
from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
from ...outbound.postgres.repositories.flashcard_repository import PostgresFlashcardRepository
from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository
router = APIRouter(tags=["flashcards"])
class FlashcardResponse(BaseModel):
id: str
user_id: str
bank_entry_id: str
source_lang: str
target_lang: str
prompt_text: str
answer_text: str
prompt_context_text: str | None
answer_context_text: str | None
card_direction: str
prompt_modality: str
created_at: str
class FlashcardEventResponse(BaseModel):
id: str
flashcard_id: str
user_id: str
event_type: str
user_response: str | None
created_at: str
class GenerateFlashcardsRequest(BaseModel):
direction: str | None = None
class RecordEventRequest(BaseModel):
event_type: str
user_response: str | None = None
def _service(db: AsyncSession) -> FlashcardService:
return FlashcardService(
flashcard_repo=PostgresFlashcardRepository(db),
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
@router.post(
"/vocab/{entry_id}/flashcards",
response_model=list[FlashcardResponse],
status_code=status.HTTP_201_CREATED,
)
async def generate_flashcards(
entry_id: str,
body: GenerateFlashcardsRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[FlashcardResponse]:
try:
eid = uuid.UUID(entry_id)
except ValueError:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid entry_id")
try:
cards = await _service(db).generate_flashcard_from_entry(eid, direction=body.direction)
except ValueError as exc:
raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(exc))
return [_flashcard_response(c) for c in cards]
@router.get("/flashcards", response_model=list[FlashcardResponse])
async def list_flashcards(
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[FlashcardResponse]:
user_id = uuid.UUID(token_data["sub"])
cards = await PostgresFlashcardRepository(db).get_flashcards_for_user(user_id)
return [_flashcard_response(c) for c in cards]
@router.post(
"/flashcards/{flashcard_id}/events",
response_model=FlashcardEventResponse,
status_code=status.HTTP_201_CREATED,
)
async def record_event(
flashcard_id: str,
body: RecordEventRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> FlashcardEventResponse:
try:
fid = uuid.UUID(flashcard_id)
except ValueError:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid flashcard_id")
user_id = uuid.UUID(token_data["sub"])
try:
event = await _service(db).record_flashcard_event(
flashcard_id=fid,
user_id=user_id,
event_type=body.event_type,
response=body.user_response,
)
except ValueError as exc:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc))
return FlashcardEventResponse(
id=event.id,
flashcard_id=event.flashcard_id,
user_id=event.user_id,
event_type=event.event_type,
user_response=event.user_response,
created_at=event.created_at.isoformat(),
)
def _flashcard_response(card) -> FlashcardResponse:
return FlashcardResponse(
id=card.id,
user_id=card.user_id,
bank_entry_id=card.bank_entry_id,
source_lang=card.source_lang,
target_lang=card.target_lang,
prompt_text=card.prompt_text,
answer_text=card.answer_text,
prompt_context_text=card.prompt_context_text,
answer_context_text=card.answer_context_text,
card_direction=card.card_direction,
prompt_modality=card.prompt_modality,
created_at=card.created_at.isoformat(),
)

View file

@ -1,15 +1,21 @@
from .account import router as account_router
from .flashcards import router as flashcards_router
from .pos import router as pos_router
from .translate import router as translate_router
from .generation import router as generation_router
from .jobs import router as jobs_router
from .learnable_languages import router as learnable_languages_router
from .vocab import router as vocab_router
from fastapi import APIRouter
api_router = APIRouter(prefix="/api", tags=["api"])
api_router.include_router(account_router)
api_router.include_router(flashcards_router)
api_router.include_router(pos_router)
api_router.include_router(translate_router)
api_router.include_router(generation_router)
api_router.include_router(jobs_router)
api_router.include_router(learnable_languages_router)
api_router.include_router(vocab_router)

View file

@ -0,0 +1,218 @@
import uuid
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token
from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
from ...domain.services.vocab_service import VocabService
from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository
router = APIRouter(prefix="/vocab", tags=["vocab"])
class AddWordRequest(BaseModel):
language_pair_id: str
surface_text: str
entry_pathway: str = "manual"
is_phrase: bool = False
source_article_id: str | None = None
class AddFromTokenRequest(BaseModel):
language_pair_id: str
surface: str
spacy_lemma: str
pos_ud: str
language: str
source_article_id: str | None = None
class SenseCandidateResponse(BaseModel):
id: str
gloss: str
topics: list[str]
tags: list[str]
class FromTokenResponse(BaseModel):
entry: "WordBankEntryResponse"
sense_candidates: list[SenseCandidateResponse]
matched_via: str
class SetSenseRequest(BaseModel):
sense_id: str
class WordBankEntryResponse(BaseModel):
id: str
user_id: str
language_pair_id: str
sense_id: str | None
wordform_id: str | None
surface_text: str
is_phrase: bool
entry_pathway: str
source_article_id: str | None
disambiguation_status: str
created_at: str
def _service(db: AsyncSession) -> VocabService:
return VocabService(
vocab_repo=PostgresVocabRepository(db),
dict_repo=PostgresDictionaryRepository(db),
)
@router.post("", response_model=WordBankEntryResponse, status_code=201)
async def add_word(
request: AddWordRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> WordBankEntryResponse:
user_id = uuid.UUID(token_data["sub"])
try:
language_pair_id = uuid.UUID(request.language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
source_article_id = None
if request.source_article_id:
try:
source_article_id = uuid.UUID(request.source_article_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid source_article_id")
try:
entry = await _service(db).add_word_to_bank(
user_id=user_id,
surface_text=request.surface_text.strip(),
language_pair_id=language_pair_id,
pathway=request.entry_pathway,
is_phrase=request.is_phrase,
source_article_id=source_article_id,
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc))
return _to_response(entry)
@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
async def add_from_token(
request: AddFromTokenRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> FromTokenResponse:
user_id = uuid.UUID(token_data["sub"])
try:
language_pair_id = uuid.UUID(request.language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
source_article_id = None
if request.source_article_id:
try:
source_article_id = uuid.UUID(request.source_article_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid source_article_id")
lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
result: TokenLookupResult = await lookup_service.lookup_token(
surface=request.surface,
spacy_lemma=request.spacy_lemma,
pos_ud=request.pos_ud,
language=request.language,
)
wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
try:
entry = await _service(db).add_token_to_bank(
user_id=user_id,
surface_text=request.surface,
language_pair_id=language_pair_id,
senses=result.senses,
wordform_id=wordform_id,
source_article_id=source_article_id,
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc))
candidates = [
SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
for s in result.senses
]
return FromTokenResponse(
entry=_to_response(entry),
sense_candidates=candidates,
matched_via=result.matched_via,
)
@router.get("", response_model=list[WordBankEntryResponse])
async def list_entries(
language_pair_id: str,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[WordBankEntryResponse]:
user_id = uuid.UUID(token_data["sub"])
try:
pair_id = uuid.UUID(language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
entries = await PostgresVocabRepository(db).get_entries_for_user(user_id, pair_id)
return [_to_response(e) for e in entries]
@router.get("/pending-disambiguation", response_model=list[WordBankEntryResponse])
async def pending_disambiguation(
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> list[WordBankEntryResponse]:
user_id = uuid.UUID(token_data["sub"])
entries = await PostgresVocabRepository(db).get_pending_disambiguation(user_id)
return [_to_response(e) for e in entries]
@router.patch("/{entry_id}/sense", response_model=WordBankEntryResponse)
async def resolve_sense(
entry_id: str,
request: SetSenseRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> WordBankEntryResponse:
try:
eid = uuid.UUID(entry_id)
sid = uuid.UUID(request.sense_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid UUID")
try:
entry = await _service(db).resolve_disambiguation(eid, sid)
except Exception:
raise HTTPException(status_code=404, detail="Entry not found")
return _to_response(entry)
def _to_response(entry) -> WordBankEntryResponse:
return WordBankEntryResponse(
id=entry.id,
user_id=entry.user_id,
language_pair_id=entry.language_pair_id,
sense_id=entry.sense_id,
wordform_id=entry.wordform_id,
surface_text=entry.surface_text,
is_phrase=entry.is_phrase,
entry_pathway=entry.entry_pathway,
source_article_id=entry.source_article_id,
disambiguation_status=entry.disambiguation_status,
created_at=entry.created_at.isoformat(),
)

View file

@ -1,9 +1,9 @@
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, EmailStr
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from ..auth import create_access_token, hash_password, verify_password
from ..auth import create_access_token, verify_password
from ..domain.services.account_service import AccountService
from ..outbound.postgres.database import get_db
from ..outbound.postgres.repositories import user_repository
@ -27,24 +27,15 @@ class TokenResponse(BaseModel):
@router.post("/register", status_code=status.HTTP_201_CREATED)
async def register(body: RegisterRequest, db: AsyncSession = Depends(get_db)):
try:
user = await user_repository.create(
db,
email=body.email,
hashed_password=hash_password(body.password),
)
except IntegrityError:
await db.rollback()
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="Email already registered",
)
# TODO(email-verification): send verification email here once transactional
# email is implemented. Set is_email_verified=False on the User model and
# require verification before allowing login.
try:
account = await AccountService(db).create_account(body.email, body.password)
except ValueError as exc:
raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc))
return {"id": str(user.id), "email": user.email}
return {"id": account.id, "email": account.email}
@router.post("/login", response_model=TokenResponse)

View file

@ -46,3 +46,9 @@ Example Api Clients in their own modules are:
- `AnthropicClient` to communicate with Anthorpic's LLM, i.e. Claude, to generate text and synthesis.
- `GeminiClient` to communicate with Google's Gemini for text-to-speech generation
- `DeepgramClient` for timestamped speech-to-text transcription
## Deploymnet
The application has not been deployed yet, but local development should mimic the deployed environment as much as possible.
It will be deployed on a VPS using containerisation technologies (docker, podman). At the root of the projec there is a `docker-compose.yaml` file which will describe each dependency (e.g. database, queue, storage).

View file

@ -1,7 +1,7 @@
[project]
name = "language-learning-api"
version = "0.1.0"
requires-python = ">=3.11"
requires-python = "==3.13.*"
dependencies = [
"fastapi>=0.115.0",
"uvicorn[standard]>=0.30.0",

View file

@ -0,0 +1,322 @@
#!/usr/bin/env python
"""
CLI import script for kaikki/wiktextract JSONL dictionary data.
Usage (from api/ directory):
uv run ./scripts/import_dictionary.py --lang fr
# or via Make from the repo root:
make import-dictionary lang=fr
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
which matches the docker-compose dev credentials when the DB port is exposed on the host.
"""
import argparse
import asyncio
import json
import os
import sys
import uuid
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
_API_DIR = Path(__file__).parent.parent
_REPO_ROOT = _API_DIR.parent
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
_LANG_FILE_MAP: dict[str, str] = {
"fr": "french.jsonl",
}
_POS_MAP: dict[str, str] = {
"noun": "NOUN",
"verb": "VERB",
"adj": "ADJ",
"adv": "ADV",
"det": "DET",
"article": "DET",
"pron": "PRON",
"prep": "ADP",
"adp": "ADP",
"conj": "CCONJ",
"cconj": "CCONJ",
"sconj": "SCONJ",
"intj": "INTJ",
"num": "NUM",
"numeral": "NUM",
"part": "PART",
"particle": "PART",
"name": "PROPN",
"propn": "PROPN",
"proper noun": "PROPN",
"punct": "PUNCT",
"sym": "SYM",
}
_GENDER_MAP: dict[str, str] = {
"masculine": "masculine",
"masc": "masculine",
"feminine": "feminine",
"fem": "feminine",
"neuter": "neuter",
"common": "common",
}
# ---------------------------------------------------------------------------
# Standalone table definitions — no app imports, no Settings() call
# ---------------------------------------------------------------------------
_meta = sa.MetaData()
_lemma_table = sa.Table(
"dictionary_lemma",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("headword", sa.Text(), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("pos_raw", sa.Text(), nullable=False),
sa.Column("pos_normalised", sa.Text(), nullable=True),
sa.Column("gender", sa.Text(), nullable=True),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_sense_table = sa.Table(
"dictionary_sense",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("sense_index", sa.Integer(), nullable=False),
sa.Column("gloss", sa.Text(), nullable=False),
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_wordform_table = sa.Table(
"dictionary_wordform",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("form", sa.Text(), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_raw_table = sa.Table(
"dictionary_lemma_raw",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("raw", JSONB(), nullable=False),
)
# ---------------------------------------------------------------------------
# Normalisation helpers
# ---------------------------------------------------------------------------
def _normalise_pos(pos_raw: str) -> str | None:
return _POS_MAP.get(pos_raw.lower().strip())
def _normalise_gender(tags: list) -> str | None:
for tag in tags:
mapped = _GENDER_MAP.get(tag)
if mapped:
return mapped
return None
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def _parse_entry(record: dict, lang_code: str) -> dict | None:
"""Parse one kaikki JSONL record into insertion-ready row dicts.
Returns None if the entry should be skipped.
"""
if record.get("lang_code") != lang_code:
return None
word = (record.get("word") or "").strip()
if not word:
return None
pos_raw = (record.get("pos") or "").strip()
top_tags = record.get("tags") or []
lemma_id = uuid.uuid4()
senses = []
for i, sense_record in enumerate(record.get("senses") or []):
sense_id = uuid.uuid4()
glosses = sense_record.get("glosses") or []
gloss = glosses[0] if glosses else ""
topics = sense_record.get("topics") or []
sense_tags = sense_record.get("tags") or []
senses.append(
{
"id": sense_id,
"lemma_id": lemma_id,
"sense_index": i,
"gloss": gloss,
"topics": topics,
"tags": sense_tags,
}
)
wordforms = []
for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip()
if not form_text or form_text == word:
continue
form_tags = f.get("tags") or []
wordforms.append(
{
"id": uuid.uuid4(),
"lemma_id": lemma_id,
"form": form_text,
"tags": form_tags,
}
)
return {
"lemma": {
"id": lemma_id,
"headword": word,
"language": lang_code,
"pos_raw": pos_raw,
"pos_normalised": _normalise_pos(pos_raw),
"gender": _normalise_gender(top_tags),
"tags": top_tags,
},
"senses": senses,
"wordforms": wordforms,
"raw": {
"id": uuid.uuid4(),
"lemma_id": lemma_id,
"language": lang_code,
"raw": record,
},
}
# ---------------------------------------------------------------------------
# DB operations
# ---------------------------------------------------------------------------
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
lemma_rows = [e["lemma"] for e in batch]
sense_rows = [s for e in batch for s in e["senses"]]
wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch]
if lemma_rows:
await conn.execute(_lemma_table.insert(), lemma_rows)
if sense_rows:
await conn.execute(_sense_table.insert(), sense_rows)
if wordform_rows:
await conn.execute(_wordform_table.insert(), wordform_rows)
if raw_rows:
await conn.execute(_raw_table.insert(), raw_rows)
await conn.commit()
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
lang_file = _LANG_FILE_MAP.get(lang_code)
if not lang_file:
print(
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
file=sys.stderr,
)
sys.exit(1)
jsonl_path = _DICT_DIR / lang_file
if not jsonl_path.exists():
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
sys.exit(1)
database_url = os.environ.get(
"DATABASE_URL",
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
)
engine = create_async_engine(database_url, echo=False)
try:
async with engine.connect() as conn:
print(f"Deleting existing entries for language={lang_code!r}...")
await conn.execute(
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
)
await conn.commit()
print(f"Importing {jsonl_path} ...")
batch: list[dict] = []
total_lemmas = 0
skipped = 0
with open(jsonl_path, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError as exc:
print(
f" Line {line_num}: JSON parse error: {exc}",
file=sys.stderr,
)
skipped += 1
continue
parsed = _parse_entry(record, lang_code)
if parsed is None:
skipped += 1
continue
batch.append(parsed)
if len(batch) >= batch_size:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f" Committed {total_lemmas} lemmas...")
batch = []
if batch:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
finally:
await engine.dispose()
def main() -> None:
parser = argparse.ArgumentParser(
description="Import kaikki dictionary JSONL into Postgres."
)
parser.add_argument(
"--lang", required=True, help="Language code to import (e.g. fr)"
)
parser.add_argument(
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
)
args = parser.parse_args()
asyncio.run(run_import(args.lang, args.batch_size))
if __name__ == "__main__":
main()

1
dictionaries/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.jsonl

3
dictionaries/README.md Normal file
View file

@ -0,0 +1,3 @@
# Dictionaries
This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.