Compare commits

..

No commits in common. "7f0977d8e53850670603c3f6b37fff1da5051397" and "aa4987981d06e3fc2643d03560f4d4f76c094272" have entirely different histories.

5 changed files with 10 additions and 428 deletions

View file

@ -1,4 +1,4 @@
FROM python:3.13-slim FROM python:3.11-slim
WORKDIR /app WORKDIR /app

View file

@ -1,41 +0,0 @@
"""add dictionary_sense_link table
Revision ID: 0010
Revises: 0009
Create Date: 2026-04-10
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0010"
down_revision: Union[str, None] = "0009"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"dictionary_sense_link",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"sense_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("link_text", sa.Text(), nullable=False),
sa.Column("link_target", sa.Text(), nullable=False),
sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
)
op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
def downgrade() -> None:
op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
op.drop_table("dictionary_sense_link")

View file

@ -1,276 +0,0 @@
# Domain
This document explains the core domain concepts of the API — how it models second language acquisition and the processes a learner goes through. It is written for both human developers and LLMs working on this codebase.
---
## What this system does
A user is learning a foreign language (currently French, with Spanish, Italian, and German planned). They encounter words in reading material, add those words to a personal vocabulary bank, resolve any ambiguity about which specific meaning they encountered, and then practise those words via spaced-repetition flashcards.
The system models that full cycle: from a raw word in context, through dictionary lookup and disambiguation, to a durable flashcard that can be studied repeatedly.
---
## Linguistic concepts
Before reading the entity descriptions, these distinctions are essential. Conflating them is the most common source of modelling mistakes in this codebase.
### Lemma, wordform, and token
- A **lemma** is the canonical dictionary form of a word: *aller*, *banque*, *bon*.
- A **wordform** is an inflected surface form: *allons*, *allais*, *banques*, *bonne*. Wordforms are derived from a lemma by applying grammatical rules.
- A **token** is what spaCy returns when it processes a sentence — it is a wordform in context. spaCy provides both the raw token text and its lemma.
Dictionary entries are keyed by lemma. Wordforms point back to their lemma. These are different things and must not be conflated — a user might encounter *allons* in an article, but what they are learning is the lemma *aller*.
### Senses
A single lemma can have multiple **senses**: *bank (finance)*, *bank (river)*, *bank (verb, to lean)*. Each sense is a distinct row in `dictionary_sense` with its own gloss (definition/translation). The user learns a specific sense, not a bare headword.
When a user adds a word with multiple senses, the system cannot know which meaning they encountered. It creates a bank entry with `disambiguation_status = "pending"` and waits for the user to select the correct sense. This process is called **disambiguation**.
### Part-of-speech normalisation
The dictionary source (kaikki/Wiktextract) uses its own POS labels: "noun", "verb", "past participle", "proverb", "phrase". spaCy uses Universal Dependencies tags: NOUN, VERB, ADJ, ADV. These do not map 1-to-1.
Both are stored: `pos_raw` holds the kaikki string exactly as it appears in the source data; `pos_normalised` holds the UD-compatible tag computed at import time. The `pos_normalised` field is what enables joining spaCy output against dictionary rows.
### Gender
French, Spanish, Italian, and German nouns have grammatical gender. Learners must know the gender — *le banc* not just *banc*. Gender is extracted from the kaikki `tags` array at import time and stored as a first-class column (`gender: text`) on `dictionary_lemma`. Possible values are `"masculine"`, `"feminine"`, `"neuter"`, `"common"`, or `null` for parts of speech that do not inflect by gender.
### The bilingual mapping
This system uses the English-language Wiktionary (via kaikki). An important structural fact: **the gloss on a sense IS the English translation**. There is no separate translations table. Because Wiktionary describes foreign words in English, the headword is the target-language word and the gloss is its English meaning:
- `dictionary_lemma.headword = "bisque"` (French)
- `dictionary_sense.gloss = "advantage"` (English meaning)
This means:
- **FR → EN** (recognition): look up lemma by French headword → sense → gloss is the English meaning.
- **EN → FR** (production): full-text search on `dictionary_sense.gloss` for the English term → linked lemma headword is the French word.
---
## The bilingual dictionary
The dictionary is a read-only reference dataset, populated once by an import script (`scripts/import_dictionary.py`) from kaikki JSONL dumps. It is never written to by the application at runtime.
### `dictionary_lemma`
One row per lemma+POS combination. The `(headword, language)` pair is indexed but not unique — *bank* has multiple lemma rows because it is both a noun and a verb.
Key fields: `headword`, `language` (ISO 639-1 code, e.g. `"fr"`), `pos_raw`, `pos_normalised`, `gender`, `tags`.
### `dictionary_sense`
One row per meaning of a lemma. The `gloss` is a short English definition that serves as both the disambiguation label and the translation. `sense_index` preserves the ordering from the source data (Wiktionary's first sense is usually the most common).
Key fields: `lemma_id` (FK → `dictionary_lemma`), `sense_index`, `gloss`, `topics`, `tags`.
### `dictionary_wordform`
One row per inflected form. Populated from the `forms` array in the kaikki JSONL. Enables the NLP pipeline to resolve an inflected token back to its lemma without relying on spaCy's lemmatisation being perfect.
Key fields: `lemma_id` (FK → `dictionary_lemma`), `form`, `tags` (e.g. `["plural"]`, `["first person plural", "present indicative"]`).
### `dictionary_lemma_raw`
Stores the full original kaikki JSON record for each lemma, one row per lemma, separate from the main lemma table to avoid bloating lookup queries. Used for reprocessing if the import logic changes.
---
## The user account
### User
Standard authentication entity: email, hashed password, `is_active`, `is_email_verified`. There is no `User` domain model — the ORM entity (`User` in `user_entity.py`) is used directly by `AccountService` and `user_repository`. This is the only entity in the codebase that does not follow the entity→domain-model pattern, reflecting its purely infrastructural role.
### `LearnableLanguage`
Records which language pair a user is studying and their self-reported proficiency levels. A user can study multiple language pairs simultaneously (e.g. EN→FR at B1 and EN→ES at A2). Proficiencies follow the CEFR scale (A1, A2, B1, B2, C1, C2).
Key fields: `user_id`, `source_language`, `target_language`, `proficiencies: list[str]`.
This entity lives in `learnable_languages` and is managed by `AccountService.add_learnable_language` / `remove_learnable_language`.
### `UserLanguagePair`
A lightweight pairing of source and target language, used to scope vocab bank entries. Where `LearnableLanguage` is a profile concept (what am I learning, at what level), `UserLanguagePair` is an operational concept (which direction does this vocabulary entry belong to).
Key fields: `user_id`, `source_lang`, `target_lang`. Unique per user per direction.
---
## The vocab bank
The vocab bank is the central concept of the system. It is the user's personal list of words they are actively learning.
### `LearnableWordBankEntry`
One row per word or phrase that a user has added to their bank. This is the bridge between the reference dictionary and the user's personal study material.
Key fields:
| Field | Description |
|---|---|
| `surface_text` | The exact text the user encountered or typed (e.g. `"allons"`, `"avoir l'air"`). Always stored, even if dictionary lookup fails. |
| `sense_id` | FK → `dictionary_sense`. Null until disambiguation is resolved. The specific meaning the user is learning. |
| `wordform_id` | FK → `dictionary_wordform`. Set when the entry originated from the NLP pipeline and the inflected form was found in the wordform table. Null for manually-entered headwords. |
| `is_phrase` | True for multi-word expressions. Phrase entries bypass dictionary lookup and never resolve to a single sense. |
| `entry_pathway` | How the word entered the bank: `"manual"`, `"highlight"`, `"nlp_extraction"`, or `"pack"`. |
| `disambiguation_status` | See below. |
| `language_pair_id` | FK → `user_language_pair`. Which direction this entry belongs to. |
### Disambiguation status lifecycle
```
┌─────────────┐
(0 or >1 sense) │ pending │ ◄── always starts here for phrases
┌──────────└─────────────┘──────────┐
│ │ │
user picks (1 sense found user skips
a sense at add time)
│ │ │
▼ ▼ ▼
┌──────────┐ ┌───────────────┐ ┌─────────┐
│ resolved │ │ auto_resolved │ │ skipped │
└──────────┘ └───────────────┘ └─────────┘
```
- **`pending`**: No sense assigned. Occurs when zero or multiple dictionary senses were found, or when the entry is a phrase. The user must visit the disambiguation UI.
- **`auto_resolved`**: Exactly one sense was found at add time; it was assigned automatically without user interaction.
- **`resolved`**: The user was presented with multiple candidates and chose one.
- **`skipped`**: The user chose not to disambiguate. The entry persists in the bank but cannot generate flashcards.
Only entries with `disambiguation_status` of `"auto_resolved"` or `"resolved"` have a `sense_id` and can generate flashcards.
---
## Flashcards
A flashcard is a study card derived from a resolved vocab bank entry. It carries pre-computed prompt and answer text so the study session does not need to re-query the dictionary.
### `Flashcard`
Two cards are typically generated per bank entry — one in each direction:
- **`target_to_en`** (recognition): prompt = `lemma.headword` (e.g. `"bisque"`), answer = `sense.gloss` (e.g. `"advantage"`). The learner sees the French word and must produce the English meaning.
- **`en_to_target`** (production): prompt = `sense.gloss` (e.g. `"advantage"`), answer = `lemma.headword` (e.g. `"bisque"`). The learner sees the English meaning and must produce the French word.
Key fields: `bank_entry_id`, `user_id`, `source_lang`, `target_lang`, `prompt_text`, `answer_text`, `prompt_context_text` (optional sentence context), `answer_context_text`, `card_direction`, `prompt_modality` (`"text"` or `"audio"`).
### `FlashcardEvent`
An immutable record of something that happened during a study session. Events are append-only — they are never updated, only inserted.
Event types:
- **`shown`**: The card was displayed to the user.
- **`answered`**: The user submitted a response. `user_response` holds the free-text answer as typed; no automatic grading is done at this layer.
- **`skipped`**: The user swiped past the card without answering.
The spaced-repetition scheduling algorithm (not yet implemented) will consume these events to determine when each card should next be shown.
---
## NLP pipeline integration
When a user highlights a word in an article, the client sends a spaCy token payload to `POST /api/vocab/from-token`. The `DictionaryLookupService` resolves the token to dictionary candidates using a three-stage fallback:
**Stage 1 — wordform table (most precise)**
The inflected surface form (e.g. `"allons"`) is looked up in `dictionary_wordform`. If found, the linked lemma's senses are returned. Because the lookup was via the wordform table, `wordform_id` is pre-populated on the resulting bank entry, preserving the link between what the user actually saw and the dictionary lemma it belongs to.
**Stage 2 — lemma + UD POS**
If no wordform row exists, the spaCy-provided lemma (e.g. `"aller"`) is looked up against `dictionary_lemma.headword`, filtered by `pos_normalised` (the UD POS tag from spaCy). The POS filter reduces false matches for homographs that share a headword but differ in part of speech.
**Stage 3 — lemma only**
Drops the POS filter as a last resort. Returns all senses for the headword regardless of part of speech.
The endpoint response includes both the created bank entry and the full list of sense candidates, so the client can immediately render the disambiguation UI if `disambiguation_status == "pending"`.
---
## The full learner journey
```
1. Account setup
User registers → adds a LearnableLanguage (e.g. EN→FR, B1)
A UserLanguagePair is created to scope their vocab entries.
2. Word discovery
User reads an article and encounters an unfamiliar word.
Option A — manual entry:
POST /api/vocab { surface_text: "banque", language_pair_id: ... }
VocabService looks up senses for "banque" in dictionary_lemma.
Option B — article highlight (NLP):
spaCy processes the article and returns a token payload.
POST /api/vocab/from-token { surface: "allons", spacy_lemma: "aller", pos_ud: "VERB", ... }
DictionaryLookupService: wordform "allons" → lemma "aller" → senses.
3. Disambiguation
If exactly 1 sense → status = auto_resolved, sense_id set immediately.
If 0 or >1 senses → status = pending.
GET /api/vocab/pending-disambiguation
User sees list of candidate senses with glosses.
PATCH /api/vocab/{entry_id}/sense { sense_id: "..." }
Status → resolved.
4. Flashcard generation
POST /api/vocab/{entry_id}/flashcards
FlashcardService reads sense.gloss + lemma.headword.
Creates 2 flashcards: target_to_en and en_to_target.
5. Study session
GET /api/flashcards — fetch cards to study.
POST /api/flashcards/{id}/events { event_type: "shown" }
POST /api/flashcards/{id}/events { event_type: "answered", user_response: "bank" }
Events accumulate for future SRS scheduling.
```
---
## Entity relationships
```
users
└── learnable_languages (what languages, at what proficiency)
└── user_language_pair (operational scope for vocab entries)
└── learnable_word_bank_entry
├── dictionary_sense (nullable — the specific meaning being learned)
│ └── dictionary_lemma
│ └── dictionary_wordform
├── dictionary_wordform (nullable — the exact inflected form encountered)
└── flashcard
└── flashcard_event
dictionary_lemma
├── dictionary_sense (one or many meanings)
├── dictionary_wordform (inflected forms)
└── dictionary_lemma_raw (original kaikki JSON, for reprocessing)
```
---
## Key enumerations
### `disambiguation_status`
`"pending"` | `"auto_resolved"` | `"resolved"` | `"skipped"`
### `entry_pathway`
`"manual"` | `"highlight"` | `"nlp_extraction"` | `"pack"`
### `card_direction`
`"target_to_en"` | `"en_to_target"`
### `prompt_modality`
`"text"` | `"audio"`
### `event_type`
`"shown"` | `"answered"` | `"skipped"`
### `pos_normalised` (UD tags used in this codebase)
`NOUN` | `VERB` | `ADJ` | `ADV` | `DET` | `PRON` | `ADP` | `CCONJ` | `SCONJ` | `INTJ` | `NUM` | `PART` | `PROPN` | `PUNCT` | `SYM`

View file

@ -59,8 +59,6 @@ _POS_MAP: dict[str, str] = {
} }
_GENDER_MAP: dict[str, str] = { _GENDER_MAP: dict[str, str] = {
"f": "feminine",
"m": "masculine",
"masculine": "masculine", "masculine": "masculine",
"masc": "masculine", "masc": "masculine",
"feminine": "feminine", "feminine": "feminine",
@ -116,16 +114,6 @@ _raw_table = sa.Table(
sa.Column("raw", JSONB(), nullable=False), sa.Column("raw", JSONB(), nullable=False),
) )
_sense_link_table = sa.Table(
"dictionary_sense_link",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("link_text", sa.Text(), nullable=False),
sa.Column("link_target", sa.Text(), nullable=False),
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Normalisation helpers # Normalisation helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -135,10 +123,12 @@ def _normalise_pos(pos_raw: str) -> str | None:
return _POS_MAP.get(pos_raw.lower().strip()) return _POS_MAP.get(pos_raw.lower().strip())
def _normalise_gender(value: str | None) -> str | None: def _normalise_gender(tags: list) -> str | None:
if value is None: for tag in tags:
mapped = _GENDER_MAP.get(tag)
if mapped:
return mapped
return None return None
return _GENDER_MAP.get(value)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -159,13 +149,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
return None return None
pos_raw = (record.get("pos") or "").strip() pos_raw = (record.get("pos") or "").strip()
top_tags = record.get("tags") or []
lemma_id = uuid.uuid4() lemma_id = uuid.uuid4()
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
gender: str | None = None
senses = [] senses = []
sense_links = []
for i, sense_record in enumerate(record.get("senses") or []): for i, sense_record in enumerate(record.get("senses") or []):
sense_id = uuid.uuid4() sense_id = uuid.uuid4()
glosses = sense_record.get("glosses") or [] glosses = sense_record.get("glosses") or []
@ -173,12 +161,6 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
topics = sense_record.get("topics") or [] topics = sense_record.get("topics") or []
sense_tags = sense_record.get("tags") or [] sense_tags = sense_record.get("tags") or []
if gender is None:
for tag in sense_tags:
if tag in _GENDER_TAGS:
gender = tag
break
senses.append( senses.append(
{ {
"id": sense_id, "id": sense_id,
@ -190,18 +172,6 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
} }
) )
for link_pair in (sense_record.get("links") or []):
if isinstance(link_pair, list) and len(link_pair) == 2:
sense_links.append(
{
"id": uuid.uuid4(),
"sense_id": sense_id,
"link_text": link_pair[0],
"link_target": link_pair[1],
"target_lemma_id": None,
}
)
wordforms = [] wordforms = []
for f in record.get("forms") or []: for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip() form_text = (f.get("form") or "").strip()
@ -224,11 +194,10 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
"language": lang_code, "language": lang_code,
"pos_raw": pos_raw, "pos_raw": pos_raw,
"pos_normalised": _normalise_pos(pos_raw), "pos_normalised": _normalise_pos(pos_raw),
"gender": gender, "gender": _normalise_gender(top_tags),
"tags": record.get("tags") or [], "tags": top_tags,
}, },
"senses": senses, "senses": senses,
"sense_links": sense_links,
"wordforms": wordforms, "wordforms": wordforms,
"raw": { "raw": {
"id": uuid.uuid4(), "id": uuid.uuid4(),
@ -247,7 +216,6 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None: async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
lemma_rows = [e["lemma"] for e in batch] lemma_rows = [e["lemma"] for e in batch]
sense_rows = [s for e in batch for s in e["senses"]] sense_rows = [s for e in batch for s in e["senses"]]
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
wordform_rows = [w for e in batch for w in e["wordforms"]] wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch] raw_rows = [e["raw"] for e in batch]
@ -255,8 +223,6 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
await conn.execute(_lemma_table.insert(), lemma_rows) await conn.execute(_lemma_table.insert(), lemma_rows)
if sense_rows: if sense_rows:
await conn.execute(_sense_table.insert(), sense_rows) await conn.execute(_sense_table.insert(), sense_rows)
if sense_link_rows:
await conn.execute(_sense_link_table.insert(), sense_link_rows)
if wordform_rows: if wordform_rows:
await conn.execute(_wordform_table.insert(), wordform_rows) await conn.execute(_wordform_table.insert(), wordform_rows)
if raw_rows: if raw_rows:
@ -265,68 +231,6 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
await conn.commit() await conn.commit()
_LANG_SECTION_MAP: dict[str, str] = {
"fr": "French",
"de": "German",
"es": "Spanish",
"it": "Italian",
"pt": "Portuguese",
}
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
"""Resolve target_lemma_id for sense links whose target matches lang_code.
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
lemmas have been imported we can attempt to match the target word to a row
in dictionary_lemma and store the foreign key.
"""
section = _LANG_SECTION_MAP.get(lang_code)
if not section:
return 0
suffix = f"#{section}"
result = await conn.execute(
sa.select(
_sense_link_table.c.id,
_sense_link_table.c.link_target,
).where(_sense_link_table.c.target_lemma_id.is_(None))
)
all_links = result.fetchall()
# Filter to links that point at this language and extract the target word.
candidates: list[tuple[uuid.UUID, str]] = []
for row in all_links:
if row.link_target.endswith(suffix):
word = row.link_target[: -len(suffix)]
candidates.append((row.id, word))
if not candidates:
return 0
target_words = list({w for _, w in candidates})
lemma_result = await conn.execute(
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
.where(_lemma_table.c.language == lang_code)
.where(_lemma_table.c.headword.in_(target_words))
)
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
resolved = 0
for link_id, word in candidates:
if word in lemma_map:
await conn.execute(
_sense_link_table.update()
.where(_sense_link_table.c.id == link_id)
.values(target_lemma_id=lemma_map[word])
)
resolved += 1
await conn.commit()
return resolved
async def run_import(lang_code: str, batch_size: int = 1000) -> None: async def run_import(lang_code: str, batch_size: int = 1000) -> None:
lang_file = _LANG_FILE_MAP.get(lang_code) lang_file = _LANG_FILE_MAP.get(lang_code)
if not lang_file: if not lang_file:
@ -395,11 +299,6 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
total_lemmas += len(batch) total_lemmas += len(batch)
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.") print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
async with engine.connect() as conn:
print("Resolving sense links...")
resolved = await _resolve_links(conn, lang_code)
print(f"Resolved {resolved} sense links.")
finally: finally:
await engine.dispose() await engine.dispose()