feat: Build the bilingual dictionary data

2026-04-08 20:26:26 +01:00 · 2026-04-08 20:26:26 +01:00 · 873ebacd4d
commit 873ebacd4d
parent 2cae5d9445
9 changed files with 599 additions and 1 deletions
--- a/9
+++ b/9
@ -1,4 +1,4 @@
-.PHONY: down build up logs shell lock migrate migration
+.PHONY: down build up logs shell lock migrate migration import-dictionary

 build:
 	docker compose build
@ -28,3 +28,10 @@ lock:
 	cd api && uv pip compile pyproject.toml -o requirements.txt

 rebuild: down build up
+
+# Import a kaikki dictionary JSONL into Postgres.
+# Requires the DB to be running with its port exposed on localhost (docker compose up).
+# DATABASE_URL defaults to the docker-compose dev credentials.
+# Usage: make import-dictionary lang=fr
+import-dictionary:
+	cd api && python scripts/import_dictionary.py --lang $(lang)
--- a/api/alembic/env.py
+++ b/api/alembic/env.py
@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base

 import app.outbound.postgres.entities.summarise_job_entity
 import app.outbound.postgres.entities.user_entity
+import app.outbound.postgres.entities.dictionary_entities

 config = context.config
 config.set_main_option("sqlalchemy.url", settings.database_url)
--- a/api/alembic/versions/20260407_0007_add_dictionary_tables.py
+++ b/api/alembic/versions/20260407_0007_add_dictionary_tables.py
@ -0,0 +1,89 @@
+"""add dictionary tables
+
+Revision ID: 0007
+Revises: 0006
+Create Date: 2026-04-07
+
+"""
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+revision: str = "0007"
+down_revision: Union[str, None] = "0006"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "dictionary_lemma",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column("headword", sa.Text(), nullable=False),
+        sa.Column("language", sa.String(2), nullable=False),
+        sa.Column("pos_raw", sa.Text(), nullable=False),
+        sa.Column("pos_normalised", sa.Text(), nullable=True),
+        sa.Column("gender", sa.Text(), nullable=True),
+        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
+    )
+    op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
+    op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
+
+    op.create_table(
+        "dictionary_sense",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column(
+            "lemma_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("sense_index", sa.Integer(), nullable=False),
+        sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
+        sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
+        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
+    )
+    op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
+
+    op.create_table(
+        "dictionary_wordform",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column(
+            "lemma_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("form", sa.Text(), nullable=False),
+        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
+    )
+    op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
+    op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
+
+    op.create_table(
+        "dictionary_lemma_raw",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column(
+            "lemma_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+            nullable=False,
+            unique=True,
+        ),
+        sa.Column("language", sa.String(2), nullable=False),
+        sa.Column("raw", postgresql.JSONB(), nullable=False),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("dictionary_lemma_raw")
+    op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
+    op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
+    op.drop_table("dictionary_wordform")
+    op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
+    op.drop_table("dictionary_sense")
+    op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
+    op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
+    op.drop_table("dictionary_lemma")
--- a/api/app/domain/models/dictionary.py
+++ b/api/app/domain/models/dictionary.py
@ -0,0 +1,32 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Wordform:
+    id: str
+    lemma_id: str
+    form: str
+    tags: list[str]
+
+
+@dataclass
+class Sense:
+    id: str
+    lemma_id: str
+    sense_index: int
+    gloss: str
+    topics: list[str]
+    tags: list[str]
+
+
+@dataclass
+class Lemma:
+    id: str
+    headword: str
+    language: str
+    pos_raw: str
+    pos_normalised: str | None
+    gender: str | None
+    tags: list[str]
+    senses: list[Sense] = field(default_factory=list)
+    wordforms: list[Wordform] = field(default_factory=list)
--- a/api/app/outbound/postgres/entities/dictionary_entities.py
+++ b/api/app/outbound/postgres/entities/dictionary_entities.py
@ -0,0 +1,63 @@
+import uuid
+
+from sqlalchemy import String, Text, ForeignKey, Integer
+from sqlalchemy.orm import Mapped, mapped_column
+from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
+
+from ..database import Base
+
+
+class DictionaryLemmaEntity(Base):
+    __tablename__ = "dictionary_lemma"
+
+    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    headword: Mapped[str] = mapped_column(Text, nullable=False)
+    language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
+    pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
+    pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
+    gender: Mapped[str | None] = mapped_column(Text, nullable=True)
+    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
+
+
+class DictionarySenseEntity(Base):
+    __tablename__ = "dictionary_sense"
+
+    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    lemma_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
+    gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
+    topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
+    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
+
+
+class DictionaryWordformEntity(Base):
+    __tablename__ = "dictionary_wordform"
+
+    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    lemma_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
+    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
+
+
+class DictionaryLemmaRawEntity(Base):
+    __tablename__ = "dictionary_lemma_raw"
+
+    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    lemma_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
+        nullable=False,
+        unique=True,
+    )
+    language: Mapped[str] = mapped_column(String(2), nullable=False)
+    raw: Mapped[dict] = mapped_column(JSONB, nullable=False)
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -0,0 +1,80 @@
+import uuid
+from typing import Protocol
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ..entities.dictionary_entities import (
+    DictionaryLemmaEntity,
+    DictionarySenseEntity,
+    DictionaryWordformEntity,
+)
+from ....domain.models.dictionary import Lemma, Sense, Wordform
+
+
+class DictionaryRepository(Protocol):
+    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
+    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
+    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
+
+
+def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
+    return Sense(
+        id=str(entity.id),
+        lemma_id=str(entity.lemma_id),
+        sense_index=entity.sense_index,
+        gloss=entity.gloss,
+        topics=entity.topics or [],
+        tags=entity.tags or [],
+    )
+
+
+def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
+    return Wordform(
+        id=str(entity.id),
+        lemma_id=str(entity.lemma_id),
+        form=entity.form,
+        tags=entity.tags or [],
+    )
+
+
+class PostgresDictionaryRepository:
+    def __init__(self, db: AsyncSession) -> None:
+        self.db = db
+
+    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionaryLemmaEntity.headword == headword,
+                DictionaryLemmaEntity.language == language,
+            )
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
+        """EN→target direction: find senses whose gloss matches the given English text.
+
+        Uses a case-insensitive exact match on the gloss column, filtered to the
+        target language via the joined lemma row.
+        """
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionarySenseEntity.gloss.ilike(text),
+                DictionaryLemmaEntity.language == target_lang,
+            )
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
+        result = await self.db.execute(
+            select(DictionaryWordformEntity).where(
+                DictionaryWordformEntity.lemma_id == lemma_id
+            )
+        )
+        return [_wordform_to_model(e) for e in result.scalars().all()]
--- a/api/scripts/import_dictionary.py
+++ b/api/scripts/import_dictionary.py
@ -0,0 +1,322 @@
+#!/usr/bin/env python
+"""
+CLI import script for kaikki/wiktextract JSONL dictionary data.
+
+Usage (from api/ directory):
+    uv run ./scripts/import_dictionary.py --lang fr
+
+    # or via Make from the repo root:
+    make import-dictionary lang=fr
+
+DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
+which matches the docker-compose dev credentials when the DB port is exposed on the host.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import sys
+import uuid
+from pathlib import Path
+
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import ARRAY, JSONB
+from sqlalchemy.dialects.postgresql import UUID as PG_UUID
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
+
+_API_DIR = Path(__file__).parent.parent
+_REPO_ROOT = _API_DIR.parent
+_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
+
+_LANG_FILE_MAP: dict[str, str] = {
+    "fr": "french.jsonl",
+}
+
+_POS_MAP: dict[str, str] = {
+    "noun": "NOUN",
+    "verb": "VERB",
+    "adj": "ADJ",
+    "adv": "ADV",
+    "det": "DET",
+    "article": "DET",
+    "pron": "PRON",
+    "prep": "ADP",
+    "adp": "ADP",
+    "conj": "CCONJ",
+    "cconj": "CCONJ",
+    "sconj": "SCONJ",
+    "intj": "INTJ",
+    "num": "NUM",
+    "numeral": "NUM",
+    "part": "PART",
+    "particle": "PART",
+    "name": "PROPN",
+    "propn": "PROPN",
+    "proper noun": "PROPN",
+    "punct": "PUNCT",
+    "sym": "SYM",
+}
+
+_GENDER_MAP: dict[str, str] = {
+    "masculine": "masculine",
+    "masc": "masculine",
+    "feminine": "feminine",
+    "fem": "feminine",
+    "neuter": "neuter",
+    "common": "common",
+}
+
+# ---------------------------------------------------------------------------
+# Standalone table definitions — no app imports, no Settings() call
+# ---------------------------------------------------------------------------
+
+_meta = sa.MetaData()
+
+_lemma_table = sa.Table(
+    "dictionary_lemma",
+    _meta,
+    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
+    sa.Column("headword", sa.Text(), nullable=False),
+    sa.Column("language", sa.String(2), nullable=False),
+    sa.Column("pos_raw", sa.Text(), nullable=False),
+    sa.Column("pos_normalised", sa.Text(), nullable=True),
+    sa.Column("gender", sa.Text(), nullable=True),
+    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
+)
+
+_sense_table = sa.Table(
+    "dictionary_sense",
+    _meta,
+    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
+    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
+    sa.Column("sense_index", sa.Integer(), nullable=False),
+    sa.Column("gloss", sa.Text(), nullable=False),
+    sa.Column("topics", ARRAY(sa.Text()), nullable=False),
+    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
+)
+
+_wordform_table = sa.Table(
+    "dictionary_wordform",
+    _meta,
+    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
+    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
+    sa.Column("form", sa.Text(), nullable=False),
+    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
+)
+
+_raw_table = sa.Table(
+    "dictionary_lemma_raw",
+    _meta,
+    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
+    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
+    sa.Column("language", sa.String(2), nullable=False),
+    sa.Column("raw", JSONB(), nullable=False),
+)
+
+# ---------------------------------------------------------------------------
+# Normalisation helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalise_pos(pos_raw: str) -> str | None:
+    return _POS_MAP.get(pos_raw.lower().strip())
+
+
+def _normalise_gender(tags: list) -> str | None:
+    for tag in tags:
+        mapped = _GENDER_MAP.get(tag)
+        if mapped:
+            return mapped
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_entry(record: dict, lang_code: str) -> dict | None:
+    """Parse one kaikki JSONL record into insertion-ready row dicts.
+
+    Returns None if the entry should be skipped.
+    """
+    if record.get("lang_code") != lang_code:
+        return None
+
+    word = (record.get("word") or "").strip()
+    if not word:
+        return None
+
+    pos_raw = (record.get("pos") or "").strip()
+    top_tags = record.get("tags") or []
+
+    lemma_id = uuid.uuid4()
+
+    senses = []
+    for i, sense_record in enumerate(record.get("senses") or []):
+        sense_id = uuid.uuid4()
+        glosses = sense_record.get("glosses") or []
+        gloss = glosses[0] if glosses else ""
+        topics = sense_record.get("topics") or []
+        sense_tags = sense_record.get("tags") or []
+
+        senses.append(
+            {
+                "id": sense_id,
+                "lemma_id": lemma_id,
+                "sense_index": i,
+                "gloss": gloss,
+                "topics": topics,
+                "tags": sense_tags,
+            }
+        )
+
+    wordforms = []
+    for f in record.get("forms") or []:
+        form_text = (f.get("form") or "").strip()
+        if not form_text or form_text == word:
+            continue
+        form_tags = f.get("tags") or []
+        wordforms.append(
+            {
+                "id": uuid.uuid4(),
+                "lemma_id": lemma_id,
+                "form": form_text,
+                "tags": form_tags,
+            }
+        )
+
+    return {
+        "lemma": {
+            "id": lemma_id,
+            "headword": word,
+            "language": lang_code,
+            "pos_raw": pos_raw,
+            "pos_normalised": _normalise_pos(pos_raw),
+            "gender": _normalise_gender(top_tags),
+            "tags": top_tags,
+        },
+        "senses": senses,
+        "wordforms": wordforms,
+        "raw": {
+            "id": uuid.uuid4(),
+            "lemma_id": lemma_id,
+            "language": lang_code,
+            "raw": record,
+        },
+    }
+
+
+# ---------------------------------------------------------------------------
+# DB operations
+# ---------------------------------------------------------------------------
+
+
+async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
+    lemma_rows = [e["lemma"] for e in batch]
+    sense_rows = [s for e in batch for s in e["senses"]]
+    wordform_rows = [w for e in batch for w in e["wordforms"]]
+    raw_rows = [e["raw"] for e in batch]
+
+    if lemma_rows:
+        await conn.execute(_lemma_table.insert(), lemma_rows)
+    if sense_rows:
+        await conn.execute(_sense_table.insert(), sense_rows)
+    if wordform_rows:
+        await conn.execute(_wordform_table.insert(), wordform_rows)
+    if raw_rows:
+        await conn.execute(_raw_table.insert(), raw_rows)
+
+    await conn.commit()
+
+
+async def run_import(lang_code: str, batch_size: int = 1000) -> None:
+    lang_file = _LANG_FILE_MAP.get(lang_code)
+    if not lang_file:
+        print(
+            f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    jsonl_path = _DICT_DIR / lang_file
+    if not jsonl_path.exists():
+        print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
+        sys.exit(1)
+
+    database_url = os.environ.get(
+        "DATABASE_URL",
+        "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
+    )
+
+    engine = create_async_engine(database_url, echo=False)
+
+    try:
+        async with engine.connect() as conn:
+            print(f"Deleting existing entries for language={lang_code!r}...")
+            await conn.execute(
+                _lemma_table.delete().where(_lemma_table.c.language == lang_code)
+            )
+            await conn.commit()
+
+            print(f"Importing {jsonl_path} ...")
+            batch: list[dict] = []
+            total_lemmas = 0
+            skipped = 0
+
+            with open(jsonl_path, encoding="utf-8") as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        record = json.loads(line)
+                    except json.JSONDecodeError as exc:
+                        print(
+                            f"  Line {line_num}: JSON parse error: {exc}",
+                            file=sys.stderr,
+                        )
+                        skipped += 1
+                        continue
+
+                    parsed = _parse_entry(record, lang_code)
+                    if parsed is None:
+                        skipped += 1
+                        continue
+
+                    batch.append(parsed)
+
+                    if len(batch) >= batch_size:
+                        await _flush_batch(conn, batch)
+                        total_lemmas += len(batch)
+                        print(f"  Committed {total_lemmas} lemmas...")
+                        batch = []
+
+            if batch:
+                await _flush_batch(conn, batch)
+                total_lemmas += len(batch)
+
+        print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
+    finally:
+        await engine.dispose()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Import kaikki dictionary JSONL into Postgres."
+    )
+    parser.add_argument(
+        "--lang", required=True, help="Language code to import (e.g. fr)"
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
+    )
+    args = parser.parse_args()
+
+    asyncio.run(run_import(args.lang, args.batch_size))
+
+
+if __name__ == "__main__":
+    main()
--- a/dictionaries/.gitignore
+++ b/dictionaries/.gitignore
@ -0,0 +1 @@
+*.jsonl
--- a/dictionaries/README.md
+++ b/dictionaries/README.md
@ -0,0 +1,3 @@
+# Dictionaries
+
+This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project.  It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.