feat: Build the bilingual dictionary data

2026-04-08 20:26:26 +01:00 · 2026-04-08 20:26:26 +01:00 · 873ebacd4d
commit 873ebacd4d
parent 2cae5d9445
9 changed files with 599 additions and 1 deletions
--- a/9
+++ b/9
@ -1,4 +1,4 @@
-.PHONY: down build up logs shell lock migrate migration
+.PHONY: down build up logs shell lock migrate migration import-dictionary
 build:
 	docker compose build
@ -28,3 +28,10 @@ lock:
 	cd api && uv pip compile pyproject.toml -o requirements.txt
 rebuild: down build up
 # Import a kaikki dictionary JSONL into Postgres.
 # Requires the DB to be running with its port exposed on localhost (docker compose up).
 # DATABASE_URL defaults to the docker-compose dev credentials.
 # Usage: make import-dictionary lang=fr
 import-dictionary:
 	cd api && python scripts/import_dictionary.py --lang $(lang)
--- a/api/alembic/env.py
+++ b/api/alembic/env.py
@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base
 import app.outbound.postgres.entities.summarise_job_entity
 import app.outbound.postgres.entities.user_entity
 import app.outbound.postgres.entities.dictionary_entities
 config = context.config
 config.set_main_option("sqlalchemy.url", settings.database_url)
--- a/api/alembic/versions/20260407_0007_add_dictionary_tables.py
+++ b/api/alembic/versions/20260407_0007_add_dictionary_tables.py
@ -0,0 +1,89 @@
 """add dictionary tables
 Revision ID: 0007
 Revises: 0006
 Create Date: 2026-04-07
 """
 from typing import Sequence, Union
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 revision: str = "0007"
 down_revision: Union[str, None] = "0006"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    op.create_table(
        "dictionary_lemma",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column("headword", sa.Text(), nullable=False),
        sa.Column("language", sa.String(2), nullable=False),
        sa.Column("pos_raw", sa.Text(), nullable=False),
        sa.Column("pos_normalised", sa.Text(), nullable=True),
        sa.Column("gender", sa.Text(), nullable=True),
        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
    )
    op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
    op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
    op.create_table(
        "dictionary_sense",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column(
            "lemma_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("sense_index", sa.Integer(), nullable=False),
        sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
        sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
    )
    op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
    op.create_table(
        "dictionary_wordform",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column(
            "lemma_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("form", sa.Text(), nullable=False),
        sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
    )
    op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
    op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
    op.create_table(
        "dictionary_lemma_raw",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column(
            "lemma_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
            nullable=False,
            unique=True,
        ),
        sa.Column("language", sa.String(2), nullable=False),
        sa.Column("raw", postgresql.JSONB(), nullable=False),
    )
 def downgrade() -> None:
    op.drop_table("dictionary_lemma_raw")
    op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
    op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
    op.drop_table("dictionary_wordform")
    op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
    op.drop_table("dictionary_sense")
    op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
    op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
    op.drop_table("dictionary_lemma")
--- a/api/app/domain/models/dictionary.py
+++ b/api/app/domain/models/dictionary.py
@ -0,0 +1,32 @@
 from dataclasses import dataclass
@dataclass
 class Wordform:
    id: str
    lemma_id: str
    form: str
    tags: list[str]
@dataclass
 class Sense:
    id: str
    lemma_id: str
    sense_index: int
    gloss: str
    topics: list[str]
    tags: list[str]
@dataclass
 class Lemma:
    id: str
    headword: str
    language: str
    pos_raw: str
    pos_normalised: str | None
    gender: str | None
    tags: list[str]
    senses: list[Sense] = field(default_factory=list)
    wordforms: list[Wordform] = field(default_factory=list)
--- a/api/app/outbound/postgres/entities/dictionary_entities.py
+++ b/api/app/outbound/postgres/entities/dictionary_entities.py
@ -0,0 +1,63 @@
 import uuid
 from sqlalchemy import String, Text, ForeignKey, Integer
 from sqlalchemy.orm import Mapped, mapped_column
 from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
 from ..database import Base
 class DictionaryLemmaEntity(Base):
    __tablename__ = "dictionary_lemma"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    headword: Mapped[str] = mapped_column(Text, nullable=False)
    language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
    pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
    pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
    gender: Mapped[str | None] = mapped_column(Text, nullable=True)
    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
 class DictionarySenseEntity(Base):
    __tablename__ = "dictionary_sense"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    lemma_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
    gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
    topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
 class DictionaryWordformEntity(Base):
    __tablename__ = "dictionary_wordform"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    lemma_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
    tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
 class DictionaryLemmaRawEntity(Base):
    __tablename__ = "dictionary_lemma_raw"
    id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    lemma_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
        nullable=False,
        unique=True,
    )
    language: Mapped[str] = mapped_column(String(2), nullable=False)
    raw: Mapped[dict] = mapped_column(JSONB, nullable=False)
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -0,0 +1,80 @@
 import uuid
 from typing import Protocol
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from ..entities.dictionary_entities import (
    DictionaryLemmaEntity,
    DictionarySenseEntity,
    DictionaryWordformEntity,
 )
 from ....domain.models.dictionary import Lemma, Sense, Wordform
 class DictionaryRepository(Protocol):
    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
 def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
    return Sense(
        id=str(entity.id),
        lemma_id=str(entity.lemma_id),
        sense_index=entity.sense_index,
        gloss=entity.gloss,
        topics=entity.topics or [],
        tags=entity.tags or [],
    )
 def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
    return Wordform(
        id=str(entity.id),
        lemma_id=str(entity.lemma_id),
        form=entity.form,
        tags=entity.tags or [],
    )
 class PostgresDictionaryRepository:
    def __init__(self, db: AsyncSession) -> None:
        self.db = db
    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
            )
            .order_by(DictionarySenseEntity.sense_index)
        )
        return [_sense_to_model(e) for e in result.scalars().all()]
    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
        """EN→target direction: find senses whose gloss matches the given English text.
        Uses a case-insensitive exact match on the gloss column, filtered to the
        target language via the joined lemma row.
        """
        result = await self.db.execute(
            select(DictionarySenseEntity)
            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
            .where(
                DictionarySenseEntity.gloss.ilike(text),
                DictionaryLemmaEntity.language == target_lang,
            )
            .order_by(DictionarySenseEntity.sense_index)
        )
        return [_sense_to_model(e) for e in result.scalars().all()]
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity).where(
                DictionaryWordformEntity.lemma_id == lemma_id
            )
        )
        return [_wordform_to_model(e) for e in result.scalars().all()]
--- a/api/scripts/import_dictionary.py
+++ b/api/scripts/import_dictionary.py
@ -0,0 +1,322 @@
 #!/usr/bin/env python
 """
 CLI import script for kaikki/wiktextract JSONL dictionary data.
 Usage (from api/ directory):
    uv run ./scripts/import_dictionary.py --lang fr
    # or via Make from the repo root:
    make import-dictionary lang=fr
 DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
 which matches the docker-compose dev credentials when the DB port is exposed on the host.
 """
 import argparse
 import asyncio
 import json
 import os
 import sys
 import uuid
 from pathlib import Path
 import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 from sqlalchemy.dialects.postgresql import UUID as PG_UUID
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
 _API_DIR = Path(__file__).parent.parent
 _REPO_ROOT = _API_DIR.parent
 _DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
 _LANG_FILE_MAP: dict[str, str] = {
    "fr": "french.jsonl",
 }
 _POS_MAP: dict[str, str] = {
    "noun": "NOUN",
    "verb": "VERB",
    "adj": "ADJ",
    "adv": "ADV",
    "det": "DET",
    "article": "DET",
    "pron": "PRON",
    "prep": "ADP",
    "adp": "ADP",
    "conj": "CCONJ",
    "cconj": "CCONJ",
    "sconj": "SCONJ",
    "intj": "INTJ",
    "num": "NUM",
    "numeral": "NUM",
    "part": "PART",
    "particle": "PART",
    "name": "PROPN",
    "propn": "PROPN",
    "proper noun": "PROPN",
    "punct": "PUNCT",
    "sym": "SYM",
 }
 _GENDER_MAP: dict[str, str] = {
    "masculine": "masculine",
    "masc": "masculine",
    "feminine": "feminine",
    "fem": "feminine",
    "neuter": "neuter",
    "common": "common",
 }
 # ---------------------------------------------------------------------------
 # Standalone table definitions — no app imports, no Settings() call
 # ---------------------------------------------------------------------------
 _meta = sa.MetaData()
 _lemma_table = sa.Table(
    "dictionary_lemma",
    _meta,
    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
    sa.Column("headword", sa.Text(), nullable=False),
    sa.Column("language", sa.String(2), nullable=False),
    sa.Column("pos_raw", sa.Text(), nullable=False),
    sa.Column("pos_normalised", sa.Text(), nullable=True),
    sa.Column("gender", sa.Text(), nullable=True),
    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
 )
 _sense_table = sa.Table(
    "dictionary_sense",
    _meta,
    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
    sa.Column("sense_index", sa.Integer(), nullable=False),
    sa.Column("gloss", sa.Text(), nullable=False),
    sa.Column("topics", ARRAY(sa.Text()), nullable=False),
    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
 )
 _wordform_table = sa.Table(
    "dictionary_wordform",
    _meta,
    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
    sa.Column("form", sa.Text(), nullable=False),
    sa.Column("tags", ARRAY(sa.Text()), nullable=False),
 )
 _raw_table = sa.Table(
    "dictionary_lemma_raw",
    _meta,
    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
    sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
    sa.Column("language", sa.String(2), nullable=False),
    sa.Column("raw", JSONB(), nullable=False),
 )
 # ---------------------------------------------------------------------------
 # Normalisation helpers
 # ---------------------------------------------------------------------------
 def _normalise_pos(pos_raw: str) -> str | None:
    return _POS_MAP.get(pos_raw.lower().strip())
 def _normalise_gender(tags: list) -> str | None:
    for tag in tags:
        mapped = _GENDER_MAP.get(tag)
        if mapped:
            return mapped
    return None
 # ---------------------------------------------------------------------------
 # Parsing
 # ---------------------------------------------------------------------------
 def _parse_entry(record: dict, lang_code: str) -> dict | None:
    """Parse one kaikki JSONL record into insertion-ready row dicts.
    Returns None if the entry should be skipped.
    """
    if record.get("lang_code") != lang_code:
        return None
    word = (record.get("word") or "").strip()
    if not word:
        return None
    pos_raw = (record.get("pos") or "").strip()
    top_tags = record.get("tags") or []
    lemma_id = uuid.uuid4()
    senses = []
    for i, sense_record in enumerate(record.get("senses") or []):
        sense_id = uuid.uuid4()
        glosses = sense_record.get("glosses") or []
        gloss = glosses[0] if glosses else ""
        topics = sense_record.get("topics") or []
        sense_tags = sense_record.get("tags") or []
        senses.append(
            {
                "id": sense_id,
                "lemma_id": lemma_id,
                "sense_index": i,
                "gloss": gloss,
                "topics": topics,
                "tags": sense_tags,
            }
        )
    wordforms = []
    for f in record.get("forms") or []:
        form_text = (f.get("form") or "").strip()
        if not form_text or form_text == word:
            continue
        form_tags = f.get("tags") or []
        wordforms.append(
            {
                "id": uuid.uuid4(),
                "lemma_id": lemma_id,
                "form": form_text,
                "tags": form_tags,
            }
        )
    return {
        "lemma": {
            "id": lemma_id,
            "headword": word,
            "language": lang_code,
            "pos_raw": pos_raw,
            "pos_normalised": _normalise_pos(pos_raw),
            "gender": _normalise_gender(top_tags),
            "tags": top_tags,
        },
        "senses": senses,
        "wordforms": wordforms,
        "raw": {
            "id": uuid.uuid4(),
            "lemma_id": lemma_id,
            "language": lang_code,
            "raw": record,
        },
    }
 # ---------------------------------------------------------------------------
 # DB operations
 # ---------------------------------------------------------------------------
 async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
    lemma_rows = [e["lemma"] for e in batch]
    sense_rows = [s for e in batch for s in e["senses"]]
    wordform_rows = [w for e in batch for w in e["wordforms"]]
    raw_rows = [e["raw"] for e in batch]
    if lemma_rows:
        await conn.execute(_lemma_table.insert(), lemma_rows)
    if sense_rows:
        await conn.execute(_sense_table.insert(), sense_rows)
    if wordform_rows:
        await conn.execute(_wordform_table.insert(), wordform_rows)
    if raw_rows:
        await conn.execute(_raw_table.insert(), raw_rows)
    await conn.commit()
 async def run_import(lang_code: str, batch_size: int = 1000) -> None:
    lang_file = _LANG_FILE_MAP.get(lang_code)
    if not lang_file:
        print(
            f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
            file=sys.stderr,
        )
        sys.exit(1)
    jsonl_path = _DICT_DIR / lang_file
    if not jsonl_path.exists():
        print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
        sys.exit(1)
    database_url = os.environ.get(
        "DATABASE_URL",
        "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
    )
    engine = create_async_engine(database_url, echo=False)
    try:
        async with engine.connect() as conn:
            print(f"Deleting existing entries for language={lang_code!r}...")
            await conn.execute(
                _lemma_table.delete().where(_lemma_table.c.language == lang_code)
            )
            await conn.commit()
            print(f"Importing {jsonl_path} ...")
            batch: list[dict] = []
            total_lemmas = 0
            skipped = 0
            with open(jsonl_path, encoding="utf-8") as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        record = json.loads(line)
                    except json.JSONDecodeError as exc:
                        print(
                            f"  Line {line_num}: JSON parse error: {exc}",
                            file=sys.stderr,
                        )
                        skipped += 1
                        continue
                    parsed = _parse_entry(record, lang_code)
                    if parsed is None:
                        skipped += 1
                        continue
                    batch.append(parsed)
                    if len(batch) >= batch_size:
                        await _flush_batch(conn, batch)
                        total_lemmas += len(batch)
                        print(f"  Committed {total_lemmas} lemmas...")
                        batch = []
            if batch:
                await _flush_batch(conn, batch)
                total_lemmas += len(batch)
        print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
    finally:
        await engine.dispose()
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Import kaikki dictionary JSONL into Postgres."
    )
    parser.add_argument(
        "--lang", required=True, help="Language code to import (e.g. fr)"
    )
    parser.add_argument(
        "--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
    )
    args = parser.parse_args()
    asyncio.run(run_import(args.lang, args.batch_size))
 if __name__ == "__main__":
    main()
--- a/dictionaries/.gitignore
+++ b/dictionaries/.gitignore
@ -0,0 +1 @@
 *.jsonl
--- a/dictionaries/README.md
+++ b/dictionaries/README.md
@ -0,0 +1,3 @@
 # Dictionaries
 This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project.  It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.
		`@ -0,0 +1,3 @@`
							`# Dictionaries`

							`This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.`