feat: Build the bilingual dictionary data

This commit is contained in:
wilson 2026-04-08 20:26:26 +01:00
parent 2cae5d9445
commit 873ebacd4d
9 changed files with 599 additions and 1 deletions

View file

@ -1,4 +1,4 @@
.PHONY: down build up logs shell lock migrate migration .PHONY: down build up logs shell lock migrate migration import-dictionary
build: build:
docker compose build docker compose build
@ -28,3 +28,10 @@ lock:
cd api && uv pip compile pyproject.toml -o requirements.txt cd api && uv pip compile pyproject.toml -o requirements.txt
rebuild: down build up rebuild: down build up
# Import a kaikki dictionary JSONL into Postgres.
# Requires the DB to be running with its port exposed on localhost (docker compose up).
# DATABASE_URL defaults to the docker-compose dev credentials.
# Usage: make import-dictionary lang=fr
import-dictionary:
cd api && python scripts/import_dictionary.py --lang $(lang)

View file

@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base
import app.outbound.postgres.entities.summarise_job_entity import app.outbound.postgres.entities.summarise_job_entity
import app.outbound.postgres.entities.user_entity import app.outbound.postgres.entities.user_entity
import app.outbound.postgres.entities.dictionary_entities
config = context.config config = context.config
config.set_main_option("sqlalchemy.url", settings.database_url) config.set_main_option("sqlalchemy.url", settings.database_url)

View file

@ -0,0 +1,89 @@
"""add dictionary tables
Revision ID: 0007
Revises: 0006
Create Date: 2026-04-07
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0007"
down_revision: Union[str, None] = "0006"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"dictionary_lemma",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column("headword", sa.Text(), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("pos_raw", sa.Text(), nullable=False),
sa.Column("pos_normalised", sa.Text(), nullable=True),
sa.Column("gender", sa.Text(), nullable=True),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
op.create_table(
"dictionary_sense",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("sense_index", sa.Integer(), nullable=False),
sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
op.create_table(
"dictionary_wordform",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("form", sa.Text(), nullable=False),
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
)
op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
op.create_table(
"dictionary_lemma_raw",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"lemma_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
unique=True,
),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("raw", postgresql.JSONB(), nullable=False),
)
def downgrade() -> None:
op.drop_table("dictionary_lemma_raw")
op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
op.drop_table("dictionary_wordform")
op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
op.drop_table("dictionary_sense")
op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
op.drop_table("dictionary_lemma")

View file

@ -0,0 +1,32 @@
from dataclasses import dataclass
@dataclass
class Wordform:
id: str
lemma_id: str
form: str
tags: list[str]
@dataclass
class Sense:
id: str
lemma_id: str
sense_index: int
gloss: str
topics: list[str]
tags: list[str]
@dataclass
class Lemma:
id: str
headword: str
language: str
pos_raw: str
pos_normalised: str | None
gender: str | None
tags: list[str]
senses: list[Sense] = field(default_factory=list)
wordforms: list[Wordform] = field(default_factory=list)

View file

@ -0,0 +1,63 @@
import uuid
from sqlalchemy import String, Text, ForeignKey, Integer
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
from ..database import Base
class DictionaryLemmaEntity(Base):
__tablename__ = "dictionary_lemma"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
headword: Mapped[str] = mapped_column(Text, nullable=False)
language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionarySenseEntity(Base):
__tablename__ = "dictionary_sense"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionaryWordformEntity(Base):
__tablename__ = "dictionary_wordform"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
class DictionaryLemmaRawEntity(Base):
__tablename__ = "dictionary_lemma_raw"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
lemma_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
nullable=False,
unique=True,
)
language: Mapped[str] = mapped_column(String(2), nullable=False)
raw: Mapped[dict] = mapped_column(JSONB, nullable=False)

View file

@ -0,0 +1,80 @@
import uuid
from typing import Protocol
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..entities.dictionary_entities import (
DictionaryLemmaEntity,
DictionarySenseEntity,
DictionaryWordformEntity,
)
from ....domain.models.dictionary import Lemma, Sense, Wordform
class DictionaryRepository(Protocol):
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
return Sense(
id=str(entity.id),
lemma_id=str(entity.lemma_id),
sense_index=entity.sense_index,
gloss=entity.gloss,
topics=entity.topics or [],
tags=entity.tags or [],
)
def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
return Wordform(
id=str(entity.id),
lemma_id=str(entity.lemma_id),
form=entity.form,
tags=entity.tags or [],
)
class PostgresDictionaryRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
"""EN→target direction: find senses whose gloss matches the given English text.
Uses a case-insensitive exact match on the gloss column, filtered to the
target language via the joined lemma row.
"""
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionarySenseEntity.gloss.ilike(text),
DictionaryLemmaEntity.language == target_lang,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity).where(
DictionaryWordformEntity.lemma_id == lemma_id
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]

View file

@ -0,0 +1,322 @@
#!/usr/bin/env python
"""
CLI import script for kaikki/wiktextract JSONL dictionary data.
Usage (from api/ directory):
uv run ./scripts/import_dictionary.py --lang fr
# or via Make from the repo root:
make import-dictionary lang=fr
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
which matches the docker-compose dev credentials when the DB port is exposed on the host.
"""
import argparse
import asyncio
import json
import os
import sys
import uuid
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
_API_DIR = Path(__file__).parent.parent
_REPO_ROOT = _API_DIR.parent
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
_LANG_FILE_MAP: dict[str, str] = {
"fr": "french.jsonl",
}
_POS_MAP: dict[str, str] = {
"noun": "NOUN",
"verb": "VERB",
"adj": "ADJ",
"adv": "ADV",
"det": "DET",
"article": "DET",
"pron": "PRON",
"prep": "ADP",
"adp": "ADP",
"conj": "CCONJ",
"cconj": "CCONJ",
"sconj": "SCONJ",
"intj": "INTJ",
"num": "NUM",
"numeral": "NUM",
"part": "PART",
"particle": "PART",
"name": "PROPN",
"propn": "PROPN",
"proper noun": "PROPN",
"punct": "PUNCT",
"sym": "SYM",
}
_GENDER_MAP: dict[str, str] = {
"masculine": "masculine",
"masc": "masculine",
"feminine": "feminine",
"fem": "feminine",
"neuter": "neuter",
"common": "common",
}
# ---------------------------------------------------------------------------
# Standalone table definitions — no app imports, no Settings() call
# ---------------------------------------------------------------------------
_meta = sa.MetaData()
_lemma_table = sa.Table(
"dictionary_lemma",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("headword", sa.Text(), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("pos_raw", sa.Text(), nullable=False),
sa.Column("pos_normalised", sa.Text(), nullable=True),
sa.Column("gender", sa.Text(), nullable=True),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_sense_table = sa.Table(
"dictionary_sense",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("sense_index", sa.Integer(), nullable=False),
sa.Column("gloss", sa.Text(), nullable=False),
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_wordform_table = sa.Table(
"dictionary_wordform",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("form", sa.Text(), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_raw_table = sa.Table(
"dictionary_lemma_raw",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("raw", JSONB(), nullable=False),
)
# ---------------------------------------------------------------------------
# Normalisation helpers
# ---------------------------------------------------------------------------
def _normalise_pos(pos_raw: str) -> str | None:
return _POS_MAP.get(pos_raw.lower().strip())
def _normalise_gender(tags: list) -> str | None:
for tag in tags:
mapped = _GENDER_MAP.get(tag)
if mapped:
return mapped
return None
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def _parse_entry(record: dict, lang_code: str) -> dict | None:
"""Parse one kaikki JSONL record into insertion-ready row dicts.
Returns None if the entry should be skipped.
"""
if record.get("lang_code") != lang_code:
return None
word = (record.get("word") or "").strip()
if not word:
return None
pos_raw = (record.get("pos") or "").strip()
top_tags = record.get("tags") or []
lemma_id = uuid.uuid4()
senses = []
for i, sense_record in enumerate(record.get("senses") or []):
sense_id = uuid.uuid4()
glosses = sense_record.get("glosses") or []
gloss = glosses[0] if glosses else ""
topics = sense_record.get("topics") or []
sense_tags = sense_record.get("tags") or []
senses.append(
{
"id": sense_id,
"lemma_id": lemma_id,
"sense_index": i,
"gloss": gloss,
"topics": topics,
"tags": sense_tags,
}
)
wordforms = []
for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip()
if not form_text or form_text == word:
continue
form_tags = f.get("tags") or []
wordforms.append(
{
"id": uuid.uuid4(),
"lemma_id": lemma_id,
"form": form_text,
"tags": form_tags,
}
)
return {
"lemma": {
"id": lemma_id,
"headword": word,
"language": lang_code,
"pos_raw": pos_raw,
"pos_normalised": _normalise_pos(pos_raw),
"gender": _normalise_gender(top_tags),
"tags": top_tags,
},
"senses": senses,
"wordforms": wordforms,
"raw": {
"id": uuid.uuid4(),
"lemma_id": lemma_id,
"language": lang_code,
"raw": record,
},
}
# ---------------------------------------------------------------------------
# DB operations
# ---------------------------------------------------------------------------
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
lemma_rows = [e["lemma"] for e in batch]
sense_rows = [s for e in batch for s in e["senses"]]
wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch]
if lemma_rows:
await conn.execute(_lemma_table.insert(), lemma_rows)
if sense_rows:
await conn.execute(_sense_table.insert(), sense_rows)
if wordform_rows:
await conn.execute(_wordform_table.insert(), wordform_rows)
if raw_rows:
await conn.execute(_raw_table.insert(), raw_rows)
await conn.commit()
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
lang_file = _LANG_FILE_MAP.get(lang_code)
if not lang_file:
print(
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
file=sys.stderr,
)
sys.exit(1)
jsonl_path = _DICT_DIR / lang_file
if not jsonl_path.exists():
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
sys.exit(1)
database_url = os.environ.get(
"DATABASE_URL",
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
)
engine = create_async_engine(database_url, echo=False)
try:
async with engine.connect() as conn:
print(f"Deleting existing entries for language={lang_code!r}...")
await conn.execute(
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
)
await conn.commit()
print(f"Importing {jsonl_path} ...")
batch: list[dict] = []
total_lemmas = 0
skipped = 0
with open(jsonl_path, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError as exc:
print(
f" Line {line_num}: JSON parse error: {exc}",
file=sys.stderr,
)
skipped += 1
continue
parsed = _parse_entry(record, lang_code)
if parsed is None:
skipped += 1
continue
batch.append(parsed)
if len(batch) >= batch_size:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f" Committed {total_lemmas} lemmas...")
batch = []
if batch:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
finally:
await engine.dispose()
def main() -> None:
parser = argparse.ArgumentParser(
description="Import kaikki dictionary JSONL into Postgres."
)
parser.add_argument(
"--lang", required=True, help="Language code to import (e.g. fr)"
)
parser.add_argument(
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
)
args = parser.parse_args()
asyncio.run(run_import(args.lang, args.batch_size))
if __name__ == "__main__":
main()

1
dictionaries/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.jsonl

3
dictionaries/README.md Normal file
View file

@ -0,0 +1,3 @@
# Dictionaries
This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.