language-learning-app/api/scripts/import_dictionary.py

535 lines
18 KiB
Python
Raw Normal View History

#!/usr/bin/env python
"""
CLI import script for kaikki/wiktextract JSONL dictionary data.
Usage (from api/ directory):
uv run ./scripts/import_dictionary.py --lang fr
# or via Make from the repo root:
make import-dictionary lang=fr
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
which matches the docker-compose dev credentials when the DB port is exposed on the host.
"""
import argparse
import asyncio
import json
import os
import sys
import uuid
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import create_async_engine
_API_DIR = Path(__file__).parent.parent
_REPO_ROOT = _API_DIR.parent
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
_LANG_FILE_MAP: dict[str, str] = {
"fr": "french.jsonl",
}
_POS_MAP: dict[str, str] = {
"noun": "NOUN",
"verb": "VERB",
"adj": "ADJ",
"adv": "ADV",
"det": "DET",
"article": "DET",
"pron": "PRON",
"prep": "ADP",
"adp": "ADP",
"conj": "CCONJ",
"cconj": "CCONJ",
"sconj": "SCONJ",
"intj": "INTJ",
"num": "NUM",
"numeral": "NUM",
"part": "PART",
"particle": "PART",
"name": "PROPN",
"propn": "PROPN",
"proper noun": "PROPN",
"punct": "PUNCT",
"sym": "SYM",
}
_GENDER_MAP: dict[str, str] = {
2026-04-10 20:12:40 +00:00
"f": "feminine",
"m": "masculine",
"masculine": "masculine",
"masc": "masculine",
"feminine": "feminine",
"fem": "feminine",
"neuter": "neuter",
"common": "common",
}
# ---------------------------------------------------------------------------
# Deterministic UUID namespace
#
# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
# that re-importing the same kaikki data always produces the same UUIDs. This
# means:
# • Re-imports update rows in place (upsert) without changing PKs, so
# learnable_word_bank_entry / word_bank_pack_entry FK references are never
# nullified by a re-import.
# • WordPacks developed in one environment can be transferred to another
# environment that imported from the same kaikki dataset, because sense UUIDs
# will be identical in both.
# ---------------------------------------------------------------------------
_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
# Include sorted sense IDs so that two kaikki entries with the same
# (word, pos, etymology_number) but different senses get distinct UUIDs.
sense_key = ":".join(sorted(sense_ids))
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
tags_key = ",".join(sorted(tags))
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
# ---------------------------------------------------------------------------
# Standalone table definitions — no app imports, no Settings() call
# ---------------------------------------------------------------------------
_meta = sa.MetaData()
_lemma_table = sa.Table(
"dictionary_lemma",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("headword", sa.Text(), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("pos_raw", sa.Text(), nullable=False),
sa.Column("pos_normalised", sa.Text(), nullable=True),
sa.Column("gender", sa.Text(), nullable=True),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_sense_table = sa.Table(
"dictionary_sense",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("sense_index", sa.Integer(), nullable=False),
sa.Column("gloss", sa.Text(), nullable=False),
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_wordform_table = sa.Table(
"dictionary_wordform",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("form", sa.Text(), nullable=False),
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
)
_raw_table = sa.Table(
"dictionary_lemma_raw",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("language", sa.String(2), nullable=False),
sa.Column("raw", JSONB(), nullable=False),
)
2026-04-10 20:12:40 +00:00
_sense_link_table = sa.Table(
"dictionary_sense_link",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("link_text", sa.Text(), nullable=False),
sa.Column("link_target", sa.Text(), nullable=False),
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
)
# ---------------------------------------------------------------------------
# Normalisation helpers
# ---------------------------------------------------------------------------
def _normalise_pos(pos_raw: str) -> str | None:
return _POS_MAP.get(pos_raw.lower().strip())
2026-04-10 20:12:40 +00:00
def _normalise_gender(value: str | None) -> str | None:
if value is None:
return None
return _GENDER_MAP.get(value)
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def _parse_entry(record: dict, lang_code: str) -> dict | None:
"""Parse one kaikki JSONL record into insertion-ready row dicts.
Returns None if the entry should be skipped.
"""
if record.get("lang_code") != lang_code:
return None
word = (record.get("word") or "").strip()
if not word:
return None
# Skip entries that are inflected forms of another lemma (e.g. conjugations,
# plurals). These appear as top-level JSONL records but are already captured
# as wordforms via the parent lemma's `forms` array.
for sense in record.get("senses") or []:
if sense.get("form_of"):
return None
pos_raw = (record.get("pos") or "").strip()
etymology_number = record.get("etymology_number", 0)
raw_senses = record.get("senses") or []
# Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
# This disambiguates entries that share (word, pos, etymology_number) but
# have genuinely different senses — kaikki has ~349 such cases in French.
kaikki_sense_ids = [
s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
for i, s in enumerate(raw_senses)
]
lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
2026-04-10 20:12:40 +00:00
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
gender: str | None = None
senses = []
2026-04-10 20:12:40 +00:00
sense_links = []
for i, sense_record in enumerate(raw_senses):
kaikki_sense_id = kaikki_sense_ids[i]
sense_id = _sense_uuid(kaikki_sense_id)
glosses = sense_record.get("glosses") or []
gloss = glosses[0] if glosses else ""
topics = sense_record.get("topics") or []
sense_tags = sense_record.get("tags") or []
2026-04-10 20:12:40 +00:00
if gender is None:
for tag in sense_tags:
if tag in _GENDER_TAGS:
gender = tag
break
senses.append(
{
"id": sense_id,
"lemma_id": lemma_id,
"sense_index": i,
"gloss": gloss,
"topics": topics,
"tags": sense_tags,
}
)
2026-04-10 20:12:40 +00:00
for link_pair in (sense_record.get("links") or []):
if isinstance(link_pair, list) and len(link_pair) == 2:
link_text, link_target = link_pair[0], link_pair[1]
link_id = uuid.uuid5(
_KAIKKI_UUID_NS,
f"kaikki:link:{sense_id}:{link_text}:{link_target}",
)
2026-04-10 20:12:40 +00:00
sense_links.append(
{
"id": link_id,
2026-04-10 20:12:40 +00:00
"sense_id": sense_id,
"link_text": link_text,
"link_target": link_target,
2026-04-10 20:12:40 +00:00
"target_lemma_id": None,
}
)
_METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
wordforms = []
for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip()
if not form_text or form_text == word:
continue
form_tags = f.get("tags") or []
if _METADATA_FORM_TAGS.intersection(form_tags):
continue
wordforms.append(
{
"id": _wordform_uuid(lemma_id, form_text, form_tags),
"lemma_id": lemma_id,
"form": form_text,
"tags": form_tags,
}
)
return {
"lemma": {
"id": lemma_id,
"headword": word,
"language": lang_code,
"pos_raw": pos_raw,
"pos_normalised": _normalise_pos(pos_raw),
2026-04-10 20:12:40 +00:00
"gender": gender,
"tags": record.get("tags") or [],
},
"senses": senses,
2026-04-10 20:12:40 +00:00
"sense_links": sense_links,
"wordforms": wordforms,
"raw": {
"id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
"lemma_id": lemma_id,
"language": lang_code,
"raw": record,
},
}
# ---------------------------------------------------------------------------
# DB operations
# ---------------------------------------------------------------------------
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
lemma_rows = [e["lemma"] for e in batch]
sense_rows = [s for e in batch for s in e["senses"]]
2026-04-10 20:12:40 +00:00
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch]
# asyncpg caps query parameters at 32767. Split each row list into chunks
# sized so that rows × columns stays comfortably under that limit.
def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
size = max(1, 32767 // n_cols)
return [rows[i : i + size] for i in range(0, len(rows), size)]
# Deduplicate by id: safety net for truly identical rows (should be rare
# now that the lemma UUID incorporates sense IDs).
def _dedup(rows: list[dict]) -> list[dict]:
seen: dict = {}
for row in rows:
seen[row["id"]] = row
return list(seen.values())
lemma_rows = _dedup(lemma_rows)
sense_rows = _dedup(sense_rows)
wordform_rows = _dedup(wordform_rows)
raw_rows = _dedup(raw_rows)
sense_link_rows = _dedup(sense_link_rows)
for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
stmt = pg_insert(_lemma_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"headword": stmt.excluded.headword,
"pos_raw": stmt.excluded.pos_raw,
"pos_normalised": stmt.excluded.pos_normalised,
"gender": stmt.excluded.gender,
"tags": stmt.excluded.tags,
},
))
for chunk in _chunks(sense_rows, len(_sense_table.columns)):
stmt = pg_insert(_sense_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"sense_index": stmt.excluded.sense_index,
"gloss": stmt.excluded.gloss,
"topics": stmt.excluded.topics,
"tags": stmt.excluded.tags,
},
))
for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
stmt = pg_insert(_wordform_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={"tags": stmt.excluded.tags},
))
for chunk in _chunks(raw_rows, len(_raw_table.columns)):
stmt = pg_insert(_raw_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={"raw": stmt.excluded.raw},
))
for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
await conn.execute(
pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
)
await conn.commit()
2026-04-10 20:12:40 +00:00
_LANG_SECTION_MAP: dict[str, str] = {
"fr": "French",
"de": "German",
"es": "Spanish",
"it": "Italian",
"pt": "Portuguese",
}
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
"""Resolve target_lemma_id for sense links whose target matches lang_code.
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
lemmas have been imported we can attempt to match the target word to a row
in dictionary_lemma and store the foreign key.
"""
section = _LANG_SECTION_MAP.get(lang_code)
if not section:
return 0
suffix = f"#{section}"
result = await conn.execute(
sa.select(
_sense_link_table.c.id,
_sense_link_table.c.link_target,
).where(_sense_link_table.c.target_lemma_id.is_(None))
)
all_links = result.fetchall()
# Filter to links that point at this language and extract the target word.
candidates: list[tuple[uuid.UUID, str]] = []
for row in all_links:
if row.link_target.endswith(suffix):
word = row.link_target[: -len(suffix)]
candidates.append((row.id, word))
if not candidates:
return 0
target_words = list({w for _, w in candidates})
lemma_result = await conn.execute(
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
.where(_lemma_table.c.language == lang_code)
.where(_lemma_table.c.headword.in_(target_words))
)
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
resolved = 0
for link_id, word in candidates:
if word in lemma_map:
await conn.execute(
_sense_link_table.update()
.where(_sense_link_table.c.id == link_id)
.values(target_lemma_id=lemma_map[word])
)
resolved += 1
await conn.commit()
return resolved
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
lang_file = _LANG_FILE_MAP.get(lang_code)
if not lang_file:
print(
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
file=sys.stderr,
)
sys.exit(1)
jsonl_path = _DICT_DIR / lang_file
if not jsonl_path.exists():
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
sys.exit(1)
database_url = os.environ.get(
"DATABASE_URL",
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
)
engine = create_async_engine(database_url, echo=False)
try:
async with engine.connect() as conn:
# No upfront delete — rows are upserted so existing FK references
# (word bank entries, pack entries) are preserved across re-imports.
# To fully wipe and start fresh, run clear_dictionary.py first.
print(f"Importing {jsonl_path} ...")
batch: list[dict] = []
total_lemmas = 0
skipped = 0
with open(jsonl_path, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError as exc:
print(
f" Line {line_num}: JSON parse error: {exc}",
file=sys.stderr,
)
skipped += 1
continue
parsed = _parse_entry(record, lang_code)
if parsed is None:
skipped += 1
continue
batch.append(parsed)
if len(batch) >= batch_size:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f" Committed {total_lemmas} lemmas...")
batch = []
if batch:
await _flush_batch(conn, batch)
total_lemmas += len(batch)
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
2026-04-10 20:12:40 +00:00
async with engine.connect() as conn:
print("Resolving sense links...")
resolved = await _resolve_links(conn, lang_code)
print(f"Resolved {resolved} sense links.")
finally:
await engine.dispose()
def main() -> None:
parser = argparse.ArgumentParser(
description="Import kaikki dictionary JSONL into Postgres."
)
parser.add_argument(
"--lang", required=True, help="Language code to import (e.g. fr)"
)
parser.add_argument(
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
)
args = parser.parse_args()
asyncio.run(run_import(args.lang, args.batch_size))
if __name__ == "__main__":
main()