From f4e97f2f294086eda673ecb252fc99eca55b1f52 Mon Sep 17 00:00:00 2001 From: wilson Date: Wed, 15 Apr 2026 21:01:52 +0100 Subject: [PATCH] scripts: improve the import/clean scripts for the dictionary --- api/scripts/clear_dictionary.py | 99 +++++++++++++++++++ api/scripts/import_dictionary.py | 163 ++++++++++++++++++++++++++----- 2 files changed, 236 insertions(+), 26 deletions(-) create mode 100644 api/scripts/clear_dictionary.py diff --git a/api/scripts/clear_dictionary.py b/api/scripts/clear_dictionary.py new file mode 100644 index 0000000..f5cd490 --- /dev/null +++ b/api/scripts/clear_dictionary.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +""" +Clear all rows from dictionary tables and re-import from source JSONL files. + +Usage (from api/ directory): + uv run ./scripts/clear_dictionary.py + + # Dry-run: clear only, no re-import + uv run ./scripts/clear_dictionary.py --no-import + +DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn +which matches the docker-compose dev credentials when the DB port is exposed on the host. +""" + +import argparse +import asyncio +import os +import sys +from pathlib import Path + +import sqlalchemy as sa +from sqlalchemy.ext.asyncio import create_async_engine + +# Re-use table definitions and run_import from the sibling script so there is +# no duplication of schema knowledge. +sys.path.insert(0, str(Path(__file__).parent)) +from import_dictionary import ( # noqa: E402 + _LANG_FILE_MAP, + _lemma_table, + _raw_table, + _sense_link_table, + _sense_table, + _wordform_table, + run_import, +) + +# Delete order respects foreign-key dependencies: +# sense_link → sense +# sense → lemma +# wordform → lemma +# raw → lemma +# lemma (parent) +_DELETE_ORDER = [ + _sense_link_table, + _sense_table, + _wordform_table, + _raw_table, + _lemma_table, +] + + +async def clear_all(database_url: str) -> None: + engine = create_async_engine(database_url, echo=False) + try: + async with engine.connect() as conn: + print("Clearing all dictionary tables...") + for table in _DELETE_ORDER: + result = await conn.execute(sa.delete(table)) + print(f" Deleted {result.rowcount} rows from {table.name}") + await conn.commit() + print("All dictionary tables cleared.") + finally: + await engine.dispose() + + +async def main(run_reimport: bool, batch_size: int) -> None: + database_url = os.environ.get( + "DATABASE_URL", + "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn", + ) + + await clear_all(database_url) + + if not run_reimport: + return + + for lang_code in _LANG_FILE_MAP: + print(f"\nRe-importing language={lang_code!r}...") + await run_import(lang_code, batch_size) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Clear all dictionary tables and optionally re-import." + ) + parser.add_argument( + "--no-import", + action="store_true", + help="Clear tables only; skip re-import.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=1000, + help="Rows per commit during re-import (default: 1000)", + ) + args = parser.parse_args() + + asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size)) diff --git a/api/scripts/import_dictionary.py b/api/scripts/import_dictionary.py index 916465e..f9a0e06 100644 --- a/api/scripts/import_dictionary.py +++ b/api/scripts/import_dictionary.py @@ -23,7 +23,8 @@ from pathlib import Path import sqlalchemy as sa from sqlalchemy.dialects.postgresql import ARRAY, JSONB from sqlalchemy.dialects.postgresql import UUID as PG_UUID -from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import create_async_engine _API_DIR = Path(__file__).parent.parent _REPO_ROOT = _API_DIR.parent @@ -69,6 +70,39 @@ _GENDER_MAP: dict[str, str] = { "common": "common", } +# --------------------------------------------------------------------------- +# Deterministic UUID namespace +# +# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so +# that re-importing the same kaikki data always produces the same UUIDs. This +# means: +# • Re-imports update rows in place (upsert) without changing PKs, so +# learnable_word_bank_entry / word_bank_pack_entry FK references are never +# nullified by a re-import. +# • WordPacks developed in one environment can be transferred to another +# environment that imported from the same kaikki dataset, because sense UUIDs +# will be identical in both. +# --------------------------------------------------------------------------- + +_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789") + + +def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID: + # Include sorted sense IDs so that two kaikki entries with the same + # (word, pos, etymology_number) but different senses get distinct UUIDs. + sense_key = ":".join(sorted(sense_ids)) + return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}") + + +def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID: + return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}") + + +def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID: + tags_key = ",".join(sorted(tags)) + return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}") + + # --------------------------------------------------------------------------- # Standalone table definitions — no app imports, no Settings() call # --------------------------------------------------------------------------- @@ -158,16 +192,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: if not word: return None - pos_raw = (record.get("pos") or "").strip() + # Skip entries that are inflected forms of another lemma (e.g. conjugations, + # plurals). These appear as top-level JSONL records but are already captured + # as wordforms via the parent lemma's `forms` array. + for sense in record.get("senses") or []: + if sense.get("form_of"): + return None - lemma_id = uuid.uuid4() + pos_raw = (record.get("pos") or "").strip() + etymology_number = record.get("etymology_number", 0) + raw_senses = record.get("senses") or [] + + # Collect kaikki sense IDs up front so the lemma UUID can incorporate them. + # This disambiguates entries that share (word, pos, etymology_number) but + # have genuinely different senses — kaikki has ~349 such cases in French. + kaikki_sense_ids = [ + s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}" + for i, s in enumerate(raw_senses) + ] + + lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids) _GENDER_TAGS = {"masculine", "feminine", "neuter"} gender: str | None = None senses = [] sense_links = [] - for i, sense_record in enumerate(record.get("senses") or []): - sense_id = uuid.uuid4() + for i, sense_record in enumerate(raw_senses): + kaikki_sense_id = kaikki_sense_ids[i] + sense_id = _sense_uuid(kaikki_sense_id) glosses = sense_record.get("glosses") or [] gloss = glosses[0] if glosses else "" topics = sense_record.get("topics") or [] @@ -192,25 +244,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: for link_pair in (sense_record.get("links") or []): if isinstance(link_pair, list) and len(link_pair) == 2: + link_text, link_target = link_pair[0], link_pair[1] + link_id = uuid.uuid5( + _KAIKKI_UUID_NS, + f"kaikki:link:{sense_id}:{link_text}:{link_target}", + ) sense_links.append( { - "id": uuid.uuid4(), + "id": link_id, "sense_id": sense_id, - "link_text": link_pair[0], - "link_target": link_pair[1], + "link_text": link_text, + "link_target": link_target, "target_lemma_id": None, } ) + _METADATA_FORM_TAGS = {"table-tags", "inflection-template"} + wordforms = [] for f in record.get("forms") or []: form_text = (f.get("form") or "").strip() if not form_text or form_text == word: continue form_tags = f.get("tags") or [] + if _METADATA_FORM_TAGS.intersection(form_tags): + continue wordforms.append( { - "id": uuid.uuid4(), + "id": _wordform_uuid(lemma_id, form_text, form_tags), "lemma_id": lemma_id, "form": form_text, "tags": form_tags, @@ -231,7 +292,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: "sense_links": sense_links, "wordforms": wordforms, "raw": { - "id": uuid.uuid4(), + "id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"), "lemma_id": lemma_id, "language": lang_code, "raw": record, @@ -251,16 +312,69 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) wordform_rows = [w for e in batch for w in e["wordforms"]] raw_rows = [e["raw"] for e in batch] - if lemma_rows: - await conn.execute(_lemma_table.insert(), lemma_rows) - if sense_rows: - await conn.execute(_sense_table.insert(), sense_rows) - if sense_link_rows: - await conn.execute(_sense_link_table.insert(), sense_link_rows) - if wordform_rows: - await conn.execute(_wordform_table.insert(), wordform_rows) - if raw_rows: - await conn.execute(_raw_table.insert(), raw_rows) + # asyncpg caps query parameters at 32767. Split each row list into chunks + # sized so that rows × columns stays comfortably under that limit. + def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]: + size = max(1, 32767 // n_cols) + return [rows[i : i + size] for i in range(0, len(rows), size)] + + # Deduplicate by id: safety net for truly identical rows (should be rare + # now that the lemma UUID incorporates sense IDs). + def _dedup(rows: list[dict]) -> list[dict]: + seen: dict = {} + for row in rows: + seen[row["id"]] = row + return list(seen.values()) + + lemma_rows = _dedup(lemma_rows) + sense_rows = _dedup(sense_rows) + wordform_rows = _dedup(wordform_rows) + raw_rows = _dedup(raw_rows) + sense_link_rows = _dedup(sense_link_rows) + + for chunk in _chunks(lemma_rows, len(_lemma_table.columns)): + stmt = pg_insert(_lemma_table).values(chunk) + await conn.execute(stmt.on_conflict_do_update( + index_elements=["id"], + set_={ + "headword": stmt.excluded.headword, + "pos_raw": stmt.excluded.pos_raw, + "pos_normalised": stmt.excluded.pos_normalised, + "gender": stmt.excluded.gender, + "tags": stmt.excluded.tags, + }, + )) + + for chunk in _chunks(sense_rows, len(_sense_table.columns)): + stmt = pg_insert(_sense_table).values(chunk) + await conn.execute(stmt.on_conflict_do_update( + index_elements=["id"], + set_={ + "sense_index": stmt.excluded.sense_index, + "gloss": stmt.excluded.gloss, + "topics": stmt.excluded.topics, + "tags": stmt.excluded.tags, + }, + )) + + for chunk in _chunks(wordform_rows, len(_wordform_table.columns)): + stmt = pg_insert(_wordform_table).values(chunk) + await conn.execute(stmt.on_conflict_do_update( + index_elements=["id"], + set_={"tags": stmt.excluded.tags}, + )) + + for chunk in _chunks(raw_rows, len(_raw_table.columns)): + stmt = pg_insert(_raw_table).values(chunk) + await conn.execute(stmt.on_conflict_do_update( + index_elements=["id"], + set_={"raw": stmt.excluded.raw}, + )) + + for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)): + await conn.execute( + pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing() + ) await conn.commit() @@ -350,12 +464,9 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None: try: async with engine.connect() as conn: - print(f"Deleting existing entries for language={lang_code!r}...") - await conn.execute( - _lemma_table.delete().where(_lemma_table.c.language == lang_code) - ) - await conn.commit() - + # No upfront delete — rows are upserted so existing FK references + # (word bank entries, pack entries) are preserved across re-imports. + # To fully wipe and start fresh, run clear_dictionary.py first. print(f"Importing {jsonl_path} ...") batch: list[dict] = [] total_lemmas = 0