From f4e97f2f294086eda673ecb252fc99eca55b1f52 Mon Sep 17 00:00:00 2001
From: wilson <wilson@thomaswilson.xyz>
Date: Wed, 15 Apr 2026 21:01:52 +0100
Subject: [PATCH] scripts: improve the import/clean scripts for the dictionary

---
 api/scripts/clear_dictionary.py  |  99 +++++++++++++++++++
 api/scripts/import_dictionary.py | 163 ++++++++++++++++++++++++++-----
 2 files changed, 236 insertions(+), 26 deletions(-)
 create mode 100644 api/scripts/clear_dictionary.py

diff --git a/api/scripts/clear_dictionary.py b/api/scripts/clear_dictionary.py
new file mode 100644
index 0000000..f5cd490
--- /dev/null
+++ b/api/scripts/clear_dictionary.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+"""
+Clear all rows from dictionary tables and re-import from source JSONL files.
+
+Usage (from api/ directory):
+    uv run ./scripts/clear_dictionary.py
+
+    # Dry-run: clear only, no re-import
+    uv run ./scripts/clear_dictionary.py --no-import
+
+DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
+which matches the docker-compose dev credentials when the DB port is exposed on the host.
+"""
+
+import argparse
+import asyncio
+import os
+import sys
+from pathlib import Path
+
+import sqlalchemy as sa
+from sqlalchemy.ext.asyncio import create_async_engine
+
+# Re-use table definitions and run_import from the sibling script so there is
+# no duplication of schema knowledge.
+sys.path.insert(0, str(Path(__file__).parent))
+from import_dictionary import (  # noqa: E402
+    _LANG_FILE_MAP,
+    _lemma_table,
+    _raw_table,
+    _sense_link_table,
+    _sense_table,
+    _wordform_table,
+    run_import,
+)
+
+# Delete order respects foreign-key dependencies:
+#   sense_link → sense
+#   sense      → lemma
+#   wordform   → lemma
+#   raw        → lemma
+#   lemma      (parent)
+_DELETE_ORDER = [
+    _sense_link_table,
+    _sense_table,
+    _wordform_table,
+    _raw_table,
+    _lemma_table,
+]
+
+
+async def clear_all(database_url: str) -> None:
+    engine = create_async_engine(database_url, echo=False)
+    try:
+        async with engine.connect() as conn:
+            print("Clearing all dictionary tables...")
+            for table in _DELETE_ORDER:
+                result = await conn.execute(sa.delete(table))
+                print(f"  Deleted {result.rowcount} rows from {table.name}")
+            await conn.commit()
+            print("All dictionary tables cleared.")
+    finally:
+        await engine.dispose()
+
+
+async def main(run_reimport: bool, batch_size: int) -> None:
+    database_url = os.environ.get(
+        "DATABASE_URL",
+        "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
+    )
+
+    await clear_all(database_url)
+
+    if not run_reimport:
+        return
+
+    for lang_code in _LANG_FILE_MAP:
+        print(f"\nRe-importing language={lang_code!r}...")
+        await run_import(lang_code, batch_size)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Clear all dictionary tables and optionally re-import."
+    )
+    parser.add_argument(
+        "--no-import",
+        action="store_true",
+        help="Clear tables only; skip re-import.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1000,
+        help="Rows per commit during re-import (default: 1000)",
+    )
+    args = parser.parse_args()
+
+    asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size))
diff --git a/api/scripts/import_dictionary.py b/api/scripts/import_dictionary.py
index 916465e..f9a0e06 100644
--- a/api/scripts/import_dictionary.py
+++ b/api/scripts/import_dictionary.py
@@ -23,7 +23,8 @@ from pathlib import Path
 import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 from sqlalchemy.dialects.postgresql import UUID as PG_UUID
-from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.ext.asyncio import create_async_engine
 
 _API_DIR = Path(__file__).parent.parent
 _REPO_ROOT = _API_DIR.parent
@@ -69,6 +70,39 @@ _GENDER_MAP: dict[str, str] = {
     "common": "common",
 }
 
+# ---------------------------------------------------------------------------
+# Deterministic UUID namespace
+#
+# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
+# that re-importing the same kaikki data always produces the same UUIDs.  This
+# means:
+#   • Re-imports update rows in place (upsert) without changing PKs, so
+#     learnable_word_bank_entry / word_bank_pack_entry FK references are never
+#     nullified by a re-import.
+#   • WordPacks developed in one environment can be transferred to another
+#     environment that imported from the same kaikki dataset, because sense UUIDs
+#     will be identical in both.
+# ---------------------------------------------------------------------------
+
+_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
+
+
+def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
+    # Include sorted sense IDs so that two kaikki entries with the same
+    # (word, pos, etymology_number) but different senses get distinct UUIDs.
+    sense_key = ":".join(sorted(sense_ids))
+    return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
+
+
+def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
+    return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
+
+
+def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
+    tags_key = ",".join(sorted(tags))
+    return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
+
+
 # ---------------------------------------------------------------------------
 # Standalone table definitions — no app imports, no Settings() call
 # ---------------------------------------------------------------------------
@@ -158,16 +192,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
     if not word:
         return None
 
-    pos_raw = (record.get("pos") or "").strip()
+    # Skip entries that are inflected forms of another lemma (e.g. conjugations,
+    # plurals).  These appear as top-level JSONL records but are already captured
+    # as wordforms via the parent lemma's `forms` array.
+    for sense in record.get("senses") or []:
+        if sense.get("form_of"):
+            return None
 
-    lemma_id = uuid.uuid4()
+    pos_raw = (record.get("pos") or "").strip()
+    etymology_number = record.get("etymology_number", 0)
+    raw_senses = record.get("senses") or []
+
+    # Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
+    # This disambiguates entries that share (word, pos, etymology_number) but
+    # have genuinely different senses — kaikki has ~349 such cases in French.
+    kaikki_sense_ids = [
+        s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
+        for i, s in enumerate(raw_senses)
+    ]
+
+    lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
 
     _GENDER_TAGS = {"masculine", "feminine", "neuter"}
     gender: str | None = None
     senses = []
     sense_links = []
-    for i, sense_record in enumerate(record.get("senses") or []):
-        sense_id = uuid.uuid4()
+    for i, sense_record in enumerate(raw_senses):
+        kaikki_sense_id = kaikki_sense_ids[i]
+        sense_id = _sense_uuid(kaikki_sense_id)
         glosses = sense_record.get("glosses") or []
         gloss = glosses[0] if glosses else ""
         topics = sense_record.get("topics") or []
@@ -192,25 +244,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
 
         for link_pair in (sense_record.get("links") or []):
             if isinstance(link_pair, list) and len(link_pair) == 2:
+                link_text, link_target = link_pair[0], link_pair[1]
+                link_id = uuid.uuid5(
+                    _KAIKKI_UUID_NS,
+                    f"kaikki:link:{sense_id}:{link_text}:{link_target}",
+                )
                 sense_links.append(
                     {
-                        "id": uuid.uuid4(),
+                        "id": link_id,
                         "sense_id": sense_id,
-                        "link_text": link_pair[0],
-                        "link_target": link_pair[1],
+                        "link_text": link_text,
+                        "link_target": link_target,
                         "target_lemma_id": None,
                     }
                 )
 
+    _METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
+
     wordforms = []
     for f in record.get("forms") or []:
         form_text = (f.get("form") or "").strip()
         if not form_text or form_text == word:
             continue
         form_tags = f.get("tags") or []
+        if _METADATA_FORM_TAGS.intersection(form_tags):
+            continue
         wordforms.append(
             {
-                "id": uuid.uuid4(),
+                "id": _wordform_uuid(lemma_id, form_text, form_tags),
                 "lemma_id": lemma_id,
                 "form": form_text,
                 "tags": form_tags,
@@ -231,7 +292,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
         "sense_links": sense_links,
         "wordforms": wordforms,
         "raw": {
-            "id": uuid.uuid4(),
+            "id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
             "lemma_id": lemma_id,
             "language": lang_code,
             "raw": record,
@@ -251,16 +312,69 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
     wordform_rows = [w for e in batch for w in e["wordforms"]]
     raw_rows = [e["raw"] for e in batch]
 
-    if lemma_rows:
-        await conn.execute(_lemma_table.insert(), lemma_rows)
-    if sense_rows:
-        await conn.execute(_sense_table.insert(), sense_rows)
-    if sense_link_rows:
-        await conn.execute(_sense_link_table.insert(), sense_link_rows)
-    if wordform_rows:
-        await conn.execute(_wordform_table.insert(), wordform_rows)
-    if raw_rows:
-        await conn.execute(_raw_table.insert(), raw_rows)
+    # asyncpg caps query parameters at 32767.  Split each row list into chunks
+    # sized so that rows × columns stays comfortably under that limit.
+    def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
+        size = max(1, 32767 // n_cols)
+        return [rows[i : i + size] for i in range(0, len(rows), size)]
+
+    # Deduplicate by id: safety net for truly identical rows (should be rare
+    # now that the lemma UUID incorporates sense IDs).
+    def _dedup(rows: list[dict]) -> list[dict]:
+        seen: dict = {}
+        for row in rows:
+            seen[row["id"]] = row
+        return list(seen.values())
+
+    lemma_rows = _dedup(lemma_rows)
+    sense_rows = _dedup(sense_rows)
+    wordform_rows = _dedup(wordform_rows)
+    raw_rows = _dedup(raw_rows)
+    sense_link_rows = _dedup(sense_link_rows)
+
+    for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
+        stmt = pg_insert(_lemma_table).values(chunk)
+        await conn.execute(stmt.on_conflict_do_update(
+            index_elements=["id"],
+            set_={
+                "headword": stmt.excluded.headword,
+                "pos_raw": stmt.excluded.pos_raw,
+                "pos_normalised": stmt.excluded.pos_normalised,
+                "gender": stmt.excluded.gender,
+                "tags": stmt.excluded.tags,
+            },
+        ))
+
+    for chunk in _chunks(sense_rows, len(_sense_table.columns)):
+        stmt = pg_insert(_sense_table).values(chunk)
+        await conn.execute(stmt.on_conflict_do_update(
+            index_elements=["id"],
+            set_={
+                "sense_index": stmt.excluded.sense_index,
+                "gloss": stmt.excluded.gloss,
+                "topics": stmt.excluded.topics,
+                "tags": stmt.excluded.tags,
+            },
+        ))
+
+    for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
+        stmt = pg_insert(_wordform_table).values(chunk)
+        await conn.execute(stmt.on_conflict_do_update(
+            index_elements=["id"],
+            set_={"tags": stmt.excluded.tags},
+        ))
+
+    for chunk in _chunks(raw_rows, len(_raw_table.columns)):
+        stmt = pg_insert(_raw_table).values(chunk)
+        await conn.execute(stmt.on_conflict_do_update(
+            index_elements=["id"],
+            set_={"raw": stmt.excluded.raw},
+        ))
+
+    for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
+        await conn.execute(
+            pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
+        )
 
     await conn.commit()
 
@@ -350,12 +464,9 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
 
     try:
         async with engine.connect() as conn:
-            print(f"Deleting existing entries for language={lang_code!r}...")
-            await conn.execute(
-                _lemma_table.delete().where(_lemma_table.c.language == lang_code)
-            )
-            await conn.commit()
-
+            # No upfront delete — rows are upserted so existing FK references
+            # (word bank entries, pack entries) are preserved across re-imports.
+            # To fully wipe and start fresh, run clear_dictionary.py first.
             print(f"Importing {jsonl_path} ...")
             batch: list[dict] = []
             total_lemmas = 0