scripts: improve the import/clean scripts for the dictionary

This commit is contained in:
wilson 2026-04-15 21:01:52 +01:00
parent c6fab5fdbb
commit f4e97f2f29
2 changed files with 236 additions and 26 deletions

View file

@ -0,0 +1,99 @@
#!/usr/bin/env python
"""
Clear all rows from dictionary tables and re-import from source JSONL files.
Usage (from api/ directory):
uv run ./scripts/clear_dictionary.py
# Dry-run: clear only, no re-import
uv run ./scripts/clear_dictionary.py --no-import
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
which matches the docker-compose dev credentials when the DB port is exposed on the host.
"""
import argparse
import asyncio
import os
import sys
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import create_async_engine
# Re-use table definitions and run_import from the sibling script so there is
# no duplication of schema knowledge.
sys.path.insert(0, str(Path(__file__).parent))
from import_dictionary import ( # noqa: E402
_LANG_FILE_MAP,
_lemma_table,
_raw_table,
_sense_link_table,
_sense_table,
_wordform_table,
run_import,
)
# Delete order respects foreign-key dependencies:
# sense_link → sense
# sense → lemma
# wordform → lemma
# raw → lemma
# lemma (parent)
_DELETE_ORDER = [
_sense_link_table,
_sense_table,
_wordform_table,
_raw_table,
_lemma_table,
]
async def clear_all(database_url: str) -> None:
engine = create_async_engine(database_url, echo=False)
try:
async with engine.connect() as conn:
print("Clearing all dictionary tables...")
for table in _DELETE_ORDER:
result = await conn.execute(sa.delete(table))
print(f" Deleted {result.rowcount} rows from {table.name}")
await conn.commit()
print("All dictionary tables cleared.")
finally:
await engine.dispose()
async def main(run_reimport: bool, batch_size: int) -> None:
database_url = os.environ.get(
"DATABASE_URL",
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
)
await clear_all(database_url)
if not run_reimport:
return
for lang_code in _LANG_FILE_MAP:
print(f"\nRe-importing language={lang_code!r}...")
await run_import(lang_code, batch_size)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Clear all dictionary tables and optionally re-import."
)
parser.add_argument(
"--no-import",
action="store_true",
help="Clear tables only; skip re-import.",
)
parser.add_argument(
"--batch-size",
type=int,
default=1000,
help="Rows per commit during re-import (default: 1000)",
)
args = parser.parse_args()
asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size))

View file

@ -23,7 +23,8 @@ from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import create_async_engine
_API_DIR = Path(__file__).parent.parent
_REPO_ROOT = _API_DIR.parent
@ -69,6 +70,39 @@ _GENDER_MAP: dict[str, str] = {
"common": "common",
}
# ---------------------------------------------------------------------------
# Deterministic UUID namespace
#
# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
# that re-importing the same kaikki data always produces the same UUIDs. This
# means:
# • Re-imports update rows in place (upsert) without changing PKs, so
# learnable_word_bank_entry / word_bank_pack_entry FK references are never
# nullified by a re-import.
# • WordPacks developed in one environment can be transferred to another
# environment that imported from the same kaikki dataset, because sense UUIDs
# will be identical in both.
# ---------------------------------------------------------------------------
_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
# Include sorted sense IDs so that two kaikki entries with the same
# (word, pos, etymology_number) but different senses get distinct UUIDs.
sense_key = ":".join(sorted(sense_ids))
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
tags_key = ",".join(sorted(tags))
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
# ---------------------------------------------------------------------------
# Standalone table definitions — no app imports, no Settings() call
# ---------------------------------------------------------------------------
@ -158,16 +192,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
if not word:
return None
pos_raw = (record.get("pos") or "").strip()
# Skip entries that are inflected forms of another lemma (e.g. conjugations,
# plurals). These appear as top-level JSONL records but are already captured
# as wordforms via the parent lemma's `forms` array.
for sense in record.get("senses") or []:
if sense.get("form_of"):
return None
lemma_id = uuid.uuid4()
pos_raw = (record.get("pos") or "").strip()
etymology_number = record.get("etymology_number", 0)
raw_senses = record.get("senses") or []
# Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
# This disambiguates entries that share (word, pos, etymology_number) but
# have genuinely different senses — kaikki has ~349 such cases in French.
kaikki_sense_ids = [
s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
for i, s in enumerate(raw_senses)
]
lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
gender: str | None = None
senses = []
sense_links = []
for i, sense_record in enumerate(record.get("senses") or []):
sense_id = uuid.uuid4()
for i, sense_record in enumerate(raw_senses):
kaikki_sense_id = kaikki_sense_ids[i]
sense_id = _sense_uuid(kaikki_sense_id)
glosses = sense_record.get("glosses") or []
gloss = glosses[0] if glosses else ""
topics = sense_record.get("topics") or []
@ -192,25 +244,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
for link_pair in (sense_record.get("links") or []):
if isinstance(link_pair, list) and len(link_pair) == 2:
link_text, link_target = link_pair[0], link_pair[1]
link_id = uuid.uuid5(
_KAIKKI_UUID_NS,
f"kaikki:link:{sense_id}:{link_text}:{link_target}",
)
sense_links.append(
{
"id": uuid.uuid4(),
"id": link_id,
"sense_id": sense_id,
"link_text": link_pair[0],
"link_target": link_pair[1],
"link_text": link_text,
"link_target": link_target,
"target_lemma_id": None,
}
)
_METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
wordforms = []
for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip()
if not form_text or form_text == word:
continue
form_tags = f.get("tags") or []
if _METADATA_FORM_TAGS.intersection(form_tags):
continue
wordforms.append(
{
"id": uuid.uuid4(),
"id": _wordform_uuid(lemma_id, form_text, form_tags),
"lemma_id": lemma_id,
"form": form_text,
"tags": form_tags,
@ -231,7 +292,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
"sense_links": sense_links,
"wordforms": wordforms,
"raw": {
"id": uuid.uuid4(),
"id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
"lemma_id": lemma_id,
"language": lang_code,
"raw": record,
@ -251,16 +312,69 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch]
if lemma_rows:
await conn.execute(_lemma_table.insert(), lemma_rows)
if sense_rows:
await conn.execute(_sense_table.insert(), sense_rows)
if sense_link_rows:
await conn.execute(_sense_link_table.insert(), sense_link_rows)
if wordform_rows:
await conn.execute(_wordform_table.insert(), wordform_rows)
if raw_rows:
await conn.execute(_raw_table.insert(), raw_rows)
# asyncpg caps query parameters at 32767. Split each row list into chunks
# sized so that rows × columns stays comfortably under that limit.
def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
size = max(1, 32767 // n_cols)
return [rows[i : i + size] for i in range(0, len(rows), size)]
# Deduplicate by id: safety net for truly identical rows (should be rare
# now that the lemma UUID incorporates sense IDs).
def _dedup(rows: list[dict]) -> list[dict]:
seen: dict = {}
for row in rows:
seen[row["id"]] = row
return list(seen.values())
lemma_rows = _dedup(lemma_rows)
sense_rows = _dedup(sense_rows)
wordform_rows = _dedup(wordform_rows)
raw_rows = _dedup(raw_rows)
sense_link_rows = _dedup(sense_link_rows)
for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
stmt = pg_insert(_lemma_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"headword": stmt.excluded.headword,
"pos_raw": stmt.excluded.pos_raw,
"pos_normalised": stmt.excluded.pos_normalised,
"gender": stmt.excluded.gender,
"tags": stmt.excluded.tags,
},
))
for chunk in _chunks(sense_rows, len(_sense_table.columns)):
stmt = pg_insert(_sense_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={
"sense_index": stmt.excluded.sense_index,
"gloss": stmt.excluded.gloss,
"topics": stmt.excluded.topics,
"tags": stmt.excluded.tags,
},
))
for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
stmt = pg_insert(_wordform_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={"tags": stmt.excluded.tags},
))
for chunk in _chunks(raw_rows, len(_raw_table.columns)):
stmt = pg_insert(_raw_table).values(chunk)
await conn.execute(stmt.on_conflict_do_update(
index_elements=["id"],
set_={"raw": stmt.excluded.raw},
))
for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
await conn.execute(
pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
)
await conn.commit()
@ -350,12 +464,9 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
try:
async with engine.connect() as conn:
print(f"Deleting existing entries for language={lang_code!r}...")
await conn.execute(
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
)
await conn.commit()
# No upfront delete — rows are upserted so existing FK references
# (word bank entries, pack entries) are preserved across re-imports.
# To fully wipe and start fresh, run clear_dictionary.py first.
print(f"Importing {jsonl_path} ...")
batch: list[dict] = []
total_lemmas = 0