scripts: improve the import/clean scripts for the dictionary
This commit is contained in:
parent
c6fab5fdbb
commit
f4e97f2f29
2 changed files with 236 additions and 26 deletions
99
api/scripts/clear_dictionary.py
Normal file
99
api/scripts/clear_dictionary.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Clear all rows from dictionary tables and re-import from source JSONL files.
|
||||
|
||||
Usage (from api/ directory):
|
||||
uv run ./scripts/clear_dictionary.py
|
||||
|
||||
# Dry-run: clear only, no re-import
|
||||
uv run ./scripts/clear_dictionary.py --no-import
|
||||
|
||||
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||||
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
# Re-use table definitions and run_import from the sibling script so there is
|
||||
# no duplication of schema knowledge.
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from import_dictionary import ( # noqa: E402
|
||||
_LANG_FILE_MAP,
|
||||
_lemma_table,
|
||||
_raw_table,
|
||||
_sense_link_table,
|
||||
_sense_table,
|
||||
_wordform_table,
|
||||
run_import,
|
||||
)
|
||||
|
||||
# Delete order respects foreign-key dependencies:
|
||||
# sense_link → sense
|
||||
# sense → lemma
|
||||
# wordform → lemma
|
||||
# raw → lemma
|
||||
# lemma (parent)
|
||||
_DELETE_ORDER = [
|
||||
_sense_link_table,
|
||||
_sense_table,
|
||||
_wordform_table,
|
||||
_raw_table,
|
||||
_lemma_table,
|
||||
]
|
||||
|
||||
|
||||
async def clear_all(database_url: str) -> None:
|
||||
engine = create_async_engine(database_url, echo=False)
|
||||
try:
|
||||
async with engine.connect() as conn:
|
||||
print("Clearing all dictionary tables...")
|
||||
for table in _DELETE_ORDER:
|
||||
result = await conn.execute(sa.delete(table))
|
||||
print(f" Deleted {result.rowcount} rows from {table.name}")
|
||||
await conn.commit()
|
||||
print("All dictionary tables cleared.")
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
async def main(run_reimport: bool, batch_size: int) -> None:
|
||||
database_url = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||||
)
|
||||
|
||||
await clear_all(database_url)
|
||||
|
||||
if not run_reimport:
|
||||
return
|
||||
|
||||
for lang_code in _LANG_FILE_MAP:
|
||||
print(f"\nRe-importing language={lang_code!r}...")
|
||||
await run_import(lang_code, batch_size)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Clear all dictionary tables and optionally re-import."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-import",
|
||||
action="store_true",
|
||||
help="Clear tables only; skip re-import.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Rows per commit during re-import (default: 1000)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size))
|
||||
|
|
@ -23,7 +23,8 @@ from pathlib import Path
|
|||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
_API_DIR = Path(__file__).parent.parent
|
||||
_REPO_ROOT = _API_DIR.parent
|
||||
|
|
@ -69,6 +70,39 @@ _GENDER_MAP: dict[str, str] = {
|
|||
"common": "common",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Deterministic UUID namespace
|
||||
#
|
||||
# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
|
||||
# that re-importing the same kaikki data always produces the same UUIDs. This
|
||||
# means:
|
||||
# • Re-imports update rows in place (upsert) without changing PKs, so
|
||||
# learnable_word_bank_entry / word_bank_pack_entry FK references are never
|
||||
# nullified by a re-import.
|
||||
# • WordPacks developed in one environment can be transferred to another
|
||||
# environment that imported from the same kaikki dataset, because sense UUIDs
|
||||
# will be identical in both.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
|
||||
|
||||
|
||||
def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
|
||||
# Include sorted sense IDs so that two kaikki entries with the same
|
||||
# (word, pos, etymology_number) but different senses get distinct UUIDs.
|
||||
sense_key = ":".join(sorted(sense_ids))
|
||||
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
|
||||
|
||||
|
||||
def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
|
||||
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
|
||||
|
||||
|
||||
def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
|
||||
tags_key = ",".join(sorted(tags))
|
||||
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Standalone table definitions — no app imports, no Settings() call
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -158,16 +192,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
if not word:
|
||||
return None
|
||||
|
||||
pos_raw = (record.get("pos") or "").strip()
|
||||
# Skip entries that are inflected forms of another lemma (e.g. conjugations,
|
||||
# plurals). These appear as top-level JSONL records but are already captured
|
||||
# as wordforms via the parent lemma's `forms` array.
|
||||
for sense in record.get("senses") or []:
|
||||
if sense.get("form_of"):
|
||||
return None
|
||||
|
||||
lemma_id = uuid.uuid4()
|
||||
pos_raw = (record.get("pos") or "").strip()
|
||||
etymology_number = record.get("etymology_number", 0)
|
||||
raw_senses = record.get("senses") or []
|
||||
|
||||
# Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
|
||||
# This disambiguates entries that share (word, pos, etymology_number) but
|
||||
# have genuinely different senses — kaikki has ~349 such cases in French.
|
||||
kaikki_sense_ids = [
|
||||
s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
|
||||
for i, s in enumerate(raw_senses)
|
||||
]
|
||||
|
||||
lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
|
||||
|
||||
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
||||
gender: str | None = None
|
||||
senses = []
|
||||
sense_links = []
|
||||
for i, sense_record in enumerate(record.get("senses") or []):
|
||||
sense_id = uuid.uuid4()
|
||||
for i, sense_record in enumerate(raw_senses):
|
||||
kaikki_sense_id = kaikki_sense_ids[i]
|
||||
sense_id = _sense_uuid(kaikki_sense_id)
|
||||
glosses = sense_record.get("glosses") or []
|
||||
gloss = glosses[0] if glosses else ""
|
||||
topics = sense_record.get("topics") or []
|
||||
|
|
@ -192,25 +244,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
|
||||
for link_pair in (sense_record.get("links") or []):
|
||||
if isinstance(link_pair, list) and len(link_pair) == 2:
|
||||
link_text, link_target = link_pair[0], link_pair[1]
|
||||
link_id = uuid.uuid5(
|
||||
_KAIKKI_UUID_NS,
|
||||
f"kaikki:link:{sense_id}:{link_text}:{link_target}",
|
||||
)
|
||||
sense_links.append(
|
||||
{
|
||||
"id": uuid.uuid4(),
|
||||
"id": link_id,
|
||||
"sense_id": sense_id,
|
||||
"link_text": link_pair[0],
|
||||
"link_target": link_pair[1],
|
||||
"link_text": link_text,
|
||||
"link_target": link_target,
|
||||
"target_lemma_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
_METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
|
||||
|
||||
wordforms = []
|
||||
for f in record.get("forms") or []:
|
||||
form_text = (f.get("form") or "").strip()
|
||||
if not form_text or form_text == word:
|
||||
continue
|
||||
form_tags = f.get("tags") or []
|
||||
if _METADATA_FORM_TAGS.intersection(form_tags):
|
||||
continue
|
||||
wordforms.append(
|
||||
{
|
||||
"id": uuid.uuid4(),
|
||||
"id": _wordform_uuid(lemma_id, form_text, form_tags),
|
||||
"lemma_id": lemma_id,
|
||||
"form": form_text,
|
||||
"tags": form_tags,
|
||||
|
|
@ -231,7 +292,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
"sense_links": sense_links,
|
||||
"wordforms": wordforms,
|
||||
"raw": {
|
||||
"id": uuid.uuid4(),
|
||||
"id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
|
||||
"lemma_id": lemma_id,
|
||||
"language": lang_code,
|
||||
"raw": record,
|
||||
|
|
@ -251,16 +312,69 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
|||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||
raw_rows = [e["raw"] for e in batch]
|
||||
|
||||
if lemma_rows:
|
||||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||
if sense_rows:
|
||||
await conn.execute(_sense_table.insert(), sense_rows)
|
||||
if sense_link_rows:
|
||||
await conn.execute(_sense_link_table.insert(), sense_link_rows)
|
||||
if wordform_rows:
|
||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||
if raw_rows:
|
||||
await conn.execute(_raw_table.insert(), raw_rows)
|
||||
# asyncpg caps query parameters at 32767. Split each row list into chunks
|
||||
# sized so that rows × columns stays comfortably under that limit.
|
||||
def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
|
||||
size = max(1, 32767 // n_cols)
|
||||
return [rows[i : i + size] for i in range(0, len(rows), size)]
|
||||
|
||||
# Deduplicate by id: safety net for truly identical rows (should be rare
|
||||
# now that the lemma UUID incorporates sense IDs).
|
||||
def _dedup(rows: list[dict]) -> list[dict]:
|
||||
seen: dict = {}
|
||||
for row in rows:
|
||||
seen[row["id"]] = row
|
||||
return list(seen.values())
|
||||
|
||||
lemma_rows = _dedup(lemma_rows)
|
||||
sense_rows = _dedup(sense_rows)
|
||||
wordform_rows = _dedup(wordform_rows)
|
||||
raw_rows = _dedup(raw_rows)
|
||||
sense_link_rows = _dedup(sense_link_rows)
|
||||
|
||||
for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
|
||||
stmt = pg_insert(_lemma_table).values(chunk)
|
||||
await conn.execute(stmt.on_conflict_do_update(
|
||||
index_elements=["id"],
|
||||
set_={
|
||||
"headword": stmt.excluded.headword,
|
||||
"pos_raw": stmt.excluded.pos_raw,
|
||||
"pos_normalised": stmt.excluded.pos_normalised,
|
||||
"gender": stmt.excluded.gender,
|
||||
"tags": stmt.excluded.tags,
|
||||
},
|
||||
))
|
||||
|
||||
for chunk in _chunks(sense_rows, len(_sense_table.columns)):
|
||||
stmt = pg_insert(_sense_table).values(chunk)
|
||||
await conn.execute(stmt.on_conflict_do_update(
|
||||
index_elements=["id"],
|
||||
set_={
|
||||
"sense_index": stmt.excluded.sense_index,
|
||||
"gloss": stmt.excluded.gloss,
|
||||
"topics": stmt.excluded.topics,
|
||||
"tags": stmt.excluded.tags,
|
||||
},
|
||||
))
|
||||
|
||||
for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
|
||||
stmt = pg_insert(_wordform_table).values(chunk)
|
||||
await conn.execute(stmt.on_conflict_do_update(
|
||||
index_elements=["id"],
|
||||
set_={"tags": stmt.excluded.tags},
|
||||
))
|
||||
|
||||
for chunk in _chunks(raw_rows, len(_raw_table.columns)):
|
||||
stmt = pg_insert(_raw_table).values(chunk)
|
||||
await conn.execute(stmt.on_conflict_do_update(
|
||||
index_elements=["id"],
|
||||
set_={"raw": stmt.excluded.raw},
|
||||
))
|
||||
|
||||
for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
|
||||
await conn.execute(
|
||||
pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
|
||||
)
|
||||
|
||||
await conn.commit()
|
||||
|
||||
|
|
@ -350,12 +464,9 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
|||
|
||||
try:
|
||||
async with engine.connect() as conn:
|
||||
print(f"Deleting existing entries for language={lang_code!r}...")
|
||||
await conn.execute(
|
||||
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
||||
)
|
||||
await conn.commit()
|
||||
|
||||
# No upfront delete — rows are upserted so existing FK references
|
||||
# (word bank entries, pack entries) are preserved across re-imports.
|
||||
# To fully wipe and start fresh, run clear_dictionary.py first.
|
||||
print(f"Importing {jsonl_path} ...")
|
||||
batch: list[dict] = []
|
||||
total_lemmas = 0
|
||||
|
|
|
|||
Loading…
Reference in a new issue