scripts: improve the import/clean scripts for the dictionary
This commit is contained in:
parent
c6fab5fdbb
commit
f4e97f2f29
2 changed files with 236 additions and 26 deletions
99
api/scripts/clear_dictionary.py
Normal file
99
api/scripts/clear_dictionary.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Clear all rows from dictionary tables and re-import from source JSONL files.
|
||||||
|
|
||||||
|
Usage (from api/ directory):
|
||||||
|
uv run ./scripts/clear_dictionary.py
|
||||||
|
|
||||||
|
# Dry-run: clear only, no re-import
|
||||||
|
uv run ./scripts/clear_dictionary.py --no-import
|
||||||
|
|
||||||
|
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||||||
|
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.ext.asyncio import create_async_engine
|
||||||
|
|
||||||
|
# Re-use table definitions and run_import from the sibling script so there is
|
||||||
|
# no duplication of schema knowledge.
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
from import_dictionary import ( # noqa: E402
|
||||||
|
_LANG_FILE_MAP,
|
||||||
|
_lemma_table,
|
||||||
|
_raw_table,
|
||||||
|
_sense_link_table,
|
||||||
|
_sense_table,
|
||||||
|
_wordform_table,
|
||||||
|
run_import,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delete order respects foreign-key dependencies:
|
||||||
|
# sense_link → sense
|
||||||
|
# sense → lemma
|
||||||
|
# wordform → lemma
|
||||||
|
# raw → lemma
|
||||||
|
# lemma (parent)
|
||||||
|
_DELETE_ORDER = [
|
||||||
|
_sense_link_table,
|
||||||
|
_sense_table,
|
||||||
|
_wordform_table,
|
||||||
|
_raw_table,
|
||||||
|
_lemma_table,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def clear_all(database_url: str) -> None:
|
||||||
|
engine = create_async_engine(database_url, echo=False)
|
||||||
|
try:
|
||||||
|
async with engine.connect() as conn:
|
||||||
|
print("Clearing all dictionary tables...")
|
||||||
|
for table in _DELETE_ORDER:
|
||||||
|
result = await conn.execute(sa.delete(table))
|
||||||
|
print(f" Deleted {result.rowcount} rows from {table.name}")
|
||||||
|
await conn.commit()
|
||||||
|
print("All dictionary tables cleared.")
|
||||||
|
finally:
|
||||||
|
await engine.dispose()
|
||||||
|
|
||||||
|
|
||||||
|
async def main(run_reimport: bool, batch_size: int) -> None:
|
||||||
|
database_url = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||||||
|
)
|
||||||
|
|
||||||
|
await clear_all(database_url)
|
||||||
|
|
||||||
|
if not run_reimport:
|
||||||
|
return
|
||||||
|
|
||||||
|
for lang_code in _LANG_FILE_MAP:
|
||||||
|
print(f"\nRe-importing language={lang_code!r}...")
|
||||||
|
await run_import(lang_code, batch_size)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Clear all dictionary tables and optionally re-import."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-import",
|
||||||
|
action="store_true",
|
||||||
|
help="Clear tables only; skip re-import.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-size",
|
||||||
|
type=int,
|
||||||
|
default=1000,
|
||||||
|
help="Rows per commit during re-import (default: 1000)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size))
|
||||||
|
|
@ -23,7 +23,8 @@ from pathlib import Path
|
||||||
import sqlalchemy as sa
|
import sqlalchemy as sa
|
||||||
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||||
|
from sqlalchemy.ext.asyncio import create_async_engine
|
||||||
|
|
||||||
_API_DIR = Path(__file__).parent.parent
|
_API_DIR = Path(__file__).parent.parent
|
||||||
_REPO_ROOT = _API_DIR.parent
|
_REPO_ROOT = _API_DIR.parent
|
||||||
|
|
@ -69,6 +70,39 @@ _GENDER_MAP: dict[str, str] = {
|
||||||
"common": "common",
|
"common": "common",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Deterministic UUID namespace
|
||||||
|
#
|
||||||
|
# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
|
||||||
|
# that re-importing the same kaikki data always produces the same UUIDs. This
|
||||||
|
# means:
|
||||||
|
# • Re-imports update rows in place (upsert) without changing PKs, so
|
||||||
|
# learnable_word_bank_entry / word_bank_pack_entry FK references are never
|
||||||
|
# nullified by a re-import.
|
||||||
|
# • WordPacks developed in one environment can be transferred to another
|
||||||
|
# environment that imported from the same kaikki dataset, because sense UUIDs
|
||||||
|
# will be identical in both.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
|
||||||
|
|
||||||
|
|
||||||
|
def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
|
||||||
|
# Include sorted sense IDs so that two kaikki entries with the same
|
||||||
|
# (word, pos, etymology_number) but different senses get distinct UUIDs.
|
||||||
|
sense_key = ":".join(sorted(sense_ids))
|
||||||
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
|
||||||
|
|
||||||
|
|
||||||
|
def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
|
||||||
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
|
||||||
|
|
||||||
|
|
||||||
|
def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
|
||||||
|
tags_key = ",".join(sorted(tags))
|
||||||
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Standalone table definitions — no app imports, no Settings() call
|
# Standalone table definitions — no app imports, no Settings() call
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -158,16 +192,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
if not word:
|
if not word:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
pos_raw = (record.get("pos") or "").strip()
|
# Skip entries that are inflected forms of another lemma (e.g. conjugations,
|
||||||
|
# plurals). These appear as top-level JSONL records but are already captured
|
||||||
|
# as wordforms via the parent lemma's `forms` array.
|
||||||
|
for sense in record.get("senses") or []:
|
||||||
|
if sense.get("form_of"):
|
||||||
|
return None
|
||||||
|
|
||||||
lemma_id = uuid.uuid4()
|
pos_raw = (record.get("pos") or "").strip()
|
||||||
|
etymology_number = record.get("etymology_number", 0)
|
||||||
|
raw_senses = record.get("senses") or []
|
||||||
|
|
||||||
|
# Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
|
||||||
|
# This disambiguates entries that share (word, pos, etymology_number) but
|
||||||
|
# have genuinely different senses — kaikki has ~349 such cases in French.
|
||||||
|
kaikki_sense_ids = [
|
||||||
|
s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
|
||||||
|
for i, s in enumerate(raw_senses)
|
||||||
|
]
|
||||||
|
|
||||||
|
lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
|
||||||
|
|
||||||
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
||||||
gender: str | None = None
|
gender: str | None = None
|
||||||
senses = []
|
senses = []
|
||||||
sense_links = []
|
sense_links = []
|
||||||
for i, sense_record in enumerate(record.get("senses") or []):
|
for i, sense_record in enumerate(raw_senses):
|
||||||
sense_id = uuid.uuid4()
|
kaikki_sense_id = kaikki_sense_ids[i]
|
||||||
|
sense_id = _sense_uuid(kaikki_sense_id)
|
||||||
glosses = sense_record.get("glosses") or []
|
glosses = sense_record.get("glosses") or []
|
||||||
gloss = glosses[0] if glosses else ""
|
gloss = glosses[0] if glosses else ""
|
||||||
topics = sense_record.get("topics") or []
|
topics = sense_record.get("topics") or []
|
||||||
|
|
@ -192,25 +244,34 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
|
|
||||||
for link_pair in (sense_record.get("links") or []):
|
for link_pair in (sense_record.get("links") or []):
|
||||||
if isinstance(link_pair, list) and len(link_pair) == 2:
|
if isinstance(link_pair, list) and len(link_pair) == 2:
|
||||||
|
link_text, link_target = link_pair[0], link_pair[1]
|
||||||
|
link_id = uuid.uuid5(
|
||||||
|
_KAIKKI_UUID_NS,
|
||||||
|
f"kaikki:link:{sense_id}:{link_text}:{link_target}",
|
||||||
|
)
|
||||||
sense_links.append(
|
sense_links.append(
|
||||||
{
|
{
|
||||||
"id": uuid.uuid4(),
|
"id": link_id,
|
||||||
"sense_id": sense_id,
|
"sense_id": sense_id,
|
||||||
"link_text": link_pair[0],
|
"link_text": link_text,
|
||||||
"link_target": link_pair[1],
|
"link_target": link_target,
|
||||||
"target_lemma_id": None,
|
"target_lemma_id": None,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
|
||||||
|
|
||||||
wordforms = []
|
wordforms = []
|
||||||
for f in record.get("forms") or []:
|
for f in record.get("forms") or []:
|
||||||
form_text = (f.get("form") or "").strip()
|
form_text = (f.get("form") or "").strip()
|
||||||
if not form_text or form_text == word:
|
if not form_text or form_text == word:
|
||||||
continue
|
continue
|
||||||
form_tags = f.get("tags") or []
|
form_tags = f.get("tags") or []
|
||||||
|
if _METADATA_FORM_TAGS.intersection(form_tags):
|
||||||
|
continue
|
||||||
wordforms.append(
|
wordforms.append(
|
||||||
{
|
{
|
||||||
"id": uuid.uuid4(),
|
"id": _wordform_uuid(lemma_id, form_text, form_tags),
|
||||||
"lemma_id": lemma_id,
|
"lemma_id": lemma_id,
|
||||||
"form": form_text,
|
"form": form_text,
|
||||||
"tags": form_tags,
|
"tags": form_tags,
|
||||||
|
|
@ -231,7 +292,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
"sense_links": sense_links,
|
"sense_links": sense_links,
|
||||||
"wordforms": wordforms,
|
"wordforms": wordforms,
|
||||||
"raw": {
|
"raw": {
|
||||||
"id": uuid.uuid4(),
|
"id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
|
||||||
"lemma_id": lemma_id,
|
"lemma_id": lemma_id,
|
||||||
"language": lang_code,
|
"language": lang_code,
|
||||||
"raw": record,
|
"raw": record,
|
||||||
|
|
@ -251,16 +312,69 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
||||||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||||
raw_rows = [e["raw"] for e in batch]
|
raw_rows = [e["raw"] for e in batch]
|
||||||
|
|
||||||
if lemma_rows:
|
# asyncpg caps query parameters at 32767. Split each row list into chunks
|
||||||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
# sized so that rows × columns stays comfortably under that limit.
|
||||||
if sense_rows:
|
def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
|
||||||
await conn.execute(_sense_table.insert(), sense_rows)
|
size = max(1, 32767 // n_cols)
|
||||||
if sense_link_rows:
|
return [rows[i : i + size] for i in range(0, len(rows), size)]
|
||||||
await conn.execute(_sense_link_table.insert(), sense_link_rows)
|
|
||||||
if wordform_rows:
|
# Deduplicate by id: safety net for truly identical rows (should be rare
|
||||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
# now that the lemma UUID incorporates sense IDs).
|
||||||
if raw_rows:
|
def _dedup(rows: list[dict]) -> list[dict]:
|
||||||
await conn.execute(_raw_table.insert(), raw_rows)
|
seen: dict = {}
|
||||||
|
for row in rows:
|
||||||
|
seen[row["id"]] = row
|
||||||
|
return list(seen.values())
|
||||||
|
|
||||||
|
lemma_rows = _dedup(lemma_rows)
|
||||||
|
sense_rows = _dedup(sense_rows)
|
||||||
|
wordform_rows = _dedup(wordform_rows)
|
||||||
|
raw_rows = _dedup(raw_rows)
|
||||||
|
sense_link_rows = _dedup(sense_link_rows)
|
||||||
|
|
||||||
|
for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
|
||||||
|
stmt = pg_insert(_lemma_table).values(chunk)
|
||||||
|
await conn.execute(stmt.on_conflict_do_update(
|
||||||
|
index_elements=["id"],
|
||||||
|
set_={
|
||||||
|
"headword": stmt.excluded.headword,
|
||||||
|
"pos_raw": stmt.excluded.pos_raw,
|
||||||
|
"pos_normalised": stmt.excluded.pos_normalised,
|
||||||
|
"gender": stmt.excluded.gender,
|
||||||
|
"tags": stmt.excluded.tags,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
for chunk in _chunks(sense_rows, len(_sense_table.columns)):
|
||||||
|
stmt = pg_insert(_sense_table).values(chunk)
|
||||||
|
await conn.execute(stmt.on_conflict_do_update(
|
||||||
|
index_elements=["id"],
|
||||||
|
set_={
|
||||||
|
"sense_index": stmt.excluded.sense_index,
|
||||||
|
"gloss": stmt.excluded.gloss,
|
||||||
|
"topics": stmt.excluded.topics,
|
||||||
|
"tags": stmt.excluded.tags,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
|
||||||
|
stmt = pg_insert(_wordform_table).values(chunk)
|
||||||
|
await conn.execute(stmt.on_conflict_do_update(
|
||||||
|
index_elements=["id"],
|
||||||
|
set_={"tags": stmt.excluded.tags},
|
||||||
|
))
|
||||||
|
|
||||||
|
for chunk in _chunks(raw_rows, len(_raw_table.columns)):
|
||||||
|
stmt = pg_insert(_raw_table).values(chunk)
|
||||||
|
await conn.execute(stmt.on_conflict_do_update(
|
||||||
|
index_elements=["id"],
|
||||||
|
set_={"raw": stmt.excluded.raw},
|
||||||
|
))
|
||||||
|
|
||||||
|
for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
|
||||||
|
await conn.execute(
|
||||||
|
pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
|
||||||
|
)
|
||||||
|
|
||||||
await conn.commit()
|
await conn.commit()
|
||||||
|
|
||||||
|
|
@ -350,12 +464,9 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with engine.connect() as conn:
|
async with engine.connect() as conn:
|
||||||
print(f"Deleting existing entries for language={lang_code!r}...")
|
# No upfront delete — rows are upserted so existing FK references
|
||||||
await conn.execute(
|
# (word bank entries, pack entries) are preserved across re-imports.
|
||||||
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
# To fully wipe and start fresh, run clear_dictionary.py first.
|
||||||
)
|
|
||||||
await conn.commit()
|
|
||||||
|
|
||||||
print(f"Importing {jsonl_path} ...")
|
print(f"Importing {jsonl_path} ...")
|
||||||
batch: list[dict] = []
|
batch: list[dict] = []
|
||||||
total_lemmas = 0
|
total_lemmas = 0
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue