language-learning-app/api/scripts/clear_dictionary.py

100 lines
2.7 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
"""
Clear all rows from dictionary tables and re-import from source JSONL files.
Usage (from api/ directory):
uv run ./scripts/clear_dictionary.py
# Dry-run: clear only, no re-import
uv run ./scripts/clear_dictionary.py --no-import
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
which matches the docker-compose dev credentials when the DB port is exposed on the host.
"""
import argparse
import asyncio
import os
import sys
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.ext.asyncio import create_async_engine
# Re-use table definitions and run_import from the sibling script so there is
# no duplication of schema knowledge.
sys.path.insert(0, str(Path(__file__).parent))
from import_dictionary import ( # noqa: E402
_LANG_FILE_MAP,
_lemma_table,
_raw_table,
_sense_link_table,
_sense_table,
_wordform_table,
run_import,
)
# Delete order respects foreign-key dependencies:
# sense_link → sense
# sense → lemma
# wordform → lemma
# raw → lemma
# lemma (parent)
_DELETE_ORDER = [
_sense_link_table,
_sense_table,
_wordform_table,
_raw_table,
_lemma_table,
]
async def clear_all(database_url: str) -> None:
engine = create_async_engine(database_url, echo=False)
try:
async with engine.connect() as conn:
print("Clearing all dictionary tables...")
for table in _DELETE_ORDER:
result = await conn.execute(sa.delete(table))
print(f" Deleted {result.rowcount} rows from {table.name}")
await conn.commit()
print("All dictionary tables cleared.")
finally:
await engine.dispose()
async def main(run_reimport: bool, batch_size: int) -> None:
database_url = os.environ.get(
"DATABASE_URL",
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
)
await clear_all(database_url)
if not run_reimport:
return
for lang_code in _LANG_FILE_MAP:
print(f"\nRe-importing language={lang_code!r}...")
await run_import(lang_code, batch_size)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Clear all dictionary tables and optionally re-import."
)
parser.add_argument(
"--no-import",
action="store_true",
help="Clear tables only; skip re-import.",
)
parser.add_argument(
"--batch-size",
type=int,
default=1000,
help="Rows per commit during re-import (default: 1000)",
)
args = parser.parse_args()
asyncio.run(main(run_reimport=not args.no_import, batch_size=args.batch_size))