323 lines
9.5 KiB
Python
323 lines
9.5 KiB
Python
|
|
#!/usr/bin/env python
|
||
|
|
"""
|
||
|
|
CLI import script for kaikki/wiktextract JSONL dictionary data.
|
||
|
|
|
||
|
|
Usage (from api/ directory):
|
||
|
|
uv run ./scripts/import_dictionary.py --lang fr
|
||
|
|
|
||
|
|
# or via Make from the repo root:
|
||
|
|
make import-dictionary lang=fr
|
||
|
|
|
||
|
|
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||
|
|
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import uuid
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import sqlalchemy as sa
|
||
|
|
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||
|
|
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||
|
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||
|
|
|
||
|
|
_API_DIR = Path(__file__).parent.parent
|
||
|
|
_REPO_ROOT = _API_DIR.parent
|
||
|
|
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
|
||
|
|
|
||
|
|
_LANG_FILE_MAP: dict[str, str] = {
|
||
|
|
"fr": "french.jsonl",
|
||
|
|
}
|
||
|
|
|
||
|
|
_POS_MAP: dict[str, str] = {
|
||
|
|
"noun": "NOUN",
|
||
|
|
"verb": "VERB",
|
||
|
|
"adj": "ADJ",
|
||
|
|
"adv": "ADV",
|
||
|
|
"det": "DET",
|
||
|
|
"article": "DET",
|
||
|
|
"pron": "PRON",
|
||
|
|
"prep": "ADP",
|
||
|
|
"adp": "ADP",
|
||
|
|
"conj": "CCONJ",
|
||
|
|
"cconj": "CCONJ",
|
||
|
|
"sconj": "SCONJ",
|
||
|
|
"intj": "INTJ",
|
||
|
|
"num": "NUM",
|
||
|
|
"numeral": "NUM",
|
||
|
|
"part": "PART",
|
||
|
|
"particle": "PART",
|
||
|
|
"name": "PROPN",
|
||
|
|
"propn": "PROPN",
|
||
|
|
"proper noun": "PROPN",
|
||
|
|
"punct": "PUNCT",
|
||
|
|
"sym": "SYM",
|
||
|
|
}
|
||
|
|
|
||
|
|
_GENDER_MAP: dict[str, str] = {
|
||
|
|
"masculine": "masculine",
|
||
|
|
"masc": "masculine",
|
||
|
|
"feminine": "feminine",
|
||
|
|
"fem": "feminine",
|
||
|
|
"neuter": "neuter",
|
||
|
|
"common": "common",
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Standalone table definitions — no app imports, no Settings() call
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
_meta = sa.MetaData()
|
||
|
|
|
||
|
|
_lemma_table = sa.Table(
|
||
|
|
"dictionary_lemma",
|
||
|
|
_meta,
|
||
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||
|
|
sa.Column("headword", sa.Text(), nullable=False),
|
||
|
|
sa.Column("language", sa.String(2), nullable=False),
|
||
|
|
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||
|
|
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||
|
|
sa.Column("gender", sa.Text(), nullable=True),
|
||
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||
|
|
)
|
||
|
|
|
||
|
|
_sense_table = sa.Table(
|
||
|
|
"dictionary_sense",
|
||
|
|
_meta,
|
||
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||
|
|
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||
|
|
sa.Column("gloss", sa.Text(), nullable=False),
|
||
|
|
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
|
||
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||
|
|
)
|
||
|
|
|
||
|
|
_wordform_table = sa.Table(
|
||
|
|
"dictionary_wordform",
|
||
|
|
_meta,
|
||
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||
|
|
sa.Column("form", sa.Text(), nullable=False),
|
||
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||
|
|
)
|
||
|
|
|
||
|
|
_raw_table = sa.Table(
|
||
|
|
"dictionary_lemma_raw",
|
||
|
|
_meta,
|
||
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||
|
|
sa.Column("language", sa.String(2), nullable=False),
|
||
|
|
sa.Column("raw", JSONB(), nullable=False),
|
||
|
|
)
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Normalisation helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def _normalise_pos(pos_raw: str) -> str | None:
|
||
|
|
return _POS_MAP.get(pos_raw.lower().strip())
|
||
|
|
|
||
|
|
|
||
|
|
def _normalise_gender(tags: list) -> str | None:
|
||
|
|
for tag in tags:
|
||
|
|
mapped = _GENDER_MAP.get(tag)
|
||
|
|
if mapped:
|
||
|
|
return mapped
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Parsing
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||
|
|
"""Parse one kaikki JSONL record into insertion-ready row dicts.
|
||
|
|
|
||
|
|
Returns None if the entry should be skipped.
|
||
|
|
"""
|
||
|
|
if record.get("lang_code") != lang_code:
|
||
|
|
return None
|
||
|
|
|
||
|
|
word = (record.get("word") or "").strip()
|
||
|
|
if not word:
|
||
|
|
return None
|
||
|
|
|
||
|
|
pos_raw = (record.get("pos") or "").strip()
|
||
|
|
top_tags = record.get("tags") or []
|
||
|
|
|
||
|
|
lemma_id = uuid.uuid4()
|
||
|
|
|
||
|
|
senses = []
|
||
|
|
for i, sense_record in enumerate(record.get("senses") or []):
|
||
|
|
sense_id = uuid.uuid4()
|
||
|
|
glosses = sense_record.get("glosses") or []
|
||
|
|
gloss = glosses[0] if glosses else ""
|
||
|
|
topics = sense_record.get("topics") or []
|
||
|
|
sense_tags = sense_record.get("tags") or []
|
||
|
|
|
||
|
|
senses.append(
|
||
|
|
{
|
||
|
|
"id": sense_id,
|
||
|
|
"lemma_id": lemma_id,
|
||
|
|
"sense_index": i,
|
||
|
|
"gloss": gloss,
|
||
|
|
"topics": topics,
|
||
|
|
"tags": sense_tags,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
wordforms = []
|
||
|
|
for f in record.get("forms") or []:
|
||
|
|
form_text = (f.get("form") or "").strip()
|
||
|
|
if not form_text or form_text == word:
|
||
|
|
continue
|
||
|
|
form_tags = f.get("tags") or []
|
||
|
|
wordforms.append(
|
||
|
|
{
|
||
|
|
"id": uuid.uuid4(),
|
||
|
|
"lemma_id": lemma_id,
|
||
|
|
"form": form_text,
|
||
|
|
"tags": form_tags,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"lemma": {
|
||
|
|
"id": lemma_id,
|
||
|
|
"headword": word,
|
||
|
|
"language": lang_code,
|
||
|
|
"pos_raw": pos_raw,
|
||
|
|
"pos_normalised": _normalise_pos(pos_raw),
|
||
|
|
"gender": _normalise_gender(top_tags),
|
||
|
|
"tags": top_tags,
|
||
|
|
},
|
||
|
|
"senses": senses,
|
||
|
|
"wordforms": wordforms,
|
||
|
|
"raw": {
|
||
|
|
"id": uuid.uuid4(),
|
||
|
|
"lemma_id": lemma_id,
|
||
|
|
"language": lang_code,
|
||
|
|
"raw": record,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# DB operations
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||
|
|
lemma_rows = [e["lemma"] for e in batch]
|
||
|
|
sense_rows = [s for e in batch for s in e["senses"]]
|
||
|
|
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||
|
|
raw_rows = [e["raw"] for e in batch]
|
||
|
|
|
||
|
|
if lemma_rows:
|
||
|
|
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||
|
|
if sense_rows:
|
||
|
|
await conn.execute(_sense_table.insert(), sense_rows)
|
||
|
|
if wordform_rows:
|
||
|
|
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||
|
|
if raw_rows:
|
||
|
|
await conn.execute(_raw_table.insert(), raw_rows)
|
||
|
|
|
||
|
|
await conn.commit()
|
||
|
|
|
||
|
|
|
||
|
|
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||
|
|
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||
|
|
if not lang_file:
|
||
|
|
print(
|
||
|
|
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
|
||
|
|
file=sys.stderr,
|
||
|
|
)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
jsonl_path = _DICT_DIR / lang_file
|
||
|
|
if not jsonl_path.exists():
|
||
|
|
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
database_url = os.environ.get(
|
||
|
|
"DATABASE_URL",
|
||
|
|
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||
|
|
)
|
||
|
|
|
||
|
|
engine = create_async_engine(database_url, echo=False)
|
||
|
|
|
||
|
|
try:
|
||
|
|
async with engine.connect() as conn:
|
||
|
|
print(f"Deleting existing entries for language={lang_code!r}...")
|
||
|
|
await conn.execute(
|
||
|
|
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
||
|
|
)
|
||
|
|
await conn.commit()
|
||
|
|
|
||
|
|
print(f"Importing {jsonl_path} ...")
|
||
|
|
batch: list[dict] = []
|
||
|
|
total_lemmas = 0
|
||
|
|
skipped = 0
|
||
|
|
|
||
|
|
with open(jsonl_path, encoding="utf-8") as f:
|
||
|
|
for line_num, line in enumerate(f, 1):
|
||
|
|
line = line.strip()
|
||
|
|
if not line:
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
record = json.loads(line)
|
||
|
|
except json.JSONDecodeError as exc:
|
||
|
|
print(
|
||
|
|
f" Line {line_num}: JSON parse error: {exc}",
|
||
|
|
file=sys.stderr,
|
||
|
|
)
|
||
|
|
skipped += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
parsed = _parse_entry(record, lang_code)
|
||
|
|
if parsed is None:
|
||
|
|
skipped += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
batch.append(parsed)
|
||
|
|
|
||
|
|
if len(batch) >= batch_size:
|
||
|
|
await _flush_batch(conn, batch)
|
||
|
|
total_lemmas += len(batch)
|
||
|
|
print(f" Committed {total_lemmas} lemmas...")
|
||
|
|
batch = []
|
||
|
|
|
||
|
|
if batch:
|
||
|
|
await _flush_batch(conn, batch)
|
||
|
|
total_lemmas += len(batch)
|
||
|
|
|
||
|
|
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||
|
|
finally:
|
||
|
|
await engine.dispose()
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Import kaikki dictionary JSONL into Postgres."
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--lang", required=True, help="Language code to import (e.g. fr)"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
|
||
|
|
)
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
asyncio.run(run_import(args.lang, args.batch_size))
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|