2026-04-08 19:26:26 +00:00
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
"""
|
|
|
|
|
|
CLI import script for kaikki/wiktextract JSONL dictionary data.
|
|
|
|
|
|
|
|
|
|
|
|
Usage (from api/ directory):
|
|
|
|
|
|
uv run ./scripts/import_dictionary.py --lang fr
|
|
|
|
|
|
|
|
|
|
|
|
# or via Make from the repo root:
|
|
|
|
|
|
make import-dictionary lang=fr
|
|
|
|
|
|
|
|
|
|
|
|
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
|
|
|
|
|
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
import sqlalchemy as sa
|
|
|
|
|
|
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
|
|
|
|
|
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
2026-04-15 20:01:52 +00:00
|
|
|
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
|
|
|
|
from sqlalchemy.ext.asyncio import create_async_engine
|
2026-04-08 19:26:26 +00:00
|
|
|
|
|
|
|
|
|
|
_API_DIR = Path(__file__).parent.parent
|
|
|
|
|
|
_REPO_ROOT = _API_DIR.parent
|
|
|
|
|
|
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
|
|
|
|
|
|
|
|
|
|
|
|
_LANG_FILE_MAP: dict[str, str] = {
|
|
|
|
|
|
"fr": "french.jsonl",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
_POS_MAP: dict[str, str] = {
|
|
|
|
|
|
"noun": "NOUN",
|
|
|
|
|
|
"verb": "VERB",
|
|
|
|
|
|
"adj": "ADJ",
|
|
|
|
|
|
"adv": "ADV",
|
|
|
|
|
|
"det": "DET",
|
|
|
|
|
|
"article": "DET",
|
|
|
|
|
|
"pron": "PRON",
|
|
|
|
|
|
"prep": "ADP",
|
|
|
|
|
|
"adp": "ADP",
|
|
|
|
|
|
"conj": "CCONJ",
|
|
|
|
|
|
"cconj": "CCONJ",
|
|
|
|
|
|
"sconj": "SCONJ",
|
|
|
|
|
|
"intj": "INTJ",
|
|
|
|
|
|
"num": "NUM",
|
|
|
|
|
|
"numeral": "NUM",
|
|
|
|
|
|
"part": "PART",
|
|
|
|
|
|
"particle": "PART",
|
|
|
|
|
|
"name": "PROPN",
|
|
|
|
|
|
"propn": "PROPN",
|
|
|
|
|
|
"proper noun": "PROPN",
|
|
|
|
|
|
"punct": "PUNCT",
|
|
|
|
|
|
"sym": "SYM",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
_GENDER_MAP: dict[str, str] = {
|
2026-04-10 20:12:40 +00:00
|
|
|
|
"f": "feminine",
|
|
|
|
|
|
"m": "masculine",
|
2026-04-08 19:26:26 +00:00
|
|
|
|
"masculine": "masculine",
|
|
|
|
|
|
"masc": "masculine",
|
|
|
|
|
|
"feminine": "feminine",
|
|
|
|
|
|
"fem": "feminine",
|
|
|
|
|
|
"neuter": "neuter",
|
|
|
|
|
|
"common": "common",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-15 20:01:52 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Deterministic UUID namespace
|
|
|
|
|
|
#
|
|
|
|
|
|
# All dictionary entity IDs are derived via uuid5(namespace, natural_key) so
|
|
|
|
|
|
# that re-importing the same kaikki data always produces the same UUIDs. This
|
|
|
|
|
|
# means:
|
|
|
|
|
|
# • Re-imports update rows in place (upsert) without changing PKs, so
|
|
|
|
|
|
# learnable_word_bank_entry / word_bank_pack_entry FK references are never
|
|
|
|
|
|
# nullified by a re-import.
|
|
|
|
|
|
# • WordPacks developed in one environment can be transferred to another
|
|
|
|
|
|
# environment that imported from the same kaikki dataset, because sense UUIDs
|
|
|
|
|
|
# will be identical in both.
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
_KAIKKI_UUID_NS = uuid.UUID("c7d8e9f0-1234-5678-abcd-ef0123456789")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _lemma_uuid(lang_code: str, word: str, pos: str, etymology_number: int, sense_ids: list[str]) -> uuid.UUID:
|
|
|
|
|
|
# Include sorted sense IDs so that two kaikki entries with the same
|
|
|
|
|
|
# (word, pos, etymology_number) but different senses get distinct UUIDs.
|
|
|
|
|
|
sense_key = ":".join(sorted(sense_ids))
|
|
|
|
|
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:lemma:{lang_code}:{word}:{pos}:{etymology_number}:{sense_key}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _sense_uuid(kaikki_sense_id: str) -> uuid.UUID:
|
|
|
|
|
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:sense:{kaikki_sense_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wordform_uuid(lemma_id: uuid.UUID, form: str, tags: list[str]) -> uuid.UUID:
|
|
|
|
|
|
tags_key = ",".join(sorted(tags))
|
|
|
|
|
|
return uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:wordform:{lemma_id}:{form}:{tags_key}")
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Standalone table definitions — no app imports, no Settings() call
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
_meta = sa.MetaData()
|
|
|
|
|
|
|
|
|
|
|
|
_lemma_table = sa.Table(
|
|
|
|
|
|
"dictionary_lemma",
|
|
|
|
|
|
_meta,
|
|
|
|
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
|
|
|
|
|
sa.Column("headword", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("language", sa.String(2), nullable=False),
|
|
|
|
|
|
sa.Column("pos_raw", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
|
|
|
|
|
sa.Column("gender", sa.Text(), nullable=True),
|
|
|
|
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_sense_table = sa.Table(
|
|
|
|
|
|
"dictionary_sense",
|
|
|
|
|
|
_meta,
|
|
|
|
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
|
|
|
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
|
|
|
|
|
sa.Column("sense_index", sa.Integer(), nullable=False),
|
|
|
|
|
|
sa.Column("gloss", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
|
|
|
|
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_wordform_table = sa.Table(
|
|
|
|
|
|
"dictionary_wordform",
|
|
|
|
|
|
_meta,
|
|
|
|
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
|
|
|
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
|
|
|
|
|
sa.Column("form", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_raw_table = sa.Table(
|
|
|
|
|
|
"dictionary_lemma_raw",
|
|
|
|
|
|
_meta,
|
|
|
|
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
|
|
|
|
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
|
|
|
|
|
sa.Column("language", sa.String(2), nullable=False),
|
|
|
|
|
|
sa.Column("raw", JSONB(), nullable=False),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
_sense_link_table = sa.Table(
|
|
|
|
|
|
"dictionary_sense_link",
|
|
|
|
|
|
_meta,
|
|
|
|
|
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
|
|
|
|
|
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
|
|
|
|
|
|
sa.Column("link_text", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("link_target", sa.Text(), nullable=False),
|
|
|
|
|
|
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Normalisation helpers
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalise_pos(pos_raw: str) -> str | None:
|
|
|
|
|
|
return _POS_MAP.get(pos_raw.lower().strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
def _normalise_gender(value: str | None) -> str | None:
|
|
|
|
|
|
if value is None:
|
|
|
|
|
|
return None
|
|
|
|
|
|
return _GENDER_MAP.get(value)
|
2026-04-08 19:26:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Parsing
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|
|
|
|
|
"""Parse one kaikki JSONL record into insertion-ready row dicts.
|
|
|
|
|
|
|
|
|
|
|
|
Returns None if the entry should be skipped.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if record.get("lang_code") != lang_code:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
word = (record.get("word") or "").strip()
|
|
|
|
|
|
if not word:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2026-04-15 20:01:52 +00:00
|
|
|
|
# Skip entries that are inflected forms of another lemma (e.g. conjugations,
|
|
|
|
|
|
# plurals). These appear as top-level JSONL records but are already captured
|
|
|
|
|
|
# as wordforms via the parent lemma's `forms` array.
|
|
|
|
|
|
for sense in record.get("senses") or []:
|
|
|
|
|
|
if sense.get("form_of"):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
pos_raw = (record.get("pos") or "").strip()
|
2026-04-15 20:01:52 +00:00
|
|
|
|
etymology_number = record.get("etymology_number", 0)
|
|
|
|
|
|
raw_senses = record.get("senses") or []
|
|
|
|
|
|
|
|
|
|
|
|
# Collect kaikki sense IDs up front so the lemma UUID can incorporate them.
|
|
|
|
|
|
# This disambiguates entries that share (word, pos, etymology_number) but
|
|
|
|
|
|
# have genuinely different senses — kaikki has ~349 such cases in French.
|
|
|
|
|
|
kaikki_sense_ids = [
|
|
|
|
|
|
s.get("id") or f"{lang_code}:{word}:{pos_raw}:{etymology_number}:{i}"
|
|
|
|
|
|
for i, s in enumerate(raw_senses)
|
|
|
|
|
|
]
|
2026-04-08 19:26:26 +00:00
|
|
|
|
|
2026-04-15 20:01:52 +00:00
|
|
|
|
lemma_id = _lemma_uuid(lang_code, word, pos_raw, etymology_number, kaikki_sense_ids)
|
2026-04-08 19:26:26 +00:00
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
|
|
|
|
|
gender: str | None = None
|
2026-04-08 19:26:26 +00:00
|
|
|
|
senses = []
|
2026-04-10 20:12:40 +00:00
|
|
|
|
sense_links = []
|
2026-04-15 20:01:52 +00:00
|
|
|
|
for i, sense_record in enumerate(raw_senses):
|
|
|
|
|
|
kaikki_sense_id = kaikki_sense_ids[i]
|
|
|
|
|
|
sense_id = _sense_uuid(kaikki_sense_id)
|
2026-04-08 19:26:26 +00:00
|
|
|
|
glosses = sense_record.get("glosses") or []
|
|
|
|
|
|
gloss = glosses[0] if glosses else ""
|
|
|
|
|
|
topics = sense_record.get("topics") or []
|
|
|
|
|
|
sense_tags = sense_record.get("tags") or []
|
|
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
if gender is None:
|
|
|
|
|
|
for tag in sense_tags:
|
|
|
|
|
|
if tag in _GENDER_TAGS:
|
|
|
|
|
|
gender = tag
|
|
|
|
|
|
break
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
senses.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": sense_id,
|
|
|
|
|
|
"lemma_id": lemma_id,
|
|
|
|
|
|
"sense_index": i,
|
|
|
|
|
|
"gloss": gloss,
|
|
|
|
|
|
"topics": topics,
|
|
|
|
|
|
"tags": sense_tags,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
for link_pair in (sense_record.get("links") or []):
|
|
|
|
|
|
if isinstance(link_pair, list) and len(link_pair) == 2:
|
2026-04-15 20:01:52 +00:00
|
|
|
|
link_text, link_target = link_pair[0], link_pair[1]
|
|
|
|
|
|
link_id = uuid.uuid5(
|
|
|
|
|
|
_KAIKKI_UUID_NS,
|
|
|
|
|
|
f"kaikki:link:{sense_id}:{link_text}:{link_target}",
|
|
|
|
|
|
)
|
2026-04-10 20:12:40 +00:00
|
|
|
|
sense_links.append(
|
|
|
|
|
|
{
|
2026-04-15 20:01:52 +00:00
|
|
|
|
"id": link_id,
|
2026-04-10 20:12:40 +00:00
|
|
|
|
"sense_id": sense_id,
|
2026-04-15 20:01:52 +00:00
|
|
|
|
"link_text": link_text,
|
|
|
|
|
|
"link_target": link_target,
|
2026-04-10 20:12:40 +00:00
|
|
|
|
"target_lemma_id": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-15 20:01:52 +00:00
|
|
|
|
_METADATA_FORM_TAGS = {"table-tags", "inflection-template"}
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
wordforms = []
|
|
|
|
|
|
for f in record.get("forms") or []:
|
|
|
|
|
|
form_text = (f.get("form") or "").strip()
|
|
|
|
|
|
if not form_text or form_text == word:
|
|
|
|
|
|
continue
|
|
|
|
|
|
form_tags = f.get("tags") or []
|
2026-04-15 20:01:52 +00:00
|
|
|
|
if _METADATA_FORM_TAGS.intersection(form_tags):
|
|
|
|
|
|
continue
|
2026-04-08 19:26:26 +00:00
|
|
|
|
wordforms.append(
|
|
|
|
|
|
{
|
2026-04-15 20:01:52 +00:00
|
|
|
|
"id": _wordform_uuid(lemma_id, form_text, form_tags),
|
2026-04-08 19:26:26 +00:00
|
|
|
|
"lemma_id": lemma_id,
|
|
|
|
|
|
"form": form_text,
|
|
|
|
|
|
"tags": form_tags,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-18 16:27:32 +00:00
|
|
|
|
# Verbs have a dedicated kaikki entry for each conjugated form (including
|
|
|
|
|
|
# the infinitive itself), so the headword is already covered. For all other
|
|
|
|
|
|
# POS (nouns, adjectives, …) no such entry exists, so we add the headword
|
|
|
|
|
|
# form explicitly here.
|
|
|
|
|
|
if pos_raw != "verb":
|
|
|
|
|
|
wordforms.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": _wordform_uuid(lemma_id, word, []),
|
|
|
|
|
|
"lemma_id": lemma_id,
|
|
|
|
|
|
"form": word,
|
|
|
|
|
|
"tags": [],
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
return {
|
|
|
|
|
|
"lemma": {
|
|
|
|
|
|
"id": lemma_id,
|
|
|
|
|
|
"headword": word,
|
|
|
|
|
|
"language": lang_code,
|
|
|
|
|
|
"pos_raw": pos_raw,
|
|
|
|
|
|
"pos_normalised": _normalise_pos(pos_raw),
|
2026-04-10 20:12:40 +00:00
|
|
|
|
"gender": gender,
|
|
|
|
|
|
"tags": record.get("tags") or [],
|
2026-04-08 19:26:26 +00:00
|
|
|
|
},
|
|
|
|
|
|
"senses": senses,
|
2026-04-10 20:12:40 +00:00
|
|
|
|
"sense_links": sense_links,
|
2026-04-08 19:26:26 +00:00
|
|
|
|
"wordforms": wordforms,
|
|
|
|
|
|
"raw": {
|
2026-04-15 20:01:52 +00:00
|
|
|
|
"id": uuid.uuid5(_KAIKKI_UUID_NS, f"kaikki:raw:{lemma_id}"),
|
2026-04-08 19:26:26 +00:00
|
|
|
|
"lemma_id": lemma_id,
|
|
|
|
|
|
"language": lang_code,
|
|
|
|
|
|
"raw": record,
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# DB operations
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
|
|
|
|
|
lemma_rows = [e["lemma"] for e in batch]
|
|
|
|
|
|
sense_rows = [s for e in batch for s in e["senses"]]
|
2026-04-10 20:12:40 +00:00
|
|
|
|
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
|
2026-04-08 19:26:26 +00:00
|
|
|
|
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
|
|
|
|
|
raw_rows = [e["raw"] for e in batch]
|
|
|
|
|
|
|
2026-04-15 20:01:52 +00:00
|
|
|
|
# asyncpg caps query parameters at 32767. Split each row list into chunks
|
|
|
|
|
|
# sized so that rows × columns stays comfortably under that limit.
|
|
|
|
|
|
def _chunks(rows: list[dict], n_cols: int) -> list[list[dict]]:
|
|
|
|
|
|
size = max(1, 32767 // n_cols)
|
|
|
|
|
|
return [rows[i : i + size] for i in range(0, len(rows), size)]
|
|
|
|
|
|
|
|
|
|
|
|
# Deduplicate by id: safety net for truly identical rows (should be rare
|
|
|
|
|
|
# now that the lemma UUID incorporates sense IDs).
|
|
|
|
|
|
def _dedup(rows: list[dict]) -> list[dict]:
|
|
|
|
|
|
seen: dict = {}
|
|
|
|
|
|
for row in rows:
|
|
|
|
|
|
seen[row["id"]] = row
|
|
|
|
|
|
return list(seen.values())
|
|
|
|
|
|
|
|
|
|
|
|
lemma_rows = _dedup(lemma_rows)
|
|
|
|
|
|
sense_rows = _dedup(sense_rows)
|
|
|
|
|
|
wordform_rows = _dedup(wordform_rows)
|
|
|
|
|
|
raw_rows = _dedup(raw_rows)
|
|
|
|
|
|
sense_link_rows = _dedup(sense_link_rows)
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in _chunks(lemma_rows, len(_lemma_table.columns)):
|
|
|
|
|
|
stmt = pg_insert(_lemma_table).values(chunk)
|
|
|
|
|
|
await conn.execute(stmt.on_conflict_do_update(
|
|
|
|
|
|
index_elements=["id"],
|
|
|
|
|
|
set_={
|
|
|
|
|
|
"headword": stmt.excluded.headword,
|
|
|
|
|
|
"pos_raw": stmt.excluded.pos_raw,
|
|
|
|
|
|
"pos_normalised": stmt.excluded.pos_normalised,
|
|
|
|
|
|
"gender": stmt.excluded.gender,
|
|
|
|
|
|
"tags": stmt.excluded.tags,
|
|
|
|
|
|
},
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in _chunks(sense_rows, len(_sense_table.columns)):
|
|
|
|
|
|
stmt = pg_insert(_sense_table).values(chunk)
|
|
|
|
|
|
await conn.execute(stmt.on_conflict_do_update(
|
|
|
|
|
|
index_elements=["id"],
|
|
|
|
|
|
set_={
|
|
|
|
|
|
"sense_index": stmt.excluded.sense_index,
|
|
|
|
|
|
"gloss": stmt.excluded.gloss,
|
|
|
|
|
|
"topics": stmt.excluded.topics,
|
|
|
|
|
|
"tags": stmt.excluded.tags,
|
|
|
|
|
|
},
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in _chunks(wordform_rows, len(_wordform_table.columns)):
|
|
|
|
|
|
stmt = pg_insert(_wordform_table).values(chunk)
|
|
|
|
|
|
await conn.execute(stmt.on_conflict_do_update(
|
|
|
|
|
|
index_elements=["id"],
|
|
|
|
|
|
set_={"tags": stmt.excluded.tags},
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in _chunks(raw_rows, len(_raw_table.columns)):
|
|
|
|
|
|
stmt = pg_insert(_raw_table).values(chunk)
|
|
|
|
|
|
await conn.execute(stmt.on_conflict_do_update(
|
|
|
|
|
|
index_elements=["id"],
|
|
|
|
|
|
set_={"raw": stmt.excluded.raw},
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in _chunks(sense_link_rows, len(_sense_link_table.columns)):
|
|
|
|
|
|
await conn.execute(
|
|
|
|
|
|
pg_insert(_sense_link_table).values(chunk).on_conflict_do_nothing()
|
|
|
|
|
|
)
|
2026-04-08 19:26:26 +00:00
|
|
|
|
|
|
|
|
|
|
await conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-10 20:12:40 +00:00
|
|
|
|
_LANG_SECTION_MAP: dict[str, str] = {
|
|
|
|
|
|
"fr": "French",
|
|
|
|
|
|
"de": "German",
|
|
|
|
|
|
"es": "Spanish",
|
|
|
|
|
|
"it": "Italian",
|
|
|
|
|
|
"pt": "Portuguese",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
|
|
|
|
|
|
"""Resolve target_lemma_id for sense links whose target matches lang_code.
|
|
|
|
|
|
|
|
|
|
|
|
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
|
|
|
|
|
|
lemmas have been imported we can attempt to match the target word to a row
|
|
|
|
|
|
in dictionary_lemma and store the foreign key.
|
|
|
|
|
|
"""
|
|
|
|
|
|
section = _LANG_SECTION_MAP.get(lang_code)
|
|
|
|
|
|
if not section:
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
suffix = f"#{section}"
|
|
|
|
|
|
|
|
|
|
|
|
result = await conn.execute(
|
|
|
|
|
|
sa.select(
|
|
|
|
|
|
_sense_link_table.c.id,
|
|
|
|
|
|
_sense_link_table.c.link_target,
|
|
|
|
|
|
).where(_sense_link_table.c.target_lemma_id.is_(None))
|
|
|
|
|
|
)
|
|
|
|
|
|
all_links = result.fetchall()
|
|
|
|
|
|
|
|
|
|
|
|
# Filter to links that point at this language and extract the target word.
|
|
|
|
|
|
candidates: list[tuple[uuid.UUID, str]] = []
|
|
|
|
|
|
for row in all_links:
|
|
|
|
|
|
if row.link_target.endswith(suffix):
|
|
|
|
|
|
word = row.link_target[: -len(suffix)]
|
|
|
|
|
|
candidates.append((row.id, word))
|
|
|
|
|
|
|
|
|
|
|
|
if not candidates:
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
target_words = list({w for _, w in candidates})
|
|
|
|
|
|
lemma_result = await conn.execute(
|
|
|
|
|
|
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
|
|
|
|
|
|
.where(_lemma_table.c.language == lang_code)
|
|
|
|
|
|
.where(_lemma_table.c.headword.in_(target_words))
|
|
|
|
|
|
)
|
|
|
|
|
|
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
|
|
|
|
|
|
|
|
|
|
|
|
resolved = 0
|
|
|
|
|
|
for link_id, word in candidates:
|
|
|
|
|
|
if word in lemma_map:
|
|
|
|
|
|
await conn.execute(
|
|
|
|
|
|
_sense_link_table.update()
|
|
|
|
|
|
.where(_sense_link_table.c.id == link_id)
|
|
|
|
|
|
.values(target_lemma_id=lemma_map[word])
|
|
|
|
|
|
)
|
|
|
|
|
|
resolved += 1
|
|
|
|
|
|
|
|
|
|
|
|
await conn.commit()
|
|
|
|
|
|
return resolved
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 19:26:26 +00:00
|
|
|
|
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
|
|
|
|
|
lang_file = _LANG_FILE_MAP.get(lang_code)
|
|
|
|
|
|
if not lang_file:
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
|
|
|
|
|
|
file=sys.stderr,
|
|
|
|
|
|
)
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
jsonl_path = _DICT_DIR / lang_file
|
|
|
|
|
|
if not jsonl_path.exists():
|
|
|
|
|
|
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
database_url = os.environ.get(
|
|
|
|
|
|
"DATABASE_URL",
|
|
|
|
|
|
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
engine = create_async_engine(database_url, echo=False)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
async with engine.connect() as conn:
|
2026-04-15 20:01:52 +00:00
|
|
|
|
# No upfront delete — rows are upserted so existing FK references
|
|
|
|
|
|
# (word bank entries, pack entries) are preserved across re-imports.
|
|
|
|
|
|
# To fully wipe and start fresh, run clear_dictionary.py first.
|
2026-04-08 19:26:26 +00:00
|
|
|
|
print(f"Importing {jsonl_path} ...")
|
|
|
|
|
|
batch: list[dict] = []
|
|
|
|
|
|
total_lemmas = 0
|
|
|
|
|
|
skipped = 0
|
|
|
|
|
|
|
|
|
|
|
|
with open(jsonl_path, encoding="utf-8") as f:
|
|
|
|
|
|
for line_num, line in enumerate(f, 1):
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
record = json.loads(line)
|
|
|
|
|
|
except json.JSONDecodeError as exc:
|
|
|
|
|
|
print(
|
|
|
|
|
|
f" Line {line_num}: JSON parse error: {exc}",
|
|
|
|
|
|
file=sys.stderr,
|
|
|
|
|
|
)
|
|
|
|
|
|
skipped += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
parsed = _parse_entry(record, lang_code)
|
|
|
|
|
|
if parsed is None:
|
|
|
|
|
|
skipped += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
batch.append(parsed)
|
|
|
|
|
|
|
|
|
|
|
|
if len(batch) >= batch_size:
|
|
|
|
|
|
await _flush_batch(conn, batch)
|
|
|
|
|
|
total_lemmas += len(batch)
|
|
|
|
|
|
print(f" Committed {total_lemmas} lemmas...")
|
|
|
|
|
|
batch = []
|
|
|
|
|
|
|
|
|
|
|
|
if batch:
|
|
|
|
|
|
await _flush_batch(conn, batch)
|
|
|
|
|
|
total_lemmas += len(batch)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
2026-04-10 20:12:40 +00:00
|
|
|
|
|
|
|
|
|
|
async with engine.connect() as conn:
|
|
|
|
|
|
print("Resolving sense links...")
|
|
|
|
|
|
resolved = await _resolve_links(conn, lang_code)
|
|
|
|
|
|
print(f"Resolved {resolved} sense links.")
|
2026-04-08 19:26:26 +00:00
|
|
|
|
finally:
|
|
|
|
|
|
await engine.dispose()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description="Import kaikki dictionary JSONL into Postgres."
|
|
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--lang", required=True, help="Language code to import (e.g. fr)"
|
|
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
|
|
|
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
asyncio.run(run_import(args.lang, args.batch_size))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|