#!/usr/bin/env python """ CLI import script for kaikki/wiktextract JSONL dictionary data. Usage (from api/ directory): uv run ./scripts/import_dictionary.py --lang fr # or via Make from the repo root: make import-dictionary lang=fr DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn which matches the docker-compose dev credentials when the DB port is exposed on the host. """ import argparse import asyncio import json import os import sys import uuid from pathlib import Path import sqlalchemy as sa from sqlalchemy.dialects.postgresql import ARRAY, JSONB from sqlalchemy.dialects.postgresql import UUID as PG_UUID from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine _API_DIR = Path(__file__).parent.parent _REPO_ROOT = _API_DIR.parent _DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki" _LANG_FILE_MAP: dict[str, str] = { "fr": "french.jsonl", } _POS_MAP: dict[str, str] = { "noun": "NOUN", "verb": "VERB", "adj": "ADJ", "adv": "ADV", "det": "DET", "article": "DET", "pron": "PRON", "prep": "ADP", "adp": "ADP", "conj": "CCONJ", "cconj": "CCONJ", "sconj": "SCONJ", "intj": "INTJ", "num": "NUM", "numeral": "NUM", "part": "PART", "particle": "PART", "name": "PROPN", "propn": "PROPN", "proper noun": "PROPN", "punct": "PUNCT", "sym": "SYM", } _GENDER_MAP: dict[str, str] = { "masculine": "masculine", "masc": "masculine", "feminine": "feminine", "fem": "feminine", "neuter": "neuter", "common": "common", } # --------------------------------------------------------------------------- # Standalone table definitions — no app imports, no Settings() call # --------------------------------------------------------------------------- _meta = sa.MetaData() _lemma_table = sa.Table( "dictionary_lemma", _meta, sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), sa.Column("headword", sa.Text(), nullable=False), sa.Column("language", sa.String(2), nullable=False), sa.Column("pos_raw", sa.Text(), nullable=False), sa.Column("pos_normalised", sa.Text(), nullable=True), sa.Column("gender", sa.Text(), nullable=True), sa.Column("tags", ARRAY(sa.Text()), nullable=False), ) _sense_table = sa.Table( "dictionary_sense", _meta, sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), sa.Column("sense_index", sa.Integer(), nullable=False), sa.Column("gloss", sa.Text(), nullable=False), sa.Column("topics", ARRAY(sa.Text()), nullable=False), sa.Column("tags", ARRAY(sa.Text()), nullable=False), ) _wordform_table = sa.Table( "dictionary_wordform", _meta, sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), sa.Column("form", sa.Text(), nullable=False), sa.Column("tags", ARRAY(sa.Text()), nullable=False), ) _raw_table = sa.Table( "dictionary_lemma_raw", _meta, sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), sa.Column("language", sa.String(2), nullable=False), sa.Column("raw", JSONB(), nullable=False), ) # --------------------------------------------------------------------------- # Normalisation helpers # --------------------------------------------------------------------------- def _normalise_pos(pos_raw: str) -> str | None: return _POS_MAP.get(pos_raw.lower().strip()) def _normalise_gender(tags: list) -> str | None: for tag in tags: mapped = _GENDER_MAP.get(tag) if mapped: return mapped return None # --------------------------------------------------------------------------- # Parsing # --------------------------------------------------------------------------- def _parse_entry(record: dict, lang_code: str) -> dict | None: """Parse one kaikki JSONL record into insertion-ready row dicts. Returns None if the entry should be skipped. """ if record.get("lang_code") != lang_code: return None word = (record.get("word") or "").strip() if not word: return None pos_raw = (record.get("pos") or "").strip() top_tags = record.get("tags") or [] lemma_id = uuid.uuid4() senses = [] for i, sense_record in enumerate(record.get("senses") or []): sense_id = uuid.uuid4() glosses = sense_record.get("glosses") or [] gloss = glosses[0] if glosses else "" topics = sense_record.get("topics") or [] sense_tags = sense_record.get("tags") or [] senses.append( { "id": sense_id, "lemma_id": lemma_id, "sense_index": i, "gloss": gloss, "topics": topics, "tags": sense_tags, } ) wordforms = [] for f in record.get("forms") or []: form_text = (f.get("form") or "").strip() if not form_text or form_text == word: continue form_tags = f.get("tags") or [] wordforms.append( { "id": uuid.uuid4(), "lemma_id": lemma_id, "form": form_text, "tags": form_tags, } ) return { "lemma": { "id": lemma_id, "headword": word, "language": lang_code, "pos_raw": pos_raw, "pos_normalised": _normalise_pos(pos_raw), "gender": _normalise_gender(top_tags), "tags": top_tags, }, "senses": senses, "wordforms": wordforms, "raw": { "id": uuid.uuid4(), "lemma_id": lemma_id, "language": lang_code, "raw": record, }, } # --------------------------------------------------------------------------- # DB operations # --------------------------------------------------------------------------- async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None: lemma_rows = [e["lemma"] for e in batch] sense_rows = [s for e in batch for s in e["senses"]] wordform_rows = [w for e in batch for w in e["wordforms"]] raw_rows = [e["raw"] for e in batch] if lemma_rows: await conn.execute(_lemma_table.insert(), lemma_rows) if sense_rows: await conn.execute(_sense_table.insert(), sense_rows) if wordform_rows: await conn.execute(_wordform_table.insert(), wordform_rows) if raw_rows: await conn.execute(_raw_table.insert(), raw_rows) await conn.commit() async def run_import(lang_code: str, batch_size: int = 1000) -> None: lang_file = _LANG_FILE_MAP.get(lang_code) if not lang_file: print( f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}", file=sys.stderr, ) sys.exit(1) jsonl_path = _DICT_DIR / lang_file if not jsonl_path.exists(): print(f"JSONL file not found: {jsonl_path}", file=sys.stderr) sys.exit(1) database_url = os.environ.get( "DATABASE_URL", "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn", ) engine = create_async_engine(database_url, echo=False) try: async with engine.connect() as conn: print(f"Deleting existing entries for language={lang_code!r}...") await conn.execute( _lemma_table.delete().where(_lemma_table.c.language == lang_code) ) await conn.commit() print(f"Importing {jsonl_path} ...") batch: list[dict] = [] total_lemmas = 0 skipped = 0 with open(jsonl_path, encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: record = json.loads(line) except json.JSONDecodeError as exc: print( f" Line {line_num}: JSON parse error: {exc}", file=sys.stderr, ) skipped += 1 continue parsed = _parse_entry(record, lang_code) if parsed is None: skipped += 1 continue batch.append(parsed) if len(batch) >= batch_size: await _flush_batch(conn, batch) total_lemmas += len(batch) print(f" Committed {total_lemmas} lemmas...") batch = [] if batch: await _flush_batch(conn, batch) total_lemmas += len(batch) print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.") finally: await engine.dispose() def main() -> None: parser = argparse.ArgumentParser( description="Import kaikki dictionary JSONL into Postgres." ) parser.add_argument( "--lang", required=True, help="Language code to import (e.g. fr)" ) parser.add_argument( "--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)" ) args = parser.parse_args() asyncio.run(run_import(args.lang, args.batch_size)) if __name__ == "__main__": main()