diff --git a/Makefile b/Makefile index 8d4e229..d2a419b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: down build up logs shell lock migrate migration +.PHONY: down build up logs shell lock migrate migration import-dictionary build: docker compose build @@ -28,3 +28,10 @@ lock: cd api && uv pip compile pyproject.toml -o requirements.txt rebuild: down build up + +# Import a kaikki dictionary JSONL into Postgres. +# Requires the DB to be running with its port exposed on localhost (docker compose up). +# DATABASE_URL defaults to the docker-compose dev credentials. +# Usage: make import-dictionary lang=fr +import-dictionary: + cd api && python scripts/import_dictionary.py --lang $(lang) diff --git a/api/alembic/env.py b/api/alembic/env.py index 69d3756..3fa77c1 100644 --- a/api/alembic/env.py +++ b/api/alembic/env.py @@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base import app.outbound.postgres.entities.summarise_job_entity import app.outbound.postgres.entities.user_entity +import app.outbound.postgres.entities.dictionary_entities config = context.config config.set_main_option("sqlalchemy.url", settings.database_url) diff --git a/api/alembic/versions/20260407_0007_add_dictionary_tables.py b/api/alembic/versions/20260407_0007_add_dictionary_tables.py new file mode 100644 index 0000000..9b1591d --- /dev/null +++ b/api/alembic/versions/20260407_0007_add_dictionary_tables.py @@ -0,0 +1,89 @@ +"""add dictionary tables + +Revision ID: 0007 +Revises: 0006 +Create Date: 2026-04-07 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0007" +down_revision: Union[str, None] = "0006" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "dictionary_lemma", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column("headword", sa.Text(), nullable=False), + sa.Column("language", sa.String(2), nullable=False), + sa.Column("pos_raw", sa.Text(), nullable=False), + sa.Column("pos_normalised", sa.Text(), nullable=True), + sa.Column("gender", sa.Text(), nullable=True), + sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"), + ) + op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"]) + op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"]) + + op.create_table( + "dictionary_sense", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "lemma_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("sense_index", sa.Integer(), nullable=False), + sa.Column("gloss", sa.Text(), nullable=False, server_default=""), + sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"), + sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"), + ) + op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"]) + + op.create_table( + "dictionary_wordform", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "lemma_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("form", sa.Text(), nullable=False), + sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"), + ) + op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"]) + op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"]) + + op.create_table( + "dictionary_lemma_raw", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "lemma_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + unique=True, + ), + sa.Column("language", sa.String(2), nullable=False), + sa.Column("raw", postgresql.JSONB(), nullable=False), + ) + + +def downgrade() -> None: + op.drop_table("dictionary_lemma_raw") + op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform") + op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform") + op.drop_table("dictionary_wordform") + op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense") + op.drop_table("dictionary_sense") + op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma") + op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma") + op.drop_table("dictionary_lemma") diff --git a/api/app/domain/models/dictionary.py b/api/app/domain/models/dictionary.py new file mode 100644 index 0000000..327efac --- /dev/null +++ b/api/app/domain/models/dictionary.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass + + +@dataclass +class Wordform: + id: str + lemma_id: str + form: str + tags: list[str] + + +@dataclass +class Sense: + id: str + lemma_id: str + sense_index: int + gloss: str + topics: list[str] + tags: list[str] + + +@dataclass +class Lemma: + id: str + headword: str + language: str + pos_raw: str + pos_normalised: str | None + gender: str | None + tags: list[str] + senses: list[Sense] = field(default_factory=list) + wordforms: list[Wordform] = field(default_factory=list) diff --git a/api/app/outbound/postgres/entities/dictionary_entities.py b/api/app/outbound/postgres/entities/dictionary_entities.py new file mode 100644 index 0000000..4036018 --- /dev/null +++ b/api/app/outbound/postgres/entities/dictionary_entities.py @@ -0,0 +1,63 @@ +import uuid + +from sqlalchemy import String, Text, ForeignKey, Integer +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB + +from ..database import Base + + +class DictionaryLemmaEntity(Base): + __tablename__ = "dictionary_lemma" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + headword: Mapped[str] = mapped_column(Text, nullable=False) + language: Mapped[str] = mapped_column(String(2), nullable=False, index=True) + pos_raw: Mapped[str] = mapped_column(Text, nullable=False) + pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True) + gender: Mapped[str | None] = mapped_column(Text, nullable=True) + tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}") + + +class DictionarySenseEntity(Base): + __tablename__ = "dictionary_sense" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + lemma_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + sense_index: Mapped[int] = mapped_column(Integer, nullable=False) + gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="") + topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}") + tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}") + + +class DictionaryWordformEntity(Base): + __tablename__ = "dictionary_wordform" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + lemma_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + form: Mapped[str] = mapped_column(Text, nullable=False, index=True) + tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}") + + +class DictionaryLemmaRawEntity(Base): + __tablename__ = "dictionary_lemma_raw" + + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + lemma_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("dictionary_lemma.id", ondelete="CASCADE"), + nullable=False, + unique=True, + ) + language: Mapped[str] = mapped_column(String(2), nullable=False) + raw: Mapped[dict] = mapped_column(JSONB, nullable=False) diff --git a/api/app/outbound/postgres/repositories/dictionary_repository.py b/api/app/outbound/postgres/repositories/dictionary_repository.py new file mode 100644 index 0000000..3901e6c --- /dev/null +++ b/api/app/outbound/postgres/repositories/dictionary_repository.py @@ -0,0 +1,80 @@ +import uuid +from typing import Protocol + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from ..entities.dictionary_entities import ( + DictionaryLemmaEntity, + DictionarySenseEntity, + DictionaryWordformEntity, +) +from ....domain.models.dictionary import Lemma, Sense, Wordform + + +class DictionaryRepository(Protocol): + async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ... + async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ... + async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ... + + +def _sense_to_model(entity: DictionarySenseEntity) -> Sense: + return Sense( + id=str(entity.id), + lemma_id=str(entity.lemma_id), + sense_index=entity.sense_index, + gloss=entity.gloss, + topics=entity.topics or [], + tags=entity.tags or [], + ) + + +def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform: + return Wordform( + id=str(entity.id), + lemma_id=str(entity.lemma_id), + form=entity.form, + tags=entity.tags or [], + ) + + +class PostgresDictionaryRepository: + def __init__(self, db: AsyncSession) -> None: + self.db = db + + async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: + result = await self.db.execute( + select(DictionarySenseEntity) + .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .where( + DictionaryLemmaEntity.headword == headword, + DictionaryLemmaEntity.language == language, + ) + .order_by(DictionarySenseEntity.sense_index) + ) + return [_sense_to_model(e) for e in result.scalars().all()] + + async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: + """EN→target direction: find senses whose gloss matches the given English text. + + Uses a case-insensitive exact match on the gloss column, filtered to the + target language via the joined lemma row. + """ + result = await self.db.execute( + select(DictionarySenseEntity) + .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .where( + DictionarySenseEntity.gloss.ilike(text), + DictionaryLemmaEntity.language == target_lang, + ) + .order_by(DictionarySenseEntity.sense_index) + ) + return [_sense_to_model(e) for e in result.scalars().all()] + + async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: + result = await self.db.execute( + select(DictionaryWordformEntity).where( + DictionaryWordformEntity.lemma_id == lemma_id + ) + ) + return [_wordform_to_model(e) for e in result.scalars().all()] diff --git a/api/scripts/import_dictionary.py b/api/scripts/import_dictionary.py new file mode 100644 index 0000000..7bded71 --- /dev/null +++ b/api/scripts/import_dictionary.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python +""" +CLI import script for kaikki/wiktextract JSONL dictionary data. + +Usage (from api/ directory): + uv run ./scripts/import_dictionary.py --lang fr + + # or via Make from the repo root: + make import-dictionary lang=fr + +DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn +which matches the docker-compose dev credentials when the DB port is exposed on the host. +""" + +import argparse +import asyncio +import json +import os +import sys +import uuid +from pathlib import Path + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ARRAY, JSONB +from sqlalchemy.dialects.postgresql import UUID as PG_UUID +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +_API_DIR = Path(__file__).parent.parent +_REPO_ROOT = _API_DIR.parent +_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki" + +_LANG_FILE_MAP: dict[str, str] = { + "fr": "french.jsonl", +} + +_POS_MAP: dict[str, str] = { + "noun": "NOUN", + "verb": "VERB", + "adj": "ADJ", + "adv": "ADV", + "det": "DET", + "article": "DET", + "pron": "PRON", + "prep": "ADP", + "adp": "ADP", + "conj": "CCONJ", + "cconj": "CCONJ", + "sconj": "SCONJ", + "intj": "INTJ", + "num": "NUM", + "numeral": "NUM", + "part": "PART", + "particle": "PART", + "name": "PROPN", + "propn": "PROPN", + "proper noun": "PROPN", + "punct": "PUNCT", + "sym": "SYM", +} + +_GENDER_MAP: dict[str, str] = { + "masculine": "masculine", + "masc": "masculine", + "feminine": "feminine", + "fem": "feminine", + "neuter": "neuter", + "common": "common", +} + +# --------------------------------------------------------------------------- +# Standalone table definitions — no app imports, no Settings() call +# --------------------------------------------------------------------------- + +_meta = sa.MetaData() + +_lemma_table = sa.Table( + "dictionary_lemma", + _meta, + sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), + sa.Column("headword", sa.Text(), nullable=False), + sa.Column("language", sa.String(2), nullable=False), + sa.Column("pos_raw", sa.Text(), nullable=False), + sa.Column("pos_normalised", sa.Text(), nullable=True), + sa.Column("gender", sa.Text(), nullable=True), + sa.Column("tags", ARRAY(sa.Text()), nullable=False), +) + +_sense_table = sa.Table( + "dictionary_sense", + _meta, + sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), + sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), + sa.Column("sense_index", sa.Integer(), nullable=False), + sa.Column("gloss", sa.Text(), nullable=False), + sa.Column("topics", ARRAY(sa.Text()), nullable=False), + sa.Column("tags", ARRAY(sa.Text()), nullable=False), +) + +_wordform_table = sa.Table( + "dictionary_wordform", + _meta, + sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), + sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), + sa.Column("form", sa.Text(), nullable=False), + sa.Column("tags", ARRAY(sa.Text()), nullable=False), +) + +_raw_table = sa.Table( + "dictionary_lemma_raw", + _meta, + sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), + sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False), + sa.Column("language", sa.String(2), nullable=False), + sa.Column("raw", JSONB(), nullable=False), +) + +# --------------------------------------------------------------------------- +# Normalisation helpers +# --------------------------------------------------------------------------- + + +def _normalise_pos(pos_raw: str) -> str | None: + return _POS_MAP.get(pos_raw.lower().strip()) + + +def _normalise_gender(tags: list) -> str | None: + for tag in tags: + mapped = _GENDER_MAP.get(tag) + if mapped: + return mapped + return None + + +# --------------------------------------------------------------------------- +# Parsing +# --------------------------------------------------------------------------- + + +def _parse_entry(record: dict, lang_code: str) -> dict | None: + """Parse one kaikki JSONL record into insertion-ready row dicts. + + Returns None if the entry should be skipped. + """ + if record.get("lang_code") != lang_code: + return None + + word = (record.get("word") or "").strip() + if not word: + return None + + pos_raw = (record.get("pos") or "").strip() + top_tags = record.get("tags") or [] + + lemma_id = uuid.uuid4() + + senses = [] + for i, sense_record in enumerate(record.get("senses") or []): + sense_id = uuid.uuid4() + glosses = sense_record.get("glosses") or [] + gloss = glosses[0] if glosses else "" + topics = sense_record.get("topics") or [] + sense_tags = sense_record.get("tags") or [] + + senses.append( + { + "id": sense_id, + "lemma_id": lemma_id, + "sense_index": i, + "gloss": gloss, + "topics": topics, + "tags": sense_tags, + } + ) + + wordforms = [] + for f in record.get("forms") or []: + form_text = (f.get("form") or "").strip() + if not form_text or form_text == word: + continue + form_tags = f.get("tags") or [] + wordforms.append( + { + "id": uuid.uuid4(), + "lemma_id": lemma_id, + "form": form_text, + "tags": form_tags, + } + ) + + return { + "lemma": { + "id": lemma_id, + "headword": word, + "language": lang_code, + "pos_raw": pos_raw, + "pos_normalised": _normalise_pos(pos_raw), + "gender": _normalise_gender(top_tags), + "tags": top_tags, + }, + "senses": senses, + "wordforms": wordforms, + "raw": { + "id": uuid.uuid4(), + "lemma_id": lemma_id, + "language": lang_code, + "raw": record, + }, + } + + +# --------------------------------------------------------------------------- +# DB operations +# --------------------------------------------------------------------------- + + +async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None: + lemma_rows = [e["lemma"] for e in batch] + sense_rows = [s for e in batch for s in e["senses"]] + wordform_rows = [w for e in batch for w in e["wordforms"]] + raw_rows = [e["raw"] for e in batch] + + if lemma_rows: + await conn.execute(_lemma_table.insert(), lemma_rows) + if sense_rows: + await conn.execute(_sense_table.insert(), sense_rows) + if wordform_rows: + await conn.execute(_wordform_table.insert(), wordform_rows) + if raw_rows: + await conn.execute(_raw_table.insert(), raw_rows) + + await conn.commit() + + +async def run_import(lang_code: str, batch_size: int = 1000) -> None: + lang_file = _LANG_FILE_MAP.get(lang_code) + if not lang_file: + print( + f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}", + file=sys.stderr, + ) + sys.exit(1) + + jsonl_path = _DICT_DIR / lang_file + if not jsonl_path.exists(): + print(f"JSONL file not found: {jsonl_path}", file=sys.stderr) + sys.exit(1) + + database_url = os.environ.get( + "DATABASE_URL", + "postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn", + ) + + engine = create_async_engine(database_url, echo=False) + + try: + async with engine.connect() as conn: + print(f"Deleting existing entries for language={lang_code!r}...") + await conn.execute( + _lemma_table.delete().where(_lemma_table.c.language == lang_code) + ) + await conn.commit() + + print(f"Importing {jsonl_path} ...") + batch: list[dict] = [] + total_lemmas = 0 + skipped = 0 + + with open(jsonl_path, encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + record = json.loads(line) + except json.JSONDecodeError as exc: + print( + f" Line {line_num}: JSON parse error: {exc}", + file=sys.stderr, + ) + skipped += 1 + continue + + parsed = _parse_entry(record, lang_code) + if parsed is None: + skipped += 1 + continue + + batch.append(parsed) + + if len(batch) >= batch_size: + await _flush_batch(conn, batch) + total_lemmas += len(batch) + print(f" Committed {total_lemmas} lemmas...") + batch = [] + + if batch: + await _flush_batch(conn, batch) + total_lemmas += len(batch) + + print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.") + finally: + await engine.dispose() + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Import kaikki dictionary JSONL into Postgres." + ) + parser.add_argument( + "--lang", required=True, help="Language code to import (e.g. fr)" + ) + parser.add_argument( + "--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)" + ) + args = parser.parse_args() + + asyncio.run(run_import(args.lang, args.batch_size)) + + +if __name__ == "__main__": + main() diff --git a/dictionaries/.gitignore b/dictionaries/.gitignore new file mode 100644 index 0000000..2fb4e2d --- /dev/null +++ b/dictionaries/.gitignore @@ -0,0 +1 @@ +*.jsonl diff --git a/dictionaries/README.md b/dictionaries/README.md new file mode 100644 index 0000000..93cf50e --- /dev/null +++ b/dictionaries/README.md @@ -0,0 +1,3 @@ +# Dictionaries + +This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using. \ No newline at end of file