feat: Build the bilingual dictionary data
This commit is contained in:
parent
2cae5d9445
commit
873ebacd4d
9 changed files with 599 additions and 1 deletions
9
Makefile
9
Makefile
|
|
@ -1,4 +1,4 @@
|
||||||
.PHONY: down build up logs shell lock migrate migration
|
.PHONY: down build up logs shell lock migrate migration import-dictionary
|
||||||
|
|
||||||
build:
|
build:
|
||||||
docker compose build
|
docker compose build
|
||||||
|
|
@ -28,3 +28,10 @@ lock:
|
||||||
cd api && uv pip compile pyproject.toml -o requirements.txt
|
cd api && uv pip compile pyproject.toml -o requirements.txt
|
||||||
|
|
||||||
rebuild: down build up
|
rebuild: down build up
|
||||||
|
|
||||||
|
# Import a kaikki dictionary JSONL into Postgres.
|
||||||
|
# Requires the DB to be running with its port exposed on localhost (docker compose up).
|
||||||
|
# DATABASE_URL defaults to the docker-compose dev credentials.
|
||||||
|
# Usage: make import-dictionary lang=fr
|
||||||
|
import-dictionary:
|
||||||
|
cd api && python scripts/import_dictionary.py --lang $(lang)
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base
|
||||||
|
|
||||||
import app.outbound.postgres.entities.summarise_job_entity
|
import app.outbound.postgres.entities.summarise_job_entity
|
||||||
import app.outbound.postgres.entities.user_entity
|
import app.outbound.postgres.entities.user_entity
|
||||||
|
import app.outbound.postgres.entities.dictionary_entities
|
||||||
|
|
||||||
config = context.config
|
config = context.config
|
||||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||||
|
|
|
||||||
89
api/alembic/versions/20260407_0007_add_dictionary_tables.py
Normal file
89
api/alembic/versions/20260407_0007_add_dictionary_tables.py
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
"""add dictionary tables
|
||||||
|
|
||||||
|
Revision ID: 0007
|
||||||
|
Revises: 0006
|
||||||
|
Create Date: 2026-04-07
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0007"
|
||||||
|
down_revision: Union[str, None] = "0006"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"dictionary_lemma",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("headword", sa.Text(), nullable=False),
|
||||||
|
sa.Column("language", sa.String(2), nullable=False),
|
||||||
|
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||||
|
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||||
|
sa.Column("gender", sa.Text(), nullable=True),
|
||||||
|
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
|
||||||
|
op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"dictionary_sense",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column(
|
||||||
|
"lemma_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
|
||||||
|
sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||||
|
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"dictionary_wordform",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column(
|
||||||
|
"lemma_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column("form", sa.Text(), nullable=False),
|
||||||
|
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||||
|
)
|
||||||
|
op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
|
||||||
|
op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"dictionary_lemma_raw",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column(
|
||||||
|
"lemma_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
unique=True,
|
||||||
|
),
|
||||||
|
sa.Column("language", sa.String(2), nullable=False),
|
||||||
|
sa.Column("raw", postgresql.JSONB(), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_table("dictionary_lemma_raw")
|
||||||
|
op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
|
||||||
|
op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
|
||||||
|
op.drop_table("dictionary_wordform")
|
||||||
|
op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
|
||||||
|
op.drop_table("dictionary_sense")
|
||||||
|
op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
|
||||||
|
op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
|
||||||
|
op.drop_table("dictionary_lemma")
|
||||||
32
api/app/domain/models/dictionary.py
Normal file
32
api/app/domain/models/dictionary.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Wordform:
|
||||||
|
id: str
|
||||||
|
lemma_id: str
|
||||||
|
form: str
|
||||||
|
tags: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Sense:
|
||||||
|
id: str
|
||||||
|
lemma_id: str
|
||||||
|
sense_index: int
|
||||||
|
gloss: str
|
||||||
|
topics: list[str]
|
||||||
|
tags: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Lemma:
|
||||||
|
id: str
|
||||||
|
headword: str
|
||||||
|
language: str
|
||||||
|
pos_raw: str
|
||||||
|
pos_normalised: str | None
|
||||||
|
gender: str | None
|
||||||
|
tags: list[str]
|
||||||
|
senses: list[Sense] = field(default_factory=list)
|
||||||
|
wordforms: list[Wordform] = field(default_factory=list)
|
||||||
63
api/app/outbound/postgres/entities/dictionary_entities.py
Normal file
63
api/app/outbound/postgres/entities/dictionary_entities.py
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from sqlalchemy import String, Text, ForeignKey, Integer
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column
|
||||||
|
from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
|
||||||
|
|
||||||
|
from ..database import Base
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryLemmaEntity(Base):
|
||||||
|
__tablename__ = "dictionary_lemma"
|
||||||
|
|
||||||
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
headword: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
|
||||||
|
pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||||
|
|
||||||
|
|
||||||
|
class DictionarySenseEntity(Base):
|
||||||
|
__tablename__ = "dictionary_sense"
|
||||||
|
|
||||||
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||||
|
UUID(as_uuid=True),
|
||||||
|
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
index=True,
|
||||||
|
)
|
||||||
|
sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
|
||||||
|
topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||||
|
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryWordformEntity(Base):
|
||||||
|
__tablename__ = "dictionary_wordform"
|
||||||
|
|
||||||
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||||
|
UUID(as_uuid=True),
|
||||||
|
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
index=True,
|
||||||
|
)
|
||||||
|
form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
|
||||||
|
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryLemmaRawEntity(Base):
|
||||||
|
__tablename__ = "dictionary_lemma_raw"
|
||||||
|
|
||||||
|
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||||
|
UUID(as_uuid=True),
|
||||||
|
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
unique=True,
|
||||||
|
)
|
||||||
|
language: Mapped[str] = mapped_column(String(2), nullable=False)
|
||||||
|
raw: Mapped[dict] = mapped_column(JSONB, nullable=False)
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
import uuid
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from ..entities.dictionary_entities import (
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionarySenseEntity,
|
||||||
|
DictionaryWordformEntity,
|
||||||
|
)
|
||||||
|
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryRepository(Protocol):
|
||||||
|
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
||||||
|
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
||||||
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||||
|
|
||||||
|
|
||||||
|
def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
|
||||||
|
return Sense(
|
||||||
|
id=str(entity.id),
|
||||||
|
lemma_id=str(entity.lemma_id),
|
||||||
|
sense_index=entity.sense_index,
|
||||||
|
gloss=entity.gloss,
|
||||||
|
topics=entity.topics or [],
|
||||||
|
tags=entity.tags or [],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
|
||||||
|
return Wordform(
|
||||||
|
id=str(entity.id),
|
||||||
|
lemma_id=str(entity.lemma_id),
|
||||||
|
form=entity.form,
|
||||||
|
tags=entity.tags or [],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PostgresDictionaryRepository:
|
||||||
|
def __init__(self, db: AsyncSession) -> None:
|
||||||
|
self.db = db
|
||||||
|
|
||||||
|
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionarySenseEntity)
|
||||||
|
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||||
|
.where(
|
||||||
|
DictionaryLemmaEntity.headword == headword,
|
||||||
|
DictionaryLemmaEntity.language == language,
|
||||||
|
)
|
||||||
|
.order_by(DictionarySenseEntity.sense_index)
|
||||||
|
)
|
||||||
|
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
|
||||||
|
"""EN→target direction: find senses whose gloss matches the given English text.
|
||||||
|
|
||||||
|
Uses a case-insensitive exact match on the gloss column, filtered to the
|
||||||
|
target language via the joined lemma row.
|
||||||
|
"""
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionarySenseEntity)
|
||||||
|
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||||
|
.where(
|
||||||
|
DictionarySenseEntity.gloss.ilike(text),
|
||||||
|
DictionaryLemmaEntity.language == target_lang,
|
||||||
|
)
|
||||||
|
.order_by(DictionarySenseEntity.sense_index)
|
||||||
|
)
|
||||||
|
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionaryWordformEntity).where(
|
||||||
|
DictionaryWordformEntity.lemma_id == lemma_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||||
322
api/scripts/import_dictionary.py
Normal file
322
api/scripts/import_dictionary.py
Normal file
|
|
@ -0,0 +1,322 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
CLI import script for kaikki/wiktextract JSONL dictionary data.
|
||||||
|
|
||||||
|
Usage (from api/ directory):
|
||||||
|
uv run ./scripts/import_dictionary.py --lang fr
|
||||||
|
|
||||||
|
# or via Make from the repo root:
|
||||||
|
make import-dictionary lang=fr
|
||||||
|
|
||||||
|
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||||||
|
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||||||
|
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||||
|
|
||||||
|
_API_DIR = Path(__file__).parent.parent
|
||||||
|
_REPO_ROOT = _API_DIR.parent
|
||||||
|
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
|
||||||
|
|
||||||
|
_LANG_FILE_MAP: dict[str, str] = {
|
||||||
|
"fr": "french.jsonl",
|
||||||
|
}
|
||||||
|
|
||||||
|
_POS_MAP: dict[str, str] = {
|
||||||
|
"noun": "NOUN",
|
||||||
|
"verb": "VERB",
|
||||||
|
"adj": "ADJ",
|
||||||
|
"adv": "ADV",
|
||||||
|
"det": "DET",
|
||||||
|
"article": "DET",
|
||||||
|
"pron": "PRON",
|
||||||
|
"prep": "ADP",
|
||||||
|
"adp": "ADP",
|
||||||
|
"conj": "CCONJ",
|
||||||
|
"cconj": "CCONJ",
|
||||||
|
"sconj": "SCONJ",
|
||||||
|
"intj": "INTJ",
|
||||||
|
"num": "NUM",
|
||||||
|
"numeral": "NUM",
|
||||||
|
"part": "PART",
|
||||||
|
"particle": "PART",
|
||||||
|
"name": "PROPN",
|
||||||
|
"propn": "PROPN",
|
||||||
|
"proper noun": "PROPN",
|
||||||
|
"punct": "PUNCT",
|
||||||
|
"sym": "SYM",
|
||||||
|
}
|
||||||
|
|
||||||
|
_GENDER_MAP: dict[str, str] = {
|
||||||
|
"masculine": "masculine",
|
||||||
|
"masc": "masculine",
|
||||||
|
"feminine": "feminine",
|
||||||
|
"fem": "feminine",
|
||||||
|
"neuter": "neuter",
|
||||||
|
"common": "common",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Standalone table definitions — no app imports, no Settings() call
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_meta = sa.MetaData()
|
||||||
|
|
||||||
|
_lemma_table = sa.Table(
|
||||||
|
"dictionary_lemma",
|
||||||
|
_meta,
|
||||||
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("headword", sa.Text(), nullable=False),
|
||||||
|
sa.Column("language", sa.String(2), nullable=False),
|
||||||
|
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||||
|
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||||
|
sa.Column("gender", sa.Text(), nullable=True),
|
||||||
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
_sense_table = sa.Table(
|
||||||
|
"dictionary_sense",
|
||||||
|
_meta,
|
||||||
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("gloss", sa.Text(), nullable=False),
|
||||||
|
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
|
||||||
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
_wordform_table = sa.Table(
|
||||||
|
"dictionary_wordform",
|
||||||
|
_meta,
|
||||||
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("form", sa.Text(), nullable=False),
|
||||||
|
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
_raw_table = sa.Table(
|
||||||
|
"dictionary_lemma_raw",
|
||||||
|
_meta,
|
||||||
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("language", sa.String(2), nullable=False),
|
||||||
|
sa.Column("raw", JSONB(), nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Normalisation helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _normalise_pos(pos_raw: str) -> str | None:
|
||||||
|
return _POS_MAP.get(pos_raw.lower().strip())
|
||||||
|
|
||||||
|
|
||||||
|
def _normalise_gender(tags: list) -> str | None:
|
||||||
|
for tag in tags:
|
||||||
|
mapped = _GENDER_MAP.get(tag)
|
||||||
|
if mapped:
|
||||||
|
return mapped
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
|
"""Parse one kaikki JSONL record into insertion-ready row dicts.
|
||||||
|
|
||||||
|
Returns None if the entry should be skipped.
|
||||||
|
"""
|
||||||
|
if record.get("lang_code") != lang_code:
|
||||||
|
return None
|
||||||
|
|
||||||
|
word = (record.get("word") or "").strip()
|
||||||
|
if not word:
|
||||||
|
return None
|
||||||
|
|
||||||
|
pos_raw = (record.get("pos") or "").strip()
|
||||||
|
top_tags = record.get("tags") or []
|
||||||
|
|
||||||
|
lemma_id = uuid.uuid4()
|
||||||
|
|
||||||
|
senses = []
|
||||||
|
for i, sense_record in enumerate(record.get("senses") or []):
|
||||||
|
sense_id = uuid.uuid4()
|
||||||
|
glosses = sense_record.get("glosses") or []
|
||||||
|
gloss = glosses[0] if glosses else ""
|
||||||
|
topics = sense_record.get("topics") or []
|
||||||
|
sense_tags = sense_record.get("tags") or []
|
||||||
|
|
||||||
|
senses.append(
|
||||||
|
{
|
||||||
|
"id": sense_id,
|
||||||
|
"lemma_id": lemma_id,
|
||||||
|
"sense_index": i,
|
||||||
|
"gloss": gloss,
|
||||||
|
"topics": topics,
|
||||||
|
"tags": sense_tags,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
wordforms = []
|
||||||
|
for f in record.get("forms") or []:
|
||||||
|
form_text = (f.get("form") or "").strip()
|
||||||
|
if not form_text or form_text == word:
|
||||||
|
continue
|
||||||
|
form_tags = f.get("tags") or []
|
||||||
|
wordforms.append(
|
||||||
|
{
|
||||||
|
"id": uuid.uuid4(),
|
||||||
|
"lemma_id": lemma_id,
|
||||||
|
"form": form_text,
|
||||||
|
"tags": form_tags,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"lemma": {
|
||||||
|
"id": lemma_id,
|
||||||
|
"headword": word,
|
||||||
|
"language": lang_code,
|
||||||
|
"pos_raw": pos_raw,
|
||||||
|
"pos_normalised": _normalise_pos(pos_raw),
|
||||||
|
"gender": _normalise_gender(top_tags),
|
||||||
|
"tags": top_tags,
|
||||||
|
},
|
||||||
|
"senses": senses,
|
||||||
|
"wordforms": wordforms,
|
||||||
|
"raw": {
|
||||||
|
"id": uuid.uuid4(),
|
||||||
|
"lemma_id": lemma_id,
|
||||||
|
"language": lang_code,
|
||||||
|
"raw": record,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# DB operations
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||||||
|
lemma_rows = [e["lemma"] for e in batch]
|
||||||
|
sense_rows = [s for e in batch for s in e["senses"]]
|
||||||
|
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||||
|
raw_rows = [e["raw"] for e in batch]
|
||||||
|
|
||||||
|
if lemma_rows:
|
||||||
|
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||||
|
if sense_rows:
|
||||||
|
await conn.execute(_sense_table.insert(), sense_rows)
|
||||||
|
if wordform_rows:
|
||||||
|
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||||
|
if raw_rows:
|
||||||
|
await conn.execute(_raw_table.insert(), raw_rows)
|
||||||
|
|
||||||
|
await conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||||
|
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||||||
|
if not lang_file:
|
||||||
|
print(
|
||||||
|
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
jsonl_path = _DICT_DIR / lang_file
|
||||||
|
if not jsonl_path.exists():
|
||||||
|
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
database_url = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||||||
|
)
|
||||||
|
|
||||||
|
engine = create_async_engine(database_url, echo=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with engine.connect() as conn:
|
||||||
|
print(f"Deleting existing entries for language={lang_code!r}...")
|
||||||
|
await conn.execute(
|
||||||
|
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
||||||
|
)
|
||||||
|
await conn.commit()
|
||||||
|
|
||||||
|
print(f"Importing {jsonl_path} ...")
|
||||||
|
batch: list[dict] = []
|
||||||
|
total_lemmas = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
with open(jsonl_path, encoding="utf-8") as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
print(
|
||||||
|
f" Line {line_num}: JSON parse error: {exc}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
parsed = _parse_entry(record, lang_code)
|
||||||
|
if parsed is None:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
batch.append(parsed)
|
||||||
|
|
||||||
|
if len(batch) >= batch_size:
|
||||||
|
await _flush_batch(conn, batch)
|
||||||
|
total_lemmas += len(batch)
|
||||||
|
print(f" Committed {total_lemmas} lemmas...")
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
await _flush_batch(conn, batch)
|
||||||
|
total_lemmas += len(batch)
|
||||||
|
|
||||||
|
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||||||
|
finally:
|
||||||
|
await engine.dispose()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Import kaikki dictionary JSONL into Postgres."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang", required=True, help="Language code to import (e.g. fr)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
asyncio.run(run_import(args.lang, args.batch_size))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
1
dictionaries/.gitignore
vendored
Normal file
1
dictionaries/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
*.jsonl
|
||||||
3
dictionaries/README.md
Normal file
3
dictionaries/README.md
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
# Dictionaries
|
||||||
|
|
||||||
|
This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.
|
||||||
Loading…
Reference in a new issue