feat: Build the bilingual dictionary data
This commit is contained in:
parent
2cae5d9445
commit
873ebacd4d
9 changed files with 599 additions and 1 deletions
9
Makefile
9
Makefile
|
|
@ -1,4 +1,4 @@
|
|||
.PHONY: down build up logs shell lock migrate migration
|
||||
.PHONY: down build up logs shell lock migrate migration import-dictionary
|
||||
|
||||
build:
|
||||
docker compose build
|
||||
|
|
@ -28,3 +28,10 @@ lock:
|
|||
cd api && uv pip compile pyproject.toml -o requirements.txt
|
||||
|
||||
rebuild: down build up
|
||||
|
||||
# Import a kaikki dictionary JSONL into Postgres.
|
||||
# Requires the DB to be running with its port exposed on localhost (docker compose up).
|
||||
# DATABASE_URL defaults to the docker-compose dev credentials.
|
||||
# Usage: make import-dictionary lang=fr
|
||||
import-dictionary:
|
||||
cd api && python scripts/import_dictionary.py --lang $(lang)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from app.outbound.postgres.database import Base
|
|||
|
||||
import app.outbound.postgres.entities.summarise_job_entity
|
||||
import app.outbound.postgres.entities.user_entity
|
||||
import app.outbound.postgres.entities.dictionary_entities
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||
|
|
|
|||
89
api/alembic/versions/20260407_0007_add_dictionary_tables.py
Normal file
89
api/alembic/versions/20260407_0007_add_dictionary_tables.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
"""add dictionary tables
|
||||
|
||||
Revision ID: 0007
|
||||
Revises: 0006
|
||||
Create Date: 2026-04-07
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0007"
|
||||
down_revision: Union[str, None] = "0006"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"dictionary_lemma",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("headword", sa.Text(), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||
sa.Column("gender", sa.Text(), nullable=True),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
|
||||
op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_sense",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||
sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
|
||||
sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_wordform",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("form", sa.Text(), nullable=False),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
|
||||
op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_lemma_raw",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("raw", postgresql.JSONB(), nullable=False),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("dictionary_lemma_raw")
|
||||
op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
|
||||
op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
|
||||
op.drop_table("dictionary_wordform")
|
||||
op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
|
||||
op.drop_table("dictionary_sense")
|
||||
op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
|
||||
op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
|
||||
op.drop_table("dictionary_lemma")
|
||||
32
api/app/domain/models/dictionary.py
Normal file
32
api/app/domain/models/dictionary.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Wordform:
|
||||
id: str
|
||||
lemma_id: str
|
||||
form: str
|
||||
tags: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Sense:
|
||||
id: str
|
||||
lemma_id: str
|
||||
sense_index: int
|
||||
gloss: str
|
||||
topics: list[str]
|
||||
tags: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Lemma:
|
||||
id: str
|
||||
headword: str
|
||||
language: str
|
||||
pos_raw: str
|
||||
pos_normalised: str | None
|
||||
gender: str | None
|
||||
tags: list[str]
|
||||
senses: list[Sense] = field(default_factory=list)
|
||||
wordforms: list[Wordform] = field(default_factory=list)
|
||||
63
api/app/outbound/postgres/entities/dictionary_entities.py
Normal file
63
api/app/outbound/postgres/entities/dictionary_entities.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import uuid
|
||||
|
||||
from sqlalchemy import String, Text, ForeignKey, Integer
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class DictionaryLemmaEntity(Base):
|
||||
__tablename__ = "dictionary_lemma"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
headword: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
|
||||
pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionarySenseEntity(Base):
|
||||
__tablename__ = "dictionary_sense"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
|
||||
topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionaryWordformEntity(Base):
|
||||
__tablename__ = "dictionary_wordform"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionaryLemmaRawEntity(Base):
|
||||
__tablename__ = "dictionary_lemma_raw"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
)
|
||||
language: Mapped[str] = mapped_column(String(2), nullable=False)
|
||||
raw: Mapped[dict] = mapped_column(JSONB, nullable=False)
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
import uuid
|
||||
from typing import Protocol
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..entities.dictionary_entities import (
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity,
|
||||
DictionaryWordformEntity,
|
||||
)
|
||||
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||
|
||||
|
||||
class DictionaryRepository(Protocol):
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||
|
||||
|
||||
def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
|
||||
return Sense(
|
||||
id=str(entity.id),
|
||||
lemma_id=str(entity.lemma_id),
|
||||
sense_index=entity.sense_index,
|
||||
gloss=entity.gloss,
|
||||
topics=entity.topics or [],
|
||||
tags=entity.tags or [],
|
||||
)
|
||||
|
||||
|
||||
def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
|
||||
return Wordform(
|
||||
id=str(entity.id),
|
||||
lemma_id=str(entity.lemma_id),
|
||||
form=entity.form,
|
||||
tags=entity.tags or [],
|
||||
)
|
||||
|
||||
|
||||
class PostgresDictionaryRepository:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionaryLemmaEntity.headword == headword,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
|
||||
"""EN→target direction: find senses whose gloss matches the given English text.
|
||||
|
||||
Uses a case-insensitive exact match on the gloss column, filtered to the
|
||||
target language via the joined lemma row.
|
||||
"""
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionarySenseEntity.gloss.ilike(text),
|
||||
DictionaryLemmaEntity.language == target_lang,
|
||||
)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity).where(
|
||||
DictionaryWordformEntity.lemma_id == lemma_id
|
||||
)
|
||||
)
|
||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||
322
api/scripts/import_dictionary.py
Normal file
322
api/scripts/import_dictionary.py
Normal file
|
|
@ -0,0 +1,322 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
CLI import script for kaikki/wiktextract JSONL dictionary data.
|
||||
|
||||
Usage (from api/ directory):
|
||||
uv run ./scripts/import_dictionary.py --lang fr
|
||||
|
||||
# or via Make from the repo root:
|
||||
make import-dictionary lang=fr
|
||||
|
||||
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||||
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
_API_DIR = Path(__file__).parent.parent
|
||||
_REPO_ROOT = _API_DIR.parent
|
||||
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
|
||||
|
||||
_LANG_FILE_MAP: dict[str, str] = {
|
||||
"fr": "french.jsonl",
|
||||
}
|
||||
|
||||
_POS_MAP: dict[str, str] = {
|
||||
"noun": "NOUN",
|
||||
"verb": "VERB",
|
||||
"adj": "ADJ",
|
||||
"adv": "ADV",
|
||||
"det": "DET",
|
||||
"article": "DET",
|
||||
"pron": "PRON",
|
||||
"prep": "ADP",
|
||||
"adp": "ADP",
|
||||
"conj": "CCONJ",
|
||||
"cconj": "CCONJ",
|
||||
"sconj": "SCONJ",
|
||||
"intj": "INTJ",
|
||||
"num": "NUM",
|
||||
"numeral": "NUM",
|
||||
"part": "PART",
|
||||
"particle": "PART",
|
||||
"name": "PROPN",
|
||||
"propn": "PROPN",
|
||||
"proper noun": "PROPN",
|
||||
"punct": "PUNCT",
|
||||
"sym": "SYM",
|
||||
}
|
||||
|
||||
_GENDER_MAP: dict[str, str] = {
|
||||
"masculine": "masculine",
|
||||
"masc": "masculine",
|
||||
"feminine": "feminine",
|
||||
"fem": "feminine",
|
||||
"neuter": "neuter",
|
||||
"common": "common",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Standalone table definitions — no app imports, no Settings() call
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_meta = sa.MetaData()
|
||||
|
||||
_lemma_table = sa.Table(
|
||||
"dictionary_lemma",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("headword", sa.Text(), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||
sa.Column("gender", sa.Text(), nullable=True),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_sense_table = sa.Table(
|
||||
"dictionary_sense",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||
sa.Column("gloss", sa.Text(), nullable=False),
|
||||
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_wordform_table = sa.Table(
|
||||
"dictionary_wordform",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("form", sa.Text(), nullable=False),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_raw_table = sa.Table(
|
||||
"dictionary_lemma_raw",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("raw", JSONB(), nullable=False),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalisation helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalise_pos(pos_raw: str) -> str | None:
|
||||
return _POS_MAP.get(pos_raw.lower().strip())
|
||||
|
||||
|
||||
def _normalise_gender(tags: list) -> str | None:
|
||||
for tag in tags:
|
||||
mapped = _GENDER_MAP.get(tag)
|
||||
if mapped:
|
||||
return mapped
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||
"""Parse one kaikki JSONL record into insertion-ready row dicts.
|
||||
|
||||
Returns None if the entry should be skipped.
|
||||
"""
|
||||
if record.get("lang_code") != lang_code:
|
||||
return None
|
||||
|
||||
word = (record.get("word") or "").strip()
|
||||
if not word:
|
||||
return None
|
||||
|
||||
pos_raw = (record.get("pos") or "").strip()
|
||||
top_tags = record.get("tags") or []
|
||||
|
||||
lemma_id = uuid.uuid4()
|
||||
|
||||
senses = []
|
||||
for i, sense_record in enumerate(record.get("senses") or []):
|
||||
sense_id = uuid.uuid4()
|
||||
glosses = sense_record.get("glosses") or []
|
||||
gloss = glosses[0] if glosses else ""
|
||||
topics = sense_record.get("topics") or []
|
||||
sense_tags = sense_record.get("tags") or []
|
||||
|
||||
senses.append(
|
||||
{
|
||||
"id": sense_id,
|
||||
"lemma_id": lemma_id,
|
||||
"sense_index": i,
|
||||
"gloss": gloss,
|
||||
"topics": topics,
|
||||
"tags": sense_tags,
|
||||
}
|
||||
)
|
||||
|
||||
wordforms = []
|
||||
for f in record.get("forms") or []:
|
||||
form_text = (f.get("form") or "").strip()
|
||||
if not form_text or form_text == word:
|
||||
continue
|
||||
form_tags = f.get("tags") or []
|
||||
wordforms.append(
|
||||
{
|
||||
"id": uuid.uuid4(),
|
||||
"lemma_id": lemma_id,
|
||||
"form": form_text,
|
||||
"tags": form_tags,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"lemma": {
|
||||
"id": lemma_id,
|
||||
"headword": word,
|
||||
"language": lang_code,
|
||||
"pos_raw": pos_raw,
|
||||
"pos_normalised": _normalise_pos(pos_raw),
|
||||
"gender": _normalise_gender(top_tags),
|
||||
"tags": top_tags,
|
||||
},
|
||||
"senses": senses,
|
||||
"wordforms": wordforms,
|
||||
"raw": {
|
||||
"id": uuid.uuid4(),
|
||||
"lemma_id": lemma_id,
|
||||
"language": lang_code,
|
||||
"raw": record,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DB operations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||||
lemma_rows = [e["lemma"] for e in batch]
|
||||
sense_rows = [s for e in batch for s in e["senses"]]
|
||||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||
raw_rows = [e["raw"] for e in batch]
|
||||
|
||||
if lemma_rows:
|
||||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||
if sense_rows:
|
||||
await conn.execute(_sense_table.insert(), sense_rows)
|
||||
if wordform_rows:
|
||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||
if raw_rows:
|
||||
await conn.execute(_raw_table.insert(), raw_rows)
|
||||
|
||||
await conn.commit()
|
||||
|
||||
|
||||
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||||
if not lang_file:
|
||||
print(
|
||||
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
jsonl_path = _DICT_DIR / lang_file
|
||||
if not jsonl_path.exists():
|
||||
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
database_url = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||||
)
|
||||
|
||||
engine = create_async_engine(database_url, echo=False)
|
||||
|
||||
try:
|
||||
async with engine.connect() as conn:
|
||||
print(f"Deleting existing entries for language={lang_code!r}...")
|
||||
await conn.execute(
|
||||
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
||||
)
|
||||
await conn.commit()
|
||||
|
||||
print(f"Importing {jsonl_path} ...")
|
||||
batch: list[dict] = []
|
||||
total_lemmas = 0
|
||||
skipped = 0
|
||||
|
||||
with open(jsonl_path, encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(
|
||||
f" Line {line_num}: JSON parse error: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
parsed = _parse_entry(record, lang_code)
|
||||
if parsed is None:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
batch.append(parsed)
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
await _flush_batch(conn, batch)
|
||||
total_lemmas += len(batch)
|
||||
print(f" Committed {total_lemmas} lemmas...")
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
await _flush_batch(conn, batch)
|
||||
total_lemmas += len(batch)
|
||||
|
||||
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Import kaikki dictionary JSONL into Postgres."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang", required=True, help="Language code to import (e.g. fr)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(run_import(args.lang, args.batch_size))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
dictionaries/.gitignore
vendored
Normal file
1
dictionaries/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
*.jsonl
|
||||
3
dictionaries/README.md
Normal file
3
dictionaries/README.md
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
# Dictionaries
|
||||
|
||||
This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.
|
||||
Loading…
Reference in a new issue