feat: link dictionary senses to lemmas

2026-04-10 21:12:40 +01:00 · 2026-04-10 21:12:40 +01:00 · 7f0977d8e5
commit 7f0977d8e5
parent eb21d8b2f0
2 changed files with 151 additions and 9 deletions
--- a/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py
+++ b/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py
@ -0,0 +1,41 @@
 """add dictionary_sense_link table
 Revision ID: 0010
 Revises: 0009
 Create Date: 2026-04-10
 """
 from typing import Sequence, Union
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 revision: str = "0010"
 down_revision: Union[str, None] = "0009"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    op.create_table(
        "dictionary_sense_link",
        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
        sa.Column(
            "sense_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("link_text", sa.Text(), nullable=False),
        sa.Column("link_target", sa.Text(), nullable=False),
        sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
    )
    op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
    op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
 def downgrade() -> None:
    op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
    op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
    op.drop_table("dictionary_sense_link")
--- a/api/scripts/import_dictionary.py
+++ b/api/scripts/import_dictionary.py
@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = {
 }
 _GENDER_MAP: dict[str, str] = {
    "f": "feminine",
    "m": "masculine",
    "masculine": "masculine",
    "masc": "masculine",
    "feminine": "feminine",
@ -114,6 +116,16 @@ _raw_table = sa.Table(
    sa.Column("raw", JSONB(), nullable=False),
 )
 _sense_link_table = sa.Table(
    "dictionary_sense_link",
    _meta,
    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
    sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
    sa.Column("link_text", sa.Text(), nullable=False),
    sa.Column("link_target", sa.Text(), nullable=False),
    sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
 )
 # ---------------------------------------------------------------------------
 # Normalisation helpers
 # ---------------------------------------------------------------------------
@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None:
    return _POS_MAP.get(pos_raw.lower().strip())
-def _normalise_gender(tags: list) -> str | None:
+def _normalise_gender(value: str | None) -> str | None:
-    for tag in tags:
+    if value is None:
        mapped = _GENDER_MAP.get(tag)
        if mapped:
            return mapped
        return None
    return _GENDER_MAP.get(value)
 # ---------------------------------------------------------------------------
@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
        return None
    pos_raw = (record.get("pos") or "").strip()
    top_tags = record.get("tags") or []
    lemma_id = uuid.uuid4()
    _GENDER_TAGS = {"masculine", "feminine", "neuter"}
    gender: str | None = None
    senses = []
    sense_links = []
    for i, sense_record in enumerate(record.get("senses") or []):
        sense_id = uuid.uuid4()
        glosses = sense_record.get("glosses") or []
@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
        topics = sense_record.get("topics") or []
        sense_tags = sense_record.get("tags") or []
        if gender is None:
            for tag in sense_tags:
                if tag in _GENDER_TAGS:
                    gender = tag
                    break
        senses.append(
            {
                "id": sense_id,
@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
            }
        )
        for link_pair in (sense_record.get("links") or []):
            if isinstance(link_pair, list) and len(link_pair) == 2:
                sense_links.append(
                    {
                        "id": uuid.uuid4(),
                        "sense_id": sense_id,
                        "link_text": link_pair[0],
                        "link_target": link_pair[1],
                        "target_lemma_id": None,
                    }
                )
    wordforms = []
    for f in record.get("forms") or []:
        form_text = (f.get("form") or "").strip()
@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
            "language": lang_code,
            "pos_raw": pos_raw,
            "pos_normalised": _normalise_pos(pos_raw),
-            "gender": _normalise_gender(top_tags),
+            "gender": gender,
-            "tags": top_tags,
+            "tags": record.get("tags") or [],
        },
        "senses": senses,
        "sense_links": sense_links,
        "wordforms": wordforms,
        "raw": {
            "id": uuid.uuid4(),
@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
 async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
    lemma_rows = [e["lemma"] for e in batch]
    sense_rows = [s for e in batch for s in e["senses"]]
    sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
    wordform_rows = [w for e in batch for w in e["wordforms"]]
    raw_rows = [e["raw"] for e in batch]
@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
        await conn.execute(_lemma_table.insert(), lemma_rows)
    if sense_rows:
        await conn.execute(_sense_table.insert(), sense_rows)
    if sense_link_rows:
        await conn.execute(_sense_link_table.insert(), sense_link_rows)
    if wordform_rows:
        await conn.execute(_wordform_table.insert(), wordform_rows)
    if raw_rows:
@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
    await conn.commit()
 _LANG_SECTION_MAP: dict[str, str] = {
    "fr": "French",
    "de": "German",
    "es": "Spanish",
    "it": "Italian",
    "pt": "Portuguese",
 }
 async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
    """Resolve target_lemma_id for sense links whose target matches lang_code.
    Links in kaikki data look like ``["maboul", "maboul#French"]``.  After all
    lemmas have been imported we can attempt to match the target word to a row
    in dictionary_lemma and store the foreign key.
    """
    section = _LANG_SECTION_MAP.get(lang_code)
    if not section:
        return 0
    suffix = f"#{section}"
    result = await conn.execute(
        sa.select(
            _sense_link_table.c.id,
            _sense_link_table.c.link_target,
        ).where(_sense_link_table.c.target_lemma_id.is_(None))
    )
    all_links = result.fetchall()
    # Filter to links that point at this language and extract the target word.
    candidates: list[tuple[uuid.UUID, str]] = []
    for row in all_links:
        if row.link_target.endswith(suffix):
            word = row.link_target[: -len(suffix)]
            candidates.append((row.id, word))
    if not candidates:
        return 0
    target_words = list({w for _, w in candidates})
    lemma_result = await conn.execute(
        sa.select(_lemma_table.c.id, _lemma_table.c.headword)
        .where(_lemma_table.c.language == lang_code)
        .where(_lemma_table.c.headword.in_(target_words))
    )
    lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
    resolved = 0
    for link_id, word in candidates:
        if word in lemma_map:
            await conn.execute(
                _sense_link_table.update()
                .where(_sense_link_table.c.id == link_id)
                .values(target_lemma_id=lemma_map[word])
            )
            resolved += 1
    await conn.commit()
    return resolved
 async def run_import(lang_code: str, batch_size: int = 1000) -> None:
    lang_file = _LANG_FILE_MAP.get(lang_code)
    if not lang_file:
@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
                total_lemmas += len(batch)
        print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
        async with engine.connect() as conn:
            print("Resolving sense links...")
            resolved = await _resolve_links(conn, lang_code)
            print(f"Resolved {resolved} sense links.")
    finally:
        await engine.dispose()