feat: link dictionary senses to lemmas

2026-04-10 21:12:40 +01:00 · 2026-04-10 21:12:40 +01:00 · 7f0977d8e5
commit 7f0977d8e5
parent eb21d8b2f0
2 changed files with 151 additions and 9 deletions
--- a/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py
+++ b/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py
@ -0,0 +1,41 @@
+"""add dictionary_sense_link table
+
+Revision ID: 0010
+Revises: 0009
+Create Date: 2026-04-10
+
+"""
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+revision: str = "0010"
+down_revision: Union[str, None] = "0009"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "dictionary_sense_link",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column(
+            "sense_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("link_text", sa.Text(), nullable=False),
+        sa.Column("link_target", sa.Text(), nullable=False),
+        sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
+    )
+    op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
+    op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
+
+
+def downgrade() -> None:
+    op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
+    op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
+    op.drop_table("dictionary_sense_link")
--- a/api/scripts/import_dictionary.py
+++ b/api/scripts/import_dictionary.py
@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = {
 }

 _GENDER_MAP: dict[str, str] = {
+    "f": "feminine",
+    "m": "masculine",
    "masculine": "masculine",
    "masc": "masculine",
    "feminine": "feminine",
@ -114,6 +116,16 @@ _raw_table = sa.Table(
    sa.Column("raw", JSONB(), nullable=False),
 )

+_sense_link_table = sa.Table(
+    "dictionary_sense_link",
+    _meta,
+    sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
+    sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
+    sa.Column("link_text", sa.Text(), nullable=False),
+    sa.Column("link_target", sa.Text(), nullable=False),
+    sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
+)
+
 # ---------------------------------------------------------------------------
 # Normalisation helpers
 # ---------------------------------------------------------------------------
@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None:
    return _POS_MAP.get(pos_raw.lower().strip())


-def _normalise_gender(tags: list) -> str | None:
-    for tag in tags:
-        mapped = _GENDER_MAP.get(tag)
-        if mapped:
-            return mapped
-    return None
+def _normalise_gender(value: str | None) -> str | None:
+    if value is None:
+        return None
+    return _GENDER_MAP.get(value)


 # ---------------------------------------------------------------------------
@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
        return None

    pos_raw = (record.get("pos") or "").strip()
-    top_tags = record.get("tags") or []

    lemma_id = uuid.uuid4()

+    _GENDER_TAGS = {"masculine", "feminine", "neuter"}
+    gender: str | None = None
    senses = []
+    sense_links = []
    for i, sense_record in enumerate(record.get("senses") or []):
        sense_id = uuid.uuid4()
        glosses = sense_record.get("glosses") or []
@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
        topics = sense_record.get("topics") or []
        sense_tags = sense_record.get("tags") or []

+        if gender is None:
+            for tag in sense_tags:
+                if tag in _GENDER_TAGS:
+                    gender = tag
+                    break
+
        senses.append(
            {
                "id": sense_id,
@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
            }
        )

+        for link_pair in (sense_record.get("links") or []):
+            if isinstance(link_pair, list) and len(link_pair) == 2:
+                sense_links.append(
+                    {
+                        "id": uuid.uuid4(),
+                        "sense_id": sense_id,
+                        "link_text": link_pair[0],
+                        "link_target": link_pair[1],
+                        "target_lemma_id": None,
+                    }
+                )
+
    wordforms = []
    for f in record.get("forms") or []:
        form_text = (f.get("form") or "").strip()
@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
            "language": lang_code,
            "pos_raw": pos_raw,
            "pos_normalised": _normalise_pos(pos_raw),
-            "gender": _normalise_gender(top_tags),
-            "tags": top_tags,
+            "gender": gender,
+            "tags": record.get("tags") or [],
        },
        "senses": senses,
+        "sense_links": sense_links,
        "wordforms": wordforms,
        "raw": {
            "id": uuid.uuid4(),
@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
 async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
    lemma_rows = [e["lemma"] for e in batch]
    sense_rows = [s for e in batch for s in e["senses"]]
+    sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
    wordform_rows = [w for e in batch for w in e["wordforms"]]
    raw_rows = [e["raw"] for e in batch]

@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
        await conn.execute(_lemma_table.insert(), lemma_rows)
    if sense_rows:
        await conn.execute(_sense_table.insert(), sense_rows)
+    if sense_link_rows:
+        await conn.execute(_sense_link_table.insert(), sense_link_rows)
    if wordform_rows:
        await conn.execute(_wordform_table.insert(), wordform_rows)
    if raw_rows:
@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
    await conn.commit()


+_LANG_SECTION_MAP: dict[str, str] = {
+    "fr": "French",
+    "de": "German",
+    "es": "Spanish",
+    "it": "Italian",
+    "pt": "Portuguese",
+}
+
+
+async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
+    """Resolve target_lemma_id for sense links whose target matches lang_code.
+
+    Links in kaikki data look like ``["maboul", "maboul#French"]``.  After all
+    lemmas have been imported we can attempt to match the target word to a row
+    in dictionary_lemma and store the foreign key.
+    """
+    section = _LANG_SECTION_MAP.get(lang_code)
+    if not section:
+        return 0
+
+    suffix = f"#{section}"
+
+    result = await conn.execute(
+        sa.select(
+            _sense_link_table.c.id,
+            _sense_link_table.c.link_target,
+        ).where(_sense_link_table.c.target_lemma_id.is_(None))
+    )
+    all_links = result.fetchall()
+
+    # Filter to links that point at this language and extract the target word.
+    candidates: list[tuple[uuid.UUID, str]] = []
+    for row in all_links:
+        if row.link_target.endswith(suffix):
+            word = row.link_target[: -len(suffix)]
+            candidates.append((row.id, word))
+
+    if not candidates:
+        return 0
+
+    target_words = list({w for _, w in candidates})
+    lemma_result = await conn.execute(
+        sa.select(_lemma_table.c.id, _lemma_table.c.headword)
+        .where(_lemma_table.c.language == lang_code)
+        .where(_lemma_table.c.headword.in_(target_words))
+    )
+    lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
+
+    resolved = 0
+    for link_id, word in candidates:
+        if word in lemma_map:
+            await conn.execute(
+                _sense_link_table.update()
+                .where(_sense_link_table.c.id == link_id)
+                .values(target_lemma_id=lemma_map[word])
+            )
+            resolved += 1
+
+    await conn.commit()
+    return resolved
+
+
 async def run_import(lang_code: str, batch_size: int = 1000) -> None:
    lang_file = _LANG_FILE_MAP.get(lang_code)
    if not lang_file:
@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
                total_lemmas += len(batch)

        print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
+
+        async with engine.connect() as conn:
+            print("Resolving sense links...")
+            resolved = await _resolve_links(conn, lang_code)
+            print(f"Resolved {resolved} sense links.")
    finally:
        await engine.dispose()