diff --git a/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py b/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py new file mode 100644 index 0000000..328510e --- /dev/null +++ b/api/alembic/versions/20260410_0010_add_dictionary_sense_link.py @@ -0,0 +1,41 @@ +"""add dictionary_sense_link table + +Revision ID: 0010 +Revises: 0009 +Create Date: 2026-04-10 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0010" +down_revision: Union[str, None] = "0009" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "dictionary_sense_link", + sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True), + sa.Column( + "sense_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("link_text", sa.Text(), nullable=False), + sa.Column("link_target", sa.Text(), nullable=False), + sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True), + ) + op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"]) + op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"]) + + +def downgrade() -> None: + op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link") + op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link") + op.drop_table("dictionary_sense_link") diff --git a/api/scripts/import_dictionary.py b/api/scripts/import_dictionary.py index 7bded71..916465e 100644 --- a/api/scripts/import_dictionary.py +++ b/api/scripts/import_dictionary.py @@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = { } _GENDER_MAP: dict[str, str] = { + "f": "feminine", + "m": "masculine", "masculine": "masculine", "masc": "masculine", "feminine": "feminine", @@ -114,6 +116,16 @@ _raw_table = sa.Table( sa.Column("raw", JSONB(), nullable=False), ) +_sense_link_table = sa.Table( + "dictionary_sense_link", + _meta, + sa.Column("id", PG_UUID(as_uuid=True), primary_key=True), + sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False), + sa.Column("link_text", sa.Text(), nullable=False), + sa.Column("link_target", sa.Text(), nullable=False), + sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True), +) + # --------------------------------------------------------------------------- # Normalisation helpers # --------------------------------------------------------------------------- @@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None: return _POS_MAP.get(pos_raw.lower().strip()) -def _normalise_gender(tags: list) -> str | None: - for tag in tags: - mapped = _GENDER_MAP.get(tag) - if mapped: - return mapped - return None +def _normalise_gender(value: str | None) -> str | None: + if value is None: + return None + return _GENDER_MAP.get(value) # --------------------------------------------------------------------------- @@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: return None pos_raw = (record.get("pos") or "").strip() - top_tags = record.get("tags") or [] lemma_id = uuid.uuid4() + _GENDER_TAGS = {"masculine", "feminine", "neuter"} + gender: str | None = None senses = [] + sense_links = [] for i, sense_record in enumerate(record.get("senses") or []): sense_id = uuid.uuid4() glosses = sense_record.get("glosses") or [] @@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: topics = sense_record.get("topics") or [] sense_tags = sense_record.get("tags") or [] + if gender is None: + for tag in sense_tags: + if tag in _GENDER_TAGS: + gender = tag + break + senses.append( { "id": sense_id, @@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: } ) + for link_pair in (sense_record.get("links") or []): + if isinstance(link_pair, list) and len(link_pair) == 2: + sense_links.append( + { + "id": uuid.uuid4(), + "sense_id": sense_id, + "link_text": link_pair[0], + "link_target": link_pair[1], + "target_lemma_id": None, + } + ) + wordforms = [] for f in record.get("forms") or []: form_text = (f.get("form") or "").strip() @@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: "language": lang_code, "pos_raw": pos_raw, "pos_normalised": _normalise_pos(pos_raw), - "gender": _normalise_gender(top_tags), - "tags": top_tags, + "gender": gender, + "tags": record.get("tags") or [], }, "senses": senses, + "sense_links": sense_links, "wordforms": wordforms, "raw": { "id": uuid.uuid4(), @@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None: async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None: lemma_rows = [e["lemma"] for e in batch] sense_rows = [s for e in batch for s in e["senses"]] + sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]] wordform_rows = [w for e in batch for w in e["wordforms"]] raw_rows = [e["raw"] for e in batch] @@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) await conn.execute(_lemma_table.insert(), lemma_rows) if sense_rows: await conn.execute(_sense_table.insert(), sense_rows) + if sense_link_rows: + await conn.execute(_sense_link_table.insert(), sense_link_rows) if wordform_rows: await conn.execute(_wordform_table.insert(), wordform_rows) if raw_rows: @@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) await conn.commit() +_LANG_SECTION_MAP: dict[str, str] = { + "fr": "French", + "de": "German", + "es": "Spanish", + "it": "Italian", + "pt": "Portuguese", +} + + +async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int: + """Resolve target_lemma_id for sense links whose target matches lang_code. + + Links in kaikki data look like ``["maboul", "maboul#French"]``. After all + lemmas have been imported we can attempt to match the target word to a row + in dictionary_lemma and store the foreign key. + """ + section = _LANG_SECTION_MAP.get(lang_code) + if not section: + return 0 + + suffix = f"#{section}" + + result = await conn.execute( + sa.select( + _sense_link_table.c.id, + _sense_link_table.c.link_target, + ).where(_sense_link_table.c.target_lemma_id.is_(None)) + ) + all_links = result.fetchall() + + # Filter to links that point at this language and extract the target word. + candidates: list[tuple[uuid.UUID, str]] = [] + for row in all_links: + if row.link_target.endswith(suffix): + word = row.link_target[: -len(suffix)] + candidates.append((row.id, word)) + + if not candidates: + return 0 + + target_words = list({w for _, w in candidates}) + lemma_result = await conn.execute( + sa.select(_lemma_table.c.id, _lemma_table.c.headword) + .where(_lemma_table.c.language == lang_code) + .where(_lemma_table.c.headword.in_(target_words)) + ) + lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result} + + resolved = 0 + for link_id, word in candidates: + if word in lemma_map: + await conn.execute( + _sense_link_table.update() + .where(_sense_link_table.c.id == link_id) + .values(target_lemma_id=lemma_map[word]) + ) + resolved += 1 + + await conn.commit() + return resolved + + async def run_import(lang_code: str, batch_size: int = 1000) -> None: lang_file = _LANG_FILE_MAP.get(lang_code) if not lang_file: @@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None: total_lemmas += len(batch) print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.") + + async with engine.connect() as conn: + print("Resolving sense links...") + resolved = await _resolve_links(conn, lang_code) + print(f"Resolved {resolved} sense links.") finally: await engine.dispose()