feat: link dictionary senses to lemmas
Some checks are pending
/ test (push) Waiting to run

This commit is contained in:
wilson 2026-04-10 21:12:40 +01:00
parent eb21d8b2f0
commit 7f0977d8e5
2 changed files with 151 additions and 9 deletions

View file

@ -0,0 +1,41 @@
"""add dictionary_sense_link table
Revision ID: 0010
Revises: 0009
Create Date: 2026-04-10
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0010"
down_revision: Union[str, None] = "0009"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"dictionary_sense_link",
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
sa.Column(
"sense_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("link_text", sa.Text(), nullable=False),
sa.Column("link_target", sa.Text(), nullable=False),
sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
)
op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
def downgrade() -> None:
op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
op.drop_table("dictionary_sense_link")

View file

@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = {
} }
_GENDER_MAP: dict[str, str] = { _GENDER_MAP: dict[str, str] = {
"f": "feminine",
"m": "masculine",
"masculine": "masculine", "masculine": "masculine",
"masc": "masculine", "masc": "masculine",
"feminine": "feminine", "feminine": "feminine",
@ -114,6 +116,16 @@ _raw_table = sa.Table(
sa.Column("raw", JSONB(), nullable=False), sa.Column("raw", JSONB(), nullable=False),
) )
_sense_link_table = sa.Table(
"dictionary_sense_link",
_meta,
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
sa.Column("link_text", sa.Text(), nullable=False),
sa.Column("link_target", sa.Text(), nullable=False),
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Normalisation helpers # Normalisation helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None:
return _POS_MAP.get(pos_raw.lower().strip()) return _POS_MAP.get(pos_raw.lower().strip())
def _normalise_gender(tags: list) -> str | None: def _normalise_gender(value: str | None) -> str | None:
for tag in tags: if value is None:
mapped = _GENDER_MAP.get(tag)
if mapped:
return mapped
return None return None
return _GENDER_MAP.get(value)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
return None return None
pos_raw = (record.get("pos") or "").strip() pos_raw = (record.get("pos") or "").strip()
top_tags = record.get("tags") or []
lemma_id = uuid.uuid4() lemma_id = uuid.uuid4()
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
gender: str | None = None
senses = [] senses = []
sense_links = []
for i, sense_record in enumerate(record.get("senses") or []): for i, sense_record in enumerate(record.get("senses") or []):
sense_id = uuid.uuid4() sense_id = uuid.uuid4()
glosses = sense_record.get("glosses") or [] glosses = sense_record.get("glosses") or []
@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
topics = sense_record.get("topics") or [] topics = sense_record.get("topics") or []
sense_tags = sense_record.get("tags") or [] sense_tags = sense_record.get("tags") or []
if gender is None:
for tag in sense_tags:
if tag in _GENDER_TAGS:
gender = tag
break
senses.append( senses.append(
{ {
"id": sense_id, "id": sense_id,
@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
} }
) )
for link_pair in (sense_record.get("links") or []):
if isinstance(link_pair, list) and len(link_pair) == 2:
sense_links.append(
{
"id": uuid.uuid4(),
"sense_id": sense_id,
"link_text": link_pair[0],
"link_target": link_pair[1],
"target_lemma_id": None,
}
)
wordforms = [] wordforms = []
for f in record.get("forms") or []: for f in record.get("forms") or []:
form_text = (f.get("form") or "").strip() form_text = (f.get("form") or "").strip()
@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
"language": lang_code, "language": lang_code,
"pos_raw": pos_raw, "pos_raw": pos_raw,
"pos_normalised": _normalise_pos(pos_raw), "pos_normalised": _normalise_pos(pos_raw),
"gender": _normalise_gender(top_tags), "gender": gender,
"tags": top_tags, "tags": record.get("tags") or [],
}, },
"senses": senses, "senses": senses,
"sense_links": sense_links,
"wordforms": wordforms, "wordforms": wordforms,
"raw": { "raw": {
"id": uuid.uuid4(), "id": uuid.uuid4(),
@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None: async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
lemma_rows = [e["lemma"] for e in batch] lemma_rows = [e["lemma"] for e in batch]
sense_rows = [s for e in batch for s in e["senses"]] sense_rows = [s for e in batch for s in e["senses"]]
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
wordform_rows = [w for e in batch for w in e["wordforms"]] wordform_rows = [w for e in batch for w in e["wordforms"]]
raw_rows = [e["raw"] for e in batch] raw_rows = [e["raw"] for e in batch]
@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
await conn.execute(_lemma_table.insert(), lemma_rows) await conn.execute(_lemma_table.insert(), lemma_rows)
if sense_rows: if sense_rows:
await conn.execute(_sense_table.insert(), sense_rows) await conn.execute(_sense_table.insert(), sense_rows)
if sense_link_rows:
await conn.execute(_sense_link_table.insert(), sense_link_rows)
if wordform_rows: if wordform_rows:
await conn.execute(_wordform_table.insert(), wordform_rows) await conn.execute(_wordform_table.insert(), wordform_rows)
if raw_rows: if raw_rows:
@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
await conn.commit() await conn.commit()
_LANG_SECTION_MAP: dict[str, str] = {
"fr": "French",
"de": "German",
"es": "Spanish",
"it": "Italian",
"pt": "Portuguese",
}
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
"""Resolve target_lemma_id for sense links whose target matches lang_code.
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
lemmas have been imported we can attempt to match the target word to a row
in dictionary_lemma and store the foreign key.
"""
section = _LANG_SECTION_MAP.get(lang_code)
if not section:
return 0
suffix = f"#{section}"
result = await conn.execute(
sa.select(
_sense_link_table.c.id,
_sense_link_table.c.link_target,
).where(_sense_link_table.c.target_lemma_id.is_(None))
)
all_links = result.fetchall()
# Filter to links that point at this language and extract the target word.
candidates: list[tuple[uuid.UUID, str]] = []
for row in all_links:
if row.link_target.endswith(suffix):
word = row.link_target[: -len(suffix)]
candidates.append((row.id, word))
if not candidates:
return 0
target_words = list({w for _, w in candidates})
lemma_result = await conn.execute(
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
.where(_lemma_table.c.language == lang_code)
.where(_lemma_table.c.headword.in_(target_words))
)
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
resolved = 0
for link_id, word in candidates:
if word in lemma_map:
await conn.execute(
_sense_link_table.update()
.where(_sense_link_table.c.id == link_id)
.values(target_lemma_id=lemma_map[word])
)
resolved += 1
await conn.commit()
return resolved
async def run_import(lang_code: str, batch_size: int = 1000) -> None: async def run_import(lang_code: str, batch_size: int = 1000) -> None:
lang_file = _LANG_FILE_MAP.get(lang_code) lang_file = _LANG_FILE_MAP.get(lang_code)
if not lang_file: if not lang_file:
@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
total_lemmas += len(batch) total_lemmas += len(batch)
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.") print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
async with engine.connect() as conn:
print("Resolving sense links...")
resolved = await _resolve_links(conn, lang_code)
print(f"Resolved {resolved} sense links.")
finally: finally:
await engine.dispose() await engine.dispose()