This commit is contained in:
parent
eb21d8b2f0
commit
7f0977d8e5
2 changed files with 151 additions and 9 deletions
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""add dictionary_sense_link table
|
||||||
|
|
||||||
|
Revision ID: 0010
|
||||||
|
Revises: 0009
|
||||||
|
Create Date: 2026-04-10
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0010"
|
||||||
|
down_revision: Union[str, None] = "0009"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"dictionary_sense_link",
|
||||||
|
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column(
|
||||||
|
"sense_id",
|
||||||
|
postgresql.UUID(as_uuid=True),
|
||||||
|
sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
|
||||||
|
nullable=False,
|
||||||
|
),
|
||||||
|
sa.Column("link_text", sa.Text(), nullable=False),
|
||||||
|
sa.Column("link_target", sa.Text(), nullable=False),
|
||||||
|
sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||||
|
)
|
||||||
|
op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
|
||||||
|
op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
|
||||||
|
op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
|
||||||
|
op.drop_table("dictionary_sense_link")
|
||||||
|
|
@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = {
|
||||||
}
|
}
|
||||||
|
|
||||||
_GENDER_MAP: dict[str, str] = {
|
_GENDER_MAP: dict[str, str] = {
|
||||||
|
"f": "feminine",
|
||||||
|
"m": "masculine",
|
||||||
"masculine": "masculine",
|
"masculine": "masculine",
|
||||||
"masc": "masculine",
|
"masc": "masculine",
|
||||||
"feminine": "feminine",
|
"feminine": "feminine",
|
||||||
|
|
@ -114,6 +116,16 @@ _raw_table = sa.Table(
|
||||||
sa.Column("raw", JSONB(), nullable=False),
|
sa.Column("raw", JSONB(), nullable=False),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_sense_link_table = sa.Table(
|
||||||
|
"dictionary_sense_link",
|
||||||
|
_meta,
|
||||||
|
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||||
|
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
|
||||||
|
sa.Column("link_text", sa.Text(), nullable=False),
|
||||||
|
sa.Column("link_target", sa.Text(), nullable=False),
|
||||||
|
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Normalisation helpers
|
# Normalisation helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None:
|
||||||
return _POS_MAP.get(pos_raw.lower().strip())
|
return _POS_MAP.get(pos_raw.lower().strip())
|
||||||
|
|
||||||
|
|
||||||
def _normalise_gender(tags: list) -> str | None:
|
def _normalise_gender(value: str | None) -> str | None:
|
||||||
for tag in tags:
|
if value is None:
|
||||||
mapped = _GENDER_MAP.get(tag)
|
return None
|
||||||
if mapped:
|
return _GENDER_MAP.get(value)
|
||||||
return mapped
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
pos_raw = (record.get("pos") or "").strip()
|
pos_raw = (record.get("pos") or "").strip()
|
||||||
top_tags = record.get("tags") or []
|
|
||||||
|
|
||||||
lemma_id = uuid.uuid4()
|
lemma_id = uuid.uuid4()
|
||||||
|
|
||||||
|
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
||||||
|
gender: str | None = None
|
||||||
senses = []
|
senses = []
|
||||||
|
sense_links = []
|
||||||
for i, sense_record in enumerate(record.get("senses") or []):
|
for i, sense_record in enumerate(record.get("senses") or []):
|
||||||
sense_id = uuid.uuid4()
|
sense_id = uuid.uuid4()
|
||||||
glosses = sense_record.get("glosses") or []
|
glosses = sense_record.get("glosses") or []
|
||||||
|
|
@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
topics = sense_record.get("topics") or []
|
topics = sense_record.get("topics") or []
|
||||||
sense_tags = sense_record.get("tags") or []
|
sense_tags = sense_record.get("tags") or []
|
||||||
|
|
||||||
|
if gender is None:
|
||||||
|
for tag in sense_tags:
|
||||||
|
if tag in _GENDER_TAGS:
|
||||||
|
gender = tag
|
||||||
|
break
|
||||||
|
|
||||||
senses.append(
|
senses.append(
|
||||||
{
|
{
|
||||||
"id": sense_id,
|
"id": sense_id,
|
||||||
|
|
@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for link_pair in (sense_record.get("links") or []):
|
||||||
|
if isinstance(link_pair, list) and len(link_pair) == 2:
|
||||||
|
sense_links.append(
|
||||||
|
{
|
||||||
|
"id": uuid.uuid4(),
|
||||||
|
"sense_id": sense_id,
|
||||||
|
"link_text": link_pair[0],
|
||||||
|
"link_target": link_pair[1],
|
||||||
|
"target_lemma_id": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
wordforms = []
|
wordforms = []
|
||||||
for f in record.get("forms") or []:
|
for f in record.get("forms") or []:
|
||||||
form_text = (f.get("form") or "").strip()
|
form_text = (f.get("form") or "").strip()
|
||||||
|
|
@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
"language": lang_code,
|
"language": lang_code,
|
||||||
"pos_raw": pos_raw,
|
"pos_raw": pos_raw,
|
||||||
"pos_normalised": _normalise_pos(pos_raw),
|
"pos_normalised": _normalise_pos(pos_raw),
|
||||||
"gender": _normalise_gender(top_tags),
|
"gender": gender,
|
||||||
"tags": top_tags,
|
"tags": record.get("tags") or [],
|
||||||
},
|
},
|
||||||
"senses": senses,
|
"senses": senses,
|
||||||
|
"sense_links": sense_links,
|
||||||
"wordforms": wordforms,
|
"wordforms": wordforms,
|
||||||
"raw": {
|
"raw": {
|
||||||
"id": uuid.uuid4(),
|
"id": uuid.uuid4(),
|
||||||
|
|
@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||||
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||||||
lemma_rows = [e["lemma"] for e in batch]
|
lemma_rows = [e["lemma"] for e in batch]
|
||||||
sense_rows = [s for e in batch for s in e["senses"]]
|
sense_rows = [s for e in batch for s in e["senses"]]
|
||||||
|
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
|
||||||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||||
raw_rows = [e["raw"] for e in batch]
|
raw_rows = [e["raw"] for e in batch]
|
||||||
|
|
||||||
|
|
@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
||||||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||||
if sense_rows:
|
if sense_rows:
|
||||||
await conn.execute(_sense_table.insert(), sense_rows)
|
await conn.execute(_sense_table.insert(), sense_rows)
|
||||||
|
if sense_link_rows:
|
||||||
|
await conn.execute(_sense_link_table.insert(), sense_link_rows)
|
||||||
if wordform_rows:
|
if wordform_rows:
|
||||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||||
if raw_rows:
|
if raw_rows:
|
||||||
|
|
@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
||||||
await conn.commit()
|
await conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
_LANG_SECTION_MAP: dict[str, str] = {
|
||||||
|
"fr": "French",
|
||||||
|
"de": "German",
|
||||||
|
"es": "Spanish",
|
||||||
|
"it": "Italian",
|
||||||
|
"pt": "Portuguese",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
|
||||||
|
"""Resolve target_lemma_id for sense links whose target matches lang_code.
|
||||||
|
|
||||||
|
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
|
||||||
|
lemmas have been imported we can attempt to match the target word to a row
|
||||||
|
in dictionary_lemma and store the foreign key.
|
||||||
|
"""
|
||||||
|
section = _LANG_SECTION_MAP.get(lang_code)
|
||||||
|
if not section:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
suffix = f"#{section}"
|
||||||
|
|
||||||
|
result = await conn.execute(
|
||||||
|
sa.select(
|
||||||
|
_sense_link_table.c.id,
|
||||||
|
_sense_link_table.c.link_target,
|
||||||
|
).where(_sense_link_table.c.target_lemma_id.is_(None))
|
||||||
|
)
|
||||||
|
all_links = result.fetchall()
|
||||||
|
|
||||||
|
# Filter to links that point at this language and extract the target word.
|
||||||
|
candidates: list[tuple[uuid.UUID, str]] = []
|
||||||
|
for row in all_links:
|
||||||
|
if row.link_target.endswith(suffix):
|
||||||
|
word = row.link_target[: -len(suffix)]
|
||||||
|
candidates.append((row.id, word))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
target_words = list({w for _, w in candidates})
|
||||||
|
lemma_result = await conn.execute(
|
||||||
|
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
|
||||||
|
.where(_lemma_table.c.language == lang_code)
|
||||||
|
.where(_lemma_table.c.headword.in_(target_words))
|
||||||
|
)
|
||||||
|
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
|
||||||
|
|
||||||
|
resolved = 0
|
||||||
|
for link_id, word in candidates:
|
||||||
|
if word in lemma_map:
|
||||||
|
await conn.execute(
|
||||||
|
_sense_link_table.update()
|
||||||
|
.where(_sense_link_table.c.id == link_id)
|
||||||
|
.values(target_lemma_id=lemma_map[word])
|
||||||
|
)
|
||||||
|
resolved += 1
|
||||||
|
|
||||||
|
await conn.commit()
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||||
lang_file = _LANG_FILE_MAP.get(lang_code)
|
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||||||
if not lang_file:
|
if not lang_file:
|
||||||
|
|
@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||||
total_lemmas += len(batch)
|
total_lemmas += len(batch)
|
||||||
|
|
||||||
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||||||
|
|
||||||
|
async with engine.connect() as conn:
|
||||||
|
print("Resolving sense links...")
|
||||||
|
resolved = await _resolve_links(conn, lang_code)
|
||||||
|
print(f"Resolved {resolved} sense links.")
|
||||||
finally:
|
finally:
|
||||||
await engine.dispose()
|
await engine.dispose()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue