This commit is contained in:
parent
eb21d8b2f0
commit
7f0977d8e5
2 changed files with 151 additions and 9 deletions
|
|
@ -0,0 +1,41 @@
|
|||
"""add dictionary_sense_link table
|
||||
|
||||
Revision ID: 0010
|
||||
Revises: 0009
|
||||
Create Date: 2026-04-10
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0010"
|
||||
down_revision: Union[str, None] = "0009"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"dictionary_sense_link",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"sense_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_sense.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("link_text", sa.Text(), nullable=False),
|
||||
sa.Column("link_target", sa.Text(), nullable=False),
|
||||
sa.Column("target_lemma_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
)
|
||||
op.create_index("ix_dictionary_sense_link_sense_id", "dictionary_sense_link", ["sense_id"])
|
||||
op.create_index("ix_dictionary_sense_link_target_lemma_id", "dictionary_sense_link", ["target_lemma_id"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_dictionary_sense_link_target_lemma_id", table_name="dictionary_sense_link")
|
||||
op.drop_index("ix_dictionary_sense_link_sense_id", table_name="dictionary_sense_link")
|
||||
op.drop_table("dictionary_sense_link")
|
||||
|
|
@ -59,6 +59,8 @@ _POS_MAP: dict[str, str] = {
|
|||
}
|
||||
|
||||
_GENDER_MAP: dict[str, str] = {
|
||||
"f": "feminine",
|
||||
"m": "masculine",
|
||||
"masculine": "masculine",
|
||||
"masc": "masculine",
|
||||
"feminine": "feminine",
|
||||
|
|
@ -114,6 +116,16 @@ _raw_table = sa.Table(
|
|||
sa.Column("raw", JSONB(), nullable=False),
|
||||
)
|
||||
|
||||
_sense_link_table = sa.Table(
|
||||
"dictionary_sense_link",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("sense_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("link_text", sa.Text(), nullable=False),
|
||||
sa.Column("link_target", sa.Text(), nullable=False),
|
||||
sa.Column("target_lemma_id", PG_UUID(as_uuid=True), nullable=True),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalisation helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -123,12 +135,10 @@ def _normalise_pos(pos_raw: str) -> str | None:
|
|||
return _POS_MAP.get(pos_raw.lower().strip())
|
||||
|
||||
|
||||
def _normalise_gender(tags: list) -> str | None:
|
||||
for tag in tags:
|
||||
mapped = _GENDER_MAP.get(tag)
|
||||
if mapped:
|
||||
return mapped
|
||||
return None
|
||||
def _normalise_gender(value: str | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
return _GENDER_MAP.get(value)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -149,11 +159,13 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
return None
|
||||
|
||||
pos_raw = (record.get("pos") or "").strip()
|
||||
top_tags = record.get("tags") or []
|
||||
|
||||
lemma_id = uuid.uuid4()
|
||||
|
||||
_GENDER_TAGS = {"masculine", "feminine", "neuter"}
|
||||
gender: str | None = None
|
||||
senses = []
|
||||
sense_links = []
|
||||
for i, sense_record in enumerate(record.get("senses") or []):
|
||||
sense_id = uuid.uuid4()
|
||||
glosses = sense_record.get("glosses") or []
|
||||
|
|
@ -161,6 +173,12 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
topics = sense_record.get("topics") or []
|
||||
sense_tags = sense_record.get("tags") or []
|
||||
|
||||
if gender is None:
|
||||
for tag in sense_tags:
|
||||
if tag in _GENDER_TAGS:
|
||||
gender = tag
|
||||
break
|
||||
|
||||
senses.append(
|
||||
{
|
||||
"id": sense_id,
|
||||
|
|
@ -172,6 +190,18 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
}
|
||||
)
|
||||
|
||||
for link_pair in (sense_record.get("links") or []):
|
||||
if isinstance(link_pair, list) and len(link_pair) == 2:
|
||||
sense_links.append(
|
||||
{
|
||||
"id": uuid.uuid4(),
|
||||
"sense_id": sense_id,
|
||||
"link_text": link_pair[0],
|
||||
"link_target": link_pair[1],
|
||||
"target_lemma_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
wordforms = []
|
||||
for f in record.get("forms") or []:
|
||||
form_text = (f.get("form") or "").strip()
|
||||
|
|
@ -194,10 +224,11 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
"language": lang_code,
|
||||
"pos_raw": pos_raw,
|
||||
"pos_normalised": _normalise_pos(pos_raw),
|
||||
"gender": _normalise_gender(top_tags),
|
||||
"tags": top_tags,
|
||||
"gender": gender,
|
||||
"tags": record.get("tags") or [],
|
||||
},
|
||||
"senses": senses,
|
||||
"sense_links": sense_links,
|
||||
"wordforms": wordforms,
|
||||
"raw": {
|
||||
"id": uuid.uuid4(),
|
||||
|
|
@ -216,6 +247,7 @@ def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
|||
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||||
lemma_rows = [e["lemma"] for e in batch]
|
||||
sense_rows = [s for e in batch for s in e["senses"]]
|
||||
sense_link_rows = [lnk for e in batch for lnk in e["sense_links"]]
|
||||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||
raw_rows = [e["raw"] for e in batch]
|
||||
|
||||
|
|
@ -223,6 +255,8 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
|||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||
if sense_rows:
|
||||
await conn.execute(_sense_table.insert(), sense_rows)
|
||||
if sense_link_rows:
|
||||
await conn.execute(_sense_link_table.insert(), sense_link_rows)
|
||||
if wordform_rows:
|
||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||
if raw_rows:
|
||||
|
|
@ -231,6 +265,68 @@ async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict])
|
|||
await conn.commit()
|
||||
|
||||
|
||||
_LANG_SECTION_MAP: dict[str, str] = {
|
||||
"fr": "French",
|
||||
"de": "German",
|
||||
"es": "Spanish",
|
||||
"it": "Italian",
|
||||
"pt": "Portuguese",
|
||||
}
|
||||
|
||||
|
||||
async def _resolve_links(conn: sa.ext.asyncio.AsyncConnection, lang_code: str) -> int:
|
||||
"""Resolve target_lemma_id for sense links whose target matches lang_code.
|
||||
|
||||
Links in kaikki data look like ``["maboul", "maboul#French"]``. After all
|
||||
lemmas have been imported we can attempt to match the target word to a row
|
||||
in dictionary_lemma and store the foreign key.
|
||||
"""
|
||||
section = _LANG_SECTION_MAP.get(lang_code)
|
||||
if not section:
|
||||
return 0
|
||||
|
||||
suffix = f"#{section}"
|
||||
|
||||
result = await conn.execute(
|
||||
sa.select(
|
||||
_sense_link_table.c.id,
|
||||
_sense_link_table.c.link_target,
|
||||
).where(_sense_link_table.c.target_lemma_id.is_(None))
|
||||
)
|
||||
all_links = result.fetchall()
|
||||
|
||||
# Filter to links that point at this language and extract the target word.
|
||||
candidates: list[tuple[uuid.UUID, str]] = []
|
||||
for row in all_links:
|
||||
if row.link_target.endswith(suffix):
|
||||
word = row.link_target[: -len(suffix)]
|
||||
candidates.append((row.id, word))
|
||||
|
||||
if not candidates:
|
||||
return 0
|
||||
|
||||
target_words = list({w for _, w in candidates})
|
||||
lemma_result = await conn.execute(
|
||||
sa.select(_lemma_table.c.id, _lemma_table.c.headword)
|
||||
.where(_lemma_table.c.language == lang_code)
|
||||
.where(_lemma_table.c.headword.in_(target_words))
|
||||
)
|
||||
lemma_map: dict[str, uuid.UUID] = {r.headword: r.id for r in lemma_result}
|
||||
|
||||
resolved = 0
|
||||
for link_id, word in candidates:
|
||||
if word in lemma_map:
|
||||
await conn.execute(
|
||||
_sense_link_table.update()
|
||||
.where(_sense_link_table.c.id == link_id)
|
||||
.values(target_lemma_id=lemma_map[word])
|
||||
)
|
||||
resolved += 1
|
||||
|
||||
await conn.commit()
|
||||
return resolved
|
||||
|
||||
|
||||
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||||
if not lang_file:
|
||||
|
|
@ -299,6 +395,11 @@ async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
|||
total_lemmas += len(batch)
|
||||
|
||||
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||||
|
||||
async with engine.connect() as conn:
|
||||
print("Resolving sense links...")
|
||||
resolved = await _resolve_links(conn, lang_code)
|
||||
print(f"Resolved {resolved} sense links.")
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue