108 lines
4.6 KiB
Python
108 lines
4.6 KiB
Python
import uuid
|
|
from dataclasses import dataclass, field
|
|
|
|
from ..models.dictionary import Sense, Wordform
|
|
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
|
|
|
|
|
@dataclass
|
|
class TokenLookupResult:
|
|
"""The result of resolving a spaCy token against the dictionary.
|
|
|
|
``senses`` is the ranked list of candidate senses for disambiguation.
|
|
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
|
|
allowing the vocab bank entry to be pre-linked to the exact inflected form.
|
|
``matched_via`` describes which lookup strategy succeeded.
|
|
"""
|
|
senses: list[Sense]
|
|
wordform_id: str | None
|
|
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
|
|
matched_wordforms: list[Wordform] = field(default_factory=list)
|
|
|
|
|
|
class DictionaryLookupService:
|
|
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
|
|
senses, using a three-stage fallback strategy.
|
|
|
|
Stage 1 — wordform table lookup (most precise):
|
|
Searches ``dictionary_wordform`` for an exact match on the inflected surface
|
|
form within the target language. "allons" → wordform row → lemma "aller".
|
|
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
|
|
|
|
Stage 2 — lemma + POS fallback:
|
|
If no wordform row exists, tries the spaCy-provided lemma string against
|
|
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
|
|
Reduces false matches for homographs with different parts of speech.
|
|
|
|
Stage 3 — lemma-only fallback:
|
|
Drops the POS filter as a last resort. Returns all senses for the headword
|
|
regardless of POS.
|
|
|
|
Usage::
|
|
|
|
service = DictionaryLookupService(PostgresDictionaryRepository(db))
|
|
|
|
result = await service.lookup_token(
|
|
surface="allons",
|
|
spacy_lemma="aller",
|
|
pos_ud="VERB",
|
|
language="fr",
|
|
)
|
|
# result.senses — candidate Sense rows for disambiguation
|
|
# result.wordform_id — pre-resolved wordform UUID string, or None
|
|
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
|
|
"""
|
|
|
|
def __init__(self, dict_repo: DictionaryRepository) -> None:
|
|
self.dict_repo = dict_repo
|
|
|
|
async def lookup_token(
|
|
self,
|
|
surface: str,
|
|
spacy_lemma: str,
|
|
pos_ud: str,
|
|
language: str,
|
|
) -> TokenLookupResult:
|
|
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
|
|
|
|
``surface`` is the raw token text (e.g. ``"allons"``).
|
|
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
|
|
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
|
|
``language`` is the target language code (e.g. ``"fr"``).
|
|
|
|
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
|
|
surface form was found in the wordform table, a ``wordform_id`` that can be
|
|
stored on the vocab bank entry for precise inflection tracking.
|
|
"""
|
|
# Stage 1: wordform table lookup by inflected surface form
|
|
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
|
|
if wordforms:
|
|
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
|
|
senses: list[Sense] = []
|
|
for lemma_id in unique_lemma_ids:
|
|
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
|
|
|
|
# Only pre-assign wordform_id when a single wordform matched — if multiple
|
|
# wordforms from different lemmas matched, the ambiguity must be resolved
|
|
# by the user and we cannot confidently pick one.
|
|
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
|
|
return TokenLookupResult(
|
|
senses=senses,
|
|
wordform_id=wordform_id,
|
|
matched_via="wordform",
|
|
matched_wordforms=wordforms,
|
|
)
|
|
|
|
# Stage 2: spaCy lemma + UD POS filter
|
|
senses = await self.dict_repo.get_senses_for_headword_and_pos(
|
|
spacy_lemma, language, pos_ud
|
|
)
|
|
if senses:
|
|
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
|
|
|
|
# Stage 3: spaCy lemma only — no POS filter
|
|
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
|
|
if senses:
|
|
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
|
|
|
|
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
|