language-learning-app/api/app/domain/services/dictionary_lookup_service.py

109 lines
4.6 KiB
Python
Raw Normal View History

import uuid
from dataclasses import dataclass, field
from ..models.dictionary import Sense, Wordform
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@dataclass
class TokenLookupResult:
"""The result of resolving a spaCy token against the dictionary.
``senses`` is the ranked list of candidate senses for disambiguation.
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
allowing the vocab bank entry to be pre-linked to the exact inflected form.
``matched_via`` describes which lookup strategy succeeded.
"""
senses: list[Sense]
wordform_id: str | None
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
matched_wordforms: list[Wordform] = field(default_factory=list)
class DictionaryLookupService:
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
senses, using a three-stage fallback strategy.
Stage 1 wordform table lookup (most precise):
Searches ``dictionary_wordform`` for an exact match on the inflected surface
form within the target language. "allons" wordform row lemma "aller".
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
Stage 2 lemma + POS fallback:
If no wordform row exists, tries the spaCy-provided lemma string against
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
Reduces false matches for homographs with different parts of speech.
Stage 3 lemma-only fallback:
Drops the POS filter as a last resort. Returns all senses for the headword
regardless of POS.
Usage::
service = DictionaryLookupService(PostgresDictionaryRepository(db))
result = await service.lookup_token(
surface="allons",
spacy_lemma="aller",
pos_ud="VERB",
language="fr",
)
# result.senses — candidate Sense rows for disambiguation
# result.wordform_id — pre-resolved wordform UUID string, or None
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
"""
def __init__(self, dict_repo: DictionaryRepository) -> None:
self.dict_repo = dict_repo
async def lookup_token(
self,
surface: str,
spacy_lemma: str,
pos_ud: str,
language: str,
) -> TokenLookupResult:
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
``surface`` is the raw token text (e.g. ``"allons"``).
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
``language`` is the target language code (e.g. ``"fr"``).
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
surface form was found in the wordform table, a ``wordform_id`` that can be
stored on the vocab bank entry for precise inflection tracking.
"""
# Stage 1: wordform table lookup by inflected surface form
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
if wordforms:
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
senses: list[Sense] = []
for lemma_id in unique_lemma_ids:
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
# Only pre-assign wordform_id when a single wordform matched — if multiple
# wordforms from different lemmas matched, the ambiguity must be resolved
# by the user and we cannot confidently pick one.
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
return TokenLookupResult(
senses=senses,
wordform_id=wordform_id,
matched_via="wordform",
matched_wordforms=wordforms,
)
# Stage 2: spaCy lemma + UD POS filter
senses = await self.dict_repo.get_senses_for_headword_and_pos(
spacy_lemma, language, pos_ud
)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
# Stage 3: spaCy lemma only — no POS filter
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")