import uuid from dataclasses import dataclass, field from ..models.dictionary import Sense, Wordform from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository @dataclass class TokenLookupResult: """The result of resolving a spaCy token against the dictionary. ``senses`` is the ranked list of candidate senses for disambiguation. ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``, allowing the vocab bank entry to be pre-linked to the exact inflected form. ``matched_via`` describes which lookup strategy succeeded. """ senses: list[Sense] wordform_id: str | None matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none" matched_wordforms: list[Wordform] = field(default_factory=list) class DictionaryLookupService: """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary senses, using a three-stage fallback strategy. Stage 1 — wordform table lookup (most precise): Searches ``dictionary_wordform`` for an exact match on the inflected surface form within the target language. "allons" → wordform row → lemma "aller". When exactly one lemma matches, ``wordform_id`` is pre-populated on the result. Stage 2 — lemma + POS fallback: If no wordform row exists, tries the spaCy-provided lemma string against ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag). Reduces false matches for homographs with different parts of speech. Stage 3 — lemma-only fallback: Drops the POS filter as a last resort. Returns all senses for the headword regardless of POS. Usage:: service = DictionaryLookupService(PostgresDictionaryRepository(db)) result = await service.lookup_token( surface="allons", spacy_lemma="aller", pos_ud="VERB", language="fr", ) # result.senses — candidate Sense rows for disambiguation # result.wordform_id — pre-resolved wordform UUID string, or None # result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none" """ def __init__(self, dict_repo: DictionaryRepository) -> None: self.dict_repo = dict_repo async def lookup_token( self, surface: str, spacy_lemma: str, pos_ud: str, language: str, ) -> TokenLookupResult: """Resolve a spaCy token to candidate senses using a three-stage fallback. ``surface`` is the raw token text (e.g. ``"allons"``). ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``). ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``). ``language`` is the target language code (e.g. ``"fr"``). Returns a :class:`TokenLookupResult` with the candidate senses and, when the surface form was found in the wordform table, a ``wordform_id`` that can be stored on the vocab bank entry for precise inflection tracking. """ # Stage 1: wordform table lookup by inflected surface form wordforms = await self.dict_repo.get_wordforms_by_form(surface, language) if wordforms: unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms)) senses: list[Sense] = [] for lemma_id in unique_lemma_ids: senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id))) # Only pre-assign wordform_id when a single wordform matched — if multiple # wordforms from different lemmas matched, the ambiguity must be resolved # by the user and we cannot confidently pick one. wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None return TokenLookupResult( senses=senses, wordform_id=wordform_id, matched_via="wordform", matched_wordforms=wordforms, ) # Stage 2: spaCy lemma + UD POS filter senses = await self.dict_repo.get_senses_for_headword_and_pos( spacy_lemma, language, pos_ud ) if senses: return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos") # Stage 3: spaCy lemma only — no POS filter senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language) if senses: return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma") return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")