language-learning-app/api/app/domain/services/dictionary_lookup_service.py

import uuid
from dataclasses import dataclass, field

from ..models.dictionary import Sense, Wordform
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository


@dataclass
class TokenLookupResult:
    """The result of resolving a spaCy token against the dictionary.

    ``senses`` is the ranked list of candidate senses for disambiguation.
    ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
    allowing the vocab bank entry to be pre-linked to the exact inflected form.
    ``matched_via`` describes which lookup strategy succeeded.
    """
    senses: list[Sense]
    wordform_id: str | None
    matched_via: str  # "wordform" | "lemma_pos" | "lemma" | "none"
    matched_wordforms: list[Wordform] = field(default_factory=list)


class DictionaryLookupService:
    """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
    senses, using a three-stage fallback strategy.

    Stage 1 — wordform table lookup (most precise):
        Searches ``dictionary_wordform`` for an exact match on the inflected surface
        form within the target language.  "allons" → wordform row → lemma "aller".
        When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.

    Stage 2 — lemma + POS fallback:
        If no wordform row exists, tries the spaCy-provided lemma string against
        ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
        Reduces false matches for homographs with different parts of speech.

    Stage 3 — lemma-only fallback:
        Drops the POS filter as a last resort.  Returns all senses for the headword
        regardless of POS.

    Usage::

        service = DictionaryLookupService(PostgresDictionaryRepository(db))

        result = await service.lookup_token(
            surface="allons",
            spacy_lemma="aller",
            pos_ud="VERB",
            language="fr",
        )
        # result.senses  — candidate Sense rows for disambiguation
        # result.wordform_id  — pre-resolved wordform UUID string, or None
        # result.matched_via  — "wordform" | "lemma_pos" | "lemma" | "none"
    """

    def __init__(self, dict_repo: DictionaryRepository) -> None:
        self.dict_repo = dict_repo

    async def lookup_token(
        self,
        surface: str,
        spacy_lemma: str,
        pos_ud: str,
        language: str,
    ) -> TokenLookupResult:
        """Resolve a spaCy token to candidate senses using a three-stage fallback.

        ``surface`` is the raw token text (e.g. ``"allons"``).
        ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
        ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
        ``language`` is the target language code (e.g. ``"fr"``).

        Returns a :class:`TokenLookupResult` with the candidate senses and, when the
        surface form was found in the wordform table, a ``wordform_id`` that can be
        stored on the vocab bank entry for precise inflection tracking.
        """
        # Stage 1: wordform table lookup by inflected surface form
        wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
        if wordforms:
            unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
            senses: list[Sense] = []
            for lemma_id in unique_lemma_ids:
                senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))

            # Only pre-assign wordform_id when a single wordform matched — if multiple
            # wordforms from different lemmas matched, the ambiguity must be resolved
            # by the user and we cannot confidently pick one.
            wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
            return TokenLookupResult(
                senses=senses,
                wordform_id=wordform_id,
                matched_via="wordform",
                matched_wordforms=wordforms,
            )

        # Stage 2: spaCy lemma + UD POS filter
        senses = await self.dict_repo.get_senses_for_headword_and_pos(
            spacy_lemma, language, pos_ud
        )
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")

        # Stage 3: spaCy lemma only — no POS filter
        senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")

        return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")