language-learning-app/api/app/domain/services/dictionary_lookup_service.py

import uuid
from dataclasses import dataclass, field

from ..models.dictionary import Sense, Wordform
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository


@dataclass
class TokenLookupResult:
    """The result of resolving a spaCy token against the dictionary.

    ``senses`` is the ranked list of candidate senses for disambiguation.
    ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
    allowing the vocab bank entry to be pre-linked to the exact inflected form.
    ``matched_via`` describes which lookup strategy succeeded.
    """
    senses: list[Sense]
    wordform_id: str | None
    matched_via: str  # "wordform" | "lemma_pos" | "lemma" | "none"
    matched_wordforms: list[Wordform] = field(default_factory=list)


class DictionaryLookupService:
    """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
    senses, using a three-stage fallback strategy.

    Stage 1 — wordform table lookup (most precise):
        Searches ``dictionary_wordform`` for an exact match on the inflected surface
        form within the target language.  "allons" → wordform row → lemma "aller".
        When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.

    Stage 2 — lemma + POS fallback:
        If no wordform row exists, tries the spaCy-provided lemma string against
        ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
        Reduces false matches for homographs with different parts of speech.

    Stage 3 — lemma-only fallback:
        Drops the POS filter as a last resort.  Returns all senses for the headword
        regardless of POS.

    Usage::

        service = DictionaryLookupService(PostgresDictionaryRepository(db))

        result = await service.lookup_token(
            surface="allons",
            spacy_lemma="aller",
            pos_ud="VERB",
            language="fr",
        )
        # result.senses  — candidate Sense rows for disambiguation
        # result.wordform_id  — pre-resolved wordform UUID string, or None
        # result.matched_via  — "wordform" | "lemma_pos" | "lemma" | "none"
    """

    def __init__(self, dict_repo: DictionaryRepository) -> None:
        self.dict_repo = dict_repo

    async def lookup_token(
        self,
        surface: str,
        spacy_lemma: str,
        pos_ud: str,
        language: str,
    ) -> TokenLookupResult:
        """Resolve a spaCy token to candidate senses using a three-stage fallback.

        ``surface`` is the raw token text (e.g. ``"allons"``).
        ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
        ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
        ``language`` is the target language code (e.g. ``"fr"``).

        Returns a :class:`TokenLookupResult` with the candidate senses and, when the
        surface form was found in the wordform table, a ``wordform_id`` that can be
        stored on the vocab bank entry for precise inflection tracking.
        """
        # Stage 1: wordform table lookup by inflected surface form
        wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
        if wordforms:
            unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
            senses: list[Sense] = []
            for lemma_id in unique_lemma_ids:
                senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))

            # Only pre-assign wordform_id when a single wordform matched — if multiple
            # wordforms from different lemmas matched, the ambiguity must be resolved
            # by the user and we cannot confidently pick one.
            wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
            return TokenLookupResult(
                senses=senses,
                wordform_id=wordform_id,
                matched_via="wordform",
                matched_wordforms=wordforms,
            )

        # Stage 2: spaCy lemma + UD POS filter
        senses = await self.dict_repo.get_senses_for_headword_and_pos(
            spacy_lemma, language, pos_ud
        )
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")

        # Stage 3: spaCy lemma only — no POS filter
        senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")

        return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
feat: Create the Dictionary Lookup Service; methods for fidning vocabulary and words 2026-04-10 06:11:57 +00:00			`import uuid`
			`from dataclasses import dataclass, field`

			`from ..models.dictionary import Sense, Wordform`
			`from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository`


			`@dataclass`
			`class TokenLookupResult:`
			`"""The result of resolving a spaCy token against the dictionary.`

			``senses`` is the ranked list of candidate senses for disambiguation.
			``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
			`allowing the vocab bank entry to be pre-linked to the exact inflected form.`
			``matched_via`` describes which lookup strategy succeeded.
			`"""`
			`senses: list[Sense]`
			`wordform_id: str \| None`
			`matched_via: str # "wordform" \| "lemma_pos" \| "lemma" \| "none"`
			`matched_wordforms: list[Wordform] = field(default_factory=list)`


			`class DictionaryLookupService:`
			`"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary`
			`senses, using a three-stage fallback strategy.`

			`Stage 1 — wordform table lookup (most precise):`
			Searches ``dictionary_wordform`` for an exact match on the inflected surface
			`form within the target language. "allons" → wordform row → lemma "aller".`
			When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.

			`Stage 2 — lemma + POS fallback:`
			`If no wordform row exists, tries the spaCy-provided lemma string against`
			``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
			`Reduces false matches for homographs with different parts of speech.`

			`Stage 3 — lemma-only fallback:`
			`Drops the POS filter as a last resort. Returns all senses for the headword`
			`regardless of POS.`

			`Usage::`

			`service = DictionaryLookupService(PostgresDictionaryRepository(db))`

			`result = await service.lookup_token(`
			`surface="allons",`
			`spacy_lemma="aller",`
			`pos_ud="VERB",`
			`language="fr",`
			`)`
			`# result.senses — candidate Sense rows for disambiguation`
			`# result.wordform_id — pre-resolved wordform UUID string, or None`
			`# result.matched_via — "wordform" \| "lemma_pos" \| "lemma" \| "none"`
			`"""`

			`def __init__(self, dict_repo: DictionaryRepository) -> None:`
			`self.dict_repo = dict_repo`

			`async def lookup_token(`
			`self,`
			`surface: str,`
			`spacy_lemma: str,`
			`pos_ud: str,`
			`language: str,`
			`) -> TokenLookupResult:`
			`"""Resolve a spaCy token to candidate senses using a three-stage fallback.`

			``surface`` is the raw token text (e.g. ``"allons"``).
			``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
			``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
			``language`` is the target language code (e.g. ``"fr"``).

			Returns a :class:`TokenLookupResult` with the candidate senses and, when the
			surface form was found in the wordform table, a ``wordform_id`` that can be
			`stored on the vocab bank entry for precise inflection tracking.`
			`"""`
			`# Stage 1: wordform table lookup by inflected surface form`
			`wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)`
			`if wordforms:`
			`unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))`
			`senses: list[Sense] = []`
			`for lemma_id in unique_lemma_ids:`
			`senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))`

			`# Only pre-assign wordform_id when a single wordform matched — if multiple`
			`# wordforms from different lemmas matched, the ambiguity must be resolved`
			`# by the user and we cannot confidently pick one.`
			`wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None`
			`return TokenLookupResult(`
			`senses=senses,`
			`wordform_id=wordform_id,`
			`matched_via="wordform",`
			`matched_wordforms=wordforms,`
			`)`

			`# Stage 2: spaCy lemma + UD POS filter`
			`senses = await self.dict_repo.get_senses_for_headword_and_pos(`
			`spacy_lemma, language, pos_ud`
			`)`
			`if senses:`
			`return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")`

			`# Stage 3: spaCy lemma only — no POS filter`
			`senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)`
			`if senses:`
			`return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")`

			`return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")`