feat: Create the Dictionary Lookup Service; methods for fidning

vocabulary and words
2026-04-10 07:11:57 +01:00 · 2026-04-10 07:11:57 +01:00 · aa4987981d
commit aa4987981d
parent 27f7a7c3f3
4 changed files with 275 additions and 0 deletions
--- a/api/app/domain/services/dictionary_lookup_service.py
+++ b/api/app/domain/services/dictionary_lookup_service.py
@ -0,0 +1,108 @@
 import uuid
 from dataclasses import dataclass, field
 from ..models.dictionary import Sense, Wordform
 from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@dataclass
 class TokenLookupResult:
    """The result of resolving a spaCy token against the dictionary.
    ``senses`` is the ranked list of candidate senses for disambiguation.
    ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
    allowing the vocab bank entry to be pre-linked to the exact inflected form.
    ``matched_via`` describes which lookup strategy succeeded.
    """
    senses: list[Sense]
    wordform_id: str | None
    matched_via: str  # "wordform" | "lemma_pos" | "lemma" | "none"
    matched_wordforms: list[Wordform] = field(default_factory=list)
 class DictionaryLookupService:
    """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
    senses, using a three-stage fallback strategy.
    Stage 1 — wordform table lookup (most precise):
        Searches ``dictionary_wordform`` for an exact match on the inflected surface
        form within the target language.  "allons" → wordform row → lemma "aller".
        When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
    Stage 2 — lemma + POS fallback:
        If no wordform row exists, tries the spaCy-provided lemma string against
        ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
        Reduces false matches for homographs with different parts of speech.
    Stage 3 — lemma-only fallback:
        Drops the POS filter as a last resort.  Returns all senses for the headword
        regardless of POS.
    Usage::
        service = DictionaryLookupService(PostgresDictionaryRepository(db))
        result = await service.lookup_token(
            surface="allons",
            spacy_lemma="aller",
            pos_ud="VERB",
            language="fr",
        )
        # result.senses  — candidate Sense rows for disambiguation
        # result.wordform_id  — pre-resolved wordform UUID string, or None
        # result.matched_via  — "wordform" | "lemma_pos" | "lemma" | "none"
    """
    def __init__(self, dict_repo: DictionaryRepository) -> None:
        self.dict_repo = dict_repo
    async def lookup_token(
        self,
        surface: str,
        spacy_lemma: str,
        pos_ud: str,
        language: str,
    ) -> TokenLookupResult:
        """Resolve a spaCy token to candidate senses using a three-stage fallback.
        ``surface`` is the raw token text (e.g. ``"allons"``).
        ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
        ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
        ``language`` is the target language code (e.g. ``"fr"``).
        Returns a :class:`TokenLookupResult` with the candidate senses and, when the
        surface form was found in the wordform table, a ``wordform_id`` that can be
        stored on the vocab bank entry for precise inflection tracking.
        """
        # Stage 1: wordform table lookup by inflected surface form
        wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
        if wordforms:
            unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
            senses: list[Sense] = []
            for lemma_id in unique_lemma_ids:
                senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
            # Only pre-assign wordform_id when a single wordform matched — if multiple
            # wordforms from different lemmas matched, the ambiguity must be resolved
            # by the user and we cannot confidently pick one.
            wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
            return TokenLookupResult(
                senses=senses,
                wordform_id=wordform_id,
                matched_via="wordform",
                matched_wordforms=wordforms,
            )
        # Stage 2: spaCy lemma + UD POS filter
        senses = await self.dict_repo.get_senses_for_headword_and_pos(
            spacy_lemma, language, pos_ud
        )
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
        # Stage 3: spaCy lemma only — no POS filter
        senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
        if senses:
            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
        return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
--- a/api/app/domain/services/vocab_service.py
+++ b/api/app/domain/services/vocab_service.py
@ -1,5 +1,6 @@
 import uuid
 from ..models.dictionary import Sense
 from ..models.vocab import LearnableWordBankEntry
 from ...outbound.postgres.repositories.vocab_repository import VocabRepository
 from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@ -35,6 +36,7 @@ class VocabService:
        language_pair_id: uuid.UUID,
        pathway: str,
        is_phrase: bool = False,
        wordform_id: uuid.UUID | None = None,
        source_article_id: uuid.UUID | None = None,
    ) -> LearnableWordBankEntry:
        """Add a word or phrase to the user's vocab bank, automatically linking it to a
@ -108,6 +110,59 @@ class VocabService:
            entry_pathway=pathway,
            is_phrase=False,
            sense_id=sense_id,
            wordform_id=wordform_id,
            source_article_id=source_article_id,
            disambiguation_status=status,
        )
    async def add_token_to_bank(
        self,
        user_id: uuid.UUID,
        surface_text: str,
        language_pair_id: uuid.UUID,
        senses: list[Sense],
        wordform_id: uuid.UUID | None,
        source_article_id: uuid.UUID | None = None,
    ) -> LearnableWordBankEntry:
        """Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
        results, skipping the redundant dictionary query that ``add_word_to_bank`` would
        otherwise perform.
        ``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
        are stored directly on the bank entry.  Auto-resolution still applies: exactly
        one sense means ``auto_resolved``; anything else means ``pending``.
        Usage::
            result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
            wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
            entry = await vocab_service.add_token_to_bank(
                user_id=user_id,
                surface_text="allons",
                language_pair_id=pair_id,
                senses=result.senses,
                wordform_id=wf_id,
            )
            # entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
        """
        pair = await self.vocab_repo.get_language_pair(language_pair_id)
        if pair is None:
            raise ValueError(f"Language pair {language_pair_id} not found")
        if len(senses) == 1:
            sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
            status = "auto_resolved"
        else:
            sense_id = None
            status = "pending"
        return await self.vocab_repo.add_entry(
            user_id=user_id,
            language_pair_id=language_pair_id,
            surface_text=surface_text,
            entry_pathway="nlp_extraction",
            wordform_id=wordform_id,
            sense_id=sense_id,
            source_article_id=source_article_id,
            disambiguation_status=status,
        )
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform
 class DictionaryRepository(Protocol):
    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
    async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
    async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
@ -99,6 +102,40 @@ class PostgresDictionaryRepository:
        entity = result.scalar_one_or_none()
        return _lemma_to_model(entity) if entity else None
    async def get_senses_for_headword_and_pos(
        self, headword: str, language: str, pos_normalised: str
    ) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
                DictionaryLemmaEntity.pos_normalised == pos_normalised,
            )
            .order_by(DictionarySenseEntity.sense_index)
        )
        return [_sense_to_model(e) for e in result.scalars().all()]
    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
            .where(DictionarySenseEntity.lemma_id == lemma_id)
            .order_by(DictionarySenseEntity.sense_index)
        )
        return [_sense_to_model(e) for e in result.scalars().all()]
    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity)
            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
            .where(
                DictionaryWordformEntity.form == form,
                DictionaryLemmaEntity.language == language,
            )
        )
        return [_wordform_to_model(e) for e in result.scalars().all()]
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity).where(
--- a/api/app/routers/api/vocab.py
+++ b/api/app/routers/api/vocab.py
@ -5,6 +5,7 @@ from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 from ...auth import verify_token
 from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
 from ...domain.services.vocab_service import VocabService
 from ...outbound.postgres.database import get_db
 from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
@ -21,6 +22,28 @@ class AddWordRequest(BaseModel):
    source_article_id: str | None = None
 class AddFromTokenRequest(BaseModel):
    language_pair_id: str
    surface: str
    spacy_lemma: str
    pos_ud: str
    language: str
    source_article_id: str | None = None
 class SenseCandidateResponse(BaseModel):
    id: str
    gloss: str
    topics: list[str]
    tags: list[str]
 class FromTokenResponse(BaseModel):
    entry: "WordBankEntryResponse"
    sense_candidates: list[SenseCandidateResponse]
    matched_via: str
 class SetSenseRequest(BaseModel):
    sense_id: str
@ -80,6 +103,58 @@ async def add_word(
    return _to_response(entry)
@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
 async def add_from_token(
    request: AddFromTokenRequest,
    db: AsyncSession = Depends(get_db),
    token_data: dict = Depends(verify_token),
 ) -> FromTokenResponse:
    user_id = uuid.UUID(token_data["sub"])
    try:
        language_pair_id = uuid.UUID(request.language_pair_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid language_pair_id")
    source_article_id = None
    if request.source_article_id:
        try:
            source_article_id = uuid.UUID(request.source_article_id)
        except ValueError:
            raise HTTPException(status_code=400, detail="Invalid source_article_id")
    lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
    result: TokenLookupResult = await lookup_service.lookup_token(
        surface=request.surface,
        spacy_lemma=request.spacy_lemma,
        pos_ud=request.pos_ud,
        language=request.language,
    )
    wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
    try:
        entry = await _service(db).add_token_to_bank(
            user_id=user_id,
            surface_text=request.surface,
            language_pair_id=language_pair_id,
            senses=result.senses,
            wordform_id=wordform_id,
            source_article_id=source_article_id,
        )
    except ValueError as exc:
        raise HTTPException(status_code=404, detail=str(exc))
    candidates = [
        SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
        for s in result.senses
    ]
    return FromTokenResponse(
        entry=_to_response(entry),
        sense_candidates=candidates,
        matched_via=result.matched_via,
    )
@router.get("", response_model=list[WordBankEntryResponse])
 async def list_entries(
    language_pair_id: str,