feat: Create the Dictionary Lookup Service; methods for fidning

vocabulary and words
2026-04-10 07:11:57 +01:00 · 2026-04-10 07:11:57 +01:00 · aa4987981d
commit aa4987981d
parent 27f7a7c3f3
4 changed files with 275 additions and 0 deletions
--- a/api/app/domain/services/dictionary_lookup_service.py
+++ b/api/app/domain/services/dictionary_lookup_service.py
@ -0,0 +1,108 @@
+import uuid
+from dataclasses import dataclass, field
+
+from ..models.dictionary import Sense, Wordform
+from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
+
+
+@dataclass
+class TokenLookupResult:
+    """The result of resolving a spaCy token against the dictionary.
+
+    ``senses`` is the ranked list of candidate senses for disambiguation.
+    ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
+    allowing the vocab bank entry to be pre-linked to the exact inflected form.
+    ``matched_via`` describes which lookup strategy succeeded.
+    """
+    senses: list[Sense]
+    wordform_id: str | None
+    matched_via: str  # "wordform" | "lemma_pos" | "lemma" | "none"
+    matched_wordforms: list[Wordform] = field(default_factory=list)
+
+
+class DictionaryLookupService:
+    """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
+    senses, using a three-stage fallback strategy.
+
+    Stage 1 — wordform table lookup (most precise):
+        Searches ``dictionary_wordform`` for an exact match on the inflected surface
+        form within the target language.  "allons" → wordform row → lemma "aller".
+        When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
+
+    Stage 2 — lemma + POS fallback:
+        If no wordform row exists, tries the spaCy-provided lemma string against
+        ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
+        Reduces false matches for homographs with different parts of speech.
+
+    Stage 3 — lemma-only fallback:
+        Drops the POS filter as a last resort.  Returns all senses for the headword
+        regardless of POS.
+
+    Usage::
+
+        service = DictionaryLookupService(PostgresDictionaryRepository(db))
+
+        result = await service.lookup_token(
+            surface="allons",
+            spacy_lemma="aller",
+            pos_ud="VERB",
+            language="fr",
+        )
+        # result.senses  — candidate Sense rows for disambiguation
+        # result.wordform_id  — pre-resolved wordform UUID string, or None
+        # result.matched_via  — "wordform" | "lemma_pos" | "lemma" | "none"
+    """
+
+    def __init__(self, dict_repo: DictionaryRepository) -> None:
+        self.dict_repo = dict_repo
+
+    async def lookup_token(
+        self,
+        surface: str,
+        spacy_lemma: str,
+        pos_ud: str,
+        language: str,
+    ) -> TokenLookupResult:
+        """Resolve a spaCy token to candidate senses using a three-stage fallback.
+
+        ``surface`` is the raw token text (e.g. ``"allons"``).
+        ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
+        ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
+        ``language`` is the target language code (e.g. ``"fr"``).
+
+        Returns a :class:`TokenLookupResult` with the candidate senses and, when the
+        surface form was found in the wordform table, a ``wordform_id`` that can be
+        stored on the vocab bank entry for precise inflection tracking.
+        """
+        # Stage 1: wordform table lookup by inflected surface form
+        wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
+        if wordforms:
+            unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
+            senses: list[Sense] = []
+            for lemma_id in unique_lemma_ids:
+                senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
+
+            # Only pre-assign wordform_id when a single wordform matched — if multiple
+            # wordforms from different lemmas matched, the ambiguity must be resolved
+            # by the user and we cannot confidently pick one.
+            wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
+            return TokenLookupResult(
+                senses=senses,
+                wordform_id=wordform_id,
+                matched_via="wordform",
+                matched_wordforms=wordforms,
+            )
+
+        # Stage 2: spaCy lemma + UD POS filter
+        senses = await self.dict_repo.get_senses_for_headword_and_pos(
+            spacy_lemma, language, pos_ud
+        )
+        if senses:
+            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
+
+        # Stage 3: spaCy lemma only — no POS filter
+        senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
+        if senses:
+            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
+
+        return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
--- a/api/app/domain/services/vocab_service.py
+++ b/api/app/domain/services/vocab_service.py
@ -1,5 +1,6 @@
 import uuid

+from ..models.dictionary import Sense
 from ..models.vocab import LearnableWordBankEntry
 from ...outbound.postgres.repositories.vocab_repository import VocabRepository
 from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@ -35,6 +36,7 @@ class VocabService:
        language_pair_id: uuid.UUID,
        pathway: str,
        is_phrase: bool = False,
+        wordform_id: uuid.UUID | None = None,
        source_article_id: uuid.UUID | None = None,
    ) -> LearnableWordBankEntry:
        """Add a word or phrase to the user's vocab bank, automatically linking it to a
@ -108,6 +110,59 @@ class VocabService:
            entry_pathway=pathway,
            is_phrase=False,
            sense_id=sense_id,
+            wordform_id=wordform_id,
+            source_article_id=source_article_id,
+            disambiguation_status=status,
+        )
+
+    async def add_token_to_bank(
+        self,
+        user_id: uuid.UUID,
+        surface_text: str,
+        language_pair_id: uuid.UUID,
+        senses: list[Sense],
+        wordform_id: uuid.UUID | None,
+        source_article_id: uuid.UUID | None = None,
+    ) -> LearnableWordBankEntry:
+        """Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
+        results, skipping the redundant dictionary query that ``add_word_to_bank`` would
+        otherwise perform.
+
+        ``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
+        are stored directly on the bank entry.  Auto-resolution still applies: exactly
+        one sense means ``auto_resolved``; anything else means ``pending``.
+
+        Usage::
+
+            result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
+            wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
+            entry = await vocab_service.add_token_to_bank(
+                user_id=user_id,
+                surface_text="allons",
+                language_pair_id=pair_id,
+                senses=result.senses,
+                wordform_id=wf_id,
+            )
+            # entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
+        """
+        pair = await self.vocab_repo.get_language_pair(language_pair_id)
+        if pair is None:
+            raise ValueError(f"Language pair {language_pair_id} not found")
+
+        if len(senses) == 1:
+            sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
+            status = "auto_resolved"
+        else:
+            sense_id = None
+            status = "pending"
+
+        return await self.vocab_repo.add_entry(
+            user_id=user_id,
+            language_pair_id=language_pair_id,
+            surface_text=surface_text,
+            entry_pathway="nlp_extraction",
+            wordform_id=wordform_id,
+            sense_id=sense_id,
            source_article_id=source_article_id,
            disambiguation_status=status,
        )
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform

 class DictionaryRepository(Protocol):
    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
+    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
+    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
    async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
    async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
+    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...


@ -99,6 +102,40 @@ class PostgresDictionaryRepository:
        entity = result.scalar_one_or_none()
        return _lemma_to_model(entity) if entity else None

+    async def get_senses_for_headword_and_pos(
+        self, headword: str, language: str, pos_normalised: str
+    ) -> list[Sense]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionaryLemmaEntity.headword == headword,
+                DictionaryLemmaEntity.language == language,
+                DictionaryLemmaEntity.pos_normalised == pos_normalised,
+            )
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .where(DictionarySenseEntity.lemma_id == lemma_id)
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
+        result = await self.db.execute(
+            select(DictionaryWordformEntity)
+            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionaryWordformEntity.form == form,
+                DictionaryLemmaEntity.language == language,
+            )
+        )
+        return [_wordform_to_model(e) for e in result.scalars().all()]
+
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity).where(
--- a/api/app/routers/api/vocab.py
+++ b/api/app/routers/api/vocab.py
@ -5,6 +5,7 @@ from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession

 from ...auth import verify_token
+from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
 from ...domain.services.vocab_service import VocabService
 from ...outbound.postgres.database import get_db
 from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
@ -21,6 +22,28 @@ class AddWordRequest(BaseModel):
    source_article_id: str | None = None


+class AddFromTokenRequest(BaseModel):
+    language_pair_id: str
+    surface: str
+    spacy_lemma: str
+    pos_ud: str
+    language: str
+    source_article_id: str | None = None
+
+
+class SenseCandidateResponse(BaseModel):
+    id: str
+    gloss: str
+    topics: list[str]
+    tags: list[str]
+
+
+class FromTokenResponse(BaseModel):
+    entry: "WordBankEntryResponse"
+    sense_candidates: list[SenseCandidateResponse]
+    matched_via: str
+
+
 class SetSenseRequest(BaseModel):
    sense_id: str

@ -80,6 +103,58 @@ async def add_word(
    return _to_response(entry)


+@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
+async def add_from_token(
+    request: AddFromTokenRequest,
+    db: AsyncSession = Depends(get_db),
+    token_data: dict = Depends(verify_token),
+) -> FromTokenResponse:
+    user_id = uuid.UUID(token_data["sub"])
+    try:
+        language_pair_id = uuid.UUID(request.language_pair_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid language_pair_id")
+
+    source_article_id = None
+    if request.source_article_id:
+        try:
+            source_article_id = uuid.UUID(request.source_article_id)
+        except ValueError:
+            raise HTTPException(status_code=400, detail="Invalid source_article_id")
+
+    lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
+    result: TokenLookupResult = await lookup_service.lookup_token(
+        surface=request.surface,
+        spacy_lemma=request.spacy_lemma,
+        pos_ud=request.pos_ud,
+        language=request.language,
+    )
+
+    wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
+
+    try:
+        entry = await _service(db).add_token_to_bank(
+            user_id=user_id,
+            surface_text=request.surface,
+            language_pair_id=language_pair_id,
+            senses=result.senses,
+            wordform_id=wordform_id,
+            source_article_id=source_article_id,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc))
+
+    candidates = [
+        SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
+        for s in result.senses
+    ]
+    return FromTokenResponse(
+        entry=_to_response(entry),
+        sense_candidates=candidates,
+        matched_via=result.matched_via,
+    )
+
+
@router.get("", response_model=list[WordBankEntryResponse])
 async def list_entries(
    language_pair_id: str,