From aa4987981d06e3fc2643d03560f4d4f76c094272 Mon Sep 17 00:00:00 2001
From: wilson <wilson@thomaswilson.xyz>
Date: Fri, 10 Apr 2026 07:11:57 +0100
Subject: [PATCH] feat: Create the Dictionary Lookup Service; methods for
 fidning vocabulary and words

---
 .../services/dictionary_lookup_service.py     | 108 ++++++++++++++++++
 api/app/domain/services/vocab_service.py      |  55 +++++++++
 .../repositories/dictionary_repository.py     |  37 ++++++
 api/app/routers/api/vocab.py                  |  75 ++++++++++++
 4 files changed, 275 insertions(+)
 create mode 100644 api/app/domain/services/dictionary_lookup_service.py

diff --git a/api/app/domain/services/dictionary_lookup_service.py b/api/app/domain/services/dictionary_lookup_service.py
new file mode 100644
index 0000000..74499ae
--- /dev/null
+++ b/api/app/domain/services/dictionary_lookup_service.py
@@ -0,0 +1,108 @@
+import uuid
+from dataclasses import dataclass, field
+
+from ..models.dictionary import Sense, Wordform
+from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
+
+
+@dataclass
+class TokenLookupResult:
+    """The result of resolving a spaCy token against the dictionary.
+
+    ``senses`` is the ranked list of candidate senses for disambiguation.
+    ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
+    allowing the vocab bank entry to be pre-linked to the exact inflected form.
+    ``matched_via`` describes which lookup strategy succeeded.
+    """
+    senses: list[Sense]
+    wordform_id: str | None
+    matched_via: str  # "wordform" | "lemma_pos" | "lemma" | "none"
+    matched_wordforms: list[Wordform] = field(default_factory=list)
+
+
+class DictionaryLookupService:
+    """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
+    senses, using a three-stage fallback strategy.
+
+    Stage 1 — wordform table lookup (most precise):
+        Searches ``dictionary_wordform`` for an exact match on the inflected surface
+        form within the target language.  "allons" → wordform row → lemma "aller".
+        When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
+
+    Stage 2 — lemma + POS fallback:
+        If no wordform row exists, tries the spaCy-provided lemma string against
+        ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
+        Reduces false matches for homographs with different parts of speech.
+
+    Stage 3 — lemma-only fallback:
+        Drops the POS filter as a last resort.  Returns all senses for the headword
+        regardless of POS.
+
+    Usage::
+
+        service = DictionaryLookupService(PostgresDictionaryRepository(db))
+
+        result = await service.lookup_token(
+            surface="allons",
+            spacy_lemma="aller",
+            pos_ud="VERB",
+            language="fr",
+        )
+        # result.senses  — candidate Sense rows for disambiguation
+        # result.wordform_id  — pre-resolved wordform UUID string, or None
+        # result.matched_via  — "wordform" | "lemma_pos" | "lemma" | "none"
+    """
+
+    def __init__(self, dict_repo: DictionaryRepository) -> None:
+        self.dict_repo = dict_repo
+
+    async def lookup_token(
+        self,
+        surface: str,
+        spacy_lemma: str,
+        pos_ud: str,
+        language: str,
+    ) -> TokenLookupResult:
+        """Resolve a spaCy token to candidate senses using a three-stage fallback.
+
+        ``surface`` is the raw token text (e.g. ``"allons"``).
+        ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
+        ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
+        ``language`` is the target language code (e.g. ``"fr"``).
+
+        Returns a :class:`TokenLookupResult` with the candidate senses and, when the
+        surface form was found in the wordform table, a ``wordform_id`` that can be
+        stored on the vocab bank entry for precise inflection tracking.
+        """
+        # Stage 1: wordform table lookup by inflected surface form
+        wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
+        if wordforms:
+            unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
+            senses: list[Sense] = []
+            for lemma_id in unique_lemma_ids:
+                senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
+
+            # Only pre-assign wordform_id when a single wordform matched — if multiple
+            # wordforms from different lemmas matched, the ambiguity must be resolved
+            # by the user and we cannot confidently pick one.
+            wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
+            return TokenLookupResult(
+                senses=senses,
+                wordform_id=wordform_id,
+                matched_via="wordform",
+                matched_wordforms=wordforms,
+            )
+
+        # Stage 2: spaCy lemma + UD POS filter
+        senses = await self.dict_repo.get_senses_for_headword_and_pos(
+            spacy_lemma, language, pos_ud
+        )
+        if senses:
+            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
+
+        # Stage 3: spaCy lemma only — no POS filter
+        senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
+        if senses:
+            return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
+
+        return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
diff --git a/api/app/domain/services/vocab_service.py b/api/app/domain/services/vocab_service.py
index 3d5e083..bad4a14 100644
--- a/api/app/domain/services/vocab_service.py
+++ b/api/app/domain/services/vocab_service.py
@@ -1,5 +1,6 @@
 import uuid
 
+from ..models.dictionary import Sense
 from ..models.vocab import LearnableWordBankEntry
 from ...outbound.postgres.repositories.vocab_repository import VocabRepository
 from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@@ -35,6 +36,7 @@ class VocabService:
         language_pair_id: uuid.UUID,
         pathway: str,
         is_phrase: bool = False,
+        wordform_id: uuid.UUID | None = None,
         source_article_id: uuid.UUID | None = None,
     ) -> LearnableWordBankEntry:
         """Add a word or phrase to the user's vocab bank, automatically linking it to a
@@ -108,6 +110,59 @@ class VocabService:
             entry_pathway=pathway,
             is_phrase=False,
             sense_id=sense_id,
+            wordform_id=wordform_id,
+            source_article_id=source_article_id,
+            disambiguation_status=status,
+        )
+
+    async def add_token_to_bank(
+        self,
+        user_id: uuid.UUID,
+        surface_text: str,
+        language_pair_id: uuid.UUID,
+        senses: list[Sense],
+        wordform_id: uuid.UUID | None,
+        source_article_id: uuid.UUID | None = None,
+    ) -> LearnableWordBankEntry:
+        """Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
+        results, skipping the redundant dictionary query that ``add_word_to_bank`` would
+        otherwise perform.
+
+        ``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
+        are stored directly on the bank entry.  Auto-resolution still applies: exactly
+        one sense means ``auto_resolved``; anything else means ``pending``.
+
+        Usage::
+
+            result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
+            wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
+            entry = await vocab_service.add_token_to_bank(
+                user_id=user_id,
+                surface_text="allons",
+                language_pair_id=pair_id,
+                senses=result.senses,
+                wordform_id=wf_id,
+            )
+            # entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
+        """
+        pair = await self.vocab_repo.get_language_pair(language_pair_id)
+        if pair is None:
+            raise ValueError(f"Language pair {language_pair_id} not found")
+
+        if len(senses) == 1:
+            sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
+            status = "auto_resolved"
+        else:
+            sense_id = None
+            status = "pending"
+
+        return await self.vocab_repo.add_entry(
+            user_id=user_id,
+            language_pair_id=language_pair_id,
+            surface_text=surface_text,
+            entry_pathway="nlp_extraction",
+            wordform_id=wordform_id,
+            sense_id=sense_id,
             source_article_id=source_article_id,
             disambiguation_status=status,
         )
diff --git a/api/app/outbound/postgres/repositories/dictionary_repository.py b/api/app/outbound/postgres/repositories/dictionary_repository.py
index 9328ecf..dff9d7d 100644
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform
 
 class DictionaryRepository(Protocol):
     async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
+    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
+    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
     async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
     async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
     async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
+    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
     async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
 
 
@@ -99,6 +102,40 @@ class PostgresDictionaryRepository:
         entity = result.scalar_one_or_none()
         return _lemma_to_model(entity) if entity else None
 
+    async def get_senses_for_headword_and_pos(
+        self, headword: str, language: str, pos_normalised: str
+    ) -> list[Sense]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionaryLemmaEntity.headword == headword,
+                DictionaryLemmaEntity.language == language,
+                DictionaryLemmaEntity.pos_normalised == pos_normalised,
+            )
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity)
+            .where(DictionarySenseEntity.lemma_id == lemma_id)
+            .order_by(DictionarySenseEntity.sense_index)
+        )
+        return [_sense_to_model(e) for e in result.scalars().all()]
+
+    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
+        result = await self.db.execute(
+            select(DictionaryWordformEntity)
+            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
+            .where(
+                DictionaryWordformEntity.form == form,
+                DictionaryLemmaEntity.language == language,
+            )
+        )
+        return [_wordform_to_model(e) for e in result.scalars().all()]
+
     async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
         result = await self.db.execute(
             select(DictionaryWordformEntity).where(
diff --git a/api/app/routers/api/vocab.py b/api/app/routers/api/vocab.py
index 40bedde..fd1f30c 100644
--- a/api/app/routers/api/vocab.py
+++ b/api/app/routers/api/vocab.py
@@ -5,6 +5,7 @@ from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from ...auth import verify_token
+from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
 from ...domain.services.vocab_service import VocabService
 from ...outbound.postgres.database import get_db
 from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
@@ -21,6 +22,28 @@ class AddWordRequest(BaseModel):
     source_article_id: str | None = None
 
 
+class AddFromTokenRequest(BaseModel):
+    language_pair_id: str
+    surface: str
+    spacy_lemma: str
+    pos_ud: str
+    language: str
+    source_article_id: str | None = None
+
+
+class SenseCandidateResponse(BaseModel):
+    id: str
+    gloss: str
+    topics: list[str]
+    tags: list[str]
+
+
+class FromTokenResponse(BaseModel):
+    entry: "WordBankEntryResponse"
+    sense_candidates: list[SenseCandidateResponse]
+    matched_via: str
+
+
 class SetSenseRequest(BaseModel):
     sense_id: str
 
@@ -80,6 +103,58 @@ async def add_word(
     return _to_response(entry)
 
 
+@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
+async def add_from_token(
+    request: AddFromTokenRequest,
+    db: AsyncSession = Depends(get_db),
+    token_data: dict = Depends(verify_token),
+) -> FromTokenResponse:
+    user_id = uuid.UUID(token_data["sub"])
+    try:
+        language_pair_id = uuid.UUID(request.language_pair_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid language_pair_id")
+
+    source_article_id = None
+    if request.source_article_id:
+        try:
+            source_article_id = uuid.UUID(request.source_article_id)
+        except ValueError:
+            raise HTTPException(status_code=400, detail="Invalid source_article_id")
+
+    lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
+    result: TokenLookupResult = await lookup_service.lookup_token(
+        surface=request.surface,
+        spacy_lemma=request.spacy_lemma,
+        pos_ud=request.pos_ud,
+        language=request.language,
+    )
+
+    wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
+
+    try:
+        entry = await _service(db).add_token_to_bank(
+            user_id=user_id,
+            surface_text=request.surface,
+            language_pair_id=language_pair_id,
+            senses=result.senses,
+            wordform_id=wordform_id,
+            source_article_id=source_article_id,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc))
+
+    candidates = [
+        SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
+        for s in result.senses
+    ]
+    return FromTokenResponse(
+        entry=_to_response(entry),
+        sense_candidates=candidates,
+        matched_via=result.matched_via,
+    )
+
+
 @router.get("", response_model=list[WordBankEntryResponse])
 async def list_entries(
     language_pair_id: str,