From aa4987981d06e3fc2643d03560f4d4f76c094272 Mon Sep 17 00:00:00 2001 From: wilson Date: Fri, 10 Apr 2026 07:11:57 +0100 Subject: [PATCH] feat: Create the Dictionary Lookup Service; methods for fidning vocabulary and words --- .../services/dictionary_lookup_service.py | 108 ++++++++++++++++++ api/app/domain/services/vocab_service.py | 55 +++++++++ .../repositories/dictionary_repository.py | 37 ++++++ api/app/routers/api/vocab.py | 75 ++++++++++++ 4 files changed, 275 insertions(+) create mode 100644 api/app/domain/services/dictionary_lookup_service.py diff --git a/api/app/domain/services/dictionary_lookup_service.py b/api/app/domain/services/dictionary_lookup_service.py new file mode 100644 index 0000000..74499ae --- /dev/null +++ b/api/app/domain/services/dictionary_lookup_service.py @@ -0,0 +1,108 @@ +import uuid +from dataclasses import dataclass, field + +from ..models.dictionary import Sense, Wordform +from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository + + +@dataclass +class TokenLookupResult: + """The result of resolving a spaCy token against the dictionary. + + ``senses`` is the ranked list of candidate senses for disambiguation. + ``wordform_id`` is set when the surface form was found in ``dictionary_wordform``, + allowing the vocab bank entry to be pre-linked to the exact inflected form. + ``matched_via`` describes which lookup strategy succeeded. + """ + senses: list[Sense] + wordform_id: str | None + matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none" + matched_wordforms: list[Wordform] = field(default_factory=list) + + +class DictionaryLookupService: + """Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary + senses, using a three-stage fallback strategy. + + Stage 1 — wordform table lookup (most precise): + Searches ``dictionary_wordform`` for an exact match on the inflected surface + form within the target language. "allons" → wordform row → lemma "aller". + When exactly one lemma matches, ``wordform_id`` is pre-populated on the result. + + Stage 2 — lemma + POS fallback: + If no wordform row exists, tries the spaCy-provided lemma string against + ``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag). + Reduces false matches for homographs with different parts of speech. + + Stage 3 — lemma-only fallback: + Drops the POS filter as a last resort. Returns all senses for the headword + regardless of POS. + + Usage:: + + service = DictionaryLookupService(PostgresDictionaryRepository(db)) + + result = await service.lookup_token( + surface="allons", + spacy_lemma="aller", + pos_ud="VERB", + language="fr", + ) + # result.senses — candidate Sense rows for disambiguation + # result.wordform_id — pre-resolved wordform UUID string, or None + # result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none" + """ + + def __init__(self, dict_repo: DictionaryRepository) -> None: + self.dict_repo = dict_repo + + async def lookup_token( + self, + surface: str, + spacy_lemma: str, + pos_ud: str, + language: str, + ) -> TokenLookupResult: + """Resolve a spaCy token to candidate senses using a three-stage fallback. + + ``surface`` is the raw token text (e.g. ``"allons"``). + ``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``). + ``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``). + ``language`` is the target language code (e.g. ``"fr"``). + + Returns a :class:`TokenLookupResult` with the candidate senses and, when the + surface form was found in the wordform table, a ``wordform_id`` that can be + stored on the vocab bank entry for precise inflection tracking. + """ + # Stage 1: wordform table lookup by inflected surface form + wordforms = await self.dict_repo.get_wordforms_by_form(surface, language) + if wordforms: + unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms)) + senses: list[Sense] = [] + for lemma_id in unique_lemma_ids: + senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id))) + + # Only pre-assign wordform_id when a single wordform matched — if multiple + # wordforms from different lemmas matched, the ambiguity must be resolved + # by the user and we cannot confidently pick one. + wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None + return TokenLookupResult( + senses=senses, + wordform_id=wordform_id, + matched_via="wordform", + matched_wordforms=wordforms, + ) + + # Stage 2: spaCy lemma + UD POS filter + senses = await self.dict_repo.get_senses_for_headword_and_pos( + spacy_lemma, language, pos_ud + ) + if senses: + return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos") + + # Stage 3: spaCy lemma only — no POS filter + senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language) + if senses: + return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma") + + return TokenLookupResult(senses=[], wordform_id=None, matched_via="none") diff --git a/api/app/domain/services/vocab_service.py b/api/app/domain/services/vocab_service.py index 3d5e083..bad4a14 100644 --- a/api/app/domain/services/vocab_service.py +++ b/api/app/domain/services/vocab_service.py @@ -1,5 +1,6 @@ import uuid +from ..models.dictionary import Sense from ..models.vocab import LearnableWordBankEntry from ...outbound.postgres.repositories.vocab_repository import VocabRepository from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository @@ -35,6 +36,7 @@ class VocabService: language_pair_id: uuid.UUID, pathway: str, is_phrase: bool = False, + wordform_id: uuid.UUID | None = None, source_article_id: uuid.UUID | None = None, ) -> LearnableWordBankEntry: """Add a word or phrase to the user's vocab bank, automatically linking it to a @@ -108,6 +110,59 @@ class VocabService: entry_pathway=pathway, is_phrase=False, sense_id=sense_id, + wordform_id=wordform_id, + source_article_id=source_article_id, + disambiguation_status=status, + ) + + async def add_token_to_bank( + self, + user_id: uuid.UUID, + surface_text: str, + language_pair_id: uuid.UUID, + senses: list[Sense], + wordform_id: uuid.UUID | None, + source_article_id: uuid.UUID | None = None, + ) -> LearnableWordBankEntry: + """Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup + results, skipping the redundant dictionary query that ``add_word_to_bank`` would + otherwise perform. + + ``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and + are stored directly on the bank entry. Auto-resolution still applies: exactly + one sense means ``auto_resolved``; anything else means ``pending``. + + Usage:: + + result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr") + wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None + entry = await vocab_service.add_token_to_bank( + user_id=user_id, + surface_text="allons", + language_pair_id=pair_id, + senses=result.senses, + wordform_id=wf_id, + ) + # entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform) + """ + pair = await self.vocab_repo.get_language_pair(language_pair_id) + if pair is None: + raise ValueError(f"Language pair {language_pair_id} not found") + + if len(senses) == 1: + sense_id: uuid.UUID | None = uuid.UUID(senses[0].id) + status = "auto_resolved" + else: + sense_id = None + status = "pending" + + return await self.vocab_repo.add_entry( + user_id=user_id, + language_pair_id=language_pair_id, + surface_text=surface_text, + entry_pathway="nlp_extraction", + wordform_id=wordform_id, + sense_id=sense_id, source_article_id=source_article_id, disambiguation_status=status, ) diff --git a/api/app/outbound/postgres/repositories/dictionary_repository.py b/api/app/outbound/postgres/repositories/dictionary_repository.py index 9328ecf..dff9d7d 100644 --- a/api/app/outbound/postgres/repositories/dictionary_repository.py +++ b/api/app/outbound/postgres/repositories/dictionary_repository.py @@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform class DictionaryRepository(Protocol): async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ... + async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ... + async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ... async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ... async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ... async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ... + async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ... async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ... @@ -99,6 +102,40 @@ class PostgresDictionaryRepository: entity = result.scalar_one_or_none() return _lemma_to_model(entity) if entity else None + async def get_senses_for_headword_and_pos( + self, headword: str, language: str, pos_normalised: str + ) -> list[Sense]: + result = await self.db.execute( + select(DictionarySenseEntity) + .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .where( + DictionaryLemmaEntity.headword == headword, + DictionaryLemmaEntity.language == language, + DictionaryLemmaEntity.pos_normalised == pos_normalised, + ) + .order_by(DictionarySenseEntity.sense_index) + ) + return [_sense_to_model(e) for e in result.scalars().all()] + + async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: + result = await self.db.execute( + select(DictionarySenseEntity) + .where(DictionarySenseEntity.lemma_id == lemma_id) + .order_by(DictionarySenseEntity.sense_index) + ) + return [_sense_to_model(e) for e in result.scalars().all()] + + async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: + result = await self.db.execute( + select(DictionaryWordformEntity) + .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id) + .where( + DictionaryWordformEntity.form == form, + DictionaryLemmaEntity.language == language, + ) + ) + return [_wordform_to_model(e) for e in result.scalars().all()] + async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: result = await self.db.execute( select(DictionaryWordformEntity).where( diff --git a/api/app/routers/api/vocab.py b/api/app/routers/api/vocab.py index 40bedde..fd1f30c 100644 --- a/api/app/routers/api/vocab.py +++ b/api/app/routers/api/vocab.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from sqlalchemy.ext.asyncio import AsyncSession from ...auth import verify_token +from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult from ...domain.services.vocab_service import VocabService from ...outbound.postgres.database import get_db from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository @@ -21,6 +22,28 @@ class AddWordRequest(BaseModel): source_article_id: str | None = None +class AddFromTokenRequest(BaseModel): + language_pair_id: str + surface: str + spacy_lemma: str + pos_ud: str + language: str + source_article_id: str | None = None + + +class SenseCandidateResponse(BaseModel): + id: str + gloss: str + topics: list[str] + tags: list[str] + + +class FromTokenResponse(BaseModel): + entry: "WordBankEntryResponse" + sense_candidates: list[SenseCandidateResponse] + matched_via: str + + class SetSenseRequest(BaseModel): sense_id: str @@ -80,6 +103,58 @@ async def add_word( return _to_response(entry) +@router.post("/from-token", response_model=FromTokenResponse, status_code=201) +async def add_from_token( + request: AddFromTokenRequest, + db: AsyncSession = Depends(get_db), + token_data: dict = Depends(verify_token), +) -> FromTokenResponse: + user_id = uuid.UUID(token_data["sub"]) + try: + language_pair_id = uuid.UUID(request.language_pair_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid language_pair_id") + + source_article_id = None + if request.source_article_id: + try: + source_article_id = uuid.UUID(request.source_article_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid source_article_id") + + lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db)) + result: TokenLookupResult = await lookup_service.lookup_token( + surface=request.surface, + spacy_lemma=request.spacy_lemma, + pos_ud=request.pos_ud, + language=request.language, + ) + + wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None + + try: + entry = await _service(db).add_token_to_bank( + user_id=user_id, + surface_text=request.surface, + language_pair_id=language_pair_id, + senses=result.senses, + wordform_id=wordform_id, + source_article_id=source_article_id, + ) + except ValueError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + + candidates = [ + SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags) + for s in result.senses + ] + return FromTokenResponse( + entry=_to_response(entry), + sense_candidates=candidates, + matched_via=result.matched_via, + ) + + @router.get("", response_model=list[WordBankEntryResponse]) async def list_entries( language_pair_id: str,