feat: Create the Dictionary Lookup Service; methods for fidning
Some checks are pending
/ test (push) Waiting to run

vocabulary and words
This commit is contained in:
wilson 2026-04-10 07:11:57 +01:00
parent 27f7a7c3f3
commit aa4987981d
4 changed files with 275 additions and 0 deletions

View file

@ -0,0 +1,108 @@
import uuid
from dataclasses import dataclass, field
from ..models.dictionary import Sense, Wordform
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@dataclass
class TokenLookupResult:
"""The result of resolving a spaCy token against the dictionary.
``senses`` is the ranked list of candidate senses for disambiguation.
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
allowing the vocab bank entry to be pre-linked to the exact inflected form.
``matched_via`` describes which lookup strategy succeeded.
"""
senses: list[Sense]
wordform_id: str | None
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
matched_wordforms: list[Wordform] = field(default_factory=list)
class DictionaryLookupService:
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
senses, using a three-stage fallback strategy.
Stage 1 wordform table lookup (most precise):
Searches ``dictionary_wordform`` for an exact match on the inflected surface
form within the target language. "allons" wordform row lemma "aller".
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
Stage 2 lemma + POS fallback:
If no wordform row exists, tries the spaCy-provided lemma string against
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
Reduces false matches for homographs with different parts of speech.
Stage 3 lemma-only fallback:
Drops the POS filter as a last resort. Returns all senses for the headword
regardless of POS.
Usage::
service = DictionaryLookupService(PostgresDictionaryRepository(db))
result = await service.lookup_token(
surface="allons",
spacy_lemma="aller",
pos_ud="VERB",
language="fr",
)
# result.senses — candidate Sense rows for disambiguation
# result.wordform_id — pre-resolved wordform UUID string, or None
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
"""
def __init__(self, dict_repo: DictionaryRepository) -> None:
self.dict_repo = dict_repo
async def lookup_token(
self,
surface: str,
spacy_lemma: str,
pos_ud: str,
language: str,
) -> TokenLookupResult:
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
``surface`` is the raw token text (e.g. ``"allons"``).
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
``language`` is the target language code (e.g. ``"fr"``).
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
surface form was found in the wordform table, a ``wordform_id`` that can be
stored on the vocab bank entry for precise inflection tracking.
"""
# Stage 1: wordform table lookup by inflected surface form
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
if wordforms:
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
senses: list[Sense] = []
for lemma_id in unique_lemma_ids:
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
# Only pre-assign wordform_id when a single wordform matched — if multiple
# wordforms from different lemmas matched, the ambiguity must be resolved
# by the user and we cannot confidently pick one.
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
return TokenLookupResult(
senses=senses,
wordform_id=wordform_id,
matched_via="wordform",
matched_wordforms=wordforms,
)
# Stage 2: spaCy lemma + UD POS filter
senses = await self.dict_repo.get_senses_for_headword_and_pos(
spacy_lemma, language, pos_ud
)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
# Stage 3: spaCy lemma only — no POS filter
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
if senses:
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")

View file

@ -1,5 +1,6 @@
import uuid
from ..models.dictionary import Sense
from ..models.vocab import LearnableWordBankEntry
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
@ -35,6 +36,7 @@ class VocabService:
language_pair_id: uuid.UUID,
pathway: str,
is_phrase: bool = False,
wordform_id: uuid.UUID | None = None,
source_article_id: uuid.UUID | None = None,
) -> LearnableWordBankEntry:
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
@ -108,6 +110,59 @@ class VocabService:
entry_pathway=pathway,
is_phrase=False,
sense_id=sense_id,
wordform_id=wordform_id,
source_article_id=source_article_id,
disambiguation_status=status,
)
async def add_token_to_bank(
self,
user_id: uuid.UUID,
surface_text: str,
language_pair_id: uuid.UUID,
senses: list[Sense],
wordform_id: uuid.UUID | None,
source_article_id: uuid.UUID | None = None,
) -> LearnableWordBankEntry:
"""Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
results, skipping the redundant dictionary query that ``add_word_to_bank`` would
otherwise perform.
``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
are stored directly on the bank entry. Auto-resolution still applies: exactly
one sense means ``auto_resolved``; anything else means ``pending``.
Usage::
result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
entry = await vocab_service.add_token_to_bank(
user_id=user_id,
surface_text="allons",
language_pair_id=pair_id,
senses=result.senses,
wordform_id=wf_id,
)
# entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
"""
pair = await self.vocab_repo.get_language_pair(language_pair_id)
if pair is None:
raise ValueError(f"Language pair {language_pair_id} not found")
if len(senses) == 1:
sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
status = "auto_resolved"
else:
sense_id = None
status = "pending"
return await self.vocab_repo.add_entry(
user_id=user_id,
language_pair_id=language_pair_id,
surface_text=surface_text,
entry_pathway="nlp_extraction",
wordform_id=wordform_id,
sense_id=sense_id,
source_article_id=source_article_id,
disambiguation_status=status,
)

View file

@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform
class DictionaryRepository(Protocol):
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
@ -99,6 +102,40 @@ class PostgresDictionaryRepository:
entity = result.scalar_one_or_none()
return _lemma_to_model(entity) if entity else None
async def get_senses_for_headword_and_pos(
self, headword: str, language: str, pos_normalised: str
) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
DictionaryLemmaEntity.pos_normalised == pos_normalised,
)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.where(DictionarySenseEntity.lemma_id == lemma_id)
.order_by(DictionarySenseEntity.sense_index)
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity)
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
.where(
DictionaryWordformEntity.form == form,
DictionaryLemmaEntity.language == language,
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity).where(

View file

@ -5,6 +5,7 @@ from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from ...auth import verify_token
from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
from ...domain.services.vocab_service import VocabService
from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
@ -21,6 +22,28 @@ class AddWordRequest(BaseModel):
source_article_id: str | None = None
class AddFromTokenRequest(BaseModel):
language_pair_id: str
surface: str
spacy_lemma: str
pos_ud: str
language: str
source_article_id: str | None = None
class SenseCandidateResponse(BaseModel):
id: str
gloss: str
topics: list[str]
tags: list[str]
class FromTokenResponse(BaseModel):
entry: "WordBankEntryResponse"
sense_candidates: list[SenseCandidateResponse]
matched_via: str
class SetSenseRequest(BaseModel):
sense_id: str
@ -80,6 +103,58 @@ async def add_word(
return _to_response(entry)
@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
async def add_from_token(
request: AddFromTokenRequest,
db: AsyncSession = Depends(get_db),
token_data: dict = Depends(verify_token),
) -> FromTokenResponse:
user_id = uuid.UUID(token_data["sub"])
try:
language_pair_id = uuid.UUID(request.language_pair_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
source_article_id = None
if request.source_article_id:
try:
source_article_id = uuid.UUID(request.source_article_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid source_article_id")
lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
result: TokenLookupResult = await lookup_service.lookup_token(
surface=request.surface,
spacy_lemma=request.spacy_lemma,
pos_ud=request.pos_ud,
language=request.language,
)
wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
try:
entry = await _service(db).add_token_to_bank(
user_id=user_id,
surface_text=request.surface,
language_pair_id=language_pair_id,
senses=result.senses,
wordform_id=wordform_id,
source_article_id=source_article_id,
)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc))
candidates = [
SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
for s in result.senses
]
return FromTokenResponse(
entry=_to_response(entry),
sense_candidates=candidates,
matched_via=result.matched_via,
)
@router.get("", response_model=list[WordBankEntryResponse])
async def list_entries(
language_pair_id: str,