feat: Create the Dictionary Lookup Service; methods for fidning
Some checks are pending
/ test (push) Waiting to run
Some checks are pending
/ test (push) Waiting to run
vocabulary and words
This commit is contained in:
parent
27f7a7c3f3
commit
aa4987981d
4 changed files with 275 additions and 0 deletions
108
api/app/domain/services/dictionary_lookup_service.py
Normal file
108
api/app/domain/services/dictionary_lookup_service.py
Normal file
|
|
@ -0,0 +1,108 @@
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from ..models.dictionary import Sense, Wordform
|
||||||
|
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TokenLookupResult:
|
||||||
|
"""The result of resolving a spaCy token against the dictionary.
|
||||||
|
|
||||||
|
``senses`` is the ranked list of candidate senses for disambiguation.
|
||||||
|
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
|
||||||
|
allowing the vocab bank entry to be pre-linked to the exact inflected form.
|
||||||
|
``matched_via`` describes which lookup strategy succeeded.
|
||||||
|
"""
|
||||||
|
senses: list[Sense]
|
||||||
|
wordform_id: str | None
|
||||||
|
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
|
||||||
|
matched_wordforms: list[Wordform] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryLookupService:
|
||||||
|
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
|
||||||
|
senses, using a three-stage fallback strategy.
|
||||||
|
|
||||||
|
Stage 1 — wordform table lookup (most precise):
|
||||||
|
Searches ``dictionary_wordform`` for an exact match on the inflected surface
|
||||||
|
form within the target language. "allons" → wordform row → lemma "aller".
|
||||||
|
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
|
||||||
|
|
||||||
|
Stage 2 — lemma + POS fallback:
|
||||||
|
If no wordform row exists, tries the spaCy-provided lemma string against
|
||||||
|
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
|
||||||
|
Reduces false matches for homographs with different parts of speech.
|
||||||
|
|
||||||
|
Stage 3 — lemma-only fallback:
|
||||||
|
Drops the POS filter as a last resort. Returns all senses for the headword
|
||||||
|
regardless of POS.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
service = DictionaryLookupService(PostgresDictionaryRepository(db))
|
||||||
|
|
||||||
|
result = await service.lookup_token(
|
||||||
|
surface="allons",
|
||||||
|
spacy_lemma="aller",
|
||||||
|
pos_ud="VERB",
|
||||||
|
language="fr",
|
||||||
|
)
|
||||||
|
# result.senses — candidate Sense rows for disambiguation
|
||||||
|
# result.wordform_id — pre-resolved wordform UUID string, or None
|
||||||
|
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, dict_repo: DictionaryRepository) -> None:
|
||||||
|
self.dict_repo = dict_repo
|
||||||
|
|
||||||
|
async def lookup_token(
|
||||||
|
self,
|
||||||
|
surface: str,
|
||||||
|
spacy_lemma: str,
|
||||||
|
pos_ud: str,
|
||||||
|
language: str,
|
||||||
|
) -> TokenLookupResult:
|
||||||
|
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
|
||||||
|
|
||||||
|
``surface`` is the raw token text (e.g. ``"allons"``).
|
||||||
|
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
|
||||||
|
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
|
||||||
|
``language`` is the target language code (e.g. ``"fr"``).
|
||||||
|
|
||||||
|
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
|
||||||
|
surface form was found in the wordform table, a ``wordform_id`` that can be
|
||||||
|
stored on the vocab bank entry for precise inflection tracking.
|
||||||
|
"""
|
||||||
|
# Stage 1: wordform table lookup by inflected surface form
|
||||||
|
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
|
||||||
|
if wordforms:
|
||||||
|
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
|
||||||
|
senses: list[Sense] = []
|
||||||
|
for lemma_id in unique_lemma_ids:
|
||||||
|
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
|
||||||
|
|
||||||
|
# Only pre-assign wordform_id when a single wordform matched — if multiple
|
||||||
|
# wordforms from different lemmas matched, the ambiguity must be resolved
|
||||||
|
# by the user and we cannot confidently pick one.
|
||||||
|
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
|
||||||
|
return TokenLookupResult(
|
||||||
|
senses=senses,
|
||||||
|
wordform_id=wordform_id,
|
||||||
|
matched_via="wordform",
|
||||||
|
matched_wordforms=wordforms,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stage 2: spaCy lemma + UD POS filter
|
||||||
|
senses = await self.dict_repo.get_senses_for_headword_and_pos(
|
||||||
|
spacy_lemma, language, pos_ud
|
||||||
|
)
|
||||||
|
if senses:
|
||||||
|
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
|
||||||
|
|
||||||
|
# Stage 3: spaCy lemma only — no POS filter
|
||||||
|
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
|
||||||
|
if senses:
|
||||||
|
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
|
||||||
|
|
||||||
|
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
from ..models.dictionary import Sense
|
||||||
from ..models.vocab import LearnableWordBankEntry
|
from ..models.vocab import LearnableWordBankEntry
|
||||||
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
|
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
|
||||||
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
||||||
|
|
@ -35,6 +36,7 @@ class VocabService:
|
||||||
language_pair_id: uuid.UUID,
|
language_pair_id: uuid.UUID,
|
||||||
pathway: str,
|
pathway: str,
|
||||||
is_phrase: bool = False,
|
is_phrase: bool = False,
|
||||||
|
wordform_id: uuid.UUID | None = None,
|
||||||
source_article_id: uuid.UUID | None = None,
|
source_article_id: uuid.UUID | None = None,
|
||||||
) -> LearnableWordBankEntry:
|
) -> LearnableWordBankEntry:
|
||||||
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
|
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
|
||||||
|
|
@ -108,6 +110,59 @@ class VocabService:
|
||||||
entry_pathway=pathway,
|
entry_pathway=pathway,
|
||||||
is_phrase=False,
|
is_phrase=False,
|
||||||
sense_id=sense_id,
|
sense_id=sense_id,
|
||||||
|
wordform_id=wordform_id,
|
||||||
|
source_article_id=source_article_id,
|
||||||
|
disambiguation_status=status,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def add_token_to_bank(
|
||||||
|
self,
|
||||||
|
user_id: uuid.UUID,
|
||||||
|
surface_text: str,
|
||||||
|
language_pair_id: uuid.UUID,
|
||||||
|
senses: list[Sense],
|
||||||
|
wordform_id: uuid.UUID | None,
|
||||||
|
source_article_id: uuid.UUID | None = None,
|
||||||
|
) -> LearnableWordBankEntry:
|
||||||
|
"""Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
|
||||||
|
results, skipping the redundant dictionary query that ``add_word_to_bank`` would
|
||||||
|
otherwise perform.
|
||||||
|
|
||||||
|
``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
|
||||||
|
are stored directly on the bank entry. Auto-resolution still applies: exactly
|
||||||
|
one sense means ``auto_resolved``; anything else means ``pending``.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
|
||||||
|
wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
|
||||||
|
entry = await vocab_service.add_token_to_bank(
|
||||||
|
user_id=user_id,
|
||||||
|
surface_text="allons",
|
||||||
|
language_pair_id=pair_id,
|
||||||
|
senses=result.senses,
|
||||||
|
wordform_id=wf_id,
|
||||||
|
)
|
||||||
|
# entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
|
||||||
|
"""
|
||||||
|
pair = await self.vocab_repo.get_language_pair(language_pair_id)
|
||||||
|
if pair is None:
|
||||||
|
raise ValueError(f"Language pair {language_pair_id} not found")
|
||||||
|
|
||||||
|
if len(senses) == 1:
|
||||||
|
sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
|
||||||
|
status = "auto_resolved"
|
||||||
|
else:
|
||||||
|
sense_id = None
|
||||||
|
status = "pending"
|
||||||
|
|
||||||
|
return await self.vocab_repo.add_entry(
|
||||||
|
user_id=user_id,
|
||||||
|
language_pair_id=language_pair_id,
|
||||||
|
surface_text=surface_text,
|
||||||
|
entry_pathway="nlp_extraction",
|
||||||
|
wordform_id=wordform_id,
|
||||||
|
sense_id=sense_id,
|
||||||
source_article_id=source_article_id,
|
source_article_id=source_article_id,
|
||||||
disambiguation_status=status,
|
disambiguation_status=status,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,12 @@ from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||||
|
|
||||||
class DictionaryRepository(Protocol):
|
class DictionaryRepository(Protocol):
|
||||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
||||||
|
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
|
||||||
|
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
|
||||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
||||||
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
||||||
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
||||||
|
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
|
||||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -99,6 +102,40 @@ class PostgresDictionaryRepository:
|
||||||
entity = result.scalar_one_or_none()
|
entity = result.scalar_one_or_none()
|
||||||
return _lemma_to_model(entity) if entity else None
|
return _lemma_to_model(entity) if entity else None
|
||||||
|
|
||||||
|
async def get_senses_for_headword_and_pos(
|
||||||
|
self, headword: str, language: str, pos_normalised: str
|
||||||
|
) -> list[Sense]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionarySenseEntity)
|
||||||
|
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||||
|
.where(
|
||||||
|
DictionaryLemmaEntity.headword == headword,
|
||||||
|
DictionaryLemmaEntity.language == language,
|
||||||
|
DictionaryLemmaEntity.pos_normalised == pos_normalised,
|
||||||
|
)
|
||||||
|
.order_by(DictionarySenseEntity.sense_index)
|
||||||
|
)
|
||||||
|
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionarySenseEntity)
|
||||||
|
.where(DictionarySenseEntity.lemma_id == lemma_id)
|
||||||
|
.order_by(DictionarySenseEntity.sense_index)
|
||||||
|
)
|
||||||
|
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionaryWordformEntity)
|
||||||
|
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||||
|
.where(
|
||||||
|
DictionaryWordformEntity.form == form,
|
||||||
|
DictionaryLemmaEntity.language == language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionaryWordformEntity).where(
|
select(DictionaryWordformEntity).where(
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
|
from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
|
||||||
from ...domain.services.vocab_service import VocabService
|
from ...domain.services.vocab_service import VocabService
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
||||||
|
|
@ -21,6 +22,28 @@ class AddWordRequest(BaseModel):
|
||||||
source_article_id: str | None = None
|
source_article_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class AddFromTokenRequest(BaseModel):
|
||||||
|
language_pair_id: str
|
||||||
|
surface: str
|
||||||
|
spacy_lemma: str
|
||||||
|
pos_ud: str
|
||||||
|
language: str
|
||||||
|
source_article_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class SenseCandidateResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
gloss: str
|
||||||
|
topics: list[str]
|
||||||
|
tags: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class FromTokenResponse(BaseModel):
|
||||||
|
entry: "WordBankEntryResponse"
|
||||||
|
sense_candidates: list[SenseCandidateResponse]
|
||||||
|
matched_via: str
|
||||||
|
|
||||||
|
|
||||||
class SetSenseRequest(BaseModel):
|
class SetSenseRequest(BaseModel):
|
||||||
sense_id: str
|
sense_id: str
|
||||||
|
|
||||||
|
|
@ -80,6 +103,58 @@ async def add_word(
|
||||||
return _to_response(entry)
|
return _to_response(entry)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
|
||||||
|
async def add_from_token(
|
||||||
|
request: AddFromTokenRequest,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
token_data: dict = Depends(verify_token),
|
||||||
|
) -> FromTokenResponse:
|
||||||
|
user_id = uuid.UUID(token_data["sub"])
|
||||||
|
try:
|
||||||
|
language_pair_id = uuid.UUID(request.language_pair_id)
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
|
||||||
|
|
||||||
|
source_article_id = None
|
||||||
|
if request.source_article_id:
|
||||||
|
try:
|
||||||
|
source_article_id = uuid.UUID(request.source_article_id)
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid source_article_id")
|
||||||
|
|
||||||
|
lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
|
||||||
|
result: TokenLookupResult = await lookup_service.lookup_token(
|
||||||
|
surface=request.surface,
|
||||||
|
spacy_lemma=request.spacy_lemma,
|
||||||
|
pos_ud=request.pos_ud,
|
||||||
|
language=request.language,
|
||||||
|
)
|
||||||
|
|
||||||
|
wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
|
||||||
|
|
||||||
|
try:
|
||||||
|
entry = await _service(db).add_token_to_bank(
|
||||||
|
user_id=user_id,
|
||||||
|
surface_text=request.surface,
|
||||||
|
language_pair_id=language_pair_id,
|
||||||
|
senses=result.senses,
|
||||||
|
wordform_id=wordform_id,
|
||||||
|
source_article_id=source_article_id,
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise HTTPException(status_code=404, detail=str(exc))
|
||||||
|
|
||||||
|
candidates = [
|
||||||
|
SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
|
||||||
|
for s in result.senses
|
||||||
|
]
|
||||||
|
return FromTokenResponse(
|
||||||
|
entry=_to_response(entry),
|
||||||
|
sense_candidates=candidates,
|
||||||
|
matched_via=result.matched_via,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=list[WordBankEntryResponse])
|
@router.get("", response_model=list[WordBankEntryResponse])
|
||||||
async def list_entries(
|
async def list_entries(
|
||||||
language_pair_id: str,
|
language_pair_id: str,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue