feat: [api] Create better "search" functionality for the dictionary

This commit is contained in:
wilson 2026-04-18 17:26:09 +01:00
parent fd96396c30
commit c9dd9d0b4c
3 changed files with 213 additions and 20 deletions

View file

@ -0,0 +1,23 @@
"""enable unaccent extension
Revision ID: 0015
Revises: 0014
Create Date: 2026-04-17
"""
from typing import Sequence, Union
from alembic import op
revision: str = "0015"
down_revision: Union[str, None] = "0014"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
def downgrade() -> None:
op.execute("DROP EXTENSION IF EXISTS unaccent")

View file

@ -1,25 +1,37 @@
import uuid
from dataclasses import dataclass
from typing import Protocol
from sqlalchemy import select
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from ....domain.models.dictionary import Lemma, Sense, Wordform
from ..entities.dictionary_entities import (
DictionaryLemmaEntity,
DictionarySenseEntity,
DictionaryWordformEntity,
)
from ....domain.models.dictionary import Lemma, Sense, Wordform
class DictionaryRepository(Protocol):
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
async def get_senses_for_headword(
self, headword: str, language: str
) -> list[Sense]: ...
async def get_senses_for_headword_and_pos(
self, headword: str, language: str, pos_normalised: str
) -> list[Sense]: ...
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
async def find_senses_by_english_gloss(
self, text: str, target_lang: str
) -> list[Sense]: ...
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
async def get_wordforms_by_form(
self, form: str, language: str
) -> list[Wordform]: ...
async def search_wordforms_by_prefix(
self, prefix: str, language: str
) -> list[Wordform]: ...
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
def __init__(self, db: AsyncSession) -> None:
self.db = db
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
async def get_senses_for_headword(
self, headword: str, language: str
) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
)
return [_sense_to_model(e) for e in result.scalars().all()]
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
async def find_senses_by_english_gloss(
self, text: str, target_lang: str
) -> list[Sense]:
"""EN→target direction: find senses whose gloss matches the given English text.
Uses a case-insensitive exact match on the gloss column, filtered to the
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
"""
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionarySenseEntity.gloss.ilike(text),
DictionaryLemmaEntity.language == target_lang,
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
) -> list[Sense]:
result = await self.db.execute(
select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
.join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language,
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity)
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
.join(
DictionaryLemmaEntity,
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionaryWordformEntity.form == form,
DictionaryLemmaEntity.language == language,
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
)
return [_wordform_to_model(e) for e in result.scalars().all()]
async def search_wordforms_by_prefix(
self, prefix: str, language: str
) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity)
.join(
DictionaryLemmaEntity,
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
func.unaccent(DictionaryWordformEntity.form).ilike(
func.unaccent(prefix) + "%"
),
DictionaryLemmaEntity.language == language,
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]
async def search_senses_by_prefix(
self, prefix: str, lang: str
) -> list[tuple[Sense, Lemma]]:
result = await self.db.execute(
select(DictionarySenseEntity, DictionaryLemmaEntity)
.join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionarySenseEntity.gloss.ilike(prefix),
DictionaryLemmaEntity.language == lang,
)
)
results: list[tuple[Sense, Lemma]] = []
for sense_with_lemma in result.all():
sense, lemma = sense_with_lemma.tuple()
results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
return results
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity).where(

View file

@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from app.domain.models.dictionary import Lemma, Sense
from ...auth import verify_token
from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
from ...outbound.postgres.repositories.dictionary_repository import (
PostgresDictionaryRepository,
)
router = APIRouter(prefix="/dictionary", tags=["dictionary"])
# ── Response models ───────────────────────────────────────────────────────────
class SenseResponse(BaseModel):
id: str
sense_index: int
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
tags: list[str]
def _sense_to_response(s: Sense) -> SenseResponse:
return SenseResponse(
id=s.id,
sense_index=s.sense_index,
gloss=s.gloss,
topics=s.topics,
tags=s.tags,
)
def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
return LemmaResponse(
id=lemma.id,
headword=lemma.headword,
language=lemma.language,
pos_raw=lemma.pos_raw,
pos_normalised=lemma.pos_normalised,
gender=lemma.gender,
tags=lemma.tags,
)
class WordformMatch(BaseModel):
lemma: LemmaResponse
senses: list[SenseResponse]
class SenseMatch(BaseModel):
sense: SenseResponse
lemma: LemmaResponse
# ── Endpoint ──────────────────────────────────────────────────────────────────
@router.get("/wordforms", response_model=list[WordformMatch])
async def search_wordforms(
@router.get("/search", response_model=list[WordformMatch])
async def search_wordforms_prefix(
lang_code: str,
text: str,
db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token),
) -> list[WordformMatch]:
"""
Search for a wordform by surface text within a language.
Search for wordforms whose surface text starts with the given prefix.
Returns one entry per matching lemma, each with the lemma's senses. A single
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
Uses accent-insensitive, case-insensitive prefix matching so that e.g.
"chatea" returns both "château" and "châteaux", and "lent" returns all
four forms of the adjective. Returns one entry per matching lemma.
"""
repo = PostgresDictionaryRepository(db)
wordforms = await repo.get_wordforms_by_form(text, lang_code)
wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
if not wordforms:
return []
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
seen_lemma_ids: set[str] = set()
results: list[WordformMatch] = []
@ -97,3 +130,71 @@ async def search_wordforms(
)
return results
@router.get("/senses", response_model=list[SenseMatch])
async def search_senses(
lang_code: str,
text: str,
db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token),
) -> list[SenseMatch]:
"""
Search for a Sense by (English) definition
Returns one entry per matching senses,each with its Sense.
"""
repo = PostgresDictionaryRepository(db)
senses = await repo.search_senses_by_prefix(text, lang_code)
if not senses:
return []
return [
SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
for (sense, lemma) in senses
]
@router.get("/wordforms", response_model=list[WordformMatch])
async def search_wordforms(
lang_code: str,
text: str,
db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token),
) -> list[WordformMatch]:
"""
Search for a wordform by surface text within a language.
Returns one entry per matching lemma, each with the lemma's senses. A single
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
"""
repo = PostgresDictionaryRepository(db)
wordforms = await repo.get_wordforms_by_form(text, lang_code)
if not wordforms:
return []
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
seen_lemma_ids: set[str] = set()
results: list[WordformMatch] = []
for wf in wordforms:
if wf.lemma_id in seen_lemma_ids:
continue
seen_lemma_ids.add(wf.lemma_id)
lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
if lemma is None:
continue
senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
results.append(
WordformMatch(
lemma=_lemma_to_response(lemma),
senses=[_sense_to_response(s) for s in senses],
)
)
return results