feat: [api] Create better "search" functionality for the dictionary

This commit is contained in:
wilson 2026-04-18 17:26:09 +01:00
parent fd96396c30
commit c9dd9d0b4c
3 changed files with 213 additions and 20 deletions

View file

@ -0,0 +1,23 @@
"""enable unaccent extension
Revision ID: 0015
Revises: 0014
Create Date: 2026-04-17
"""
from typing import Sequence, Union
from alembic import op
revision: str = "0015"
down_revision: Union[str, None] = "0014"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
def downgrade() -> None:
op.execute("DROP EXTENSION IF EXISTS unaccent")

View file

@ -1,25 +1,37 @@
import uuid import uuid
from dataclasses import dataclass
from typing import Protocol from typing import Protocol
from sqlalchemy import select from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from ....domain.models.dictionary import Lemma, Sense, Wordform
from ..entities.dictionary_entities import ( from ..entities.dictionary_entities import (
DictionaryLemmaEntity, DictionaryLemmaEntity,
DictionarySenseEntity, DictionarySenseEntity,
DictionaryWordformEntity, DictionaryWordformEntity,
) )
from ....domain.models.dictionary import Lemma, Sense, Wordform
class DictionaryRepository(Protocol): class DictionaryRepository(Protocol):
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ... async def get_senses_for_headword(
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ... self, headword: str, language: str
) -> list[Sense]: ...
async def get_senses_for_headword_and_pos(
self, headword: str, language: str, pos_normalised: str
) -> list[Sense]: ...
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ... async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ... async def find_senses_by_english_gloss(
self, text: str, target_lang: str
) -> list[Sense]: ...
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ... async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ... async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ... async def get_wordforms_by_form(
self, form: str, language: str
) -> list[Wordform]: ...
async def search_wordforms_by_prefix(
self, prefix: str, language: str
) -> list[Wordform]: ...
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ... async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
def __init__(self, db: AsyncSession) -> None: def __init__(self, db: AsyncSession) -> None:
self.db = db self.db = db
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: async def get_senses_for_headword(
self, headword: str, language: str
) -> list[Sense]:
result = await self.db.execute( result = await self.db.execute(
select(DictionarySenseEntity) select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) .join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where( .where(
DictionaryLemmaEntity.headword == headword, DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language, DictionaryLemmaEntity.language == language,
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
) )
return [_sense_to_model(e) for e in result.scalars().all()] return [_sense_to_model(e) for e in result.scalars().all()]
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: async def find_senses_by_english_gloss(
self, text: str, target_lang: str
) -> list[Sense]:
"""EN→target direction: find senses whose gloss matches the given English text. """EN→target direction: find senses whose gloss matches the given English text.
Uses a case-insensitive exact match on the gloss column, filtered to the Uses a case-insensitive exact match on the gloss column, filtered to the
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
""" """
result = await self.db.execute( result = await self.db.execute(
select(DictionarySenseEntity) select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) .join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where( .where(
DictionarySenseEntity.gloss.ilike(text), DictionarySenseEntity.gloss.ilike(text),
DictionaryLemmaEntity.language == target_lang, DictionaryLemmaEntity.language == target_lang,
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
) -> list[Sense]: ) -> list[Sense]:
result = await self.db.execute( result = await self.db.execute(
select(DictionarySenseEntity) select(DictionarySenseEntity)
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) .join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where( .where(
DictionaryLemmaEntity.headword == headword, DictionaryLemmaEntity.headword == headword,
DictionaryLemmaEntity.language == language, DictionaryLemmaEntity.language == language,
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
result = await self.db.execute( result = await self.db.execute(
select(DictionaryWordformEntity) select(DictionaryWordformEntity)
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id) .join(
DictionaryLemmaEntity,
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where( .where(
DictionaryWordformEntity.form == form, DictionaryWordformEntity.form == form,
DictionaryLemmaEntity.language == language, DictionaryLemmaEntity.language == language,
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
) )
return [_wordform_to_model(e) for e in result.scalars().all()] return [_wordform_to_model(e) for e in result.scalars().all()]
async def search_wordforms_by_prefix(
self, prefix: str, language: str
) -> list[Wordform]:
result = await self.db.execute(
select(DictionaryWordformEntity)
.join(
DictionaryLemmaEntity,
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
func.unaccent(DictionaryWordformEntity.form).ilike(
func.unaccent(prefix) + "%"
),
DictionaryLemmaEntity.language == language,
)
)
return [_wordform_to_model(e) for e in result.scalars().all()]
async def search_senses_by_prefix(
self, prefix: str, lang: str
) -> list[tuple[Sense, Lemma]]:
result = await self.db.execute(
select(DictionarySenseEntity, DictionaryLemmaEntity)
.join(
DictionaryLemmaEntity,
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
)
.where(
DictionarySenseEntity.gloss.ilike(prefix),
DictionaryLemmaEntity.language == lang,
)
)
results: list[tuple[Sense, Lemma]] = []
for sense_with_lemma in result.all():
sense, lemma = sense_with_lemma.tuple()
results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
return results
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
result = await self.db.execute( result = await self.db.execute(
select(DictionaryWordformEntity).where( select(DictionaryWordformEntity).where(

View file

@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.domain.models.dictionary import Lemma, Sense
from ...auth import verify_token from ...auth import verify_token
from ...outbound.postgres.database import get_db from ...outbound.postgres.database import get_db
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository from ...outbound.postgres.repositories.dictionary_repository import (
PostgresDictionaryRepository,
)
router = APIRouter(prefix="/dictionary", tags=["dictionary"]) router = APIRouter(prefix="/dictionary", tags=["dictionary"])
# ── Response models ─────────────────────────────────────────────────────────── # ── Response models ───────────────────────────────────────────────────────────
class SenseResponse(BaseModel): class SenseResponse(BaseModel):
id: str id: str
sense_index: int sense_index: int
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
tags: list[str] tags: list[str]
def _sense_to_response(s: Sense) -> SenseResponse:
return SenseResponse(
id=s.id,
sense_index=s.sense_index,
gloss=s.gloss,
topics=s.topics,
tags=s.tags,
)
def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
return LemmaResponse(
id=lemma.id,
headword=lemma.headword,
language=lemma.language,
pos_raw=lemma.pos_raw,
pos_normalised=lemma.pos_normalised,
gender=lemma.gender,
tags=lemma.tags,
)
class WordformMatch(BaseModel): class WordformMatch(BaseModel):
lemma: LemmaResponse lemma: LemmaResponse
senses: list[SenseResponse] senses: list[SenseResponse]
class SenseMatch(BaseModel):
sense: SenseResponse
lemma: LemmaResponse
# ── Endpoint ────────────────────────────────────────────────────────────────── # ── Endpoint ──────────────────────────────────────────────────────────────────
@router.get("/wordforms", response_model=list[WordformMatch])
async def search_wordforms( @router.get("/search", response_model=list[WordformMatch])
async def search_wordforms_prefix(
lang_code: str, lang_code: str,
text: str, text: str,
db: AsyncSession = Depends(get_db), db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token), _: dict = Depends(verify_token),
) -> list[WordformMatch]: ) -> list[WordformMatch]:
""" """
Search for a wordform by surface text within a language. Search for wordforms whose surface text starts with the given prefix.
Returns one entry per matching lemma, each with the lemma's senses. A single Uses accent-insensitive, case-insensitive prefix matching so that e.g.
form (e.g. "allons") may resolve to more than one lemma when homographs exist. "chatea" returns both "château" and "châteaux", and "lent" returns all
four forms of the adjective. Returns one entry per matching lemma.
""" """
repo = PostgresDictionaryRepository(db) repo = PostgresDictionaryRepository(db)
wordforms = await repo.get_wordforms_by_form(text, lang_code) wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
if not wordforms: if not wordforms:
return [] return []
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
seen_lemma_ids: set[str] = set() seen_lemma_ids: set[str] = set()
results: list[WordformMatch] = [] results: list[WordformMatch] = []
@ -97,3 +130,71 @@ async def search_wordforms(
) )
return results return results
@router.get("/senses", response_model=list[SenseMatch])
async def search_senses(
lang_code: str,
text: str,
db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token),
) -> list[SenseMatch]:
"""
Search for a Sense by (English) definition
Returns one entry per matching senses,each with its Sense.
"""
repo = PostgresDictionaryRepository(db)
senses = await repo.search_senses_by_prefix(text, lang_code)
if not senses:
return []
return [
SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
for (sense, lemma) in senses
]
@router.get("/wordforms", response_model=list[WordformMatch])
async def search_wordforms(
lang_code: str,
text: str,
db: AsyncSession = Depends(get_db),
_: dict = Depends(verify_token),
) -> list[WordformMatch]:
"""
Search for a wordform by surface text within a language.
Returns one entry per matching lemma, each with the lemma's senses. A single
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
"""
repo = PostgresDictionaryRepository(db)
wordforms = await repo.get_wordforms_by_form(text, lang_code)
if not wordforms:
return []
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
seen_lemma_ids: set[str] = set()
results: list[WordformMatch] = []
for wf in wordforms:
if wf.lemma_id in seen_lemma_ids:
continue
seen_lemma_ids.add(wf.lemma_id)
lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
if lemma is None:
continue
senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
results.append(
WordformMatch(
lemma=_lemma_to_response(lemma),
senses=[_sense_to_response(s) for s in senses],
)
)
return results