feat: [api] Create better "search" functionality for the dictionary

2026-04-18 17:26:09 +01:00 · 2026-04-18 17:26:09 +01:00 · c9dd9d0b4c
commit c9dd9d0b4c
parent fd96396c30
3 changed files with 213 additions and 20 deletions
--- a/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
+++ b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
@ -0,0 +1,23 @@
 """enable unaccent extension
 Revision ID: 0015
 Revises: 0014
 Create Date: 2026-04-17
 """
 from typing import Sequence, Union
 from alembic import op
 revision: str = "0015"
 down_revision: Union[str, None] = "0014"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
 def downgrade() -> None:
    op.execute("DROP EXTENSION IF EXISTS unaccent")
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -1,25 +1,37 @@
 import uuid
 from dataclasses import dataclass
 from typing import Protocol
-from sqlalchemy import select
+from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from ....domain.models.dictionary import Lemma, Sense, Wordform
 from ..entities.dictionary_entities import (
    DictionaryLemmaEntity,
    DictionarySenseEntity,
    DictionaryWordformEntity,
 )
 from ....domain.models.dictionary import Lemma, Sense, Wordform
 class DictionaryRepository(Protocol):
-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
+    async def get_senses_for_headword(
-    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
+        self, headword: str, language: str
    ) -> list[Sense]: ...
    async def get_senses_for_headword_and_pos(
        self, headword: str, language: str, pos_normalised: str
    ) -> list[Sense]: ...
    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
+    async def find_senses_by_english_gloss(
        self, text: str, target_lang: str
    ) -> list[Sense]: ...
    async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
    async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
-    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
+    async def get_wordforms_by_form(
        self, form: str, language: str
    ) -> list[Wordform]: ...
    async def search_wordforms_by_prefix(
        self, prefix: str, language: str
    ) -> list[Wordform]: ...
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
    def __init__(self, db: AsyncSession) -> None:
        self.db = db
-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
+    async def get_senses_for_headword(
        self, headword: str, language: str
    ) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
                DictionaryLemmaEntity,
                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
        )
        return [_sense_to_model(e) for e in result.scalars().all()]
-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
+    async def find_senses_by_english_gloss(
        self, text: str, target_lang: str
    ) -> list[Sense]:
        """EN→target direction: find senses whose gloss matches the given English text.
        Uses a case-insensitive exact match on the gloss column, filtered to the
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
        """
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
                DictionaryLemmaEntity,
                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                DictionarySenseEntity.gloss.ilike(text),
                DictionaryLemmaEntity.language == target_lang,
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
    ) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
                DictionaryLemmaEntity,
                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity)
-            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
                DictionaryLemmaEntity,
                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                DictionaryWordformEntity.form == form,
                DictionaryLemmaEntity.language == language,
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
        )
        return [_wordform_to_model(e) for e in result.scalars().all()]
    async def search_wordforms_by_prefix(
        self, prefix: str, language: str
    ) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity)
            .join(
                DictionaryLemmaEntity,
                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                func.unaccent(DictionaryWordformEntity.form).ilike(
                    func.unaccent(prefix) + "%"
                ),
                DictionaryLemmaEntity.language == language,
            )
        )
        return [_wordform_to_model(e) for e in result.scalars().all()]
    async def search_senses_by_prefix(
        self, prefix: str, lang: str
    ) -> list[tuple[Sense, Lemma]]:
        result = await self.db.execute(
            select(DictionarySenseEntity, DictionaryLemmaEntity)
            .join(
                DictionaryLemmaEntity,
                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
            )
            .where(
                DictionarySenseEntity.gloss.ilike(prefix),
                DictionaryLemmaEntity.language == lang,
            )
        )
        results: list[tuple[Sense, Lemma]] = []
        for sense_with_lemma in result.all():
            sense, lemma = sense_with_lemma.tuple()
            results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
        return results
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity).where(
--- a/api/app/routers/api/dictionary.py
+++ b/api/app/routers/api/dictionary.py
@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.domain.models.dictionary import Lemma, Sense
 from ...auth import verify_token
 from ...outbound.postgres.database import get_db
-from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
+from ...outbound.postgres.repositories.dictionary_repository import (
    PostgresDictionaryRepository,
 )
 router = APIRouter(prefix="/dictionary", tags=["dictionary"])
 # ── Response models ───────────────────────────────────────────────────────────
 class SenseResponse(BaseModel):
    id: str
    sense_index: int
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
    tags: list[str]
 def _sense_to_response(s: Sense) -> SenseResponse:
    return SenseResponse(
        id=s.id,
        sense_index=s.sense_index,
        gloss=s.gloss,
        topics=s.topics,
        tags=s.tags,
    )
 def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
    return LemmaResponse(
        id=lemma.id,
        headword=lemma.headword,
        language=lemma.language,
        pos_raw=lemma.pos_raw,
        pos_normalised=lemma.pos_normalised,
        gender=lemma.gender,
        tags=lemma.tags,
    )
 class WordformMatch(BaseModel):
    lemma: LemmaResponse
    senses: list[SenseResponse]
 class SenseMatch(BaseModel):
    sense: SenseResponse
    lemma: LemmaResponse
 # ── Endpoint ──────────────────────────────────────────────────────────────────
-@router.get("/wordforms", response_model=list[WordformMatch])
+
-async def search_wordforms(
+@router.get("/search", response_model=list[WordformMatch])
 async def search_wordforms_prefix(
    lang_code: str,
    text: str,
    db: AsyncSession = Depends(get_db),
    _: dict = Depends(verify_token),
 ) -> list[WordformMatch]:
    """
-    Search for a wordform by surface text within a language.
+    Search for wordforms whose surface text starts with the given prefix.
-    Returns one entry per matching lemma, each with the lemma's senses. A single
+    Uses accent-insensitive, case-insensitive prefix matching so that e.g.
-    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
+    "chatea" returns both "château" and "châteaux", and "lent" returns all
    four forms of the adjective.  Returns one entry per matching lemma.
    """
    repo = PostgresDictionaryRepository(db)
-    wordforms = await repo.get_wordforms_by_form(text, lang_code)
+    wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
    if not wordforms:
        return []
    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
    seen_lemma_ids: set[str] = set()
    results: list[WordformMatch] = []
@ -97,3 +130,71 @@ async def search_wordforms(
        )
    return results
@router.get("/senses", response_model=list[SenseMatch])
 async def search_senses(
    lang_code: str,
    text: str,
    db: AsyncSession = Depends(get_db),
    _: dict = Depends(verify_token),
 ) -> list[SenseMatch]:
    """
    Search for a Sense by (English) definition
    Returns one entry per matching senses,each with its Sense.
    """
    repo = PostgresDictionaryRepository(db)
    senses = await repo.search_senses_by_prefix(text, lang_code)
    if not senses:
        return []
    return [
        SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
        for (sense, lemma) in senses
    ]
@router.get("/wordforms", response_model=list[WordformMatch])
 async def search_wordforms(
    lang_code: str,
    text: str,
    db: AsyncSession = Depends(get_db),
    _: dict = Depends(verify_token),
 ) -> list[WordformMatch]:
    """
    Search for a wordform by surface text within a language.
    Returns one entry per matching lemma, each with the lemma's senses. A single
    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
    """
    repo = PostgresDictionaryRepository(db)
    wordforms = await repo.get_wordforms_by_form(text, lang_code)
    if not wordforms:
        return []
    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
    seen_lemma_ids: set[str] = set()
    results: list[WordformMatch] = []
    for wf in wordforms:
        if wf.lemma_id in seen_lemma_ids:
            continue
        seen_lemma_ids.add(wf.lemma_id)
        lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
        if lemma is None:
            continue
        senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
        results.append(
            WordformMatch(
                lemma=_lemma_to_response(lemma),
                senses=[_sense_to_response(s) for s in senses],
            )
        )
    return results