feat: [api] Create better "search" functionality for the dictionary

2026-04-18 17:26:09 +01:00 · 2026-04-18 17:26:09 +01:00 · c9dd9d0b4c
commit c9dd9d0b4c
parent fd96396c30
3 changed files with 213 additions and 20 deletions
--- a/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
+++ b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
@ -0,0 +1,23 @@
+"""enable unaccent extension
+
+Revision ID: 0015
+Revises: 0014
+Create Date: 2026-04-17
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+
+revision: str = "0015"
+down_revision: Union[str, None] = "0014"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
+
+
+def downgrade() -> None:
+    op.execute("DROP EXTENSION IF EXISTS unaccent")
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@ -1,25 +1,37 @@
 import uuid
+from dataclasses import dataclass
 from typing import Protocol

-from sqlalchemy import select
+from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession

+from ....domain.models.dictionary import Lemma, Sense, Wordform
 from ..entities.dictionary_entities import (
    DictionaryLemmaEntity,
    DictionarySenseEntity,
    DictionaryWordformEntity,
 )
-from ....domain.models.dictionary import Lemma, Sense, Wordform


 class DictionaryRepository(Protocol):
-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
-    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
+    async def get_senses_for_headword(
+        self, headword: str, language: str
+    ) -> list[Sense]: ...
+    async def get_senses_for_headword_and_pos(
+        self, headword: str, language: str, pos_normalised: str
+    ) -> list[Sense]: ...
    async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
+    async def find_senses_by_english_gloss(
+        self, text: str, target_lang: str
+    ) -> list[Sense]: ...
    async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
    async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
-    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
+    async def get_wordforms_by_form(
+        self, form: str, language: str
+    ) -> list[Wordform]: ...
+    async def search_wordforms_by_prefix(
+        self, prefix: str, language: str
+    ) -> list[Wordform]: ...
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...


@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
    def __init__(self, db: AsyncSession) -> None:
        self.db = db

-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
+    async def get_senses_for_headword(
+        self, headword: str, language: str
+    ) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
        )
        return [_sense_to_model(e) for e in result.scalars().all()]

-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
+    async def find_senses_by_english_gloss(
+        self, text: str, target_lang: str
+    ) -> list[Sense]:
        """EN→target direction: find senses whose gloss matches the given English text.

        Uses a case-insensitive exact match on the gloss column, filtered to the
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
        """
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
            .where(
                DictionarySenseEntity.gloss.ilike(text),
                DictionaryLemmaEntity.language == target_lang,
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
    ) -> list[Sense]:
        result = await self.db.execute(
            select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
            .where(
                DictionaryLemmaEntity.headword == headword,
                DictionaryLemmaEntity.language == language,
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity)
-            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
            .where(
                DictionaryWordformEntity.form == form,
                DictionaryLemmaEntity.language == language,
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
        )
        return [_wordform_to_model(e) for e in result.scalars().all()]

+    async def search_wordforms_by_prefix(
+        self, prefix: str, language: str
+    ) -> list[Wordform]:
+        result = await self.db.execute(
+            select(DictionaryWordformEntity)
+            .join(
+                DictionaryLemmaEntity,
+                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
+            .where(
+                func.unaccent(DictionaryWordformEntity.form).ilike(
+                    func.unaccent(prefix) + "%"
+                ),
+                DictionaryLemmaEntity.language == language,
+            )
+        )
+        return [_wordform_to_model(e) for e in result.scalars().all()]
+
+    async def search_senses_by_prefix(
+        self, prefix: str, lang: str
+    ) -> list[tuple[Sense, Lemma]]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity, DictionaryLemmaEntity)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
+            .where(
+                DictionarySenseEntity.gloss.ilike(prefix),
+                DictionaryLemmaEntity.language == lang,
+            )
+        )
+
+        results: list[tuple[Sense, Lemma]] = []
+
+        for sense_with_lemma in result.all():
+            sense, lemma = sense_with_lemma.tuple()
+            results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
+
+        return results
+
    async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
        result = await self.db.execute(
            select(DictionaryWordformEntity).where(
--- a/api/app/routers/api/dictionary.py
+++ b/api/app/routers/api/dictionary.py
@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.domain.models.dictionary import Lemma, Sense
+
 from ...auth import verify_token
 from ...outbound.postgres.database import get_db
-from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
+from ...outbound.postgres.repositories.dictionary_repository import (
+    PostgresDictionaryRepository,
+)

 router = APIRouter(prefix="/dictionary", tags=["dictionary"])


 # ── Response models ───────────────────────────────────────────────────────────

+
 class SenseResponse(BaseModel):
    id: str
    sense_index: int
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
    tags: list[str]


+def _sense_to_response(s: Sense) -> SenseResponse:
+    return SenseResponse(
+        id=s.id,
+        sense_index=s.sense_index,
+        gloss=s.gloss,
+        topics=s.topics,
+        tags=s.tags,
+    )
+
+
+def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
+    return LemmaResponse(
+        id=lemma.id,
+        headword=lemma.headword,
+        language=lemma.language,
+        pos_raw=lemma.pos_raw,
+        pos_normalised=lemma.pos_normalised,
+        gender=lemma.gender,
+        tags=lemma.tags,
+    )
+
+
 class WordformMatch(BaseModel):
    lemma: LemmaResponse
    senses: list[SenseResponse]


+class SenseMatch(BaseModel):
+    sense: SenseResponse
+    lemma: LemmaResponse
+
+
 # ── Endpoint ──────────────────────────────────────────────────────────────────

-@router.get("/wordforms", response_model=list[WordformMatch])
-async def search_wordforms(
+
+@router.get("/search", response_model=list[WordformMatch])
+async def search_wordforms_prefix(
    lang_code: str,
    text: str,
    db: AsyncSession = Depends(get_db),
    _: dict = Depends(verify_token),
 ) -> list[WordformMatch]:
    """
-    Search for a wordform by surface text within a language.
+    Search for wordforms whose surface text starts with the given prefix.

-    Returns one entry per matching lemma, each with the lemma's senses. A single
-    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
+    Uses accent-insensitive, case-insensitive prefix matching so that e.g.
+    "chatea" returns both "château" and "châteaux", and "lent" returns all
+    four forms of the adjective.  Returns one entry per matching lemma.
    """
    repo = PostgresDictionaryRepository(db)
-    wordforms = await repo.get_wordforms_by_form(text, lang_code)
+    wordforms = await repo.search_wordforms_by_prefix(text, lang_code)

    if not wordforms:
        return []

-    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
    seen_lemma_ids: set[str] = set()
    results: list[WordformMatch] = []

@ -97,3 +130,71 @@ async def search_wordforms(
        )

    return results
+
+
+@router.get("/senses", response_model=list[SenseMatch])
+async def search_senses(
+    lang_code: str,
+    text: str,
+    db: AsyncSession = Depends(get_db),
+    _: dict = Depends(verify_token),
+) -> list[SenseMatch]:
+    """
+    Search for a Sense by (English) definition
+
+    Returns one entry per matching senses,each with its Sense.
+    """
+    repo = PostgresDictionaryRepository(db)
+    senses = await repo.search_senses_by_prefix(text, lang_code)
+
+    if not senses:
+        return []
+
+    return [
+        SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
+        for (sense, lemma) in senses
+    ]
+
+
+@router.get("/wordforms", response_model=list[WordformMatch])
+async def search_wordforms(
+    lang_code: str,
+    text: str,
+    db: AsyncSession = Depends(get_db),
+    _: dict = Depends(verify_token),
+) -> list[WordformMatch]:
+    """
+    Search for a wordform by surface text within a language.
+
+    Returns one entry per matching lemma, each with the lemma's senses. A single
+    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
+    """
+    repo = PostgresDictionaryRepository(db)
+    wordforms = await repo.get_wordforms_by_form(text, lang_code)
+
+    if not wordforms:
+        return []
+
+    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
+    seen_lemma_ids: set[str] = set()
+    results: list[WordformMatch] = []
+
+    for wf in wordforms:
+        if wf.lemma_id in seen_lemma_ids:
+            continue
+        seen_lemma_ids.add(wf.lemma_id)
+
+        lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
+        if lemma is None:
+            continue
+
+        senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
+
+        results.append(
+            WordformMatch(
+                lemma=_lemma_to_response(lemma),
+                senses=[_sense_to_response(s) for s in senses],
+            )
+        )
+
+    return results