From c9dd9d0b4c416c0a164ed94f9ca7b9e144c1fa8d Mon Sep 17 00:00:00 2001
From: wilson <wilson@thomaswilson.xyz>
Date: Sat, 18 Apr 2026 17:26:09 +0100
Subject: [PATCH] feat: [api] Create better "search" functionality for the
 dictionary

---
 ...20260417_0015_enable_unaccent_extension.py |  23 ++++
 .../repositories/dictionary_repository.py     |  93 ++++++++++++--
 api/app/routers/api/dictionary.py             | 117 ++++++++++++++++--
 3 files changed, 213 insertions(+), 20 deletions(-)
 create mode 100644 api/alembic/versions/20260417_0015_enable_unaccent_extension.py

diff --git a/api/alembic/versions/20260417_0015_enable_unaccent_extension.py b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
new file mode 100644
index 0000000..ed13ff8
--- /dev/null
+++ b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py
@@ -0,0 +1,23 @@
+"""enable unaccent extension
+
+Revision ID: 0015
+Revises: 0014
+Create Date: 2026-04-17
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+
+revision: str = "0015"
+down_revision: Union[str, None] = "0014"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
+
+
+def downgrade() -> None:
+    op.execute("DROP EXTENSION IF EXISTS unaccent")
diff --git a/api/app/outbound/postgres/repositories/dictionary_repository.py b/api/app/outbound/postgres/repositories/dictionary_repository.py
index dff9d7d..ad9bb45 100644
--- a/api/app/outbound/postgres/repositories/dictionary_repository.py
+++ b/api/app/outbound/postgres/repositories/dictionary_repository.py
@@ -1,25 +1,37 @@
 import uuid
+from dataclasses import dataclass
 from typing import Protocol
 
-from sqlalchemy import select
+from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from ....domain.models.dictionary import Lemma, Sense, Wordform
 from ..entities.dictionary_entities import (
     DictionaryLemmaEntity,
     DictionarySenseEntity,
     DictionaryWordformEntity,
 )
-from ....domain.models.dictionary import Lemma, Sense, Wordform
 
 
 class DictionaryRepository(Protocol):
-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
-    async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
+    async def get_senses_for_headword(
+        self, headword: str, language: str
+    ) -> list[Sense]: ...
+    async def get_senses_for_headword_and_pos(
+        self, headword: str, language: str, pos_normalised: str
+    ) -> list[Sense]: ...
     async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
+    async def find_senses_by_english_gloss(
+        self, text: str, target_lang: str
+    ) -> list[Sense]: ...
     async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
     async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
-    async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
+    async def get_wordforms_by_form(
+        self, form: str, language: str
+    ) -> list[Wordform]: ...
+    async def search_wordforms_by_prefix(
+        self, prefix: str, language: str
+    ) -> list[Wordform]: ...
     async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
 
 
@@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
     def __init__(self, db: AsyncSession) -> None:
         self.db = db
 
-    async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
+    async def get_senses_for_headword(
+        self, headword: str, language: str
+    ) -> list[Sense]:
         result = await self.db.execute(
             select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
             .where(
                 DictionaryLemmaEntity.headword == headword,
                 DictionaryLemmaEntity.language == language,
@@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
         )
         return [_sense_to_model(e) for e in result.scalars().all()]
 
-    async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
+    async def find_senses_by_english_gloss(
+        self, text: str, target_lang: str
+    ) -> list[Sense]:
         """EN→target direction: find senses whose gloss matches the given English text.
 
         Uses a case-insensitive exact match on the gloss column, filtered to the
@@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
         """
         result = await self.db.execute(
             select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
             .where(
                 DictionarySenseEntity.gloss.ilike(text),
                 DictionaryLemmaEntity.language == target_lang,
@@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
     ) -> list[Sense]:
         result = await self.db.execute(
             select(DictionarySenseEntity)
-            .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
             .where(
                 DictionaryLemmaEntity.headword == headword,
                 DictionaryLemmaEntity.language == language,
@@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
     async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
         result = await self.db.execute(
             select(DictionaryWordformEntity)
-            .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
+            .join(
+                DictionaryLemmaEntity,
+                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
             .where(
                 DictionaryWordformEntity.form == form,
                 DictionaryLemmaEntity.language == language,
@@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
         )
         return [_wordform_to_model(e) for e in result.scalars().all()]
 
+    async def search_wordforms_by_prefix(
+        self, prefix: str, language: str
+    ) -> list[Wordform]:
+        result = await self.db.execute(
+            select(DictionaryWordformEntity)
+            .join(
+                DictionaryLemmaEntity,
+                DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
+            .where(
+                func.unaccent(DictionaryWordformEntity.form).ilike(
+                    func.unaccent(prefix) + "%"
+                ),
+                DictionaryLemmaEntity.language == language,
+            )
+        )
+        return [_wordform_to_model(e) for e in result.scalars().all()]
+
+    async def search_senses_by_prefix(
+        self, prefix: str, lang: str
+    ) -> list[tuple[Sense, Lemma]]:
+        result = await self.db.execute(
+            select(DictionarySenseEntity, DictionaryLemmaEntity)
+            .join(
+                DictionaryLemmaEntity,
+                DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
+            )
+            .where(
+                DictionarySenseEntity.gloss.ilike(prefix),
+                DictionaryLemmaEntity.language == lang,
+            )
+        )
+
+        results: list[tuple[Sense, Lemma]] = []
+
+        for sense_with_lemma in result.all():
+            sense, lemma = sense_with_lemma.tuple()
+            results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
+
+        return results
+
     async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
         result = await self.db.execute(
             select(DictionaryWordformEntity).where(
diff --git a/api/app/routers/api/dictionary.py b/api/app/routers/api/dictionary.py
index dca336a..72af717 100644
--- a/api/app/routers/api/dictionary.py
+++ b/api/app/routers/api/dictionary.py
@@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.domain.models.dictionary import Lemma, Sense
+
 from ...auth import verify_token
 from ...outbound.postgres.database import get_db
-from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
+from ...outbound.postgres.repositories.dictionary_repository import (
+    PostgresDictionaryRepository,
+)
 
 router = APIRouter(prefix="/dictionary", tags=["dictionary"])
 
 
 # ── Response models ───────────────────────────────────────────────────────────
 
+
 class SenseResponse(BaseModel):
     id: str
     sense_index: int
@@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
     tags: list[str]
 
 
+def _sense_to_response(s: Sense) -> SenseResponse:
+    return SenseResponse(
+        id=s.id,
+        sense_index=s.sense_index,
+        gloss=s.gloss,
+        topics=s.topics,
+        tags=s.tags,
+    )
+
+
+def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
+    return LemmaResponse(
+        id=lemma.id,
+        headword=lemma.headword,
+        language=lemma.language,
+        pos_raw=lemma.pos_raw,
+        pos_normalised=lemma.pos_normalised,
+        gender=lemma.gender,
+        tags=lemma.tags,
+    )
+
+
 class WordformMatch(BaseModel):
     lemma: LemmaResponse
     senses: list[SenseResponse]
 
 
+class SenseMatch(BaseModel):
+    sense: SenseResponse
+    lemma: LemmaResponse
+
+
 # ── Endpoint ──────────────────────────────────────────────────────────────────
 
-@router.get("/wordforms", response_model=list[WordformMatch])
-async def search_wordforms(
+
+@router.get("/search", response_model=list[WordformMatch])
+async def search_wordforms_prefix(
     lang_code: str,
     text: str,
     db: AsyncSession = Depends(get_db),
     _: dict = Depends(verify_token),
 ) -> list[WordformMatch]:
     """
-    Search for a wordform by surface text within a language.
+    Search for wordforms whose surface text starts with the given prefix.
 
-    Returns one entry per matching lemma, each with the lemma's senses. A single
-    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
+    Uses accent-insensitive, case-insensitive prefix matching so that e.g.
+    "chatea" returns both "château" and "châteaux", and "lent" returns all
+    four forms of the adjective.  Returns one entry per matching lemma.
     """
     repo = PostgresDictionaryRepository(db)
-    wordforms = await repo.get_wordforms_by_form(text, lang_code)
+    wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
 
     if not wordforms:
         return []
 
-    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
     seen_lemma_ids: set[str] = set()
     results: list[WordformMatch] = []
 
@@ -97,3 +130,71 @@ async def search_wordforms(
         )
 
     return results
+
+
+@router.get("/senses", response_model=list[SenseMatch])
+async def search_senses(
+    lang_code: str,
+    text: str,
+    db: AsyncSession = Depends(get_db),
+    _: dict = Depends(verify_token),
+) -> list[SenseMatch]:
+    """
+    Search for a Sense by (English) definition
+
+    Returns one entry per matching senses,each with its Sense.
+    """
+    repo = PostgresDictionaryRepository(db)
+    senses = await repo.search_senses_by_prefix(text, lang_code)
+
+    if not senses:
+        return []
+
+    return [
+        SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
+        for (sense, lemma) in senses
+    ]
+
+
+@router.get("/wordforms", response_model=list[WordformMatch])
+async def search_wordforms(
+    lang_code: str,
+    text: str,
+    db: AsyncSession = Depends(get_db),
+    _: dict = Depends(verify_token),
+) -> list[WordformMatch]:
+    """
+    Search for a wordform by surface text within a language.
+
+    Returns one entry per matching lemma, each with the lemma's senses. A single
+    form (e.g. "allons") may resolve to more than one lemma when homographs exist.
+    """
+    repo = PostgresDictionaryRepository(db)
+    wordforms = await repo.get_wordforms_by_form(text, lang_code)
+
+    if not wordforms:
+        return []
+
+    # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
+    seen_lemma_ids: set[str] = set()
+    results: list[WordformMatch] = []
+
+    for wf in wordforms:
+        if wf.lemma_id in seen_lemma_ids:
+            continue
+        seen_lemma_ids.add(wf.lemma_id)
+
+        lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
+        if lemma is None:
+            continue
+
+        senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
+
+        results.append(
+            WordformMatch(
+                lemma=_lemma_to_response(lemma),
+                senses=[_sense_to_response(s) for s in senses],
+            )
+        )
+
+    return results