From c9dd9d0b4c416c0a164ed94f9ca7b9e144c1fa8d Mon Sep 17 00:00:00 2001 From: wilson Date: Sat, 18 Apr 2026 17:26:09 +0100 Subject: [PATCH] feat: [api] Create better "search" functionality for the dictionary --- ...20260417_0015_enable_unaccent_extension.py | 23 ++++ .../repositories/dictionary_repository.py | 93 ++++++++++++-- api/app/routers/api/dictionary.py | 117 ++++++++++++++++-- 3 files changed, 213 insertions(+), 20 deletions(-) create mode 100644 api/alembic/versions/20260417_0015_enable_unaccent_extension.py diff --git a/api/alembic/versions/20260417_0015_enable_unaccent_extension.py b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py new file mode 100644 index 0000000..ed13ff8 --- /dev/null +++ b/api/alembic/versions/20260417_0015_enable_unaccent_extension.py @@ -0,0 +1,23 @@ +"""enable unaccent extension + +Revision ID: 0015 +Revises: 0014 +Create Date: 2026-04-17 + +""" +from typing import Sequence, Union + +from alembic import op + +revision: str = "0015" +down_revision: Union[str, None] = "0014" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("CREATE EXTENSION IF NOT EXISTS unaccent") + + +def downgrade() -> None: + op.execute("DROP EXTENSION IF EXISTS unaccent") diff --git a/api/app/outbound/postgres/repositories/dictionary_repository.py b/api/app/outbound/postgres/repositories/dictionary_repository.py index dff9d7d..ad9bb45 100644 --- a/api/app/outbound/postgres/repositories/dictionary_repository.py +++ b/api/app/outbound/postgres/repositories/dictionary_repository.py @@ -1,25 +1,37 @@ import uuid +from dataclasses import dataclass from typing import Protocol -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession +from ....domain.models.dictionary import Lemma, Sense, Wordform from ..entities.dictionary_entities import ( DictionaryLemmaEntity, DictionarySenseEntity, DictionaryWordformEntity, ) -from ....domain.models.dictionary import Lemma, Sense, Wordform class DictionaryRepository(Protocol): - async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ... - async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ... + async def get_senses_for_headword( + self, headword: str, language: str + ) -> list[Sense]: ... + async def get_senses_for_headword_and_pos( + self, headword: str, language: str, pos_normalised: str + ) -> list[Sense]: ... async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ... - async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ... + async def find_senses_by_english_gloss( + self, text: str, target_lang: str + ) -> list[Sense]: ... async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ... async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ... - async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ... + async def get_wordforms_by_form( + self, form: str, language: str + ) -> list[Wordform]: ... + async def search_wordforms_by_prefix( + self, prefix: str, language: str + ) -> list[Wordform]: ... async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ... @@ -59,10 +71,15 @@ class PostgresDictionaryRepository: def __init__(self, db: AsyncSession) -> None: self.db = db - async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: + async def get_senses_for_headword( + self, headword: str, language: str + ) -> list[Sense]: result = await self.db.execute( select(DictionarySenseEntity) - .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .join( + DictionaryLemmaEntity, + DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id, + ) .where( DictionaryLemmaEntity.headword == headword, DictionaryLemmaEntity.language == language, @@ -71,7 +88,9 @@ class PostgresDictionaryRepository: ) return [_sense_to_model(e) for e in result.scalars().all()] - async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: + async def find_senses_by_english_gloss( + self, text: str, target_lang: str + ) -> list[Sense]: """EN→target direction: find senses whose gloss matches the given English text. Uses a case-insensitive exact match on the gloss column, filtered to the @@ -79,7 +98,10 @@ class PostgresDictionaryRepository: """ result = await self.db.execute( select(DictionarySenseEntity) - .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .join( + DictionaryLemmaEntity, + DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id, + ) .where( DictionarySenseEntity.gloss.ilike(text), DictionaryLemmaEntity.language == target_lang, @@ -107,7 +129,10 @@ class PostgresDictionaryRepository: ) -> list[Sense]: result = await self.db.execute( select(DictionarySenseEntity) - .join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id) + .join( + DictionaryLemmaEntity, + DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id, + ) .where( DictionaryLemmaEntity.headword == headword, DictionaryLemmaEntity.language == language, @@ -128,7 +153,10 @@ class PostgresDictionaryRepository: async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: result = await self.db.execute( select(DictionaryWordformEntity) - .join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id) + .join( + DictionaryLemmaEntity, + DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id, + ) .where( DictionaryWordformEntity.form == form, DictionaryLemmaEntity.language == language, @@ -136,6 +164,47 @@ class PostgresDictionaryRepository: ) return [_wordform_to_model(e) for e in result.scalars().all()] + async def search_wordforms_by_prefix( + self, prefix: str, language: str + ) -> list[Wordform]: + result = await self.db.execute( + select(DictionaryWordformEntity) + .join( + DictionaryLemmaEntity, + DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id, + ) + .where( + func.unaccent(DictionaryWordformEntity.form).ilike( + func.unaccent(prefix) + "%" + ), + DictionaryLemmaEntity.language == language, + ) + ) + return [_wordform_to_model(e) for e in result.scalars().all()] + + async def search_senses_by_prefix( + self, prefix: str, lang: str + ) -> list[tuple[Sense, Lemma]]: + result = await self.db.execute( + select(DictionarySenseEntity, DictionaryLemmaEntity) + .join( + DictionaryLemmaEntity, + DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id, + ) + .where( + DictionarySenseEntity.gloss.ilike(prefix), + DictionaryLemmaEntity.language == lang, + ) + ) + + results: list[tuple[Sense, Lemma]] = [] + + for sense_with_lemma in result.all(): + sense, lemma = sense_with_lemma.tuple() + results.append((_sense_to_model(sense), _lemma_to_model(lemma))) + + return results + async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: result = await self.db.execute( select(DictionaryWordformEntity).where( diff --git a/api/app/routers/api/dictionary.py b/api/app/routers/api/dictionary.py index dca336a..72af717 100644 --- a/api/app/routers/api/dictionary.py +++ b/api/app/routers/api/dictionary.py @@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel from sqlalchemy.ext.asyncio import AsyncSession +from app.domain.models.dictionary import Lemma, Sense + from ...auth import verify_token from ...outbound.postgres.database import get_db -from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository +from ...outbound.postgres.repositories.dictionary_repository import ( + PostgresDictionaryRepository, +) router = APIRouter(prefix="/dictionary", tags=["dictionary"]) # ── Response models ─────────────────────────────────────────────────────────── + class SenseResponse(BaseModel): id: str sense_index: int @@ -31,33 +36,61 @@ class LemmaResponse(BaseModel): tags: list[str] +def _sense_to_response(s: Sense) -> SenseResponse: + return SenseResponse( + id=s.id, + sense_index=s.sense_index, + gloss=s.gloss, + topics=s.topics, + tags=s.tags, + ) + + +def _lemma_to_response(lemma: Lemma) -> LemmaResponse: + return LemmaResponse( + id=lemma.id, + headword=lemma.headword, + language=lemma.language, + pos_raw=lemma.pos_raw, + pos_normalised=lemma.pos_normalised, + gender=lemma.gender, + tags=lemma.tags, + ) + + class WordformMatch(BaseModel): lemma: LemmaResponse senses: list[SenseResponse] +class SenseMatch(BaseModel): + sense: SenseResponse + lemma: LemmaResponse + + # ── Endpoint ────────────────────────────────────────────────────────────────── -@router.get("/wordforms", response_model=list[WordformMatch]) -async def search_wordforms( + +@router.get("/search", response_model=list[WordformMatch]) +async def search_wordforms_prefix( lang_code: str, text: str, db: AsyncSession = Depends(get_db), _: dict = Depends(verify_token), ) -> list[WordformMatch]: """ - Search for a wordform by surface text within a language. + Search for wordforms whose surface text starts with the given prefix. - Returns one entry per matching lemma, each with the lemma's senses. A single - form (e.g. "allons") may resolve to more than one lemma when homographs exist. + Uses accent-insensitive, case-insensitive prefix matching so that e.g. + "chatea" returns both "château" and "châteaux", and "lent" returns all + four forms of the adjective. Returns one entry per matching lemma. """ repo = PostgresDictionaryRepository(db) - wordforms = await repo.get_wordforms_by_form(text, lang_code) + wordforms = await repo.search_wordforms_by_prefix(text, lang_code) if not wordforms: return [] - # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma seen_lemma_ids: set[str] = set() results: list[WordformMatch] = [] @@ -97,3 +130,71 @@ async def search_wordforms( ) return results + + +@router.get("/senses", response_model=list[SenseMatch]) +async def search_senses( + lang_code: str, + text: str, + db: AsyncSession = Depends(get_db), + _: dict = Depends(verify_token), +) -> list[SenseMatch]: + """ + Search for a Sense by (English) definition + + Returns one entry per matching senses,each with its Sense. + """ + repo = PostgresDictionaryRepository(db) + senses = await repo.search_senses_by_prefix(text, lang_code) + + if not senses: + return [] + + return [ + SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense)) + for (sense, lemma) in senses + ] + + +@router.get("/wordforms", response_model=list[WordformMatch]) +async def search_wordforms( + lang_code: str, + text: str, + db: AsyncSession = Depends(get_db), + _: dict = Depends(verify_token), +) -> list[WordformMatch]: + """ + Search for a wordform by surface text within a language. + + Returns one entry per matching lemma, each with the lemma's senses. A single + form (e.g. "allons") may resolve to more than one lemma when homographs exist. + """ + repo = PostgresDictionaryRepository(db) + wordforms = await repo.get_wordforms_by_form(text, lang_code) + + if not wordforms: + return [] + + # Deduplicate lemma IDs — multiple wordform rows may point to the same lemma + seen_lemma_ids: set[str] = set() + results: list[WordformMatch] = [] + + for wf in wordforms: + if wf.lemma_id in seen_lemma_ids: + continue + seen_lemma_ids.add(wf.lemma_id) + + lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id)) + if lemma is None: + continue + + senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id)) + + results.append( + WordformMatch( + lemma=_lemma_to_response(lemma), + senses=[_sense_to_response(s) for s in senses], + ) + ) + + return results