feat: [api] Create better "search" functionality for the dictionary
This commit is contained in:
parent
fd96396c30
commit
c9dd9d0b4c
3 changed files with 213 additions and 20 deletions
|
|
@ -0,0 +1,23 @@
|
|||
"""enable unaccent extension
|
||||
|
||||
Revision ID: 0015
|
||||
Revises: 0014
|
||||
Create Date: 2026-04-17
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "0015"
|
||||
down_revision: Union[str, None] = "0014"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP EXTENSION IF EXISTS unaccent")
|
||||
|
|
@ -1,25 +1,37 @@
|
|||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||
from ..entities.dictionary_entities import (
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity,
|
||||
DictionaryWordformEntity,
|
||||
)
|
||||
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||
|
||||
|
||||
class DictionaryRepository(Protocol):
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
||||
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
|
||||
async def get_senses_for_headword(
|
||||
self, headword: str, language: str
|
||||
) -> list[Sense]: ...
|
||||
async def get_senses_for_headword_and_pos(
|
||||
self, headword: str, language: str, pos_normalised: str
|
||||
) -> list[Sense]: ...
|
||||
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
||||
async def find_senses_by_english_gloss(
|
||||
self, text: str, target_lang: str
|
||||
) -> list[Sense]: ...
|
||||
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
||||
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
||||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
|
||||
async def get_wordforms_by_form(
|
||||
self, form: str, language: str
|
||||
) -> list[Wordform]: ...
|
||||
async def search_wordforms_by_prefix(
|
||||
self, prefix: str, language: str
|
||||
) -> list[Wordform]: ...
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||
|
||||
|
||||
|
|
@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
|
|||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
|
||||
async def get_senses_for_headword(
|
||||
self, headword: str, language: str
|
||||
) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
DictionaryLemmaEntity.headword == headword,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
|
|
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
|
|||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
|
||||
async def find_senses_by_english_gloss(
|
||||
self, text: str, target_lang: str
|
||||
) -> list[Sense]:
|
||||
"""EN→target direction: find senses whose gloss matches the given English text.
|
||||
|
||||
Uses a case-insensitive exact match on the gloss column, filtered to the
|
||||
|
|
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
|
|||
"""
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
DictionarySenseEntity.gloss.ilike(text),
|
||||
DictionaryLemmaEntity.language == target_lang,
|
||||
|
|
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
|
|||
) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
DictionaryLemmaEntity.headword == headword,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
|
|
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
|
|||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity)
|
||||
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
DictionaryWordformEntity.form == form,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
|
|
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
|
|||
)
|
||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def search_wordforms_by_prefix(
|
||||
self, prefix: str, language: str
|
||||
) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
func.unaccent(DictionaryWordformEntity.form).ilike(
|
||||
func.unaccent(prefix) + "%"
|
||||
),
|
||||
DictionaryLemmaEntity.language == language,
|
||||
)
|
||||
)
|
||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def search_senses_by_prefix(
|
||||
self, prefix: str, lang: str
|
||||
) -> list[tuple[Sense, Lemma]]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity, DictionaryLemmaEntity)
|
||||
.join(
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||
)
|
||||
.where(
|
||||
DictionarySenseEntity.gloss.ilike(prefix),
|
||||
DictionaryLemmaEntity.language == lang,
|
||||
)
|
||||
)
|
||||
|
||||
results: list[tuple[Sense, Lemma]] = []
|
||||
|
||||
for sense_with_lemma in result.all():
|
||||
sense, lemma = sense_with_lemma.tuple()
|
||||
results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
|
||||
|
||||
return results
|
||||
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity).where(
|
||||
|
|
|
|||
|
|
@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
|
|||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.domain.models.dictionary import Lemma, Sense
|
||||
|
||||
from ...auth import verify_token
|
||||
from ...outbound.postgres.database import get_db
|
||||
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
||||
from ...outbound.postgres.repositories.dictionary_repository import (
|
||||
PostgresDictionaryRepository,
|
||||
)
|
||||
|
||||
router = APIRouter(prefix="/dictionary", tags=["dictionary"])
|
||||
|
||||
|
||||
# ── Response models ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class SenseResponse(BaseModel):
|
||||
id: str
|
||||
sense_index: int
|
||||
|
|
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
|
|||
tags: list[str]
|
||||
|
||||
|
||||
def _sense_to_response(s: Sense) -> SenseResponse:
|
||||
return SenseResponse(
|
||||
id=s.id,
|
||||
sense_index=s.sense_index,
|
||||
gloss=s.gloss,
|
||||
topics=s.topics,
|
||||
tags=s.tags,
|
||||
)
|
||||
|
||||
|
||||
def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
|
||||
return LemmaResponse(
|
||||
id=lemma.id,
|
||||
headword=lemma.headword,
|
||||
language=lemma.language,
|
||||
pos_raw=lemma.pos_raw,
|
||||
pos_normalised=lemma.pos_normalised,
|
||||
gender=lemma.gender,
|
||||
tags=lemma.tags,
|
||||
)
|
||||
|
||||
|
||||
class WordformMatch(BaseModel):
|
||||
lemma: LemmaResponse
|
||||
senses: list[SenseResponse]
|
||||
|
||||
|
||||
class SenseMatch(BaseModel):
|
||||
sense: SenseResponse
|
||||
lemma: LemmaResponse
|
||||
|
||||
|
||||
# ── Endpoint ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/wordforms", response_model=list[WordformMatch])
|
||||
async def search_wordforms(
|
||||
|
||||
@router.get("/search", response_model=list[WordformMatch])
|
||||
async def search_wordforms_prefix(
|
||||
lang_code: str,
|
||||
text: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
_: dict = Depends(verify_token),
|
||||
) -> list[WordformMatch]:
|
||||
"""
|
||||
Search for a wordform by surface text within a language.
|
||||
Search for wordforms whose surface text starts with the given prefix.
|
||||
|
||||
Returns one entry per matching lemma, each with the lemma's senses. A single
|
||||
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
|
||||
Uses accent-insensitive, case-insensitive prefix matching so that e.g.
|
||||
"chatea" returns both "château" and "châteaux", and "lent" returns all
|
||||
four forms of the adjective. Returns one entry per matching lemma.
|
||||
"""
|
||||
repo = PostgresDictionaryRepository(db)
|
||||
wordforms = await repo.get_wordforms_by_form(text, lang_code)
|
||||
wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
|
||||
|
||||
if not wordforms:
|
||||
return []
|
||||
|
||||
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
|
||||
seen_lemma_ids: set[str] = set()
|
||||
results: list[WordformMatch] = []
|
||||
|
||||
|
|
@ -97,3 +130,71 @@ async def search_wordforms(
|
|||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@router.get("/senses", response_model=list[SenseMatch])
|
||||
async def search_senses(
|
||||
lang_code: str,
|
||||
text: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
_: dict = Depends(verify_token),
|
||||
) -> list[SenseMatch]:
|
||||
"""
|
||||
Search for a Sense by (English) definition
|
||||
|
||||
Returns one entry per matching senses,each with its Sense.
|
||||
"""
|
||||
repo = PostgresDictionaryRepository(db)
|
||||
senses = await repo.search_senses_by_prefix(text, lang_code)
|
||||
|
||||
if not senses:
|
||||
return []
|
||||
|
||||
return [
|
||||
SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
|
||||
for (sense, lemma) in senses
|
||||
]
|
||||
|
||||
|
||||
@router.get("/wordforms", response_model=list[WordformMatch])
|
||||
async def search_wordforms(
|
||||
lang_code: str,
|
||||
text: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
_: dict = Depends(verify_token),
|
||||
) -> list[WordformMatch]:
|
||||
"""
|
||||
Search for a wordform by surface text within a language.
|
||||
|
||||
Returns one entry per matching lemma, each with the lemma's senses. A single
|
||||
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
|
||||
"""
|
||||
repo = PostgresDictionaryRepository(db)
|
||||
wordforms = await repo.get_wordforms_by_form(text, lang_code)
|
||||
|
||||
if not wordforms:
|
||||
return []
|
||||
|
||||
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
|
||||
seen_lemma_ids: set[str] = set()
|
||||
results: list[WordformMatch] = []
|
||||
|
||||
for wf in wordforms:
|
||||
if wf.lemma_id in seen_lemma_ids:
|
||||
continue
|
||||
seen_lemma_ids.add(wf.lemma_id)
|
||||
|
||||
lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
|
||||
if lemma is None:
|
||||
continue
|
||||
|
||||
senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
|
||||
|
||||
results.append(
|
||||
WordformMatch(
|
||||
lemma=_lemma_to_response(lemma),
|
||||
senses=[_sense_to_response(s) for s in senses],
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
|
|||
Loading…
Reference in a new issue