feat: [api] Create better "search" functionality for the dictionary
This commit is contained in:
parent
fd96396c30
commit
c9dd9d0b4c
3 changed files with 213 additions and 20 deletions
|
|
@ -0,0 +1,23 @@
|
||||||
|
"""enable unaccent extension
|
||||||
|
|
||||||
|
Revision ID: 0015
|
||||||
|
Revises: 0014
|
||||||
|
Create Date: 2026-04-17
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "0015"
|
||||||
|
down_revision: Union[str, None] = "0014"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.execute("CREATE EXTENSION IF NOT EXISTS unaccent")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.execute("DROP EXTENSION IF EXISTS unaccent")
|
||||||
|
|
@ -1,25 +1,37 @@
|
||||||
import uuid
|
import uuid
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
|
|
||||||
from sqlalchemy import select
|
from sqlalchemy import func, select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||||
from ..entities.dictionary_entities import (
|
from ..entities.dictionary_entities import (
|
||||||
DictionaryLemmaEntity,
|
DictionaryLemmaEntity,
|
||||||
DictionarySenseEntity,
|
DictionarySenseEntity,
|
||||||
DictionaryWordformEntity,
|
DictionaryWordformEntity,
|
||||||
)
|
)
|
||||||
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
|
||||||
|
|
||||||
|
|
||||||
class DictionaryRepository(Protocol):
|
class DictionaryRepository(Protocol):
|
||||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
async def get_senses_for_headword(
|
||||||
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
|
self, headword: str, language: str
|
||||||
|
) -> list[Sense]: ...
|
||||||
|
async def get_senses_for_headword_and_pos(
|
||||||
|
self, headword: str, language: str, pos_normalised: str
|
||||||
|
) -> list[Sense]: ...
|
||||||
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
|
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
|
||||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
async def find_senses_by_english_gloss(
|
||||||
|
self, text: str, target_lang: str
|
||||||
|
) -> list[Sense]: ...
|
||||||
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
||||||
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
||||||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
|
async def get_wordforms_by_form(
|
||||||
|
self, form: str, language: str
|
||||||
|
) -> list[Wordform]: ...
|
||||||
|
async def search_wordforms_by_prefix(
|
||||||
|
self, prefix: str, language: str
|
||||||
|
) -> list[Wordform]: ...
|
||||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -59,10 +71,15 @@ class PostgresDictionaryRepository:
|
||||||
def __init__(self, db: AsyncSession) -> None:
|
def __init__(self, db: AsyncSession) -> None:
|
||||||
self.db = db
|
self.db = db
|
||||||
|
|
||||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
|
async def get_senses_for_headword(
|
||||||
|
self, headword: str, language: str
|
||||||
|
) -> list[Sense]:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionarySenseEntity)
|
select(DictionarySenseEntity)
|
||||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
.where(
|
.where(
|
||||||
DictionaryLemmaEntity.headword == headword,
|
DictionaryLemmaEntity.headword == headword,
|
||||||
DictionaryLemmaEntity.language == language,
|
DictionaryLemmaEntity.language == language,
|
||||||
|
|
@ -71,7 +88,9 @@ class PostgresDictionaryRepository:
|
||||||
)
|
)
|
||||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
|
async def find_senses_by_english_gloss(
|
||||||
|
self, text: str, target_lang: str
|
||||||
|
) -> list[Sense]:
|
||||||
"""EN→target direction: find senses whose gloss matches the given English text.
|
"""EN→target direction: find senses whose gloss matches the given English text.
|
||||||
|
|
||||||
Uses a case-insensitive exact match on the gloss column, filtered to the
|
Uses a case-insensitive exact match on the gloss column, filtered to the
|
||||||
|
|
@ -79,7 +98,10 @@ class PostgresDictionaryRepository:
|
||||||
"""
|
"""
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionarySenseEntity)
|
select(DictionarySenseEntity)
|
||||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
.where(
|
.where(
|
||||||
DictionarySenseEntity.gloss.ilike(text),
|
DictionarySenseEntity.gloss.ilike(text),
|
||||||
DictionaryLemmaEntity.language == target_lang,
|
DictionaryLemmaEntity.language == target_lang,
|
||||||
|
|
@ -107,7 +129,10 @@ class PostgresDictionaryRepository:
|
||||||
) -> list[Sense]:
|
) -> list[Sense]:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionarySenseEntity)
|
select(DictionarySenseEntity)
|
||||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
.where(
|
.where(
|
||||||
DictionaryLemmaEntity.headword == headword,
|
DictionaryLemmaEntity.headword == headword,
|
||||||
DictionaryLemmaEntity.language == language,
|
DictionaryLemmaEntity.language == language,
|
||||||
|
|
@ -128,7 +153,10 @@ class PostgresDictionaryRepository:
|
||||||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
|
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionaryWordformEntity)
|
select(DictionaryWordformEntity)
|
||||||
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
.where(
|
.where(
|
||||||
DictionaryWordformEntity.form == form,
|
DictionaryWordformEntity.form == form,
|
||||||
DictionaryLemmaEntity.language == language,
|
DictionaryLemmaEntity.language == language,
|
||||||
|
|
@ -136,6 +164,47 @@ class PostgresDictionaryRepository:
|
||||||
)
|
)
|
||||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def search_wordforms_by_prefix(
|
||||||
|
self, prefix: str, language: str
|
||||||
|
) -> list[Wordform]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionaryWordformEntity)
|
||||||
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
func.unaccent(DictionaryWordformEntity.form).ilike(
|
||||||
|
func.unaccent(prefix) + "%"
|
||||||
|
),
|
||||||
|
DictionaryLemmaEntity.language == language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||||
|
|
||||||
|
async def search_senses_by_prefix(
|
||||||
|
self, prefix: str, lang: str
|
||||||
|
) -> list[tuple[Sense, Lemma]]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(DictionarySenseEntity, DictionaryLemmaEntity)
|
||||||
|
.join(
|
||||||
|
DictionaryLemmaEntity,
|
||||||
|
DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
DictionarySenseEntity.gloss.ilike(prefix),
|
||||||
|
DictionaryLemmaEntity.language == lang,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
results: list[tuple[Sense, Lemma]] = []
|
||||||
|
|
||||||
|
for sense_with_lemma in result.all():
|
||||||
|
sense, lemma = sense_with_lemma.tuple()
|
||||||
|
results.append((_sense_to_model(sense), _lemma_to_model(lemma)))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DictionaryWordformEntity).where(
|
select(DictionaryWordformEntity).where(
|
||||||
|
|
|
||||||
|
|
@ -4,15 +4,20 @@ from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.domain.models.dictionary import Lemma, Sense
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
from ...outbound.postgres.database import get_db
|
from ...outbound.postgres.database import get_db
|
||||||
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
from ...outbound.postgres.repositories.dictionary_repository import (
|
||||||
|
PostgresDictionaryRepository,
|
||||||
|
)
|
||||||
|
|
||||||
router = APIRouter(prefix="/dictionary", tags=["dictionary"])
|
router = APIRouter(prefix="/dictionary", tags=["dictionary"])
|
||||||
|
|
||||||
|
|
||||||
# ── Response models ───────────────────────────────────────────────────────────
|
# ── Response models ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
class SenseResponse(BaseModel):
|
class SenseResponse(BaseModel):
|
||||||
id: str
|
id: str
|
||||||
sense_index: int
|
sense_index: int
|
||||||
|
|
@ -31,33 +36,61 @@ class LemmaResponse(BaseModel):
|
||||||
tags: list[str]
|
tags: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def _sense_to_response(s: Sense) -> SenseResponse:
|
||||||
|
return SenseResponse(
|
||||||
|
id=s.id,
|
||||||
|
sense_index=s.sense_index,
|
||||||
|
gloss=s.gloss,
|
||||||
|
topics=s.topics,
|
||||||
|
tags=s.tags,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _lemma_to_response(lemma: Lemma) -> LemmaResponse:
|
||||||
|
return LemmaResponse(
|
||||||
|
id=lemma.id,
|
||||||
|
headword=lemma.headword,
|
||||||
|
language=lemma.language,
|
||||||
|
pos_raw=lemma.pos_raw,
|
||||||
|
pos_normalised=lemma.pos_normalised,
|
||||||
|
gender=lemma.gender,
|
||||||
|
tags=lemma.tags,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class WordformMatch(BaseModel):
|
class WordformMatch(BaseModel):
|
||||||
lemma: LemmaResponse
|
lemma: LemmaResponse
|
||||||
senses: list[SenseResponse]
|
senses: list[SenseResponse]
|
||||||
|
|
||||||
|
|
||||||
|
class SenseMatch(BaseModel):
|
||||||
|
sense: SenseResponse
|
||||||
|
lemma: LemmaResponse
|
||||||
|
|
||||||
|
|
||||||
# ── Endpoint ──────────────────────────────────────────────────────────────────
|
# ── Endpoint ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@router.get("/wordforms", response_model=list[WordformMatch])
|
|
||||||
async def search_wordforms(
|
@router.get("/search", response_model=list[WordformMatch])
|
||||||
|
async def search_wordforms_prefix(
|
||||||
lang_code: str,
|
lang_code: str,
|
||||||
text: str,
|
text: str,
|
||||||
db: AsyncSession = Depends(get_db),
|
db: AsyncSession = Depends(get_db),
|
||||||
_: dict = Depends(verify_token),
|
_: dict = Depends(verify_token),
|
||||||
) -> list[WordformMatch]:
|
) -> list[WordformMatch]:
|
||||||
"""
|
"""
|
||||||
Search for a wordform by surface text within a language.
|
Search for wordforms whose surface text starts with the given prefix.
|
||||||
|
|
||||||
Returns one entry per matching lemma, each with the lemma's senses. A single
|
Uses accent-insensitive, case-insensitive prefix matching so that e.g.
|
||||||
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
|
"chatea" returns both "château" and "châteaux", and "lent" returns all
|
||||||
|
four forms of the adjective. Returns one entry per matching lemma.
|
||||||
"""
|
"""
|
||||||
repo = PostgresDictionaryRepository(db)
|
repo = PostgresDictionaryRepository(db)
|
||||||
wordforms = await repo.get_wordforms_by_form(text, lang_code)
|
wordforms = await repo.search_wordforms_by_prefix(text, lang_code)
|
||||||
|
|
||||||
if not wordforms:
|
if not wordforms:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
|
|
||||||
seen_lemma_ids: set[str] = set()
|
seen_lemma_ids: set[str] = set()
|
||||||
results: list[WordformMatch] = []
|
results: list[WordformMatch] = []
|
||||||
|
|
||||||
|
|
@ -97,3 +130,71 @@ async def search_wordforms(
|
||||||
)
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/senses", response_model=list[SenseMatch])
|
||||||
|
async def search_senses(
|
||||||
|
lang_code: str,
|
||||||
|
text: str,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
_: dict = Depends(verify_token),
|
||||||
|
) -> list[SenseMatch]:
|
||||||
|
"""
|
||||||
|
Search for a Sense by (English) definition
|
||||||
|
|
||||||
|
Returns one entry per matching senses,each with its Sense.
|
||||||
|
"""
|
||||||
|
repo = PostgresDictionaryRepository(db)
|
||||||
|
senses = await repo.search_senses_by_prefix(text, lang_code)
|
||||||
|
|
||||||
|
if not senses:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
SenseMatch(lemma=_lemma_to_response(lemma), sense=_sense_to_response(sense))
|
||||||
|
for (sense, lemma) in senses
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/wordforms", response_model=list[WordformMatch])
|
||||||
|
async def search_wordforms(
|
||||||
|
lang_code: str,
|
||||||
|
text: str,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
_: dict = Depends(verify_token),
|
||||||
|
) -> list[WordformMatch]:
|
||||||
|
"""
|
||||||
|
Search for a wordform by surface text within a language.
|
||||||
|
|
||||||
|
Returns one entry per matching lemma, each with the lemma's senses. A single
|
||||||
|
form (e.g. "allons") may resolve to more than one lemma when homographs exist.
|
||||||
|
"""
|
||||||
|
repo = PostgresDictionaryRepository(db)
|
||||||
|
wordforms = await repo.get_wordforms_by_form(text, lang_code)
|
||||||
|
|
||||||
|
if not wordforms:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Deduplicate lemma IDs — multiple wordform rows may point to the same lemma
|
||||||
|
seen_lemma_ids: set[str] = set()
|
||||||
|
results: list[WordformMatch] = []
|
||||||
|
|
||||||
|
for wf in wordforms:
|
||||||
|
if wf.lemma_id in seen_lemma_ids:
|
||||||
|
continue
|
||||||
|
seen_lemma_ids.add(wf.lemma_id)
|
||||||
|
|
||||||
|
lemma = await repo.get_lemma(uuid.UUID(wf.lemma_id))
|
||||||
|
if lemma is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
senses = await repo.get_senses_for_lemma(uuid.UUID(wf.lemma_id))
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
WordformMatch(
|
||||||
|
lemma=_lemma_to_response(lemma),
|
||||||
|
senses=[_sense_to_response(s) for s in senses],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue