Compare commits
No commits in common. "aa4987981d06e3fc2643d03560f4d4f76c094272" and "2cae5d9445d350dfeb8617b91a4cb60da551f59b" have entirely different histories.
aa4987981d
...
2cae5d9445
30 changed files with 18 additions and 2454 deletions
9
Makefile
9
Makefile
|
|
@ -1,4 +1,4 @@
|
|||
.PHONY: down build up logs shell lock migrate migration import-dictionary
|
||||
.PHONY: down build up logs shell lock migrate migration
|
||||
|
||||
build:
|
||||
docker compose build
|
||||
|
|
@ -28,10 +28,3 @@ lock:
|
|||
cd api && uv pip compile pyproject.toml -o requirements.txt
|
||||
|
||||
rebuild: down build up
|
||||
|
||||
# Import a kaikki dictionary JSONL into Postgres.
|
||||
# Requires the DB to be running with its port exposed on localhost (docker compose up).
|
||||
# DATABASE_URL defaults to the docker-compose dev credentials.
|
||||
# Usage: make import-dictionary lang=fr
|
||||
import-dictionary:
|
||||
cd api && python scripts/import_dictionary.py --lang $(lang)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ from app.outbound.postgres.database import Base
|
|||
|
||||
import app.outbound.postgres.entities.summarise_job_entity
|
||||
import app.outbound.postgres.entities.user_entity
|
||||
import app.outbound.postgres.entities.dictionary_entities
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.database_url)
|
||||
|
|
|
|||
|
|
@ -1,89 +0,0 @@
|
|||
"""add dictionary tables
|
||||
|
||||
Revision ID: 0007
|
||||
Revises: 0006
|
||||
Create Date: 2026-04-07
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0007"
|
||||
down_revision: Union[str, None] = "0006"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"dictionary_lemma",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("headword", sa.Text(), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||
sa.Column("gender", sa.Text(), nullable=True),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_lemma_headword_language", "dictionary_lemma", ["headword", "language"])
|
||||
op.create_index("ix_dictionary_lemma_language", "dictionary_lemma", ["language"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_sense",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||
sa.Column("gloss", sa.Text(), nullable=False, server_default=""),
|
||||
sa.Column("topics", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_sense_lemma_id", "dictionary_sense", ["lemma_id"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_wordform",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("form", sa.Text(), nullable=False),
|
||||
sa.Column("tags", postgresql.ARRAY(sa.Text()), nullable=False, server_default="{}"),
|
||||
)
|
||||
op.create_index("ix_dictionary_wordform_lemma_id", "dictionary_wordform", ["lemma_id"])
|
||||
op.create_index("ix_dictionary_wordform_form", "dictionary_wordform", ["form"])
|
||||
|
||||
op.create_table(
|
||||
"dictionary_lemma_raw",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"lemma_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("raw", postgresql.JSONB(), nullable=False),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("dictionary_lemma_raw")
|
||||
op.drop_index("ix_dictionary_wordform_form", table_name="dictionary_wordform")
|
||||
op.drop_index("ix_dictionary_wordform_lemma_id", table_name="dictionary_wordform")
|
||||
op.drop_table("dictionary_wordform")
|
||||
op.drop_index("ix_dictionary_sense_lemma_id", table_name="dictionary_sense")
|
||||
op.drop_table("dictionary_sense")
|
||||
op.drop_index("ix_dictionary_lemma_language", table_name="dictionary_lemma")
|
||||
op.drop_index("ix_dictionary_lemma_headword_language", table_name="dictionary_lemma")
|
||||
op.drop_table("dictionary_lemma")
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
"""add vocab bank tables
|
||||
|
||||
Revision ID: 0008
|
||||
Revises: 0007
|
||||
Create Date: 2026-04-08
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0008"
|
||||
down_revision: Union[str, None] = "0007"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"user_language_pair",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("source_lang", sa.String(2), nullable=False),
|
||||
sa.Column("target_lang", sa.String(2), nullable=False),
|
||||
sa.UniqueConstraint("user_id", "source_lang", "target_lang", name="uq_user_language_pair"),
|
||||
)
|
||||
op.create_index("ix_user_language_pair_user_id", "user_language_pair", ["user_id"])
|
||||
|
||||
op.create_table(
|
||||
"learnable_word_bank_entry",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"language_pair_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("user_language_pair.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"sense_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column(
|
||||
"wordform_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("surface_text", sa.Text(), nullable=False),
|
||||
sa.Column("is_phrase", sa.Boolean(), nullable=False, server_default="false"),
|
||||
sa.Column("entry_pathway", sa.Text(), nullable=False),
|
||||
sa.Column("source_article_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("disambiguation_status", sa.Text(), nullable=False, server_default="pending"),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.func.now(),
|
||||
),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_learnable_word_bank_entry_user_id", "learnable_word_bank_entry", ["user_id"]
|
||||
)
|
||||
op.create_index(
|
||||
"ix_learnable_word_bank_entry_language_pair_id",
|
||||
"learnable_word_bank_entry",
|
||||
["language_pair_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_learnable_word_bank_entry_sense_id", "learnable_word_bank_entry", ["sense_id"]
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_learnable_word_bank_entry_sense_id", table_name="learnable_word_bank_entry")
|
||||
op.drop_index(
|
||||
"ix_learnable_word_bank_entry_language_pair_id", table_name="learnable_word_bank_entry"
|
||||
)
|
||||
op.drop_index("ix_learnable_word_bank_entry_user_id", table_name="learnable_word_bank_entry")
|
||||
op.drop_table("learnable_word_bank_entry")
|
||||
op.drop_index("ix_user_language_pair_user_id", table_name="user_language_pair")
|
||||
op.drop_table("user_language_pair")
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
"""add flashcard tables
|
||||
|
||||
Revision ID: 0009
|
||||
Revises: 0008
|
||||
Create Date: 2026-04-08
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
revision: str = "0009"
|
||||
down_revision: Union[str, None] = "0008"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"flashcard",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"bank_entry_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("learnable_word_bank_entry.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("source_lang", sa.Text(), nullable=False),
|
||||
sa.Column("target_lang", sa.Text(), nullable=False),
|
||||
sa.Column("prompt_text", sa.Text(), nullable=False),
|
||||
sa.Column("answer_text", sa.Text(), nullable=False),
|
||||
sa.Column("prompt_context_text", sa.Text(), nullable=True),
|
||||
sa.Column("answer_context_text", sa.Text(), nullable=True),
|
||||
sa.Column("card_direction", sa.Text(), nullable=False),
|
||||
sa.Column("prompt_modality", sa.Text(), nullable=False, server_default="text"),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.func.now(),
|
||||
),
|
||||
)
|
||||
op.create_index("ix_flashcard_user_id", "flashcard", ["user_id"])
|
||||
op.create_index("ix_flashcard_bank_entry_id", "flashcard", ["bank_entry_id"])
|
||||
|
||||
op.create_table(
|
||||
"flashcard_event",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"flashcard_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("flashcard.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("event_type", sa.Text(), nullable=False),
|
||||
sa.Column("user_response", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.func.now(),
|
||||
),
|
||||
)
|
||||
op.create_index("ix_flashcard_event_flashcard_id", "flashcard_event", ["flashcard_id"])
|
||||
op.create_index("ix_flashcard_event_user_id", "flashcard_event", ["user_id"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_flashcard_event_user_id", table_name="flashcard_event")
|
||||
op.drop_index("ix_flashcard_event_flashcard_id", table_name="flashcard_event")
|
||||
op.drop_table("flashcard_event")
|
||||
op.drop_index("ix_flashcard_bank_entry_id", table_name="flashcard")
|
||||
op.drop_index("ix_flashcard_user_id", table_name="flashcard")
|
||||
op.drop_table("flashcard")
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
from .learnable_language import LearnableLanguage
|
||||
|
||||
|
||||
@dataclass
|
||||
class Account:
|
||||
id: str
|
||||
email: str
|
||||
is_active: bool
|
||||
is_email_verified: bool
|
||||
created_at: datetime
|
||||
learnable_languages: list[LearnableLanguage] = field(default_factory=list)
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Wordform:
|
||||
id: str
|
||||
lemma_id: str
|
||||
form: str
|
||||
tags: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Sense:
|
||||
id: str
|
||||
lemma_id: str
|
||||
sense_index: int
|
||||
gloss: str
|
||||
topics: list[str]
|
||||
tags: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Lemma:
|
||||
id: str
|
||||
headword: str
|
||||
language: str
|
||||
pos_raw: str
|
||||
pos_normalised: str | None
|
||||
gender: str | None
|
||||
tags: list[str]
|
||||
senses: list[Sense] = field(default_factory=list)
|
||||
wordforms: list[Wordform] = field(default_factory=list)
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class Flashcard:
|
||||
id: str
|
||||
user_id: str
|
||||
bank_entry_id: str
|
||||
source_lang: str
|
||||
target_lang: str
|
||||
prompt_text: str
|
||||
answer_text: str
|
||||
prompt_context_text: str | None
|
||||
answer_context_text: str | None
|
||||
card_direction: str
|
||||
prompt_modality: str
|
||||
created_at: datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlashcardEvent:
|
||||
id: str
|
||||
flashcard_id: str
|
||||
user_id: str
|
||||
event_type: str
|
||||
user_response: str | None
|
||||
created_at: datetime
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserLanguagePair:
|
||||
id: str
|
||||
user_id: str
|
||||
source_lang: str
|
||||
target_lang: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class LearnableWordBankEntry:
|
||||
id: str
|
||||
user_id: str
|
||||
language_pair_id: str
|
||||
sense_id: str | None
|
||||
wordform_id: str | None
|
||||
surface_text: str
|
||||
is_phrase: bool
|
||||
entry_pathway: str
|
||||
source_article_id: str | None
|
||||
disambiguation_status: str
|
||||
created_at: datetime
|
||||
|
|
@ -1,150 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..models.account import Account
|
||||
from ..models.learnable_language import LearnableLanguage
|
||||
from ...auth import hash_password
|
||||
from ...outbound.postgres.entities.user_entity import User as UserEntity
|
||||
from ...outbound.postgres.repositories import learnable_language_repository, user_repository
|
||||
|
||||
|
||||
class AccountService:
|
||||
"""Handles account-level operations: registration, profile retrieval, and managing
|
||||
the set of languages a user is learning.
|
||||
|
||||
All methods operate on behalf of a single authenticated user (or, for
|
||||
``create_account``, the user being created).
|
||||
|
||||
Usage::
|
||||
|
||||
service = AccountService(db)
|
||||
|
||||
# Registration
|
||||
account = await service.create_account("alice@example.com", "s3cr3t")
|
||||
|
||||
# Profile retrieval
|
||||
account = await service.get_account(user_id)
|
||||
print(account.learnable_languages) # [LearnableLanguage(...), ...]
|
||||
|
||||
# Add French (B1) to the account
|
||||
lang = await service.add_learnable_language(
|
||||
user_id, source_language="en", target_language="fr", proficiencies=["B1"]
|
||||
)
|
||||
|
||||
# Remove it again
|
||||
await service.remove_learnable_language(user_id, lang.id)
|
||||
"""
|
||||
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def create_account(self, email: str, password: str) -> Account:
|
||||
"""Create a new user account, hashing the plain-text password before storage.
|
||||
|
||||
Raises ``ValueError`` if the email address is already registered, so the
|
||||
caller does not need to catch SQLAlchemy exceptions directly.
|
||||
|
||||
Usage::
|
||||
|
||||
try:
|
||||
account = await service.create_account("alice@example.com", "s3cr3t")
|
||||
except ValueError:
|
||||
# email already taken
|
||||
...
|
||||
"""
|
||||
try:
|
||||
user = await user_repository.create(
|
||||
self.db,
|
||||
email=email,
|
||||
hashed_password=hash_password(password),
|
||||
)
|
||||
except IntegrityError:
|
||||
await self.db.rollback()
|
||||
raise ValueError("Email already registered")
|
||||
|
||||
return Account(
|
||||
id=str(user.id),
|
||||
email=user.email,
|
||||
is_active=user.is_active,
|
||||
is_email_verified=user.is_email_verified,
|
||||
created_at=user.created_at,
|
||||
)
|
||||
|
||||
async def get_account(self, user_id: uuid.UUID) -> Account:
|
||||
"""Retrieve a user's account profile including all their learnable languages.
|
||||
|
||||
Raises ``ValueError`` if no user exists for the given ``user_id``.
|
||||
|
||||
Usage::
|
||||
|
||||
account = await service.get_account(user_id)
|
||||
for lang in account.learnable_languages:
|
||||
print(lang.target_language, lang.proficiencies)
|
||||
"""
|
||||
# user_repository only exposes get_by_email; query by id directly
|
||||
result = await self.db.execute(
|
||||
select(UserEntity).where(UserEntity.id == user_id)
|
||||
)
|
||||
user = result.scalar_one_or_none()
|
||||
if user is None:
|
||||
raise ValueError(f"User {user_id} not found")
|
||||
|
||||
languages = await learnable_language_repository.list_for_user(self.db, user_id)
|
||||
|
||||
return Account(
|
||||
id=str(user.id),
|
||||
email=user.email,
|
||||
is_active=user.is_active,
|
||||
is_email_verified=user.is_email_verified,
|
||||
created_at=user.created_at,
|
||||
learnable_languages=languages,
|
||||
)
|
||||
|
||||
async def add_learnable_language(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
source_language: str,
|
||||
target_language: str,
|
||||
proficiencies: list[str],
|
||||
) -> LearnableLanguage:
|
||||
"""Add a language pair to the user's account, or update proficiency levels if
|
||||
the pair already exists (upsert semantics).
|
||||
|
||||
Usage::
|
||||
|
||||
lang = await service.add_learnable_language(
|
||||
user_id,
|
||||
source_language="en",
|
||||
target_language="fr",
|
||||
proficiencies=["B1", "B2"],
|
||||
)
|
||||
print(lang.id) # UUID of the learnable_language row
|
||||
"""
|
||||
return await learnable_language_repository.upsert(
|
||||
self.db,
|
||||
user_id=user_id,
|
||||
source_language=source_language,
|
||||
target_language=target_language,
|
||||
proficiencies=proficiencies,
|
||||
)
|
||||
|
||||
async def remove_learnable_language(
|
||||
self, user_id: uuid.UUID, language_id: uuid.UUID
|
||||
) -> None:
|
||||
"""Remove a learnable language from the user's account by its row ID.
|
||||
|
||||
Raises ``ValueError`` if the language entry does not exist or does not belong
|
||||
to ``user_id``.
|
||||
|
||||
Usage::
|
||||
|
||||
await service.remove_learnable_language(user_id, lang.id)
|
||||
"""
|
||||
deleted = await learnable_language_repository.delete(
|
||||
self.db, user_id=user_id, language_id=language_id
|
||||
)
|
||||
if not deleted:
|
||||
raise ValueError(f"Learnable language {language_id} not found for this user")
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from ..models.dictionary import Sense, Wordform
|
||||
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenLookupResult:
|
||||
"""The result of resolving a spaCy token against the dictionary.
|
||||
|
||||
``senses`` is the ranked list of candidate senses for disambiguation.
|
||||
``wordform_id`` is set when the surface form was found in ``dictionary_wordform``,
|
||||
allowing the vocab bank entry to be pre-linked to the exact inflected form.
|
||||
``matched_via`` describes which lookup strategy succeeded.
|
||||
"""
|
||||
senses: list[Sense]
|
||||
wordform_id: str | None
|
||||
matched_via: str # "wordform" | "lemma_pos" | "lemma" | "none"
|
||||
matched_wordforms: list[Wordform] = field(default_factory=list)
|
||||
|
||||
|
||||
class DictionaryLookupService:
|
||||
"""Resolves a spaCy token (surface form + UD POS + lemma) to candidate dictionary
|
||||
senses, using a three-stage fallback strategy.
|
||||
|
||||
Stage 1 — wordform table lookup (most precise):
|
||||
Searches ``dictionary_wordform`` for an exact match on the inflected surface
|
||||
form within the target language. "allons" → wordform row → lemma "aller".
|
||||
When exactly one lemma matches, ``wordform_id`` is pre-populated on the result.
|
||||
|
||||
Stage 2 — lemma + POS fallback:
|
||||
If no wordform row exists, tries the spaCy-provided lemma string against
|
||||
``dictionary_lemma.headword`` filtered by ``pos_normalised`` (UD tag).
|
||||
Reduces false matches for homographs with different parts of speech.
|
||||
|
||||
Stage 3 — lemma-only fallback:
|
||||
Drops the POS filter as a last resort. Returns all senses for the headword
|
||||
regardless of POS.
|
||||
|
||||
Usage::
|
||||
|
||||
service = DictionaryLookupService(PostgresDictionaryRepository(db))
|
||||
|
||||
result = await service.lookup_token(
|
||||
surface="allons",
|
||||
spacy_lemma="aller",
|
||||
pos_ud="VERB",
|
||||
language="fr",
|
||||
)
|
||||
# result.senses — candidate Sense rows for disambiguation
|
||||
# result.wordform_id — pre-resolved wordform UUID string, or None
|
||||
# result.matched_via — "wordform" | "lemma_pos" | "lemma" | "none"
|
||||
"""
|
||||
|
||||
def __init__(self, dict_repo: DictionaryRepository) -> None:
|
||||
self.dict_repo = dict_repo
|
||||
|
||||
async def lookup_token(
|
||||
self,
|
||||
surface: str,
|
||||
spacy_lemma: str,
|
||||
pos_ud: str,
|
||||
language: str,
|
||||
) -> TokenLookupResult:
|
||||
"""Resolve a spaCy token to candidate senses using a three-stage fallback.
|
||||
|
||||
``surface`` is the raw token text (e.g. ``"allons"``).
|
||||
``spacy_lemma`` is spaCy's lemmatisation of the token (e.g. ``"aller"``).
|
||||
``pos_ud`` is the Universal Dependencies POS tag (e.g. ``"VERB"``).
|
||||
``language`` is the target language code (e.g. ``"fr"``).
|
||||
|
||||
Returns a :class:`TokenLookupResult` with the candidate senses and, when the
|
||||
surface form was found in the wordform table, a ``wordform_id`` that can be
|
||||
stored on the vocab bank entry for precise inflection tracking.
|
||||
"""
|
||||
# Stage 1: wordform table lookup by inflected surface form
|
||||
wordforms = await self.dict_repo.get_wordforms_by_form(surface, language)
|
||||
if wordforms:
|
||||
unique_lemma_ids = list(dict.fromkeys(wf.lemma_id for wf in wordforms))
|
||||
senses: list[Sense] = []
|
||||
for lemma_id in unique_lemma_ids:
|
||||
senses.extend(await self.dict_repo.get_senses_for_lemma(uuid.UUID(lemma_id)))
|
||||
|
||||
# Only pre-assign wordform_id when a single wordform matched — if multiple
|
||||
# wordforms from different lemmas matched, the ambiguity must be resolved
|
||||
# by the user and we cannot confidently pick one.
|
||||
wordform_id = wordforms[0].id if len(unique_lemma_ids) == 1 else None
|
||||
return TokenLookupResult(
|
||||
senses=senses,
|
||||
wordform_id=wordform_id,
|
||||
matched_via="wordform",
|
||||
matched_wordforms=wordforms,
|
||||
)
|
||||
|
||||
# Stage 2: spaCy lemma + UD POS filter
|
||||
senses = await self.dict_repo.get_senses_for_headword_and_pos(
|
||||
spacy_lemma, language, pos_ud
|
||||
)
|
||||
if senses:
|
||||
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma_pos")
|
||||
|
||||
# Stage 3: spaCy lemma only — no POS filter
|
||||
senses = await self.dict_repo.get_senses_for_headword(spacy_lemma, language)
|
||||
if senses:
|
||||
return TokenLookupResult(senses=senses, wordform_id=None, matched_via="lemma")
|
||||
|
||||
return TokenLookupResult(senses=[], wordform_id=None, matched_via="none")
|
||||
|
|
@ -1,152 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from ..models.flashcard import Flashcard, FlashcardEvent
|
||||
from ...outbound.postgres.repositories.flashcard_repository import FlashcardRepository
|
||||
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
|
||||
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
||||
|
||||
VALID_DIRECTIONS = {"target_to_en", "en_to_target"}
|
||||
VALID_EVENT_TYPES = {"shown", "answered", "skipped"}
|
||||
|
||||
|
||||
class FlashcardService:
|
||||
"""Generates flashcards from resolved vocab bank entries and records study events.
|
||||
|
||||
Flashcard text is derived directly from the dictionary: the lemma headword is the
|
||||
target-language side and the sense gloss is the English side. Both directions are
|
||||
created by default.
|
||||
|
||||
Usage::
|
||||
|
||||
service = FlashcardService(
|
||||
flashcard_repo=PostgresFlashcardRepository(db),
|
||||
vocab_repo=PostgresVocabRepository(db),
|
||||
dict_repo=PostgresDictionaryRepository(db),
|
||||
)
|
||||
|
||||
# Generate both directions for a resolved bank entry
|
||||
cards = await service.generate_flashcard_from_entry(entry_id)
|
||||
|
||||
# Record that the user answered correctly
|
||||
event = await service.record_flashcard_event(
|
||||
flashcard_id=cards[0].id,
|
||||
user_id=user_id,
|
||||
event_type="answered",
|
||||
response="banque",
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
flashcard_repo: FlashcardRepository,
|
||||
vocab_repo: VocabRepository,
|
||||
dict_repo: DictionaryRepository,
|
||||
) -> None:
|
||||
self.flashcard_repo = flashcard_repo
|
||||
self.vocab_repo = vocab_repo
|
||||
self.dict_repo = dict_repo
|
||||
|
||||
async def generate_flashcard_from_entry(
|
||||
self,
|
||||
entry_id: uuid.UUID,
|
||||
direction: str | None = None,
|
||||
) -> list[Flashcard]:
|
||||
"""Create flashcard(s) from a vocab bank entry that has a resolved sense.
|
||||
|
||||
Looks up the sense gloss (English meaning) and lemma headword (target-language
|
||||
word) and creates one card per direction. Pass ``direction`` to generate only
|
||||
``"target_to_en"`` or ``"en_to_target"``; omit it to create both.
|
||||
|
||||
Raises ``ValueError`` if the entry does not exist, has no resolved sense, or
|
||||
if the underlying sense or lemma rows cannot be found in the dictionary.
|
||||
|
||||
Usage::
|
||||
|
||||
# Both directions — typical case
|
||||
cards = await service.generate_flashcard_from_entry(entry_id)
|
||||
assert len(cards) == 2
|
||||
|
||||
# One direction only
|
||||
cards = await service.generate_flashcard_from_entry(
|
||||
entry_id, direction="target_to_en"
|
||||
)
|
||||
"""
|
||||
if direction is not None and direction not in VALID_DIRECTIONS:
|
||||
raise ValueError(f"Invalid direction '{direction}'. Must be one of {VALID_DIRECTIONS}")
|
||||
|
||||
entry = await self.vocab_repo.get_entry(entry_id)
|
||||
if entry is None:
|
||||
raise ValueError(f"Bank entry {entry_id} not found")
|
||||
if entry.sense_id is None:
|
||||
raise ValueError(
|
||||
"Entry has no resolved sense; disambiguate before generating flashcards"
|
||||
)
|
||||
|
||||
sense = await self.dict_repo.get_sense(uuid.UUID(entry.sense_id))
|
||||
if sense is None:
|
||||
raise ValueError(f"Sense {entry.sense_id} not found in dictionary")
|
||||
|
||||
lemma = await self.dict_repo.get_lemma(uuid.UUID(sense.lemma_id))
|
||||
if lemma is None:
|
||||
raise ValueError(f"Lemma for sense {entry.sense_id} not found in dictionary")
|
||||
|
||||
pair = await self.vocab_repo.get_language_pair(uuid.UUID(entry.language_pair_id))
|
||||
if pair is None:
|
||||
raise ValueError(f"Language pair {entry.language_pair_id} not found")
|
||||
|
||||
user_id = uuid.UUID(entry.user_id)
|
||||
directions = [direction] if direction else ["target_to_en", "en_to_target"]
|
||||
|
||||
flashcards = []
|
||||
for d in directions:
|
||||
if d == "target_to_en":
|
||||
prompt, answer = lemma.headword, sense.gloss
|
||||
else:
|
||||
prompt, answer = sense.gloss, lemma.headword
|
||||
|
||||
card = await self.flashcard_repo.create_flashcard(
|
||||
user_id=user_id,
|
||||
bank_entry_id=entry_id,
|
||||
source_lang=pair.source_lang,
|
||||
target_lang=pair.target_lang,
|
||||
prompt_text=prompt,
|
||||
answer_text=answer,
|
||||
card_direction=d,
|
||||
)
|
||||
flashcards.append(card)
|
||||
|
||||
return flashcards
|
||||
|
||||
async def record_flashcard_event(
|
||||
self,
|
||||
flashcard_id: uuid.UUID,
|
||||
user_id: uuid.UUID,
|
||||
event_type: str,
|
||||
response: str | None = None,
|
||||
) -> FlashcardEvent:
|
||||
"""Record a study event against a flashcard — shown, answered, or skipped.
|
||||
|
||||
``response`` is the user's free-text answer and is only meaningful for
|
||||
``event_type="answered"``; it is stored as-is without grading.
|
||||
|
||||
Raises ``ValueError`` for unrecognised event types.
|
||||
|
||||
Usage::
|
||||
|
||||
event = await service.record_flashcard_event(
|
||||
flashcard_id=card.id,
|
||||
user_id=user_id,
|
||||
event_type="answered",
|
||||
response="banque",
|
||||
)
|
||||
"""
|
||||
if event_type not in VALID_EVENT_TYPES:
|
||||
raise ValueError(
|
||||
f"Invalid event_type '{event_type}'. Must be one of {VALID_EVENT_TYPES}"
|
||||
)
|
||||
return await self.flashcard_repo.record_event(
|
||||
flashcard_id=flashcard_id,
|
||||
user_id=user_id,
|
||||
event_type=event_type,
|
||||
user_response=response,
|
||||
)
|
||||
|
|
@ -1,190 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from ..models.dictionary import Sense
|
||||
from ..models.vocab import LearnableWordBankEntry
|
||||
from ...outbound.postgres.repositories.vocab_repository import VocabRepository
|
||||
from ...outbound.postgres.repositories.dictionary_repository import DictionaryRepository
|
||||
|
||||
|
||||
class VocabService:
|
||||
"""Manages a user's learnable word bank — adding words from various sources and
|
||||
resolving which dictionary sense a word belongs to.
|
||||
|
||||
Usage:
|
||||
service = VocabService(
|
||||
vocab_repo=PostgresVocabRepository(db),
|
||||
dict_repo=PostgresDictionaryRepository(db),
|
||||
)
|
||||
entry = await service.add_word_to_bank(
|
||||
user_id=user.id,
|
||||
surface_text="banque",
|
||||
language_pair_id=pair.id,
|
||||
pathway="highlight",
|
||||
)
|
||||
# entry.disambiguation_status is "auto_resolved" if "banque" has exactly one
|
||||
# dictionary sense, or "pending" if the user needs to pick from multiple senses.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_repo: VocabRepository, dict_repo: DictionaryRepository) -> None:
|
||||
self.vocab_repo = vocab_repo
|
||||
self.dict_repo = dict_repo
|
||||
|
||||
async def add_word_to_bank(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
surface_text: str,
|
||||
language_pair_id: uuid.UUID,
|
||||
pathway: str,
|
||||
is_phrase: bool = False,
|
||||
wordform_id: uuid.UUID | None = None,
|
||||
source_article_id: uuid.UUID | None = None,
|
||||
) -> LearnableWordBankEntry:
|
||||
"""Add a word or phrase to the user's vocab bank, automatically linking it to a
|
||||
dictionary sense when exactly one match exists, or flagging it as pending
|
||||
disambiguation when zero or multiple senses are found.
|
||||
|
||||
Phrases (``is_phrase=True``) bypass dictionary lookup entirely and are always
|
||||
created with ``disambiguation_status="pending"`` since they cannot be resolved
|
||||
to a single headword.
|
||||
|
||||
Usage::
|
||||
|
||||
# Word with a single sense — auto-resolved immediately
|
||||
entry = await service.add_word_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text="bisque",
|
||||
language_pair_id=fr_en_pair_id,
|
||||
pathway="highlight",
|
||||
)
|
||||
assert entry.disambiguation_status == "auto_resolved"
|
||||
|
||||
# Common word with many senses — user must pick one
|
||||
entry = await service.add_word_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text="avoir",
|
||||
language_pair_id=fr_en_pair_id,
|
||||
pathway="manual",
|
||||
)
|
||||
assert entry.disambiguation_status == "pending"
|
||||
|
||||
# Multi-word expression — skips lookup, always pending
|
||||
entry = await service.add_word_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text="avoir l'air",
|
||||
language_pair_id=fr_en_pair_id,
|
||||
pathway="manual",
|
||||
is_phrase=True,
|
||||
)
|
||||
"""
|
||||
pair = await self.vocab_repo.get_language_pair(language_pair_id)
|
||||
if pair is None:
|
||||
raise ValueError(f"Language pair {language_pair_id} not found")
|
||||
|
||||
if is_phrase:
|
||||
return await self.vocab_repo.add_entry(
|
||||
user_id=user_id,
|
||||
language_pair_id=language_pair_id,
|
||||
surface_text=surface_text,
|
||||
entry_pathway=pathway,
|
||||
is_phrase=True,
|
||||
source_article_id=source_article_id,
|
||||
disambiguation_status="pending",
|
||||
)
|
||||
|
||||
senses = await self.dict_repo.get_senses_for_headword(surface_text, pair.target_lang)
|
||||
|
||||
if len(senses) == 1:
|
||||
sense_id = uuid.UUID(senses[0].id)
|
||||
status = "auto_resolved"
|
||||
elif len(senses) > 1:
|
||||
sense_id = None
|
||||
status = "pending"
|
||||
else:
|
||||
sense_id = None
|
||||
status = "pending"
|
||||
|
||||
return await self.vocab_repo.add_entry(
|
||||
user_id=user_id,
|
||||
language_pair_id=language_pair_id,
|
||||
surface_text=surface_text,
|
||||
entry_pathway=pathway,
|
||||
is_phrase=False,
|
||||
sense_id=sense_id,
|
||||
wordform_id=wordform_id,
|
||||
source_article_id=source_article_id,
|
||||
disambiguation_status=status,
|
||||
)
|
||||
|
||||
async def add_token_to_bank(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
surface_text: str,
|
||||
language_pair_id: uuid.UUID,
|
||||
senses: list[Sense],
|
||||
wordform_id: uuid.UUID | None,
|
||||
source_article_id: uuid.UUID | None = None,
|
||||
) -> LearnableWordBankEntry:
|
||||
"""Add a token from the NLP pipeline to the vocab bank using pre-resolved lookup
|
||||
results, skipping the redundant dictionary query that ``add_word_to_bank`` would
|
||||
otherwise perform.
|
||||
|
||||
``senses`` and ``wordform_id`` come from :class:`DictionaryLookupService` and
|
||||
are stored directly on the bank entry. Auto-resolution still applies: exactly
|
||||
one sense means ``auto_resolved``; anything else means ``pending``.
|
||||
|
||||
Usage::
|
||||
|
||||
result = await lookup_service.lookup_token("allons", "aller", "VERB", "fr")
|
||||
wf_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
|
||||
entry = await vocab_service.add_token_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text="allons",
|
||||
language_pair_id=pair_id,
|
||||
senses=result.senses,
|
||||
wordform_id=wf_id,
|
||||
)
|
||||
# entry.wordform_id == result.wordform_id (pre-linked to "allons" wordform)
|
||||
"""
|
||||
pair = await self.vocab_repo.get_language_pair(language_pair_id)
|
||||
if pair is None:
|
||||
raise ValueError(f"Language pair {language_pair_id} not found")
|
||||
|
||||
if len(senses) == 1:
|
||||
sense_id: uuid.UUID | None = uuid.UUID(senses[0].id)
|
||||
status = "auto_resolved"
|
||||
else:
|
||||
sense_id = None
|
||||
status = "pending"
|
||||
|
||||
return await self.vocab_repo.add_entry(
|
||||
user_id=user_id,
|
||||
language_pair_id=language_pair_id,
|
||||
surface_text=surface_text,
|
||||
entry_pathway="nlp_extraction",
|
||||
wordform_id=wordform_id,
|
||||
sense_id=sense_id,
|
||||
source_article_id=source_article_id,
|
||||
disambiguation_status=status,
|
||||
)
|
||||
|
||||
async def resolve_disambiguation(
|
||||
self, entry_id: uuid.UUID, sense_id: uuid.UUID
|
||||
) -> LearnableWordBankEntry:
|
||||
"""Attach a specific dictionary sense to a pending vocab bank entry, marking it
|
||||
as ``resolved`` so it can be used for flashcard generation.
|
||||
|
||||
This is called after the user selects the correct sense from the list presented
|
||||
during disambiguation — for example, choosing "bank (finance)" over
|
||||
"bank (river)" for the French word "banque".
|
||||
|
||||
Usage::
|
||||
|
||||
# User has been shown the sense list and picked sense_id for "bank (finance)"
|
||||
resolved_entry = await service.resolve_disambiguation(
|
||||
entry_id=pending_entry.id,
|
||||
sense_id=finance_sense_id,
|
||||
)
|
||||
assert resolved_entry.disambiguation_status == "resolved"
|
||||
assert resolved_entry.sense_id == str(finance_sense_id)
|
||||
"""
|
||||
return await self.vocab_repo.set_sense(entry_id, sense_id)
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from sqlalchemy import String, Text, ForeignKey, Integer
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.dialects.postgresql import UUID, ARRAY, JSONB
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class DictionaryLemmaEntity(Base):
|
||||
__tablename__ = "dictionary_lemma"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
headword: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
language: Mapped[str] = mapped_column(String(2), nullable=False, index=True)
|
||||
pos_raw: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
pos_normalised: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionarySenseEntity(Base):
|
||||
__tablename__ = "dictionary_sense"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
sense_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
gloss: Mapped[str] = mapped_column(Text, nullable=False, server_default="")
|
||||
topics: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionaryWordformEntity(Base):
|
||||
__tablename__ = "dictionary_wordform"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
form: Mapped[str] = mapped_column(Text, nullable=False, index=True)
|
||||
tags: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False, server_default="{}")
|
||||
|
||||
|
||||
class DictionaryLemmaRawEntity(Base):
|
||||
__tablename__ = "dictionary_lemma_raw"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
lemma_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_lemma.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
)
|
||||
language: Mapped[str] = mapped_column(String(2), nullable=False)
|
||||
raw: Mapped[dict] = mapped_column(JSONB, nullable=False)
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import DateTime, ForeignKey, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class FlashcardEntity(Base):
|
||||
__tablename__ = "flashcard"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
user_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
bank_entry_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("learnable_word_bank_entry.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
source_lang: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
target_lang: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
prompt_text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
answer_text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
prompt_context_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
answer_context_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
card_direction: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
prompt_modality: Mapped[str] = mapped_column(Text, nullable=False, default="text")
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
nullable=False,
|
||||
default=lambda: datetime.now(timezone.utc),
|
||||
)
|
||||
|
||||
|
||||
class FlashcardEventEntity(Base):
|
||||
__tablename__ = "flashcard_event"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
flashcard_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("flashcard.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
user_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
event_type: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
user_response: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
nullable=False,
|
||||
default=lambda: datetime.now(timezone.utc),
|
||||
)
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import Boolean, ForeignKey, String, Text, UniqueConstraint, DateTime
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class UserLanguagePairEntity(Base):
|
||||
__tablename__ = "user_language_pair"
|
||||
__table_args__ = (UniqueConstraint("user_id", "source_lang", "target_lang"),)
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
user_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
source_lang: Mapped[str] = mapped_column(String(2), nullable=False)
|
||||
target_lang: Mapped[str] = mapped_column(String(2), nullable=False)
|
||||
|
||||
|
||||
class LearnableWordBankEntryEntity(Base):
|
||||
__tablename__ = "learnable_word_bank_entry"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
user_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("users.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
language_pair_id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("user_language_pair.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
index=True,
|
||||
)
|
||||
sense_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_sense.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
index=True,
|
||||
)
|
||||
wordform_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True),
|
||||
ForeignKey("dictionary_wordform.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
surface_text: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
is_phrase: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
entry_pathway: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
source_article_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), nullable=True
|
||||
)
|
||||
disambiguation_status: Mapped[str] = mapped_column(Text, nullable=False, default="pending")
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
nullable=False,
|
||||
default=lambda: datetime.now(timezone.utc),
|
||||
)
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
import uuid
|
||||
from typing import Protocol
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..entities.dictionary_entities import (
|
||||
DictionaryLemmaEntity,
|
||||
DictionarySenseEntity,
|
||||
DictionaryWordformEntity,
|
||||
)
|
||||
from ....domain.models.dictionary import Lemma, Sense, Wordform
|
||||
|
||||
|
||||
class DictionaryRepository(Protocol):
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]: ...
|
||||
async def get_senses_for_headword_and_pos(self, headword: str, language: str, pos_normalised: str) -> list[Sense]: ...
|
||||
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]: ...
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]: ...
|
||||
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None: ...
|
||||
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None: ...
|
||||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]: ...
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]: ...
|
||||
|
||||
|
||||
def _sense_to_model(entity: DictionarySenseEntity) -> Sense:
|
||||
return Sense(
|
||||
id=str(entity.id),
|
||||
lemma_id=str(entity.lemma_id),
|
||||
sense_index=entity.sense_index,
|
||||
gloss=entity.gloss,
|
||||
topics=entity.topics or [],
|
||||
tags=entity.tags or [],
|
||||
)
|
||||
|
||||
|
||||
def _lemma_to_model(entity: DictionaryLemmaEntity) -> Lemma:
|
||||
return Lemma(
|
||||
id=str(entity.id),
|
||||
headword=entity.headword,
|
||||
language=entity.language,
|
||||
pos_raw=entity.pos_raw,
|
||||
pos_normalised=entity.pos_normalised,
|
||||
gender=entity.gender,
|
||||
tags=entity.tags or [],
|
||||
)
|
||||
|
||||
|
||||
def _wordform_to_model(entity: DictionaryWordformEntity) -> Wordform:
|
||||
return Wordform(
|
||||
id=str(entity.id),
|
||||
lemma_id=str(entity.lemma_id),
|
||||
form=entity.form,
|
||||
tags=entity.tags or [],
|
||||
)
|
||||
|
||||
|
||||
class PostgresDictionaryRepository:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def get_senses_for_headword(self, headword: str, language: str) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionaryLemmaEntity.headword == headword,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def find_senses_by_english_gloss(self, text: str, target_lang: str) -> list[Sense]:
|
||||
"""EN→target direction: find senses whose gloss matches the given English text.
|
||||
|
||||
Uses a case-insensitive exact match on the gloss column, filtered to the
|
||||
target language via the joined lemma row.
|
||||
"""
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionarySenseEntity.gloss.ilike(text),
|
||||
DictionaryLemmaEntity.language == target_lang,
|
||||
)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_sense(self, sense_id: uuid.UUID) -> Sense | None:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity).where(DictionarySenseEntity.id == sense_id)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
return _sense_to_model(entity) if entity else None
|
||||
|
||||
async def get_lemma(self, lemma_id: uuid.UUID) -> Lemma | None:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryLemmaEntity).where(DictionaryLemmaEntity.id == lemma_id)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
return _lemma_to_model(entity) if entity else None
|
||||
|
||||
async def get_senses_for_headword_and_pos(
|
||||
self, headword: str, language: str, pos_normalised: str
|
||||
) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.join(DictionaryLemmaEntity, DictionarySenseEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionaryLemmaEntity.headword == headword,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
DictionaryLemmaEntity.pos_normalised == pos_normalised,
|
||||
)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_senses_for_lemma(self, lemma_id: uuid.UUID) -> list[Sense]:
|
||||
result = await self.db.execute(
|
||||
select(DictionarySenseEntity)
|
||||
.where(DictionarySenseEntity.lemma_id == lemma_id)
|
||||
.order_by(DictionarySenseEntity.sense_index)
|
||||
)
|
||||
return [_sense_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_wordforms_by_form(self, form: str, language: str) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity)
|
||||
.join(DictionaryLemmaEntity, DictionaryWordformEntity.lemma_id == DictionaryLemmaEntity.id)
|
||||
.where(
|
||||
DictionaryWordformEntity.form == form,
|
||||
DictionaryLemmaEntity.language == language,
|
||||
)
|
||||
)
|
||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_wordforms_for_lemma(self, lemma_id: uuid.UUID) -> list[Wordform]:
|
||||
result = await self.db.execute(
|
||||
select(DictionaryWordformEntity).where(
|
||||
DictionaryWordformEntity.lemma_id == lemma_id
|
||||
)
|
||||
)
|
||||
return [_wordform_to_model(e) for e in result.scalars().all()]
|
||||
|
|
@ -1,136 +0,0 @@
|
|||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Protocol
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..entities.flashcard_entities import FlashcardEntity, FlashcardEventEntity
|
||||
from ....domain.models.flashcard import Flashcard, FlashcardEvent
|
||||
|
||||
|
||||
class FlashcardRepository(Protocol):
|
||||
async def create_flashcard(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
bank_entry_id: uuid.UUID,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
prompt_text: str,
|
||||
answer_text: str,
|
||||
card_direction: str,
|
||||
prompt_modality: str = "text",
|
||||
prompt_context_text: str | None = None,
|
||||
answer_context_text: str | None = None,
|
||||
) -> Flashcard: ...
|
||||
|
||||
async def get_flashcards_for_user(self, user_id: uuid.UUID) -> list[Flashcard]: ...
|
||||
|
||||
async def get_flashcards_for_entry(self, bank_entry_id: uuid.UUID) -> list[Flashcard]: ...
|
||||
|
||||
async def record_event(
|
||||
self,
|
||||
flashcard_id: uuid.UUID,
|
||||
user_id: uuid.UUID,
|
||||
event_type: str,
|
||||
user_response: str | None = None,
|
||||
) -> FlashcardEvent: ...
|
||||
|
||||
|
||||
def _flashcard_to_model(entity: FlashcardEntity) -> Flashcard:
|
||||
return Flashcard(
|
||||
id=str(entity.id),
|
||||
user_id=str(entity.user_id),
|
||||
bank_entry_id=str(entity.bank_entry_id),
|
||||
source_lang=entity.source_lang,
|
||||
target_lang=entity.target_lang,
|
||||
prompt_text=entity.prompt_text,
|
||||
answer_text=entity.answer_text,
|
||||
prompt_context_text=entity.prompt_context_text,
|
||||
answer_context_text=entity.answer_context_text,
|
||||
card_direction=entity.card_direction,
|
||||
prompt_modality=entity.prompt_modality,
|
||||
created_at=entity.created_at,
|
||||
)
|
||||
|
||||
|
||||
def _event_to_model(entity: FlashcardEventEntity) -> FlashcardEvent:
|
||||
return FlashcardEvent(
|
||||
id=str(entity.id),
|
||||
flashcard_id=str(entity.flashcard_id),
|
||||
user_id=str(entity.user_id),
|
||||
event_type=entity.event_type,
|
||||
user_response=entity.user_response,
|
||||
created_at=entity.created_at,
|
||||
)
|
||||
|
||||
|
||||
class PostgresFlashcardRepository:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def create_flashcard(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
bank_entry_id: uuid.UUID,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
prompt_text: str,
|
||||
answer_text: str,
|
||||
card_direction: str,
|
||||
prompt_modality: str = "text",
|
||||
prompt_context_text: str | None = None,
|
||||
answer_context_text: str | None = None,
|
||||
) -> Flashcard:
|
||||
entity = FlashcardEntity(
|
||||
user_id=user_id,
|
||||
bank_entry_id=bank_entry_id,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
prompt_text=prompt_text,
|
||||
answer_text=answer_text,
|
||||
prompt_context_text=prompt_context_text,
|
||||
answer_context_text=answer_context_text,
|
||||
card_direction=card_direction,
|
||||
prompt_modality=prompt_modality,
|
||||
created_at=datetime.now(timezone.utc),
|
||||
)
|
||||
self.db.add(entity)
|
||||
await self.db.commit()
|
||||
await self.db.refresh(entity)
|
||||
return _flashcard_to_model(entity)
|
||||
|
||||
async def get_flashcards_for_user(self, user_id: uuid.UUID) -> list[Flashcard]:
|
||||
result = await self.db.execute(
|
||||
select(FlashcardEntity)
|
||||
.where(FlashcardEntity.user_id == user_id)
|
||||
.order_by(FlashcardEntity.created_at.desc())
|
||||
)
|
||||
return [_flashcard_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def get_flashcards_for_entry(self, bank_entry_id: uuid.UUID) -> list[Flashcard]:
|
||||
result = await self.db.execute(
|
||||
select(FlashcardEntity)
|
||||
.where(FlashcardEntity.bank_entry_id == bank_entry_id)
|
||||
.order_by(FlashcardEntity.created_at.desc())
|
||||
)
|
||||
return [_flashcard_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def record_event(
|
||||
self,
|
||||
flashcard_id: uuid.UUID,
|
||||
user_id: uuid.UUID,
|
||||
event_type: str,
|
||||
user_response: str | None = None,
|
||||
) -> FlashcardEvent:
|
||||
entity = FlashcardEventEntity(
|
||||
flashcard_id=flashcard_id,
|
||||
user_id=user_id,
|
||||
event_type=event_type,
|
||||
user_response=user_response,
|
||||
created_at=datetime.now(timezone.utc),
|
||||
)
|
||||
self.db.add(entity)
|
||||
await self.db.commit()
|
||||
await self.db.refresh(entity)
|
||||
return _event_to_model(entity)
|
||||
|
|
@ -7,26 +7,6 @@ from ..entities.learnable_language_entity import LearnableLanguageEntity
|
|||
from ....domain.models.learnable_language import LearnableLanguage
|
||||
|
||||
|
||||
async def delete(db: AsyncSession, user_id: uuid.UUID, language_id: uuid.UUID) -> bool:
|
||||
"""Delete a learnable language row owned by ``user_id``.
|
||||
|
||||
Returns ``True`` if a row was deleted, ``False`` if no matching row was found.
|
||||
The ``user_id`` check prevents one user from deleting another's data.
|
||||
"""
|
||||
result = await db.execute(
|
||||
select(LearnableLanguageEntity).where(
|
||||
LearnableLanguageEntity.id == language_id,
|
||||
LearnableLanguageEntity.user_id == user_id,
|
||||
)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
if entity is None:
|
||||
return False
|
||||
await db.delete(entity)
|
||||
await db.commit()
|
||||
return True
|
||||
|
||||
|
||||
def _to_model(entity: LearnableLanguageEntity) -> LearnableLanguage:
|
||||
return LearnableLanguage(
|
||||
id=str(entity.id),
|
||||
|
|
|
|||
|
|
@ -1,177 +0,0 @@
|
|||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Protocol
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..entities.vocab_entities import LearnableWordBankEntryEntity, UserLanguagePairEntity
|
||||
from ....domain.models.vocab import LearnableWordBankEntry, UserLanguagePair
|
||||
|
||||
|
||||
class VocabRepository(Protocol):
|
||||
async def get_or_create_language_pair(
|
||||
self, user_id: uuid.UUID, source_lang: str, target_lang: str
|
||||
) -> UserLanguagePair: ...
|
||||
|
||||
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None: ...
|
||||
|
||||
async def add_entry(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
language_pair_id: uuid.UUID,
|
||||
surface_text: str,
|
||||
entry_pathway: str,
|
||||
is_phrase: bool = False,
|
||||
sense_id: uuid.UUID | None = None,
|
||||
wordform_id: uuid.UUID | None = None,
|
||||
source_article_id: uuid.UUID | None = None,
|
||||
disambiguation_status: str = "pending",
|
||||
) -> LearnableWordBankEntry: ...
|
||||
|
||||
async def get_entries_for_user(
|
||||
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
|
||||
) -> list[LearnableWordBankEntry]: ...
|
||||
|
||||
async def set_sense(
|
||||
self, entry_id: uuid.UUID, sense_id: uuid.UUID
|
||||
) -> LearnableWordBankEntry: ...
|
||||
|
||||
async def get_entry(self, entry_id: uuid.UUID) -> LearnableWordBankEntry | None: ...
|
||||
|
||||
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]: ...
|
||||
|
||||
|
||||
def _pair_to_model(entity: UserLanguagePairEntity) -> UserLanguagePair:
|
||||
return UserLanguagePair(
|
||||
id=str(entity.id),
|
||||
user_id=str(entity.user_id),
|
||||
source_lang=entity.source_lang,
|
||||
target_lang=entity.target_lang,
|
||||
)
|
||||
|
||||
|
||||
def _entry_to_model(entity: LearnableWordBankEntryEntity) -> LearnableWordBankEntry:
|
||||
return LearnableWordBankEntry(
|
||||
id=str(entity.id),
|
||||
user_id=str(entity.user_id),
|
||||
language_pair_id=str(entity.language_pair_id),
|
||||
sense_id=str(entity.sense_id) if entity.sense_id else None,
|
||||
wordform_id=str(entity.wordform_id) if entity.wordform_id else None,
|
||||
surface_text=entity.surface_text,
|
||||
is_phrase=entity.is_phrase,
|
||||
entry_pathway=entity.entry_pathway,
|
||||
source_article_id=str(entity.source_article_id) if entity.source_article_id else None,
|
||||
disambiguation_status=entity.disambiguation_status,
|
||||
created_at=entity.created_at,
|
||||
)
|
||||
|
||||
|
||||
class PostgresVocabRepository:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self.db = db
|
||||
|
||||
async def get_or_create_language_pair(
|
||||
self, user_id: uuid.UUID, source_lang: str, target_lang: str
|
||||
) -> UserLanguagePair:
|
||||
result = await self.db.execute(
|
||||
select(UserLanguagePairEntity).where(
|
||||
UserLanguagePairEntity.user_id == user_id,
|
||||
UserLanguagePairEntity.source_lang == source_lang,
|
||||
UserLanguagePairEntity.target_lang == target_lang,
|
||||
)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
if entity is None:
|
||||
entity = UserLanguagePairEntity(
|
||||
user_id=user_id,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
)
|
||||
self.db.add(entity)
|
||||
await self.db.flush()
|
||||
return _pair_to_model(entity)
|
||||
|
||||
async def get_language_pair(self, language_pair_id: uuid.UUID) -> UserLanguagePair | None:
|
||||
result = await self.db.execute(
|
||||
select(UserLanguagePairEntity).where(UserLanguagePairEntity.id == language_pair_id)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
return _pair_to_model(entity) if entity else None
|
||||
|
||||
async def add_entry(
|
||||
self,
|
||||
user_id: uuid.UUID,
|
||||
language_pair_id: uuid.UUID,
|
||||
surface_text: str,
|
||||
entry_pathway: str,
|
||||
is_phrase: bool = False,
|
||||
sense_id: uuid.UUID | None = None,
|
||||
wordform_id: uuid.UUID | None = None,
|
||||
source_article_id: uuid.UUID | None = None,
|
||||
disambiguation_status: str = "pending",
|
||||
) -> LearnableWordBankEntry:
|
||||
entity = LearnableWordBankEntryEntity(
|
||||
user_id=user_id,
|
||||
language_pair_id=language_pair_id,
|
||||
surface_text=surface_text,
|
||||
entry_pathway=entry_pathway,
|
||||
is_phrase=is_phrase,
|
||||
sense_id=sense_id,
|
||||
wordform_id=wordform_id,
|
||||
source_article_id=source_article_id,
|
||||
disambiguation_status=disambiguation_status,
|
||||
created_at=datetime.now(timezone.utc),
|
||||
)
|
||||
self.db.add(entity)
|
||||
await self.db.commit()
|
||||
await self.db.refresh(entity)
|
||||
return _entry_to_model(entity)
|
||||
|
||||
async def get_entries_for_user(
|
||||
self, user_id: uuid.UUID, language_pair_id: uuid.UUID
|
||||
) -> list[LearnableWordBankEntry]:
|
||||
result = await self.db.execute(
|
||||
select(LearnableWordBankEntryEntity)
|
||||
.where(
|
||||
LearnableWordBankEntryEntity.user_id == user_id,
|
||||
LearnableWordBankEntryEntity.language_pair_id == language_pair_id,
|
||||
)
|
||||
.order_by(LearnableWordBankEntryEntity.created_at.desc())
|
||||
)
|
||||
return [_entry_to_model(e) for e in result.scalars().all()]
|
||||
|
||||
async def set_sense(
|
||||
self, entry_id: uuid.UUID, sense_id: uuid.UUID
|
||||
) -> LearnableWordBankEntry:
|
||||
result = await self.db.execute(
|
||||
select(LearnableWordBankEntryEntity).where(
|
||||
LearnableWordBankEntryEntity.id == entry_id
|
||||
)
|
||||
)
|
||||
entity = result.scalar_one()
|
||||
entity.sense_id = sense_id
|
||||
entity.disambiguation_status = "resolved"
|
||||
await self.db.commit()
|
||||
await self.db.refresh(entity)
|
||||
return _entry_to_model(entity)
|
||||
|
||||
async def get_entry(self, entry_id: uuid.UUID) -> LearnableWordBankEntry | None:
|
||||
result = await self.db.execute(
|
||||
select(LearnableWordBankEntryEntity).where(
|
||||
LearnableWordBankEntryEntity.id == entry_id
|
||||
)
|
||||
)
|
||||
entity = result.scalar_one_or_none()
|
||||
return _entry_to_model(entity) if entity else None
|
||||
|
||||
async def get_pending_disambiguation(self, user_id: uuid.UUID) -> list[LearnableWordBankEntry]:
|
||||
result = await self.db.execute(
|
||||
select(LearnableWordBankEntryEntity)
|
||||
.where(
|
||||
LearnableWordBankEntryEntity.user_id == user_id,
|
||||
LearnableWordBankEntryEntity.disambiguation_status == "pending",
|
||||
)
|
||||
.order_by(LearnableWordBankEntryEntity.created_at.desc())
|
||||
)
|
||||
return [_entry_to_model(e) for e in result.scalars().all()]
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, field_validator
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ...auth import verify_token
|
||||
from ...domain.services.account_service import AccountService
|
||||
from ...languages import SUPPORTED_LANGUAGES, SUPPORTED_LEVELS
|
||||
from ...outbound.postgres.database import get_db
|
||||
|
||||
router = APIRouter(prefix="/account", tags=["account"])
|
||||
|
||||
|
||||
class AddLearnableLanguageRequest(BaseModel):
|
||||
source_language: str
|
||||
target_language: str
|
||||
proficiencies: list[str]
|
||||
|
||||
@field_validator("proficiencies")
|
||||
@classmethod
|
||||
def validate_proficiencies(cls, v: list[str]) -> list[str]:
|
||||
if not (1 <= len(v) <= 2):
|
||||
raise ValueError("proficiencies must contain 1 or 2 levels")
|
||||
invalid = [p for p in v if p not in SUPPORTED_LEVELS]
|
||||
if invalid:
|
||||
raise ValueError(f"Invalid proficiency levels: {invalid}. Supported: {sorted(SUPPORTED_LEVELS)}")
|
||||
return v
|
||||
|
||||
|
||||
class LearnableLanguageResponse(BaseModel):
|
||||
id: str
|
||||
source_language: str
|
||||
target_language: str
|
||||
proficiencies: list[str]
|
||||
|
||||
|
||||
@router.post(
|
||||
"/learnable-languages",
|
||||
response_model=LearnableLanguageResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def add_learnable_language(
|
||||
body: AddLearnableLanguageRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> LearnableLanguageResponse:
|
||||
if body.source_language not in SUPPORTED_LANGUAGES:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Unsupported source language '{body.source_language}'. Supported: {list(SUPPORTED_LANGUAGES)}",
|
||||
)
|
||||
if body.target_language not in SUPPORTED_LANGUAGES:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Unsupported target language '{body.target_language}'. Supported: {list(SUPPORTED_LANGUAGES)}",
|
||||
)
|
||||
if body.source_language == body.target_language:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="source_language and target_language must differ",
|
||||
)
|
||||
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
lang = await AccountService(db).add_learnable_language(
|
||||
user_id=user_id,
|
||||
source_language=body.source_language,
|
||||
target_language=body.target_language,
|
||||
proficiencies=body.proficiencies,
|
||||
)
|
||||
return LearnableLanguageResponse(
|
||||
id=lang.id,
|
||||
source_language=lang.source_language,
|
||||
target_language=lang.target_language,
|
||||
proficiencies=lang.proficiencies,
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/learnable-languages/{language_id}",
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
)
|
||||
async def remove_learnable_language(
|
||||
language_id: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> None:
|
||||
try:
|
||||
lid = uuid.UUID(language_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid language_id")
|
||||
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
try:
|
||||
await AccountService(db).remove_learnable_language(user_id=user_id, language_id=lid)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc))
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ...auth import verify_token
|
||||
from ...domain.services.flashcard_service import FlashcardService
|
||||
from ...outbound.postgres.database import get_db
|
||||
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
||||
from ...outbound.postgres.repositories.flashcard_repository import PostgresFlashcardRepository
|
||||
from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository
|
||||
|
||||
router = APIRouter(tags=["flashcards"])
|
||||
|
||||
|
||||
class FlashcardResponse(BaseModel):
|
||||
id: str
|
||||
user_id: str
|
||||
bank_entry_id: str
|
||||
source_lang: str
|
||||
target_lang: str
|
||||
prompt_text: str
|
||||
answer_text: str
|
||||
prompt_context_text: str | None
|
||||
answer_context_text: str | None
|
||||
card_direction: str
|
||||
prompt_modality: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class FlashcardEventResponse(BaseModel):
|
||||
id: str
|
||||
flashcard_id: str
|
||||
user_id: str
|
||||
event_type: str
|
||||
user_response: str | None
|
||||
created_at: str
|
||||
|
||||
|
||||
class GenerateFlashcardsRequest(BaseModel):
|
||||
direction: str | None = None
|
||||
|
||||
|
||||
class RecordEventRequest(BaseModel):
|
||||
event_type: str
|
||||
user_response: str | None = None
|
||||
|
||||
|
||||
def _service(db: AsyncSession) -> FlashcardService:
|
||||
return FlashcardService(
|
||||
flashcard_repo=PostgresFlashcardRepository(db),
|
||||
vocab_repo=PostgresVocabRepository(db),
|
||||
dict_repo=PostgresDictionaryRepository(db),
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/vocab/{entry_id}/flashcards",
|
||||
response_model=list[FlashcardResponse],
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def generate_flashcards(
|
||||
entry_id: str,
|
||||
body: GenerateFlashcardsRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> list[FlashcardResponse]:
|
||||
try:
|
||||
eid = uuid.UUID(entry_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid entry_id")
|
||||
|
||||
try:
|
||||
cards = await _service(db).generate_flashcard_from_entry(eid, direction=body.direction)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(exc))
|
||||
|
||||
return [_flashcard_response(c) for c in cards]
|
||||
|
||||
|
||||
@router.get("/flashcards", response_model=list[FlashcardResponse])
|
||||
async def list_flashcards(
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> list[FlashcardResponse]:
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
cards = await PostgresFlashcardRepository(db).get_flashcards_for_user(user_id)
|
||||
return [_flashcard_response(c) for c in cards]
|
||||
|
||||
|
||||
@router.post(
|
||||
"/flashcards/{flashcard_id}/events",
|
||||
response_model=FlashcardEventResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def record_event(
|
||||
flashcard_id: str,
|
||||
body: RecordEventRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> FlashcardEventResponse:
|
||||
try:
|
||||
fid = uuid.UUID(flashcard_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid flashcard_id")
|
||||
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
try:
|
||||
event = await _service(db).record_flashcard_event(
|
||||
flashcard_id=fid,
|
||||
user_id=user_id,
|
||||
event_type=body.event_type,
|
||||
response=body.user_response,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc))
|
||||
|
||||
return FlashcardEventResponse(
|
||||
id=event.id,
|
||||
flashcard_id=event.flashcard_id,
|
||||
user_id=event.user_id,
|
||||
event_type=event.event_type,
|
||||
user_response=event.user_response,
|
||||
created_at=event.created_at.isoformat(),
|
||||
)
|
||||
|
||||
|
||||
def _flashcard_response(card) -> FlashcardResponse:
|
||||
return FlashcardResponse(
|
||||
id=card.id,
|
||||
user_id=card.user_id,
|
||||
bank_entry_id=card.bank_entry_id,
|
||||
source_lang=card.source_lang,
|
||||
target_lang=card.target_lang,
|
||||
prompt_text=card.prompt_text,
|
||||
answer_text=card.answer_text,
|
||||
prompt_context_text=card.prompt_context_text,
|
||||
answer_context_text=card.answer_context_text,
|
||||
card_direction=card.card_direction,
|
||||
prompt_modality=card.prompt_modality,
|
||||
created_at=card.created_at.isoformat(),
|
||||
)
|
||||
|
|
@ -1,21 +1,15 @@
|
|||
from .account import router as account_router
|
||||
from .flashcards import router as flashcards_router
|
||||
from .pos import router as pos_router
|
||||
from .translate import router as translate_router
|
||||
from .generation import router as generation_router
|
||||
from .jobs import router as jobs_router
|
||||
from .learnable_languages import router as learnable_languages_router
|
||||
from .vocab import router as vocab_router
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
api_router = APIRouter(prefix="/api", tags=["api"])
|
||||
|
||||
api_router.include_router(account_router)
|
||||
api_router.include_router(flashcards_router)
|
||||
api_router.include_router(pos_router)
|
||||
api_router.include_router(translate_router)
|
||||
api_router.include_router(generation_router)
|
||||
api_router.include_router(jobs_router)
|
||||
api_router.include_router(learnable_languages_router)
|
||||
api_router.include_router(vocab_router)
|
||||
|
|
|
|||
|
|
@ -1,218 +0,0 @@
|
|||
import uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ...auth import verify_token
|
||||
from ...domain.services.dictionary_lookup_service import DictionaryLookupService, TokenLookupResult
|
||||
from ...domain.services.vocab_service import VocabService
|
||||
from ...outbound.postgres.database import get_db
|
||||
from ...outbound.postgres.repositories.dictionary_repository import PostgresDictionaryRepository
|
||||
from ...outbound.postgres.repositories.vocab_repository import PostgresVocabRepository
|
||||
|
||||
router = APIRouter(prefix="/vocab", tags=["vocab"])
|
||||
|
||||
|
||||
class AddWordRequest(BaseModel):
|
||||
language_pair_id: str
|
||||
surface_text: str
|
||||
entry_pathway: str = "manual"
|
||||
is_phrase: bool = False
|
||||
source_article_id: str | None = None
|
||||
|
||||
|
||||
class AddFromTokenRequest(BaseModel):
|
||||
language_pair_id: str
|
||||
surface: str
|
||||
spacy_lemma: str
|
||||
pos_ud: str
|
||||
language: str
|
||||
source_article_id: str | None = None
|
||||
|
||||
|
||||
class SenseCandidateResponse(BaseModel):
|
||||
id: str
|
||||
gloss: str
|
||||
topics: list[str]
|
||||
tags: list[str]
|
||||
|
||||
|
||||
class FromTokenResponse(BaseModel):
|
||||
entry: "WordBankEntryResponse"
|
||||
sense_candidates: list[SenseCandidateResponse]
|
||||
matched_via: str
|
||||
|
||||
|
||||
class SetSenseRequest(BaseModel):
|
||||
sense_id: str
|
||||
|
||||
|
||||
class WordBankEntryResponse(BaseModel):
|
||||
id: str
|
||||
user_id: str
|
||||
language_pair_id: str
|
||||
sense_id: str | None
|
||||
wordform_id: str | None
|
||||
surface_text: str
|
||||
is_phrase: bool
|
||||
entry_pathway: str
|
||||
source_article_id: str | None
|
||||
disambiguation_status: str
|
||||
created_at: str
|
||||
|
||||
|
||||
def _service(db: AsyncSession) -> VocabService:
|
||||
return VocabService(
|
||||
vocab_repo=PostgresVocabRepository(db),
|
||||
dict_repo=PostgresDictionaryRepository(db),
|
||||
)
|
||||
|
||||
|
||||
@router.post("", response_model=WordBankEntryResponse, status_code=201)
|
||||
async def add_word(
|
||||
request: AddWordRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> WordBankEntryResponse:
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
try:
|
||||
language_pair_id = uuid.UUID(request.language_pair_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
|
||||
|
||||
source_article_id = None
|
||||
if request.source_article_id:
|
||||
try:
|
||||
source_article_id = uuid.UUID(request.source_article_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid source_article_id")
|
||||
|
||||
try:
|
||||
entry = await _service(db).add_word_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text=request.surface_text.strip(),
|
||||
language_pair_id=language_pair_id,
|
||||
pathway=request.entry_pathway,
|
||||
is_phrase=request.is_phrase,
|
||||
source_article_id=source_article_id,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc))
|
||||
|
||||
return _to_response(entry)
|
||||
|
||||
|
||||
@router.post("/from-token", response_model=FromTokenResponse, status_code=201)
|
||||
async def add_from_token(
|
||||
request: AddFromTokenRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> FromTokenResponse:
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
try:
|
||||
language_pair_id = uuid.UUID(request.language_pair_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
|
||||
|
||||
source_article_id = None
|
||||
if request.source_article_id:
|
||||
try:
|
||||
source_article_id = uuid.UUID(request.source_article_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid source_article_id")
|
||||
|
||||
lookup_service = DictionaryLookupService(PostgresDictionaryRepository(db))
|
||||
result: TokenLookupResult = await lookup_service.lookup_token(
|
||||
surface=request.surface,
|
||||
spacy_lemma=request.spacy_lemma,
|
||||
pos_ud=request.pos_ud,
|
||||
language=request.language,
|
||||
)
|
||||
|
||||
wordform_id = uuid.UUID(result.wordform_id) if result.wordform_id else None
|
||||
|
||||
try:
|
||||
entry = await _service(db).add_token_to_bank(
|
||||
user_id=user_id,
|
||||
surface_text=request.surface,
|
||||
language_pair_id=language_pair_id,
|
||||
senses=result.senses,
|
||||
wordform_id=wordform_id,
|
||||
source_article_id=source_article_id,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc))
|
||||
|
||||
candidates = [
|
||||
SenseCandidateResponse(id=s.id, gloss=s.gloss, topics=s.topics, tags=s.tags)
|
||||
for s in result.senses
|
||||
]
|
||||
return FromTokenResponse(
|
||||
entry=_to_response(entry),
|
||||
sense_candidates=candidates,
|
||||
matched_via=result.matched_via,
|
||||
)
|
||||
|
||||
|
||||
@router.get("", response_model=list[WordBankEntryResponse])
|
||||
async def list_entries(
|
||||
language_pair_id: str,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> list[WordBankEntryResponse]:
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
try:
|
||||
pair_id = uuid.UUID(language_pair_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid language_pair_id")
|
||||
|
||||
entries = await PostgresVocabRepository(db).get_entries_for_user(user_id, pair_id)
|
||||
return [_to_response(e) for e in entries]
|
||||
|
||||
|
||||
@router.get("/pending-disambiguation", response_model=list[WordBankEntryResponse])
|
||||
async def pending_disambiguation(
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> list[WordBankEntryResponse]:
|
||||
user_id = uuid.UUID(token_data["sub"])
|
||||
entries = await PostgresVocabRepository(db).get_pending_disambiguation(user_id)
|
||||
return [_to_response(e) for e in entries]
|
||||
|
||||
|
||||
@router.patch("/{entry_id}/sense", response_model=WordBankEntryResponse)
|
||||
async def resolve_sense(
|
||||
entry_id: str,
|
||||
request: SetSenseRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
token_data: dict = Depends(verify_token),
|
||||
) -> WordBankEntryResponse:
|
||||
try:
|
||||
eid = uuid.UUID(entry_id)
|
||||
sid = uuid.UUID(request.sense_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid UUID")
|
||||
|
||||
try:
|
||||
entry = await _service(db).resolve_disambiguation(eid, sid)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
|
||||
return _to_response(entry)
|
||||
|
||||
|
||||
def _to_response(entry) -> WordBankEntryResponse:
|
||||
return WordBankEntryResponse(
|
||||
id=entry.id,
|
||||
user_id=entry.user_id,
|
||||
language_pair_id=entry.language_pair_id,
|
||||
sense_id=entry.sense_id,
|
||||
wordform_id=entry.wordform_id,
|
||||
surface_text=entry.surface_text,
|
||||
is_phrase=entry.is_phrase,
|
||||
entry_pathway=entry.entry_pathway,
|
||||
source_article_id=entry.source_article_id,
|
||||
disambiguation_status=entry.disambiguation_status,
|
||||
created_at=entry.created_at.isoformat(),
|
||||
)
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, EmailStr
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..auth import create_access_token, verify_password
|
||||
from ..domain.services.account_service import AccountService
|
||||
from ..auth import create_access_token, hash_password, verify_password
|
||||
from ..outbound.postgres.database import get_db
|
||||
from ..outbound.postgres.repositories import user_repository
|
||||
|
||||
|
|
@ -27,15 +27,24 @@ class TokenResponse(BaseModel):
|
|||
|
||||
@router.post("/register", status_code=status.HTTP_201_CREATED)
|
||||
async def register(body: RegisterRequest, db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
user = await user_repository.create(
|
||||
db,
|
||||
email=body.email,
|
||||
hashed_password=hash_password(body.password),
|
||||
)
|
||||
except IntegrityError:
|
||||
await db.rollback()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail="Email already registered",
|
||||
)
|
||||
|
||||
# TODO(email-verification): send verification email here once transactional
|
||||
# email is implemented. Set is_email_verified=False on the User model and
|
||||
# require verification before allowing login.
|
||||
try:
|
||||
account = await AccountService(db).create_account(body.email, body.password)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc))
|
||||
|
||||
return {"id": account.id, "email": account.email}
|
||||
return {"id": str(user.id), "email": user.email}
|
||||
|
||||
|
||||
@router.post("/login", response_model=TokenResponse)
|
||||
|
|
|
|||
|
|
@ -46,9 +46,3 @@ Example Api Clients in their own modules are:
|
|||
- `AnthropicClient` to communicate with Anthorpic's LLM, i.e. Claude, to generate text and synthesis.
|
||||
- `GeminiClient` to communicate with Google's Gemini for text-to-speech generation
|
||||
- `DeepgramClient` for timestamped speech-to-text transcription
|
||||
|
||||
## Deploymnet
|
||||
|
||||
The application has not been deployed yet, but local development should mimic the deployed environment as much as possible.
|
||||
|
||||
It will be deployed on a VPS using containerisation technologies (docker, podman). At the root of the projec there is a `docker-compose.yaml` file which will describe each dependency (e.g. database, queue, storage).
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
[project]
|
||||
name = "language-learning-api"
|
||||
version = "0.1.0"
|
||||
requires-python = "==3.13.*"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"fastapi>=0.115.0",
|
||||
"uvicorn[standard]>=0.30.0",
|
||||
|
|
|
|||
|
|
@ -1,322 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
CLI import script for kaikki/wiktextract JSONL dictionary data.
|
||||
|
||||
Usage (from api/ directory):
|
||||
uv run ./scripts/import_dictionary.py --lang fr
|
||||
|
||||
# or via Make from the repo root:
|
||||
make import-dictionary lang=fr
|
||||
|
||||
DATABASE_URL defaults to postgresql+asyncpg://langlearn:langlearn@localhost:5432/langlearn
|
||||
which matches the docker-compose dev credentials when the DB port is exposed on the host.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
|
||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
_API_DIR = Path(__file__).parent.parent
|
||||
_REPO_ROOT = _API_DIR.parent
|
||||
_DICT_DIR = _REPO_ROOT / "dictionaries" / "kaikki"
|
||||
|
||||
_LANG_FILE_MAP: dict[str, str] = {
|
||||
"fr": "french.jsonl",
|
||||
}
|
||||
|
||||
_POS_MAP: dict[str, str] = {
|
||||
"noun": "NOUN",
|
||||
"verb": "VERB",
|
||||
"adj": "ADJ",
|
||||
"adv": "ADV",
|
||||
"det": "DET",
|
||||
"article": "DET",
|
||||
"pron": "PRON",
|
||||
"prep": "ADP",
|
||||
"adp": "ADP",
|
||||
"conj": "CCONJ",
|
||||
"cconj": "CCONJ",
|
||||
"sconj": "SCONJ",
|
||||
"intj": "INTJ",
|
||||
"num": "NUM",
|
||||
"numeral": "NUM",
|
||||
"part": "PART",
|
||||
"particle": "PART",
|
||||
"name": "PROPN",
|
||||
"propn": "PROPN",
|
||||
"proper noun": "PROPN",
|
||||
"punct": "PUNCT",
|
||||
"sym": "SYM",
|
||||
}
|
||||
|
||||
_GENDER_MAP: dict[str, str] = {
|
||||
"masculine": "masculine",
|
||||
"masc": "masculine",
|
||||
"feminine": "feminine",
|
||||
"fem": "feminine",
|
||||
"neuter": "neuter",
|
||||
"common": "common",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Standalone table definitions — no app imports, no Settings() call
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_meta = sa.MetaData()
|
||||
|
||||
_lemma_table = sa.Table(
|
||||
"dictionary_lemma",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("headword", sa.Text(), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("pos_raw", sa.Text(), nullable=False),
|
||||
sa.Column("pos_normalised", sa.Text(), nullable=True),
|
||||
sa.Column("gender", sa.Text(), nullable=True),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_sense_table = sa.Table(
|
||||
"dictionary_sense",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("sense_index", sa.Integer(), nullable=False),
|
||||
sa.Column("gloss", sa.Text(), nullable=False),
|
||||
sa.Column("topics", ARRAY(sa.Text()), nullable=False),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_wordform_table = sa.Table(
|
||||
"dictionary_wordform",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("form", sa.Text(), nullable=False),
|
||||
sa.Column("tags", ARRAY(sa.Text()), nullable=False),
|
||||
)
|
||||
|
||||
_raw_table = sa.Table(
|
||||
"dictionary_lemma_raw",
|
||||
_meta,
|
||||
sa.Column("id", PG_UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column("lemma_id", PG_UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("language", sa.String(2), nullable=False),
|
||||
sa.Column("raw", JSONB(), nullable=False),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalisation helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _normalise_pos(pos_raw: str) -> str | None:
|
||||
return _POS_MAP.get(pos_raw.lower().strip())
|
||||
|
||||
|
||||
def _normalise_gender(tags: list) -> str | None:
|
||||
for tag in tags:
|
||||
mapped = _GENDER_MAP.get(tag)
|
||||
if mapped:
|
||||
return mapped
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_entry(record: dict, lang_code: str) -> dict | None:
|
||||
"""Parse one kaikki JSONL record into insertion-ready row dicts.
|
||||
|
||||
Returns None if the entry should be skipped.
|
||||
"""
|
||||
if record.get("lang_code") != lang_code:
|
||||
return None
|
||||
|
||||
word = (record.get("word") or "").strip()
|
||||
if not word:
|
||||
return None
|
||||
|
||||
pos_raw = (record.get("pos") or "").strip()
|
||||
top_tags = record.get("tags") or []
|
||||
|
||||
lemma_id = uuid.uuid4()
|
||||
|
||||
senses = []
|
||||
for i, sense_record in enumerate(record.get("senses") or []):
|
||||
sense_id = uuid.uuid4()
|
||||
glosses = sense_record.get("glosses") or []
|
||||
gloss = glosses[0] if glosses else ""
|
||||
topics = sense_record.get("topics") or []
|
||||
sense_tags = sense_record.get("tags") or []
|
||||
|
||||
senses.append(
|
||||
{
|
||||
"id": sense_id,
|
||||
"lemma_id": lemma_id,
|
||||
"sense_index": i,
|
||||
"gloss": gloss,
|
||||
"topics": topics,
|
||||
"tags": sense_tags,
|
||||
}
|
||||
)
|
||||
|
||||
wordforms = []
|
||||
for f in record.get("forms") or []:
|
||||
form_text = (f.get("form") or "").strip()
|
||||
if not form_text or form_text == word:
|
||||
continue
|
||||
form_tags = f.get("tags") or []
|
||||
wordforms.append(
|
||||
{
|
||||
"id": uuid.uuid4(),
|
||||
"lemma_id": lemma_id,
|
||||
"form": form_text,
|
||||
"tags": form_tags,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"lemma": {
|
||||
"id": lemma_id,
|
||||
"headword": word,
|
||||
"language": lang_code,
|
||||
"pos_raw": pos_raw,
|
||||
"pos_normalised": _normalise_pos(pos_raw),
|
||||
"gender": _normalise_gender(top_tags),
|
||||
"tags": top_tags,
|
||||
},
|
||||
"senses": senses,
|
||||
"wordforms": wordforms,
|
||||
"raw": {
|
||||
"id": uuid.uuid4(),
|
||||
"lemma_id": lemma_id,
|
||||
"language": lang_code,
|
||||
"raw": record,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DB operations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _flush_batch(conn: sa.ext.asyncio.AsyncConnection, batch: list[dict]) -> None:
|
||||
lemma_rows = [e["lemma"] for e in batch]
|
||||
sense_rows = [s for e in batch for s in e["senses"]]
|
||||
wordform_rows = [w for e in batch for w in e["wordforms"]]
|
||||
raw_rows = [e["raw"] for e in batch]
|
||||
|
||||
if lemma_rows:
|
||||
await conn.execute(_lemma_table.insert(), lemma_rows)
|
||||
if sense_rows:
|
||||
await conn.execute(_sense_table.insert(), sense_rows)
|
||||
if wordform_rows:
|
||||
await conn.execute(_wordform_table.insert(), wordform_rows)
|
||||
if raw_rows:
|
||||
await conn.execute(_raw_table.insert(), raw_rows)
|
||||
|
||||
await conn.commit()
|
||||
|
||||
|
||||
async def run_import(lang_code: str, batch_size: int = 1000) -> None:
|
||||
lang_file = _LANG_FILE_MAP.get(lang_code)
|
||||
if not lang_file:
|
||||
print(
|
||||
f"No file mapping for lang_code={lang_code!r}. Known: {list(_LANG_FILE_MAP)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
jsonl_path = _DICT_DIR / lang_file
|
||||
if not jsonl_path.exists():
|
||||
print(f"JSONL file not found: {jsonl_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
database_url = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://langlearn:changeme@localhost:5432/langlearn",
|
||||
)
|
||||
|
||||
engine = create_async_engine(database_url, echo=False)
|
||||
|
||||
try:
|
||||
async with engine.connect() as conn:
|
||||
print(f"Deleting existing entries for language={lang_code!r}...")
|
||||
await conn.execute(
|
||||
_lemma_table.delete().where(_lemma_table.c.language == lang_code)
|
||||
)
|
||||
await conn.commit()
|
||||
|
||||
print(f"Importing {jsonl_path} ...")
|
||||
batch: list[dict] = []
|
||||
total_lemmas = 0
|
||||
skipped = 0
|
||||
|
||||
with open(jsonl_path, encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(
|
||||
f" Line {line_num}: JSON parse error: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
parsed = _parse_entry(record, lang_code)
|
||||
if parsed is None:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
batch.append(parsed)
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
await _flush_batch(conn, batch)
|
||||
total_lemmas += len(batch)
|
||||
print(f" Committed {total_lemmas} lemmas...")
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
await _flush_batch(conn, batch)
|
||||
total_lemmas += len(batch)
|
||||
|
||||
print(f"Done. Imported {total_lemmas} lemmas, skipped {skipped} lines.")
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Import kaikki dictionary JSONL into Postgres."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang", required=True, help="Language code to import (e.g. fr)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=1000, help="Rows per commit (default: 1000)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(run_import(args.lang, args.batch_size))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
dictionaries/.gitignore
vendored
1
dictionaries/.gitignore
vendored
|
|
@ -1 +0,0 @@
|
|||
*.jsonl
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
# Dictionaries
|
||||
|
||||
This module contains dictionaires of words, namely from the [Kaikki](https://kaikki.org/dictionary/index.html) project. It is responsible for genering lexical information about words, for both the system and the user, to help describe the language they are using.
|
||||
Loading…
Reference in a new issue