Compare commits
2 commits
407d423a4c
...
746aa5f382
| Author | SHA1 | Date | |
|---|---|---|---|
| 746aa5f382 | |||
| 3d5551c3d9 |
14 changed files with 206 additions and 48 deletions
|
|
@ -0,0 +1,29 @@
|
||||||
|
"""add source_pos_data, target_pos_data, audio_transcript to jobs
|
||||||
|
|
||||||
|
Revision ID: 0003
|
||||||
|
Revises: 0002
|
||||||
|
Create Date: 2026-03-27
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0003"
|
||||||
|
down_revision: Union[str, None] = "0002"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.add_column("jobs", sa.Column("source_pos_data", postgresql.JSONB(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("target_pos_data", postgresql.JSONB(), nullable=True))
|
||||||
|
op.add_column("jobs", sa.Column("audio_transcript", postgresql.JSONB(), nullable=True))
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_column("jobs", "audio_transcript")
|
||||||
|
op.drop_column("jobs", "target_pos_data")
|
||||||
|
op.drop_column("jobs", "source_pos_data")
|
||||||
|
|
@ -6,6 +6,7 @@ class Settings(BaseSettings):
|
||||||
jwt_secret: str
|
jwt_secret: str
|
||||||
anthropic_api_key: str
|
anthropic_api_key: str
|
||||||
deepl_api_key: str
|
deepl_api_key: str
|
||||||
|
deepgram_api_key: str
|
||||||
gemini_api_key: str
|
gemini_api_key: str
|
||||||
storage_endpoint_url: str
|
storage_endpoint_url: str
|
||||||
storage_access_key: str
|
storage_access_key: str
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,9 @@ class SummariseJob:
|
||||||
translated_text: str
|
translated_text: str
|
||||||
error_message: str
|
error_message: str
|
||||||
audio_url: str
|
audio_url: str
|
||||||
|
source_pos_data: dict | None
|
||||||
|
target_pos_data: dict | None
|
||||||
|
audio_transcript: dict | None
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
started_at: datetime | None = None
|
started_at: datetime | None = None
|
||||||
completed_at: datetime | None = None
|
completed_at: datetime | None = None
|
||||||
|
|
|
||||||
|
|
@ -19,18 +19,17 @@ class AnthropicClient():
|
||||||
) -> str:
|
) -> str:
|
||||||
return (
|
return (
|
||||||
f"You are a language learning content creator.\n"
|
f"You are a language learning content creator.\n"
|
||||||
f"The user will provide input, you will generate an engaging realistic summary text in {to_language} "
|
f"You generate markdown summaries of user-provided content.\n"
|
||||||
f"at {complexity_level} proficiency level (CEFR scale).\n\n"
|
f"You will provide content in {to_language} at {complexity_level} proficiency level on the CEFR scale.\n"
|
||||||
f"The text you generate will:\n"
|
f"The text you generate will:\n"
|
||||||
f"- Contain ONLY the generated text in {to_language}.\n"
|
f"- Contain ONLY the generated summary text in {to_language}.\n"
|
||||||
f"- Be appropriate for a {complexity_level} {to_language} speaker.\n"
|
f"- Never contain inappropriate (hateful, sexual, violent) content. It is preferable to return no text than to generate such content.\n"
|
||||||
f"- Never generate inappropriate (hateful, sexual, violent) content. It is preferable to return no text than to generate such content.\n"
|
|
||||||
f"- Speak directly to the reader/listener, adopting the tone and style of a semi-formal news reporter or podcaster.\n"
|
f"- Speak directly to the reader/listener, adopting the tone and style of a semi-formal news reporter or podcaster.\n"
|
||||||
f"- Where appropriate (fluency level, content), use a small number of idiomatic expressions.\n"
|
f"- Where appropriate (fluency level, content), use a small number of idiomatic expressions.\n"
|
||||||
f"- Be formatted in markdown with paragraphs and line breaks.\n"
|
f"- Where appropriate use at least one additional tense, beyond the default of the content.\n"
|
||||||
f"- Be {length_preference} long.\n"
|
f"- Be formatted in markdown. Contain a single level 1 header (#) at the top, followed by paragraphs and line breaks.\n"
|
||||||
f"- Be inspired by the following source material "
|
f"- Be around {length_preference} long.\n"
|
||||||
f"(but written originally in {from_language}):\n\n"
|
f"- Be inspired by the content, but not the tone, of the source material."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_prompt_summarise_text(
|
def _create_prompt_summarise_text(
|
||||||
|
|
|
||||||
0
api/app/outbound/deepgram/__init__.py
Normal file
0
api/app/outbound/deepgram/__init__.py
Normal file
24
api/app/outbound/deepgram/deepgram_client.py
Normal file
24
api/app/outbound/deepgram/deepgram_client.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
import asyncio
|
||||||
|
from deepgram import (
|
||||||
|
AsyncDeepgramClient,
|
||||||
|
)
|
||||||
|
|
||||||
|
class LocalDeepgramClient:
|
||||||
|
def __init__(self, api_key: str):
|
||||||
|
self.deepgram_client = AsyncDeepgramClient(api_key=api_key)
|
||||||
|
|
||||||
|
async def transcribe_bytes(self, audio_bytes: bytes, language_code: str) -> dict:
|
||||||
|
response = await self.deepgram_client.listen.v1.media.transcribe_file(
|
||||||
|
request=audio_bytes,
|
||||||
|
model="nova-3",
|
||||||
|
language=language_code,
|
||||||
|
utterances=True,
|
||||||
|
smart_format=True,
|
||||||
|
)
|
||||||
|
return response.results.json()
|
||||||
|
|
||||||
|
async def transcribe_local_file(self, local_file_path: str, language_code: str) -> dict:
|
||||||
|
with open(local_file_path, "rb") as audio_file:
|
||||||
|
return await self.transcribe_bytes(audio_file, language_code)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -3,7 +3,7 @@ from datetime import datetime, timezone
|
||||||
|
|
||||||
from sqlalchemy import String, Text, DateTime, ForeignKey
|
from sqlalchemy import String, Text, DateTime, ForeignKey
|
||||||
from sqlalchemy.orm import Mapped, mapped_column
|
from sqlalchemy.orm import Mapped, mapped_column
|
||||||
from sqlalchemy.dialects.postgresql import UUID
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
||||||
|
|
||||||
from ..database import Base
|
from ..database import Base
|
||||||
|
|
||||||
|
|
@ -25,6 +25,9 @@ class SummariseJobEntity(Base):
|
||||||
translated_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
translated_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
audio_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
audio_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
source_pos_data: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
|
target_pos_data: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
|
audio_transcript: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||||
created_at: Mapped[datetime] = mapped_column(
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
DateTime(timezone=True),
|
DateTime(timezone=True),
|
||||||
default=lambda: datetime.now(timezone.utc),
|
default=lambda: datetime.now(timezone.utc),
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,9 @@ class PostgresSummariseJobRepository:
|
||||||
translated_text=entity.translated_text,
|
translated_text=entity.translated_text,
|
||||||
error_message=entity.error_message,
|
error_message=entity.error_message,
|
||||||
audio_url=entity.audio_url,
|
audio_url=entity.audio_url,
|
||||||
|
source_pos_data=entity.source_pos_data,
|
||||||
|
target_pos_data=entity.target_pos_data,
|
||||||
|
audio_transcript=entity.audio_transcript,
|
||||||
created_at=entity.created_at,
|
created_at=entity.created_at,
|
||||||
started_at=entity.started_at,
|
started_at=entity.started_at,
|
||||||
completed_at=entity.completed_at,
|
completed_at=entity.completed_at,
|
||||||
|
|
@ -112,6 +115,26 @@ async def save_translated_text(
|
||||||
await update(db, job)
|
await update(db, job)
|
||||||
|
|
||||||
|
|
||||||
|
async def save_pos_data(
|
||||||
|
db: AsyncSession,
|
||||||
|
job: SummariseJobEntity,
|
||||||
|
source_pos_data: dict,
|
||||||
|
target_pos_data: dict,
|
||||||
|
) -> None:
|
||||||
|
job.source_pos_data = source_pos_data
|
||||||
|
job.target_pos_data = target_pos_data
|
||||||
|
await update(db, job)
|
||||||
|
|
||||||
|
|
||||||
|
async def save_audio_transcript(
|
||||||
|
db: AsyncSession,
|
||||||
|
job: SummariseJobEntity,
|
||||||
|
audio_transcript: dict,
|
||||||
|
) -> None:
|
||||||
|
job.audio_transcript = audio_transcript
|
||||||
|
await update(db, job)
|
||||||
|
|
||||||
|
|
||||||
async def mark_succeeded(db: AsyncSession, job: SummariseJobEntity, audio_url: str) -> None:
|
async def mark_succeeded(db: AsyncSession, job: SummariseJobEntity, audio_url: str) -> None:
|
||||||
job.status = "succeeded"
|
job.status = "succeeded"
|
||||||
job.audio_url = audio_url
|
job.audio_url = audio_url
|
||||||
|
|
|
||||||
0
api/app/outbound/spacy/__init__.py
Normal file
0
api/app/outbound/spacy/__init__.py
Normal file
46
api/app/outbound/spacy/spacy_client.py
Normal file
46
api/app/outbound/spacy/spacy_client.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
LANGUAGE_MODELS: dict[str, str] = {
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"fr": "fr_core_news_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
"it": "it_core_news_sm",
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedLanguageError(ValueError):
|
||||||
|
def __init__(self, language: str):
|
||||||
|
self.language = language
|
||||||
|
super().__init__(
|
||||||
|
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SpacyClient:
|
||||||
|
def __init__(self):
|
||||||
|
self._cache: dict[str, spacy.Language] = {}
|
||||||
|
|
||||||
|
def _get_nlp(self, language: str) -> spacy.Language:
|
||||||
|
if language not in LANGUAGE_MODELS:
|
||||||
|
raise UnsupportedLanguageError(language)
|
||||||
|
if language not in self._cache:
|
||||||
|
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
|
||||||
|
return self._cache[language]
|
||||||
|
|
||||||
|
def get_parts_of_speech(self, text: str, language: str) -> dict:
|
||||||
|
nlp = self._get_nlp(language)
|
||||||
|
doc = nlp(text)
|
||||||
|
tokens = [
|
||||||
|
{
|
||||||
|
"text": token.text,
|
||||||
|
"lemma": token.lemma_,
|
||||||
|
"pos": token.pos_,
|
||||||
|
"tag": token.tag_,
|
||||||
|
"dep": token.dep_,
|
||||||
|
"is_stop": token.is_stop,
|
||||||
|
}
|
||||||
|
for token in doc
|
||||||
|
if not token.is_space
|
||||||
|
]
|
||||||
|
return {"language": language, "tokens": tokens}
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from typing import Any, Callable, Coroutine
|
||||||
|
|
||||||
|
import anthropic
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
@ -11,13 +15,48 @@ from ...storage import upload_audio
|
||||||
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
from ...outbound.postgres.database import get_db, AsyncSessionLocal
|
||||||
from ...outbound.postgres.repositories import summarise_job_repository
|
from ...outbound.postgres.repositories import summarise_job_repository
|
||||||
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
from ...outbound.anthropic.anthropic_client import AnthropicClient
|
||||||
|
from ...outbound.deepgram.deepgram_client import LocalDeepgramClient
|
||||||
from ...outbound.deepl.deepl_client import DeepLClient
|
from ...outbound.deepl.deepl_client import DeepLClient
|
||||||
from ...outbound.gemini.gemini_client import GeminiClient
|
from ...outbound.gemini.gemini_client import GeminiClient
|
||||||
|
from ...outbound.spacy.spacy_client import SpacyClient
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
from ... import worker
|
from ... import worker
|
||||||
|
|
||||||
router = APIRouter(prefix="/generate", tags=["api"])
|
router = APIRouter(prefix="/generate", tags=["api"])
|
||||||
|
|
||||||
|
_ANTHROPIC_RETRYABLE = (
|
||||||
|
anthropic.RateLimitError,
|
||||||
|
anthropic.InternalServerError,
|
||||||
|
anthropic.APITimeoutError,
|
||||||
|
anthropic.APIConnectionError,
|
||||||
|
)
|
||||||
|
_MAX_RETRIES = 4
|
||||||
|
_BASE_DELAY = 1.0 # seconds
|
||||||
|
_MAX_DELAY = 60.0 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
async def _anthropic_with_backoff(
|
||||||
|
coro_fn: Callable[..., Coroutine[Any, Any, Any]],
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Any:
|
||||||
|
for attempt in range(_MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
return await coro_fn(*args, **kwargs)
|
||||||
|
except _ANTHROPIC_RETRYABLE as exc:
|
||||||
|
if attempt == _MAX_RETRIES:
|
||||||
|
raise
|
||||||
|
retry_after: float | None = None
|
||||||
|
if isinstance(exc, anthropic.RateLimitError):
|
||||||
|
raw = exc.response.headers.get("retry-after")
|
||||||
|
if raw is not None:
|
||||||
|
retry_after = float(raw)
|
||||||
|
if retry_after is None:
|
||||||
|
retry_after = min(_BASE_DELAY * (2 ** attempt), _MAX_DELAY)
|
||||||
|
# ±20 % jitter to spread out concurrent retries
|
||||||
|
jittered = retry_after * (0.8 + random.random() * 0.4)
|
||||||
|
await asyncio.sleep(jittered)
|
||||||
|
|
||||||
|
|
||||||
class GenerationRequest(BaseModel):
|
class GenerationRequest(BaseModel):
|
||||||
target_language: str
|
target_language: str
|
||||||
|
|
@ -32,8 +71,10 @@ class GenerationResponse(BaseModel):
|
||||||
|
|
||||||
async def _run_generation(job_id: uuid.UUID, request: GenerationRequest) -> None:
|
async def _run_generation(job_id: uuid.UUID, request: GenerationRequest) -> None:
|
||||||
anthropic_client = AnthropicClient.new(settings.anthropic_api_key)
|
anthropic_client = AnthropicClient.new(settings.anthropic_api_key)
|
||||||
|
deepgram_client = LocalDeepgramClient(settings.deepgram_api_key)
|
||||||
deepl_client = DeepLClient(settings.deepl_api_key)
|
deepl_client = DeepLClient(settings.deepl_api_key)
|
||||||
gemini_client = GeminiClient(settings.gemini_api_key)
|
gemini_client = GeminiClient(settings.gemini_api_key)
|
||||||
|
spacy_client = SpacyClient()
|
||||||
|
|
||||||
async with AsyncSessionLocal() as db:
|
async with AsyncSessionLocal() as db:
|
||||||
job = await summarise_job_repository.get_by_id(db, job_id)
|
job = await summarise_job_repository.get_by_id(db, job_id)
|
||||||
|
|
@ -44,7 +85,8 @@ async def _run_generation(job_id: uuid.UUID, request: GenerationRequest) -> None
|
||||||
|
|
||||||
source_material = "\n\n".join(request.input_texts[:3])
|
source_material = "\n\n".join(request.input_texts[:3])
|
||||||
|
|
||||||
generated_text = await anthropic_client.generate_summary_text(
|
generated_text = await _anthropic_with_backoff(
|
||||||
|
anthropic_client.generate_summary_text,
|
||||||
content_to_summarise=source_material,
|
content_to_summarise=source_material,
|
||||||
complexity_level=request.complexity_level,
|
complexity_level=request.complexity_level,
|
||||||
from_language=language_name,
|
from_language=language_name,
|
||||||
|
|
@ -60,11 +102,19 @@ async def _run_generation(job_id: uuid.UUID, request: GenerationRequest) -> None
|
||||||
|
|
||||||
await summarise_job_repository.save_translated_text(db, job, translated_text)
|
await summarise_job_repository.save_translated_text(db, job, translated_text)
|
||||||
|
|
||||||
|
target_pos_data = spacy_client.get_parts_of_speech(generated_text, request.target_language)
|
||||||
|
source_pos_data = spacy_client.get_parts_of_speech(translated_text, request.source_language)
|
||||||
|
|
||||||
|
await summarise_job_repository.save_pos_data(db, job, source_pos_data, target_pos_data)
|
||||||
|
|
||||||
voice = gemini_client.get_voice_by_language(request.target_language)
|
voice = gemini_client.get_voice_by_language(request.target_language)
|
||||||
wav_bytes = await gemini_client.generate_audio(generated_text, voice)
|
wav_bytes = await gemini_client.generate_audio(generated_text, voice)
|
||||||
audio_key = f"audio/{job_id}.wav"
|
audio_key = f"audio/{job_id}.wav"
|
||||||
upload_audio(audio_key, wav_bytes)
|
upload_audio(audio_key, wav_bytes)
|
||||||
|
|
||||||
|
transcript = await deepgram_client.transcribe_bytes(wav_bytes, request.target_language)
|
||||||
|
await summarise_job_repository.save_audio_transcript(db, job, transcript)
|
||||||
|
|
||||||
await summarise_job_repository.mark_succeeded(db, job, audio_key)
|
await summarise_job_repository.mark_succeeded(db, job, audio_key)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|
|
||||||
|
|
@ -28,9 +28,12 @@ class JobResponse(BaseModel):
|
||||||
completed_at: datetime | None = None
|
completed_at: datetime | None = None
|
||||||
# only present on success
|
# only present on success
|
||||||
generated_text: str | None = None
|
generated_text: str | None = None
|
||||||
|
generated_text_pos: dict | None = None
|
||||||
translated_text: str | None = None
|
translated_text: str | None = None
|
||||||
|
translated_text_pos: dict | None = None
|
||||||
input_summary: str | None = None
|
input_summary: str | None = None
|
||||||
audio_url: str | None = None
|
audio_url: str | None = None
|
||||||
|
audio_transcript: dict | None = None
|
||||||
# only present on failure
|
# only present on failure
|
||||||
error_message: str | None = None
|
error_message: str | None = None
|
||||||
model_config = {"from_attributes": True}
|
model_config = {"from_attributes": True}
|
||||||
|
|
@ -40,6 +43,8 @@ class JobSummary(BaseModel):
|
||||||
id: uuid.UUID
|
id: uuid.UUID
|
||||||
status: str
|
status: str
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
|
completed_at: datetime | None = None
|
||||||
|
error_message: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class JobListResponse(BaseModel):
|
class JobListResponse(BaseModel):
|
||||||
|
|
@ -85,9 +90,12 @@ async def get_job(
|
||||||
|
|
||||||
if job.status == "succeeded":
|
if job.status == "succeeded":
|
||||||
response.generated_text = job.generated_text
|
response.generated_text = job.generated_text
|
||||||
|
response.generated_text_pos = job.target_pos_data
|
||||||
response.translated_text = job.translated_text
|
response.translated_text = job.translated_text
|
||||||
|
response.translated_text_pos = job.source_pos_data
|
||||||
response.input_summary = job.input_summary
|
response.input_summary = job.input_summary
|
||||||
response.audio_url = job.audio_url
|
response.audio_url = job.audio_url
|
||||||
|
response.audio_transcript = job.audio_transcript
|
||||||
elif job.status == "failed":
|
elif job.status == "failed":
|
||||||
response.error_message = job.error_message
|
response.error_message = job.error_message
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,31 +1,12 @@
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
import spacy
|
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
|
from ...outbound.spacy.spacy_client import SpacyClient, UnsupportedLanguageError
|
||||||
|
|
||||||
router = APIRouter(prefix="/pos", tags=["api", "pos"])
|
router = APIRouter(prefix="/pos", tags=["api", "pos"])
|
||||||
|
|
||||||
LANGUAGE_MODELS: dict[str, str] = {
|
_spacy_client = SpacyClient()
|
||||||
"en": "en_core_web_sm",
|
|
||||||
"fr": "fr_core_news_sm",
|
|
||||||
"es": "es_core_news_sm",
|
|
||||||
"it": "it_core_news_sm",
|
|
||||||
"de": "de_core_news_sm",
|
|
||||||
}
|
|
||||||
|
|
||||||
_nlp_cache: dict[str, spacy.Language] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def _get_nlp(language: str) -> spacy.Language:
|
|
||||||
if language not in LANGUAGE_MODELS:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}",
|
|
||||||
)
|
|
||||||
if language not in _nlp_cache:
|
|
||||||
_nlp_cache[language] = spacy.load(LANGUAGE_MODELS[language])
|
|
||||||
return _nlp_cache[language]
|
|
||||||
|
|
||||||
|
|
||||||
class POSRequest(BaseModel):
|
class POSRequest(BaseModel):
|
||||||
|
|
@ -49,18 +30,8 @@ class POSResponse(BaseModel):
|
||||||
|
|
||||||
@router.post("/", response_model=POSResponse)
|
@router.post("/", response_model=POSResponse)
|
||||||
def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse:
|
def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse:
|
||||||
nlp = _get_nlp(request.language)
|
try:
|
||||||
doc = nlp(request.text)
|
result = _spacy_client.get_parts_of_speech(request.text, request.language)
|
||||||
tokens = [
|
except UnsupportedLanguageError as e:
|
||||||
TokenInfo(
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
text=token.text,
|
return POSResponse(**result)
|
||||||
lemma=token.lemma_,
|
|
||||||
pos=token.pos_,
|
|
||||||
tag=token.tag_,
|
|
||||||
dep=token.dep_,
|
|
||||||
is_stop=token.is_stop,
|
|
||||||
)
|
|
||||||
for token in doc
|
|
||||||
if not token.is_space
|
|
||||||
]
|
|
||||||
return POSResponse(language=request.language, tokens=tokens)
|
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ dependencies = [
|
||||||
"google-genai>=1.0.0",
|
"google-genai>=1.0.0",
|
||||||
"boto3>=1.35.0",
|
"boto3>=1.35.0",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
|
"deepgram-sdk>=6.1.0"
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue