api: Add Deepgram and Spacy clients

This commit is contained in:
wilson 2026-03-27 07:54:00 +00:00
parent 407d423a4c
commit 3d5551c3d9
6 changed files with 75 additions and 36 deletions

View file

View file

@ -0,0 +1,21 @@
import asyncio
from deepgram import (
AsyncDeepgramClient,
)
class LocalDeepgramClient:
def __init__(self, api_key: str):
self.deepgram_client = AsyncDeepgramClient(api_key=api_key)
async def transcribe_local_file(self, local_file_path: str, language_code: str):
with open(local_file_path, "rb") as audio_file:
response = await self.deepgram_client.listen.v1.media.transcribe_file(
request=audio_file.read(),
model="nova-3",
language=language_code,
utterances=True,
smart_format=True,
)
return response.results

View file

View file

@ -0,0 +1,46 @@
import spacy
LANGUAGE_MODELS: dict[str, str] = {
"en": "en_core_web_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm",
"it": "it_core_news_sm",
"de": "de_core_news_sm",
}
class UnsupportedLanguageError(ValueError):
def __init__(self, language: str):
self.language = language
super().__init__(
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
)
class SpacyClient:
def __init__(self):
self._cache: dict[str, spacy.Language] = {}
def _get_nlp(self, language: str) -> spacy.Language:
if language not in LANGUAGE_MODELS:
raise UnsupportedLanguageError(language)
if language not in self._cache:
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
return self._cache[language]
def get_parts_of_speech(self, text: str, language: str) -> dict:
nlp = self._get_nlp(language)
doc = nlp(text)
tokens = [
{
"text": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"tag": token.tag_,
"dep": token.dep_,
"is_stop": token.is_stop,
}
for token in doc
if not token.is_space
]
return {"language": language, "tokens": tokens}

View file

@ -1,31 +1,12 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
import spacy
from ...auth import verify_token from ...auth import verify_token
from ...outbound.spacy.spacy_client import SpacyClient, UnsupportedLanguageError
router = APIRouter(prefix="/pos", tags=["api", "pos"]) router = APIRouter(prefix="/pos", tags=["api", "pos"])
LANGUAGE_MODELS: dict[str, str] = { _spacy_client = SpacyClient()
"en": "en_core_web_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm",
"it": "it_core_news_sm",
"de": "de_core_news_sm",
}
_nlp_cache: dict[str, spacy.Language] = {}
def _get_nlp(language: str) -> spacy.Language:
if language not in LANGUAGE_MODELS:
raise HTTPException(
status_code=400,
detail=f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}",
)
if language not in _nlp_cache:
_nlp_cache[language] = spacy.load(LANGUAGE_MODELS[language])
return _nlp_cache[language]
class POSRequest(BaseModel): class POSRequest(BaseModel):
@ -49,18 +30,8 @@ class POSResponse(BaseModel):
@router.post("/", response_model=POSResponse) @router.post("/", response_model=POSResponse)
def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse: def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse:
nlp = _get_nlp(request.language) try:
doc = nlp(request.text) result = _spacy_client.get_parts_of_speech(request.text, request.language)
tokens = [ except UnsupportedLanguageError as e:
TokenInfo( raise HTTPException(status_code=400, detail=str(e))
text=token.text, return POSResponse(**result)
lemma=token.lemma_,
pos=token.pos_,
tag=token.tag_,
dep=token.dep_,
is_stop=token.is_stop,
)
for token in doc
if not token.is_space
]
return POSResponse(language=request.language, tokens=tokens)

View file

@ -17,6 +17,7 @@ dependencies = [
"google-genai>=1.0.0", "google-genai>=1.0.0",
"boto3>=1.35.0", "boto3>=1.35.0",
"httpx>=0.28.1", "httpx>=0.28.1",
"deepgram-sdk>=6.1.0"
] ]
[build-system] [build-system]