api: Add Deepgram and Spacy clients
This commit is contained in:
parent
407d423a4c
commit
3d5551c3d9
6 changed files with 75 additions and 36 deletions
0
api/app/outbound/deepgram/__init__.py
Normal file
0
api/app/outbound/deepgram/__init__.py
Normal file
21
api/app/outbound/deepgram/deepgram_client.py
Normal file
21
api/app/outbound/deepgram/deepgram_client.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
import asyncio
|
||||||
|
from deepgram import (
|
||||||
|
AsyncDeepgramClient,
|
||||||
|
)
|
||||||
|
|
||||||
|
class LocalDeepgramClient:
|
||||||
|
def __init__(self, api_key: str):
|
||||||
|
self.deepgram_client = AsyncDeepgramClient(api_key=api_key)
|
||||||
|
|
||||||
|
async def transcribe_local_file(self, local_file_path: str, language_code: str):
|
||||||
|
with open(local_file_path, "rb") as audio_file:
|
||||||
|
response = await self.deepgram_client.listen.v1.media.transcribe_file(
|
||||||
|
request=audio_file.read(),
|
||||||
|
model="nova-3",
|
||||||
|
language=language_code,
|
||||||
|
utterances=True,
|
||||||
|
smart_format=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.results
|
||||||
|
|
||||||
0
api/app/outbound/spacy/__init__.py
Normal file
0
api/app/outbound/spacy/__init__.py
Normal file
46
api/app/outbound/spacy/spacy_client.py
Normal file
46
api/app/outbound/spacy/spacy_client.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
LANGUAGE_MODELS: dict[str, str] = {
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"fr": "fr_core_news_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
"it": "it_core_news_sm",
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedLanguageError(ValueError):
|
||||||
|
def __init__(self, language: str):
|
||||||
|
self.language = language
|
||||||
|
super().__init__(
|
||||||
|
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SpacyClient:
|
||||||
|
def __init__(self):
|
||||||
|
self._cache: dict[str, spacy.Language] = {}
|
||||||
|
|
||||||
|
def _get_nlp(self, language: str) -> spacy.Language:
|
||||||
|
if language not in LANGUAGE_MODELS:
|
||||||
|
raise UnsupportedLanguageError(language)
|
||||||
|
if language not in self._cache:
|
||||||
|
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
|
||||||
|
return self._cache[language]
|
||||||
|
|
||||||
|
def get_parts_of_speech(self, text: str, language: str) -> dict:
|
||||||
|
nlp = self._get_nlp(language)
|
||||||
|
doc = nlp(text)
|
||||||
|
tokens = [
|
||||||
|
{
|
||||||
|
"text": token.text,
|
||||||
|
"lemma": token.lemma_,
|
||||||
|
"pos": token.pos_,
|
||||||
|
"tag": token.tag_,
|
||||||
|
"dep": token.dep_,
|
||||||
|
"is_stop": token.is_stop,
|
||||||
|
}
|
||||||
|
for token in doc
|
||||||
|
if not token.is_space
|
||||||
|
]
|
||||||
|
return {"language": language, "tokens": tokens}
|
||||||
|
|
@ -1,31 +1,12 @@
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
import spacy
|
|
||||||
|
|
||||||
from ...auth import verify_token
|
from ...auth import verify_token
|
||||||
|
from ...outbound.spacy.spacy_client import SpacyClient, UnsupportedLanguageError
|
||||||
|
|
||||||
router = APIRouter(prefix="/pos", tags=["api", "pos"])
|
router = APIRouter(prefix="/pos", tags=["api", "pos"])
|
||||||
|
|
||||||
LANGUAGE_MODELS: dict[str, str] = {
|
_spacy_client = SpacyClient()
|
||||||
"en": "en_core_web_sm",
|
|
||||||
"fr": "fr_core_news_sm",
|
|
||||||
"es": "es_core_news_sm",
|
|
||||||
"it": "it_core_news_sm",
|
|
||||||
"de": "de_core_news_sm",
|
|
||||||
}
|
|
||||||
|
|
||||||
_nlp_cache: dict[str, spacy.Language] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def _get_nlp(language: str) -> spacy.Language:
|
|
||||||
if language not in LANGUAGE_MODELS:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}",
|
|
||||||
)
|
|
||||||
if language not in _nlp_cache:
|
|
||||||
_nlp_cache[language] = spacy.load(LANGUAGE_MODELS[language])
|
|
||||||
return _nlp_cache[language]
|
|
||||||
|
|
||||||
|
|
||||||
class POSRequest(BaseModel):
|
class POSRequest(BaseModel):
|
||||||
|
|
@ -49,18 +30,8 @@ class POSResponse(BaseModel):
|
||||||
|
|
||||||
@router.post("/", response_model=POSResponse)
|
@router.post("/", response_model=POSResponse)
|
||||||
def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse:
|
def analyze_pos(request: POSRequest, _: dict = Depends(verify_token)) -> POSResponse:
|
||||||
nlp = _get_nlp(request.language)
|
try:
|
||||||
doc = nlp(request.text)
|
result = _spacy_client.get_parts_of_speech(request.text, request.language)
|
||||||
tokens = [
|
except UnsupportedLanguageError as e:
|
||||||
TokenInfo(
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
text=token.text,
|
return POSResponse(**result)
|
||||||
lemma=token.lemma_,
|
|
||||||
pos=token.pos_,
|
|
||||||
tag=token.tag_,
|
|
||||||
dep=token.dep_,
|
|
||||||
is_stop=token.is_stop,
|
|
||||||
)
|
|
||||||
for token in doc
|
|
||||||
if not token.is_space
|
|
||||||
]
|
|
||||||
return POSResponse(language=request.language, tokens=tokens)
|
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ dependencies = [
|
||||||
"google-genai>=1.0.0",
|
"google-genai>=1.0.0",
|
||||||
"boto3>=1.35.0",
|
"boto3>=1.35.0",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
|
"deepgram-sdk>=6.1.0"
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue