language-learning-app/api/app/outbound/spacy/spacy_client.py

47 lines
1.3 KiB
Python
Raw Normal View History

2026-03-27 07:54:00 +00:00
import spacy
LANGUAGE_MODELS: dict[str, str] = {
"en": "en_core_web_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm",
"it": "it_core_news_sm",
"de": "de_core_news_sm",
}
class UnsupportedLanguageError(ValueError):
def __init__(self, language: str):
self.language = language
super().__init__(
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
)
class SpacyClient:
def __init__(self):
self._cache: dict[str, spacy.Language] = {}
def _get_nlp(self, language: str) -> spacy.Language:
if language not in LANGUAGE_MODELS:
raise UnsupportedLanguageError(language)
if language not in self._cache:
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
return self._cache[language]
def get_parts_of_speech(self, text: str, language: str) -> dict:
nlp = self._get_nlp(language)
doc = nlp(text)
tokens = [
{
"text": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"tag": token.tag_,
"dep": token.dep_,
"is_stop": token.is_stop,
}
for token in doc
if not token.is_space
]
return {"language": language, "tokens": tokens}