language-learning-app/api/app/outbound/spacy/spacy_client.py

59 lines
1.9 KiB
Python

import spacy
LANGUAGE_MODELS: dict[str, str] = {
"en": "en_core_web_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm",
"it": "it_core_news_sm",
"de": "de_core_news_sm",
}
class UnsupportedLanguageError(ValueError):
def __init__(self, language: str):
self.language = language
super().__init__(
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
)
class SpacyClient:
def __init__(self):
self._cache: dict[str, spacy.Language] = {}
def _get_nlp(self, language: str) -> spacy.Language:
if language not in LANGUAGE_MODELS:
raise UnsupportedLanguageError(language)
if language not in self._cache:
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
return self._cache[language]
def get_parts_of_speech(self, text: str, language: str) -> dict:
"""Use SpaCy to get parts of speech for the given text and language,
broken down by sentences and then by tokens."""
nlp = self._get_nlp(language)
doc = nlp(text)
sentences = [
{
"text": sent.text,
"tokens": [
{
"text": token.text,
"lemma": token.lemma_,
"type": token.ent_type_ if token.ent_type_ else None,
"pos": token.pos_,
"tag": token.tag_,
"dep": token.dep_,
"is_stop": token.is_stop,
"is_punct": token.is_punct,
"is_alpha": token.is_alpha,
}
for token in sent
if not token.is_space
],
}
for sent in doc.sents
]
return {"language": language, "sentences": sentences}