import spacy LANGUAGE_MODELS: dict[str, str] = { "en": "en_core_web_sm", "fr": "fr_core_news_sm", "es": "es_core_news_sm", "it": "it_core_news_sm", "de": "de_core_news_sm", } class UnsupportedLanguageError(ValueError): def __init__(self, language: str): self.language = language super().__init__( f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}" ) class SpacyClient: def __init__(self): self._cache: dict[str, spacy.Language] = {} def _get_nlp(self, language: str) -> spacy.Language: if language not in LANGUAGE_MODELS: raise UnsupportedLanguageError(language) if language not in self._cache: self._cache[language] = spacy.load(LANGUAGE_MODELS[language]) return self._cache[language] def get_parts_of_speech(self, text: str, language: str) -> dict: """Use SpaCy to get parts of speech for the given text and language, broken down by sentences and then by tokens.""" nlp = self._get_nlp(language) doc = nlp(text) sentences = [ { "text": sent.text, "tokens": [ { "text": token.text, "lemma": token.lemma_, "type": token.ent_type_ if token.ent_type_ else None, "pos": token.pos_, "tag": token.tag_, "dep": token.dep_, "is_stop": token.is_stop, "is_punct": token.is_punct, "is_alpha": token.is_alpha, } for token in sent if not token.is_space ], } for sent in doc.sents ] return {"language": language, "sentences": sentences}