language-learning-app/api/app/outbound/spacy/spacy_client.py

import spacy

LANGUAGE_MODELS: dict[str, str] = {
    "en": "en_core_web_sm",
    "fr": "fr_core_news_sm",
    "es": "es_core_news_sm",
    "it": "it_core_news_sm",
    "de": "de_core_news_sm",
}


class UnsupportedLanguageError(ValueError):
    def __init__(self, language: str):
        self.language = language
        super().__init__(
            f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
        )


class SpacyClient:
    def __init__(self):
        self._cache: dict[str, spacy.Language] = {}

    def _get_nlp(self, language: str) -> spacy.Language:
        if language not in LANGUAGE_MODELS:
            raise UnsupportedLanguageError(language)
        if language not in self._cache:
            self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
        return self._cache[language]

    def get_parts_of_speech(self, text: str, language: str) -> dict:
        """Use SpaCy to get parts of speech for the given text and language,
        broken down by sentences and then by tokens."""
        nlp = self._get_nlp(language)
        doc = nlp(text)

        sentences = [
            {
                "text": sent.text,
                "tokens": [
                    {
                        "text": token.text,
                        "lemma": token.lemma_,
                        "type": token.ent_type_ if token.ent_type_ else None,
                        "pos": token.pos_,
                        "tag": token.tag_,
                        "dep": token.dep_,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "is_alpha": token.is_alpha,

                    }
                    for token in sent
                    if not token.is_space
                ],
            }
            for sent in doc.sents
        ]
        return {"language": language, "sentences": sentences}