language-learning-app/api/app/outbound/spacy/spacy_client.py

import spacy

LANGUAGE_MODELS: dict[str, str] = {
    "en": "en_core_web_sm",
    "fr": "fr_core_news_sm",
    "es": "es_core_news_sm",
    "it": "it_core_news_sm",
    "de": "de_core_news_sm",
}


class UnsupportedLanguageError(ValueError):
    def __init__(self, language: str):
        self.language = language
        super().__init__(
            f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
        )


class SpacyClient:
    def __init__(self):
        self._cache: dict[str, spacy.Language] = {}

    def _get_nlp(self, language: str) -> spacy.Language:
        if language not in LANGUAGE_MODELS:
            raise UnsupportedLanguageError(language)
        if language not in self._cache:
            self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
        return self._cache[language]

    def get_parts_of_speech(self, text: str, language: str) -> dict:
        """Use SpaCy to get parts of speech for the given text and language, 
        broken down by sentences and then by tokens."""
        nlp = self._get_nlp(language)
        
        # Recognise line-breaks as always being sentence boundaries, even if the model doesn't.
        # This is important for the frontend to be able to show line-breaks in the source text.
        nlp.add_pipe("sentencizer", before="parser")
        
        doc = nlp(text)
        
        sentences = [
            {
                "text": sent.text,
                "tokens": [
                    {
                        "text": token.text,
                        "lemma": token.lemma_,
                        "type": token.ent_type_ if token.ent_type_ else None,
                        "pos": token.pos_,
                        "tag": token.tag_,
                        "dep": token.dep_,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "is_alpha": token.is_alpha,

                    }
                    for token in sent
                    if not token.is_space
                ],
            }
            for sent in doc.sents
        ]
        return {"language": language, "sentences": sentences}
api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`import spacy`

			`LANGUAGE_MODELS: dict[str, str] = {`
			`"en": "en_core_web_sm",`
			`"fr": "fr_core_news_sm",`
			`"es": "es_core_news_sm",`
			`"it": "it_core_news_sm",`
			`"de": "de_core_news_sm",`
			`}`


			`class UnsupportedLanguageError(ValueError):`
			`def __init__(self, language: str):`
			`self.language = language`
			`super().__init__(`
			`f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"`
			`)`


			`class SpacyClient:`
			`def __init__(self):`
			`self._cache: dict[str, spacy.Language] = {}`

			`def _get_nlp(self, language: str) -> spacy.Language:`
			`if language not in LANGUAGE_MODELS:`
			`raise UnsupportedLanguageError(language)`
			`if language not in self._cache:`
			`self._cache[language] = spacy.load(LANGUAGE_MODELS[language])`
			`return self._cache[language]`

			`def get_parts_of_speech(self, text: str, language: str) -> dict:`
feat: Update responsibility between the job and the translated_articles 2026-03-30 06:11:32 +00:00			`"""Use SpaCy to get parts of speech for the given text and language,`
			`broken down by sentences and then by tokens."""`
api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`nlp = self._get_nlp(language)`
feat: Split sentence/translation in UI into a component 2026-04-07 06:09:25 +00:00
			`# Recognise line-breaks as always being sentence boundaries, even if the model doesn't.`
			`# This is important for the frontend to be able to show line-breaks in the source text.`
			`nlp.add_pipe("sentencizer", before="parser")`

api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`doc = nlp(text)`
feat: Update responsibility between the job and the translated_articles 2026-03-30 06:11:32 +00:00
			`sentences = [`
api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`{`
feat: Update responsibility between the job and the translated_articles 2026-03-30 06:11:32 +00:00			`"text": sent.text,`
			`"tokens": [`
			`{`
			`"text": token.text,`
			`"lemma": token.lemma_,`
			`"type": token.ent_type_ if token.ent_type_ else None,`
			`"pos": token.pos_,`
			`"tag": token.tag_,`
			`"dep": token.dep_,`
			`"is_stop": token.is_stop,`
			`"is_punct": token.is_punct,`
			`"is_alpha": token.is_alpha,`

			`}`
			`for token in sent`
			`if not token.is_space`
			`],`
api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`}`
feat: Update responsibility between the job and the translated_articles 2026-03-30 06:11:32 +00:00			`for sent in doc.sents`
api: Add Deepgram and Spacy clients 2026-03-27 07:54:00 +00:00			`]`
feat: Update responsibility between the job and the translated_articles 2026-03-30 06:11:32 +00:00			`return {"language": language, "sentences": sentences}`