2026-03-27 07:54:00 +00:00
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
LANGUAGE_MODELS: dict[str, str] = {
|
|
|
|
|
"en": "en_core_web_sm",
|
|
|
|
|
"fr": "fr_core_news_sm",
|
|
|
|
|
"es": "es_core_news_sm",
|
|
|
|
|
"it": "it_core_news_sm",
|
|
|
|
|
"de": "de_core_news_sm",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UnsupportedLanguageError(ValueError):
|
|
|
|
|
def __init__(self, language: str):
|
|
|
|
|
self.language = language
|
|
|
|
|
super().__init__(
|
|
|
|
|
f"Unsupported language '{language}'. Supported: {list(LANGUAGE_MODELS)}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpacyClient:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self._cache: dict[str, spacy.Language] = {}
|
|
|
|
|
|
|
|
|
|
def _get_nlp(self, language: str) -> spacy.Language:
|
|
|
|
|
if language not in LANGUAGE_MODELS:
|
|
|
|
|
raise UnsupportedLanguageError(language)
|
|
|
|
|
if language not in self._cache:
|
|
|
|
|
self._cache[language] = spacy.load(LANGUAGE_MODELS[language])
|
|
|
|
|
return self._cache[language]
|
|
|
|
|
|
|
|
|
|
def get_parts_of_speech(self, text: str, language: str) -> dict:
|
2026-03-30 06:11:32 +00:00
|
|
|
"""Use SpaCy to get parts of speech for the given text and language,
|
|
|
|
|
broken down by sentences and then by tokens."""
|
2026-03-27 07:54:00 +00:00
|
|
|
nlp = self._get_nlp(language)
|
2026-04-07 06:09:25 +00:00
|
|
|
|
|
|
|
|
# Recognise line-breaks as always being sentence boundaries, even if the model doesn't.
|
|
|
|
|
# This is important for the frontend to be able to show line-breaks in the source text.
|
|
|
|
|
nlp.add_pipe("sentencizer", before="parser")
|
|
|
|
|
|
2026-03-27 07:54:00 +00:00
|
|
|
doc = nlp(text)
|
2026-03-30 06:11:32 +00:00
|
|
|
|
|
|
|
|
sentences = [
|
2026-03-27 07:54:00 +00:00
|
|
|
{
|
2026-03-30 06:11:32 +00:00
|
|
|
"text": sent.text,
|
|
|
|
|
"tokens": [
|
|
|
|
|
{
|
|
|
|
|
"text": token.text,
|
|
|
|
|
"lemma": token.lemma_,
|
|
|
|
|
"type": token.ent_type_ if token.ent_type_ else None,
|
|
|
|
|
"pos": token.pos_,
|
|
|
|
|
"tag": token.tag_,
|
|
|
|
|
"dep": token.dep_,
|
|
|
|
|
"is_stop": token.is_stop,
|
|
|
|
|
"is_punct": token.is_punct,
|
|
|
|
|
"is_alpha": token.is_alpha,
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
for token in sent
|
|
|
|
|
if not token.is_space
|
|
|
|
|
],
|
2026-03-27 07:54:00 +00:00
|
|
|
}
|
2026-03-30 06:11:32 +00:00
|
|
|
for sent in doc.sents
|
2026-03-27 07:54:00 +00:00
|
|
|
]
|
2026-03-30 06:11:32 +00:00
|
|
|
return {"language": language, "sentences": sentences}
|