From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 24 May 2023 18:54:12 +0000 (-0700) Subject: Just in case, catch a sometimes nltk error and return the basic processed content... X-Git-Tag: v1.15.0~1^2~19 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c1641f6fb8a1cbe24d96de432a64217bec4a4936;p=thirdparty%2Fpaperless-ngx.git Just in case, catch a sometimes nltk error and return the basic processed content instead --- diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 0848e01053..e4f92b9eaf 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -341,20 +341,42 @@ class DocumentClassifier: # set the search path for NLTK data to the single location it should be in nltk.data.path = [settings.NLTK_DIR] - # Do some one time setup - if self._stemmer is None: - self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE) - if self._stop_words is None: - self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE)) - - # Tokenize - words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE) - # Remove stop words - meaningful_words = [w for w in words if w not in self._stop_words] - # Stem words - meaningful_words = [self._stemmer.stem(w) for w in meaningful_words] - - return " ".join(meaningful_words) + try: + # Preload the corpus early, to force the lazy loader to transform + stopwords.ensure_loaded() + + # Do some one time setup + # Sometimes, somehow, there's multiple threads loading the corpus + # and it's not thread safe, raising an AttributeError + if self._stemmer is None: + self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE) + if self._stop_words is None: + self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE)) + + # Tokenize + # This splits the content into tokens, roughly words + words: List[str] = word_tokenize( + content, + language=settings.NLTK_LANGUAGE, + ) + + meaningful_words = [] + for word in words: + # Skip stop words + # These are words like "a", "and", "the" which add little meaning + if word in self._stop_words: + continue + # Stem the words + # This reduces the words to their stems. + # "amazement" returns "amaz" + # "amaze" returns "amaz + # "amazed" returns "amaz" + meaningful_words.append(self._stemmer.stem(word)) + + return " ".join(meaningful_words) + + except AttributeError: + return content return content diff --git a/src/paperless/settings.py b/src/paperless/settings.py index c3e75e402c..d3c239b433 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -921,6 +921,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: languages for all the NLTK data used. Assumption: The primary language is first + + NLTK Languages: + - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer + """ ocr_lang = ocr_lang.split("+")[0] iso_code_to_nltk = {