From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Wed, 24 May 2023 18:54:12 +0000 (-0700)
Subject: Just in case, catch a sometimes nltk error and return the basic processed content... 
X-Git-Tag: v1.15.0~1^2~19
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c1641f6fb8a1cbe24d96de432a64217bec4a4936;p=thirdparty%2Fpaperless-ngx.git

Just in case, catch a sometimes nltk error and return the basic processed content instead
---

diff --git a/src/documents/classifier.py b/src/documents/classifier.py
index 0848e01053..e4f92b9eaf 100644
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -341,20 +341,42 @@ class DocumentClassifier:
             # set the search path for NLTK data to the single location it should be in
             nltk.data.path = [settings.NLTK_DIR]
 
-            # Do some one time setup
-            if self._stemmer is None:
-                self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
-            if self._stop_words is None:
-                self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
-
-            # Tokenize
-            words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
-            # Remove stop words
-            meaningful_words = [w for w in words if w not in self._stop_words]
-            # Stem words
-            meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
-
-            return " ".join(meaningful_words)
+            try:
+                # Preload the corpus early, to force the lazy loader to transform
+                stopwords.ensure_loaded()
+
+                # Do some one time setup
+                # Sometimes, somehow, there's multiple threads loading the corpus
+                # and it's not thread safe, raising an AttributeError
+                if self._stemmer is None:
+                    self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
+                if self._stop_words is None:
+                    self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
+
+                # Tokenize
+                # This splits the content into tokens, roughly words
+                words: List[str] = word_tokenize(
+                    content,
+                    language=settings.NLTK_LANGUAGE,
+                )
+
+                meaningful_words = []
+                for word in words:
+                    # Skip stop words
+                    # These are words like "a", "and", "the" which add little meaning
+                    if word in self._stop_words:
+                        continue
+                    # Stem the words
+                    # This reduces the words to their stems.
+                    # "amazement" returns "amaz"
+                    # "amaze" returns "amaz
+                    # "amazed" returns "amaz"
+                    meaningful_words.append(self._stemmer.stem(word))
+
+                return " ".join(meaningful_words)
+
+            except AttributeError:
+                return content
 
         return content
 
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index c3e75e402c..d3c239b433 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -921,6 +921,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
     languages for all the NLTK data used.
 
     Assumption: The primary language is first
+
+    NLTK Languages:
+      - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer
+
     """
     ocr_lang = ocr_lang.split("+")[0]
     iso_code_to_nltk = {