]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Just in case, catch a sometimes nltk error and return the basic processed content...
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Wed, 24 May 2023 18:54:12 +0000 (11:54 -0700)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Thu, 25 May 2023 02:34:49 +0000 (19:34 -0700)
src/documents/classifier.py
src/paperless/settings.py

index 0848e01053d83f0670d44633ae4889e09df341dd..e4f92b9eaf6084b87d6ffaa0edcc788e57fd1464 100644 (file)
@@ -341,20 +341,42 @@ class DocumentClassifier:
             # set the search path for NLTK data to the single location it should be in
             nltk.data.path = [settings.NLTK_DIR]
 
-            # Do some one time setup
-            if self._stemmer is None:
-                self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
-            if self._stop_words is None:
-                self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
-
-            # Tokenize
-            words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
-            # Remove stop words
-            meaningful_words = [w for w in words if w not in self._stop_words]
-            # Stem words
-            meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
-
-            return " ".join(meaningful_words)
+            try:
+                # Preload the corpus early, to force the lazy loader to transform
+                stopwords.ensure_loaded()
+
+                # Do some one time setup
+                # Sometimes, somehow, there's multiple threads loading the corpus
+                # and it's not thread safe, raising an AttributeError
+                if self._stemmer is None:
+                    self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
+                if self._stop_words is None:
+                    self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
+
+                # Tokenize
+                # This splits the content into tokens, roughly words
+                words: List[str] = word_tokenize(
+                    content,
+                    language=settings.NLTK_LANGUAGE,
+                )
+
+                meaningful_words = []
+                for word in words:
+                    # Skip stop words
+                    # These are words like "a", "and", "the" which add little meaning
+                    if word in self._stop_words:
+                        continue
+                    # Stem the words
+                    # This reduces the words to their stems.
+                    # "amazement" returns "amaz"
+                    # "amaze" returns "amaz
+                    # "amazed" returns "amaz"
+                    meaningful_words.append(self._stemmer.stem(word))
+
+                return " ".join(meaningful_words)
+
+            except AttributeError:
+                return content
 
         return content
 
index c3e75e402ccc86aea679a78992e0932343ea6770..d3c239b433a2bc771ddd9391062413b2fa5d48dc 100644 (file)
@@ -921,6 +921,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
     languages for all the NLTK data used.
 
     Assumption: The primary language is first
+
+    NLTK Languages:
+      - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer
+
     """
     ocr_lang = ocr_lang.split("+")[0]
     iso_code_to_nltk = {