Performance: Classifier performance optimizations (#10363)

author Antoine Mérino <antoine.merino.dev@gmail.com>

Wed, 6 Aug 2025 20:00:11 +0000 (22:00 +0200)

committer GitHub <noreply@github.com>

Wed, 6 Aug 2025 20:00:11 +0000 (16:00 -0400)
author Antoine Mérino <antoine.merino.dev@gmail.com>
Wed, 6 Aug 2025 20:00:11 +0000 (22:00 +0200)
committer GitHub <noreply@github.com>
Wed, 6 Aug 2025 20:00:11 +0000 (16:00 -0400)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

index d0e070e15c08e1e9fea896546386d2c2937656bb..e042683deadb0a43f92e42d851e0df5575b550bc 100644 (file)
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,6 +15,7 @@ env:
    DEFAULT_UV_VERSION: "0.8.x"
    # This is the default version of Python to use in most steps which aren't specific
    DEFAULT_PYTHON_VERSION: "3.11"
+  NLTK_DATA: "/usr/share/nltk_data"
  jobs:
    pre-commit:
      # We want to run on external PRs, but not on our own internal PRs as they'll be run
@@ -121,8 +122,11 @@ jobs:
        - name: List installed Python dependencies
          run: |
            uv pip list
+      - name: Install or update NLTK dependencies
+        run: uv run python -m nltk.downloader punkt punkt_tab snowball_data stopwords -d ${{ env.NLTK_DATA }}
        - name: Tests
          env:
+          NLTK_DATA: ${{ env.NLTK_DATA }}
            PAPERLESS_CI_TEST: 1
            # Enable paperless_mail testing against real server
            PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml

index 64a18318665c83d37aef2335de8e3b526966d06c..2c50e81a6f1bde2a8ba55b7232b20491f8281d52 100644 (file)
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
      rev: v2.4.1
      hooks:
        - id: codespell
-        exclude: "(^src-ui/src/locale/)|(^src-ui/pnpm-lock.yaml)|(^src-ui/e2e/)|(^src/paperless_mail/tests/samples/)"
+        exclude: "(^src-ui/src/locale/)|(^src-ui/pnpm-lock.yaml)|(^src-ui/e2e/)|(^src/paperless_mail/tests/samples/)|(^src/documents/tests/samples/)"
          exclude_types:
            - pofile
            - json
diff --git a/src/documents/caching.py b/src/documents/caching.py

index 1099a7a73d976176d58e54a85b5f16c38cae86e6..ed7f6dbc17f02f7c87d112584da1b27c5b174256 100644 (file)
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@@ -1,16 +1,23 @@
  from __future__ import annotations
  
  import logging
+import pickle
  from binascii import hexlify
+from collections import OrderedDict
  from dataclasses import dataclass
  from typing import TYPE_CHECKING
+from typing import Any
  from typing import Final
  
+from django.conf import settings
  from django.core.cache import cache
+from django.core.cache import caches
  
  from documents.models import Document
  
  if TYPE_CHECKING:
+    from django.core.cache.backends.base import BaseCache
+
      from documents.classifier import DocumentClassifier
  
  logger = logging.getLogger("paperless.caching")
@@ -39,6 +46,80 @@ CACHE_1_MINUTE: Final[int] = 60
  CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
  CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
  
+read_cache = caches["read-cache"]
+
+
+class LRUCache:
+    def __init__(self, capacity: int = 128):
+        self._data = OrderedDict()
+        self.capacity = capacity
+
+    def get(self, key, default=None) -> Any | None:
+        if key in self._data:
+            self._data.move_to_end(key)
+            return self._data[key]
+        return default
+
+    def set(self, key, value) -> None:
+        self._data[key] = value
+        self._data.move_to_end(key)
+        while len(self._data) > self.capacity:
+            self._data.popitem(last=False)
+
+
+class StoredLRUCache(LRUCache):
+    """
+    LRU cache that can persist its entire contents as a single entry in a backend cache.
+
+    Useful for sharing a cache across multiple workers or processes.
+
+    Workflow:
+        1. Load the cache state from the backend using `load()`.
+        2. Use `get()` and `set()` locally as usual.
+        3. Persist changes back to the backend using `save()`.
+    """
+
+    def __init__(
+        self,
+        backend_key: str,
+        capacity: int = 128,
+        backend: BaseCache = read_cache,
+        backend_ttl=settings.CACHALOT_TIMEOUT,
+    ):
+        if backend_key is None:
+            raise ValueError("backend_key is mandatory")
+        super().__init__(capacity)
+        self._backend_key = backend_key
+        self._backend = backend
+        self.backend_ttl = backend_ttl
+
+    def load(self) -> None:
+        """
+        Load the whole cache content from backend storage.
+
+        If no valid cached data exists in the backend, the local cache is cleared.
+        """
+        serialized_data = self._backend.get(self._backend_key)
+        try:
+            self._data = (
+                pickle.loads(serialized_data) if serialized_data else OrderedDict()
+            )
+        except pickle.PickleError:
+            logger.warning(
+                "Cache exists in backend but could not be read (possibly invalid format)",
+            )
+
+    def save(self) -> None:
+        """Save the entire local cache to the backend as a serialized object.
+
+        The backend entry will expire after the configured TTL.
+        """
+        self._backend.set(
+            self._backend_key,
+            pickle.dumps(self._data),
+            self.backend_ttl,
+        )
+
  
  def get_suggestion_cache_key(document_id: int) -> str:
      """
diff --git a/src/documents/classifier.py b/src/documents/classifier.py

index 728c8322898377c1b319ff6d6aa0d5258dd52720..613c1d5adc8f39a97cc70f2013a73981860edf52 100644 (file)
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -16,16 +16,29 @@ if TYPE_CHECKING:
  
  from django.conf import settings
  from django.core.cache import cache
+from django.core.cache import caches
  
+from documents.caching import CACHE_5_MINUTES
  from documents.caching import CACHE_50_MINUTES
  from documents.caching import CLASSIFIER_HASH_KEY
  from documents.caching import CLASSIFIER_MODIFIED_KEY
  from documents.caching import CLASSIFIER_VERSION_KEY
+from documents.caching import StoredLRUCache
  from documents.models import Document
  from documents.models import MatchingModel
  
  logger = logging.getLogger("paperless.classifier")
  
+ADVANCED_TEXT_PROCESSING_ENABLED = (
+    settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED
+)
+
+read_cache = caches["read-cache"]
+
+
+RE_DIGIT = re.compile(r"\d")
+RE_WORD = re.compile(r"\b[\w]+\b")  # words that may contain digits
+
  
  class IncompatibleClassifierVersionError(Exception):
      def __init__(self, message: str, *args: object) -> None:
@@ -92,15 +105,28 @@ class DocumentClassifier:
          self.last_auto_type_hash: bytes | None = None
  
          self.data_vectorizer = None
+        self.data_vectorizer_hash = None
          self.tags_binarizer = None
          self.tags_classifier = None
          self.correspondent_classifier = None
          self.document_type_classifier = None
          self.storage_path_classifier = None
-
          self._stemmer = None
+        # 10,000 elements roughly use 200 to 500 KB per worker,
+        # and also in the shared Redis cache,
+        # Keep this cache small to minimize lookup and I/O latency.
+        if ADVANCED_TEXT_PROCESSING_ENABLED:
+            self._stem_cache = StoredLRUCache(
+                f"stem_cache_v{self.FORMAT_VERSION}",
+                capacity=10000,
+            )
          self._stop_words = None
  
+    def _update_data_vectorizer_hash(self):
+        self.data_vectorizer_hash = sha256(
+            pickle.dumps(self.data_vectorizer),
+        ).hexdigest()
+
      def load(self) -> None:
          from sklearn.exceptions import InconsistentVersionWarning
  
@@ -119,6 +145,7 @@ class DocumentClassifier:
                          self.last_auto_type_hash = pickle.load(f)
  
                          self.data_vectorizer = pickle.load(f)
+                        self._update_data_vectorizer_hash()
                          self.tags_binarizer = pickle.load(f)
  
                          self.tags_classifier = pickle.load(f)
@@ -269,7 +296,7 @@ class DocumentClassifier:
              Generates the content for documents, but once at a time
              """
              for doc in docs_queryset:
-                yield self.preprocess_content(doc.content)
+                yield self.preprocess_content(doc.content, shared_cache=False)
  
          self.data_vectorizer = CountVectorizer(
              analyzer="word",
@@ -347,6 +374,7 @@ class DocumentClassifier:
  
          self.last_doc_change_time = latest_doc_change
          self.last_auto_type_hash = hasher.digest()
+        self._update_data_vectorizer_hash()
  
          # Set the classifier information into the cache
          # Caching for 50 minutes, so slightly less than the normal retrain time
@@ -356,30 +384,15 @@ class DocumentClassifier:
  
          return True
  
-    def preprocess_content(self, content: str) -> str:  # pragma: no cover
-        """
-        Process to contents of a document, distilling it down into
-        words which are meaningful to the content
-        """
-
-        # Lower case the document
-        content = content.lower().strip()
-        # Reduce spaces
-        content = re.sub(r"\s+", " ", content)
-        # Get only the letters
-        content = re.sub(r"[^\w\s]", " ", content)
-
-        # If the NLTK language is supported, do further processing
-        if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED:
+    def _init_advanced_text_processing(self):
+        if self._stop_words is None or self._stemmer is None:
              import nltk
              from nltk.corpus import stopwords
              from nltk.stem import SnowballStemmer
-            from nltk.tokenize import word_tokenize
  
              # Not really hacky, since it isn't private and is documented, but
              # set the search path for NLTK data to the single location it should be in
              nltk.data.path = [settings.NLTK_DIR]
-
              try:
                  # Preload the corpus early, to force the lazy loader to transform
                  stopwords.ensure_loaded()
@@ -387,41 +400,100 @@ class DocumentClassifier:
                  # Do some one time setup
                  # Sometimes, somehow, there's multiple threads loading the corpus
                  # and it's not thread safe, raising an AttributeError
-                if self._stemmer is None:
-                    self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
-                if self._stop_words is None:
-                    self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
-
-                # Tokenize
-                # This splits the content into tokens, roughly words
-                words: list[str] = word_tokenize(
-                    content,
-                    language=settings.NLTK_LANGUAGE,
-                )
-
-                meaningful_words = []
-                for word in words:
-                    # Skip stop words
-                    # These are words like "a", "and", "the" which add little meaning
-                    if word in self._stop_words:
-                        continue
-                    # Stem the words
-                    # This reduces the words to their stems.
-                    # "amazement" returns "amaz"
-                    # "amaze" returns "amaz
-                    # "amazed" returns "amaz"
-                    meaningful_words.append(self._stemmer.stem(word))
-
-                return " ".join(meaningful_words)
-
+                self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
+                self._stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
              except AttributeError:
+                logger.debug("Could not initialize NLTK for advanced text processing.")
+                return False
+        return True
+
+    def stem_and_skip_stop_words(self, words: list[str], *, shared_cache=True):
+        """
+        Reduce a list of words to their stem. Stop words are converted to empty strings.
+        :param words: the list of words to stem
+        """
+
+        def _stem_and_skip_stop_word(word: str):
+            """
+            Reduce a given word to its stem. If it's a stop word, return an empty string.
+            E.g. "amazement", "amaze" and "amazed" all return "amaz".
+            """
+            cached = self._stem_cache.get(word)
+            if cached is not None:
+                return cached
+            elif word in self._stop_words:
+                return ""
+            # Assumption: words that contain numbers are never stemmed
+            elif RE_DIGIT.search(word):
+                return word
+            else:
+                result = self._stemmer.stem(word)
+                self._stem_cache.set(word, result)
+                return result
+
+        if shared_cache:
+            self._stem_cache.load()
+
+        # Stem the words and skip stop words
+        result = " ".join(
+            filter(None, (_stem_and_skip_stop_word(w) for w in words)),
+        )
+        if shared_cache:
+            self._stem_cache.save()
+        return result
+
+    def preprocess_content(
+        self,
+        content: str,
+        *,
+        shared_cache=True,
+    ) -> str:
+        """
+        Process the contents of a document, distilling it down into
+        words which are meaningful to the content.
+
+        A stemmer cache is shared across workers with the parameter "shared_cache".
+        This is unnecessary when training the classifier.
+        """
+
+        # Lower case the document, reduce space,
+        # and keep only letters and digits.
+        content = " ".join(match.group().lower() for match in RE_WORD.finditer(content))
+
+        if ADVANCED_TEXT_PROCESSING_ENABLED:
+            from nltk.tokenize import word_tokenize
+
+            if not self._init_advanced_text_processing():
                  return content
+            # Tokenize
+            # This splits the content into tokens, roughly words
+            words = word_tokenize(content, language=settings.NLTK_LANGUAGE)
+            # Stem the words and skip stop words
+            content = self.stem_and_skip_stop_words(words, shared_cache=shared_cache)
  
          return content
  
+    def _get_vectorizer_cache_key(self, content: str):
+        hash = sha256(content.encode())
+        hash.update(
+            f"|{self.FORMAT_VERSION}|{settings.NLTK_LANGUAGE}|{settings.NLTK_ENABLED}|{self.data_vectorizer_hash}".encode(),
+        )
+        return f"vectorized_content_{hash.hexdigest()}"
+
+    def _vectorize(self, content: str):
+        key = self._get_vectorizer_cache_key(content)
+        serialized_result = read_cache.get(key)
+        if serialized_result is None:
+            result = self.data_vectorizer.transform([self.preprocess_content(content)])
+            read_cache.set(key, pickle.dumps(result), CACHE_5_MINUTES)
+        else:
+            read_cache.touch(key, CACHE_5_MINUTES)
+            result = pickle.loads(serialized_result)
+        return result
+
      def predict_correspondent(self, content: str) -> int | None:
          if self.correspondent_classifier:
-            X = self.data_vectorizer.transform([self.preprocess_content(content)])
+            X = self._vectorize(content)
              correspondent_id = self.correspondent_classifier.predict(X)
              if correspondent_id != -1:
                  return correspondent_id
@@ -432,7 +504,7 @@ class DocumentClassifier:
  
      def predict_document_type(self, content: str) -> int | None:
          if self.document_type_classifier:
-            X = self.data_vectorizer.transform([self.preprocess_content(content)])
+            X = self._vectorize(content)
              document_type_id = self.document_type_classifier.predict(X)
              if document_type_id != -1:
                  return document_type_id
@@ -445,7 +517,7 @@ class DocumentClassifier:
          from sklearn.utils.multiclass import type_of_target
  
          if self.tags_classifier:
-            X = self.data_vectorizer.transform([self.preprocess_content(content)])
+            X = self._vectorize(content)
              y = self.tags_classifier.predict(X)
              tags_ids = self.tags_binarizer.inverse_transform(y)[0]
              if type_of_target(y).startswith("multilabel"):
@@ -464,7 +536,7 @@ class DocumentClassifier:
  
      def predict_storage_path(self, content: str) -> int | None:
          if self.storage_path_classifier:
-            X = self.data_vectorizer.transform([self.preprocess_content(content)])
+            X = self._vectorize(content)
              storage_path_id = self.storage_path_classifier.predict(X)
              if storage_path_id != -1:
                  return storage_path_id
diff --git a/src/documents/tests/samples/content.txt b/src/documents/tests/samples/content.txt

new file mode 100644 (file)

index 0000000..0766642
--- /dev/null
+++ b/src/documents/tests/samples/content.txt
@@ -0,0 +1,34 @@
+Sample textual document content.
+Include as many characters as possible, to check the classifier's vectorization.
+
+Hey 00, this is "a" test0707 content.
+This is an example document — created on 2025-06-25.
+
+Digits: 0123456789
+Punctuation: . , ; : ! ? ' " ( ) [ ] { } — – …
+English text: The quick brown fox jumps over the lazy dog.
+English stop words: We’ve been doing it before.
+Accented Latin (diacritics): àâäæçéèêëîïôœùûüÿñ
+Arabic: لقد قام المترجم بعمل جيد
+Greek: Αλφα, Βήτα, Γάμμα, Δέλτα, Ωμέγα
+Cyrillic: Привет, как дела? Добро пожаловать!
+Chinese (Simplified): 你好，世界！今天的天气很好。
+Chinese (Traditional): 歡迎來到世界，今天天氣很好。
+Japanese (Kanji, Hiragana, Katakana): 東京へ行きます。カタカナ、ひらがな、漢字。
+Korean (Hangul): 안녕하세요. 오늘 날씨 어때요?
+Arabic: مرحبًا، كيف حالك؟
+Hebrew: שלום, מה שלומך?
+Emoji: 😀 🐍 📘 ✅ ©️ 🇺🇳
+Symbols: © ® ™ § ¶ † ‡ ∞ µ ∑ ∆ √
+Math: ∫₀^∞ x² dx = ∞, π ≈ 3.14159, ∇·E = ρ/ε₀
+Currency: 1$ € ¥ £ ₹
+Date formats: 25/06/2025, June 25, 2025, 2025年6月25日
+Quote in French: « Bonjour, ça va ? »
+Quote in German: „Guten Tag! Wie geht's?“
+Newline test:
+\r\n
+\r
+
+Tab\ttest\tspacing
+/ = +) ( []) ~ * #192 +33601010101 § ¤
+End of document.
diff --git a/src/documents/tests/samples/preprocessed_content.txt b/src/documents/tests/samples/preprocessed_content.txt

new file mode 100644 (file)

index 0000000..aac6beb
--- /dev/null
+++ b/src/documents/tests/samples/preprocessed_content.txt
@@ -0,0 +1 @@
+sample textual document content include as many characters as possible to check the classifier s vectorization hey 00 this is a test0707 content this is an example document created on 2025 06 25 digits 0123456789 punctuation english text the quick brown fox jumps over the lazy dog english stop words we ve been doing it before accented latin diacritics àâäæçéèêëîïôœùûüÿñ arabic لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyrillic привет как дела добро пожаловать chinese simplified 你好 世界 今天的天气很好 chinese traditional 歡迎來到世界 今天天氣很好 japanese kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arabic مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbols µ math ₀ x² dx π 3 14159 e ρ ε₀ currency 1 date formats 25 06 2025 june 25 2025 2025年6月25日 quote in french bonjour ça va quote in german guten tag wie geht s newline test r n r tab ttest tspacing 192 33601010101 end of document
diff --git a/src/documents/tests/samples/preprocessed_content_advanced.txt b/src/documents/tests/samples/preprocessed_content_advanced.txt

new file mode 100644 (file)

index 0000000..c67627c
--- /dev/null
+++ b/src/documents/tests/samples/preprocessed_content_advanced.txt
@@ -0,0 +1 @@
+sampl textual document content includ mani charact possibl check classifi vector hey 00 test0707 content exampl document creat 2025 06 25 digit 0123456789 punctuat english text quick brown fox jump lazi dog english stop word accent latin diacrit àâäæçéèêëîïôœùûüÿñ arab لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyril привет как дела добро пожаловать chines simplifi 你好 世界 今天的天气很好 chines tradit 歡迎來到世界 今天天氣很好 japanes kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arab مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbol µ math ₀ x² dx π 3 14159 e ρ ε₀ currenc 1 date format 25 06 2025 june 25 2025 2025年6月25日 quot french bonjour ça va quot german guten tag wie geht newlin test r n r tab ttest tspace 192 33601010101 end document
diff --git a/src/documents/tests/test_caching.py b/src/documents/tests/test_caching.py

new file mode 100644 (file)

index 0000000..4c8d2ea
--- /dev/null
+++ b/src/documents/tests/test_caching.py
@@ -0,0 +1,45 @@
+import pickle
+
+from documents.caching import StoredLRUCache
+
+
+def test_lru_cache_entries():
+    CACHE_TTL = 1
+    # LRU cache with a capacity of 2 elements
+    cache = StoredLRUCache("test_lru_cache_key", 2, backend_ttl=CACHE_TTL)
+    cache.set(1, 1)
+    cache.set(2, 2)
+    assert cache.get(2) == 2
+    assert cache.get(1) == 1
+
+    # The oldest entry (2) should be removed
+    cache.set(3, 3)
+    assert cache.get(3) == 3
+    assert not cache.get(2)
+    assert cache.get(1) == 1
+
+    # Save the cache, restore it and check it overwrites the current cache in memory
+    cache.save()
+    cache.set(4, 4)
+    assert not cache.get(3)
+    cache.load()
+    assert not cache.get(4)
+    assert cache.get(3) == 3
+    assert cache.get(1) == 1
+
+
+def test_stored_lru_cache_key_ttl(mocker):
+    mock_backend = mocker.Mock()
+    cache = StoredLRUCache("test_key", backend=mock_backend, backend_ttl=321)
+
+    # Simulate storing values
+    cache.set("x", "X")
+    cache.set("y", "Y")
+    cache.save()
+
+    # Assert backend.set was called with pickled data, key and TTL
+    mock_backend.set.assert_called_once()
+    key, data, timeout = mock_backend.set.call_args[0]
+    assert key == "test_key"
+    assert timeout == 321
+    assert pickle.loads(data) == {"x": "X", "y": "Y"}
diff --git a/src/documents/tests/test_classifier.py b/src/documents/tests/test_classifier.py

index d1bc8e04fc65f8f790cc7f56fee14e535e118101..b1317f70bff97ef7077c3cd7a78c899998659d14 100644 (file)
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -21,7 +21,7 @@ from documents.models import Tag
  from documents.tests.utils import DirectoriesMixin
  
  
-def dummy_preprocess(content: str):
+def dummy_preprocess(content: str, **kwargs):
      """
      Simpler, faster pre-processing for testing purposes
      """
@@ -223,24 +223,47 @@ class TestClassifier(DirectoriesMixin, TestCase):
          self.generate_test_data()
          self.classifier.train()
  
-        self.assertEqual(
-            self.classifier.predict_correspondent(self.doc1.content),
-            self.c1.pk,
-        )
-        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
-        self.assertListEqual(
-            self.classifier.predict_tags(self.doc1.content),
-            [self.t1.pk],
-        )
-        self.assertListEqual(
-            self.classifier.predict_tags(self.doc2.content),
-            [self.t1.pk, self.t3.pk],
-        )
-        self.assertEqual(
-            self.classifier.predict_document_type(self.doc1.content),
-            self.dt.pk,
-        )
-        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
+        with (
+            mock.patch.object(
+                self.classifier.data_vectorizer,
+                "transform",
+                wraps=self.classifier.data_vectorizer.transform,
+            ) as mock_transform,
+            mock.patch.object(
+                self.classifier,
+                "preprocess_content",
+                wraps=self.classifier.preprocess_content,
+            ) as mock_preprocess_content,
+        ):
+            self.assertEqual(
+                self.classifier.predict_correspondent(self.doc1.content),
+                self.c1.pk,
+            )
+            self.assertEqual(
+                self.classifier.predict_correspondent(self.doc2.content),
+                None,
+            )
+            self.assertListEqual(
+                self.classifier.predict_tags(self.doc1.content),
+                [self.t1.pk],
+            )
+            self.assertListEqual(
+                self.classifier.predict_tags(self.doc2.content),
+                [self.t1.pk, self.t3.pk],
+            )
+            self.assertEqual(
+                self.classifier.predict_document_type(self.doc1.content),
+                self.dt.pk,
+            )
+            self.assertEqual(
+                self.classifier.predict_document_type(self.doc2.content),
+                None,
+            )
+
+            # Check that the classifier vectorized content and text preprocessing has been cached
+            # It should be called once per document (doc1 and doc2)
+            self.assertEqual(mock_preprocess_content.call_count, 2)
+            self.assertEqual(mock_transform.call_count, 2)
  
      def test_no_retrain_if_no_change(self):
          """
@@ -694,3 +717,67 @@ class TestClassifier(DirectoriesMixin, TestCase):
          mock_load.side_effect = Exception()
          with self.assertRaises(Exception):
              load_classifier(raise_exception=True)
+
+
+def test_preprocess_content():
+    """
+    GIVEN:
+        - Advanced text processing is enabled (default)
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (stemmed words)
+    """
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content_advanced.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    classifier = DocumentClassifier()
+    result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
+
+
+def test_preprocess_content_nltk_disabled():
+    """
+    GIVEN:
+        - Advanced text processing is disabled
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (unstemmed words)
+    """
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    classifier = DocumentClassifier()
+    with mock.patch("documents.classifier.ADVANCED_TEXT_PROCESSING_ENABLED", new=False):
+        result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
+
+
+def test_preprocess_content_nltk_load_fail(mocker):
+    """
+    GIVEN:
+        - NLTK stop words fail to load
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (unstemmed words)
+    """
+    _module = mocker.MagicMock(name="nltk_corpus_mock")
+    _module.stopwords.words.side_effect = AttributeError()
+    mocker.patch.dict("sys.modules", {"nltk.corpus": _module})
+    classifier = DocumentClassifier()
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
author	Antoine Mérino <antoine.merino.dev@gmail.com>
	Wed, 6 Aug 2025 20:00:11 +0000 (22:00 +0200)
committer	GitHub <noreply@github.com>
	Wed, 6 Aug 2025 20:00:11 +0000 (16:00 -0400)
.github/workflows/ci.yml		patch \| blob \| blame \| history
.pre-commit-config.yaml		patch \| blob \| blame \| history
src/documents/caching.py		patch \| blob \| blame \| history
src/documents/classifier.py		patch \| blob \| blame \| history
src/documents/tests/samples/content.txt	[new file with mode: 0644]	patch \| blob
src/documents/tests/samples/preprocessed_content.txt	[new file with mode: 0644]	patch \| blob
src/documents/tests/samples/preprocessed_content_advanced.txt	[new file with mode: 0644]	patch \| blob
src/documents/tests/test_caching.py	[new file with mode: 0644]	patch \| blob
src/documents/tests/test_classifier.py		patch \| blob \| blame \| history