Feature: Cache metadata and suggestions in Redis (#5638)

author Trenton H <797416+stumpylog@users.noreply.github.com>

Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)

committer GitHub <noreply@github.com>

Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)
committer GitHub <noreply@github.com>
Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)
diff --git a/src/documents/caching.py b/src/documents/caching.py

new file mode 100644 (file)

index 0000000..9b8607d
--- /dev/null
+++ b/src/documents/caching.py
@@ -0,0 +1,197 @@
+import logging
+from binascii import hexlify
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Final
+from typing import Optional
+
+from django.core.cache import cache
+
+from documents.models import Document
+
+if TYPE_CHECKING:
+    from documents.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+    original_checksum: str
+    original_metadata: list
+    archive_checksum: Optional[str]
+    archive_metadata: Optional[list]
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+    classifier_version: int
+    classifier_hash: str
+    suggestions: dict
+
+
+CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
+CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
+CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
+
+CACHE_1_MINUTE: Final[int] = 60
+CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
+CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
+
+
+def get_suggestion_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's suggestions
+    """
+    return f"doc_{document_id}_suggest"
+
+
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
+    """
+    If possible, return the cached suggestions for the given document ID.
+    The classifier needs to be matching in format and hash and the suggestions need to
+    have been cached once.
+    """
+    from documents.classifier import DocumentClassifier
+
+    doc_key = get_suggestion_cache_key(document_id)
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+    # The document suggestions are in the cache
+    if doc_key in cache_hits:
+        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+        # The classifier format is the same
+        # The classifier hash is the same
+        # Then the suggestions can be used
+        if (
+            CLASSIFIER_VERSION_KEY in cache_hits
+            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+        ) and (
+            CLASSIFIER_HASH_KEY in cache_hits
+            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+        ):
+            return doc_suggestions
+        else:  # pragma: no cover
+            # Remove the key because something didn't match
+            cache.delete(doc_key)
+    return None
+
+
+def set_suggestions_cache(
+    document_id: int,
+    suggestions: dict,
+    classifier: Optional["DocumentClassifier"],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
+    this function is a no-op (there won't be suggestions then anyway)
+    """
+    if classifier is not None:
+        doc_key = get_suggestion_cache_key(document_id)
+        print(classifier.last_auto_type_hash)
+        cache.set(
+            doc_key,
+            SuggestionCacheData(
+                classifier.FORMAT_VERSION,
+                hexlify(classifier.last_auto_type_hash).decode(),
+                suggestions,
+            ),
+            timeout,
+        )
+
+
+def refresh_suggestions_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the suggestions for the given document ID
+    to the given timeout
+    """
+    doc_key = get_suggestion_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's metadata
+    """
+    return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+    """
+    Returns the cached document metadata for the given document ID, as long as the metadata
+    was cached once and the checksums have not changed
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    doc_metadata: Optional[MetadataCacheData] = cache.get(doc_key)
+    # The metadata exists in the cache
+    if doc_metadata is not None:
+        try:
+            doc = Document.objects.get(pk=document_id)
+            # The original checksums match
+            # If it has one, the archive checksums match
+            # Then, we can use the metadata
+            if (
+                doc_metadata.original_checksum == doc.checksum
+                and doc.has_archive_version
+                and doc_metadata.archive_checksum is not None
+                and doc_metadata.archive_checksum == doc.archive_checksum
+            ):
+                # Refresh cache
+                cache.touch(doc_key, CACHE_50_MINUTES)
+                return doc_metadata
+            else:  # pragma: no cover
+                # Something didn't match, delete the key
+                cache.delete(doc_key)
+        except Document.DoesNotExist:  # pragma: no cover
+            # Basically impossible, but the key existed, but the Document didn't
+            cache.delete(doc_key)
+    return None
+
+
+def set_metadata_cache(
+    document: Document,
+    original_metadata: list,
+    archive_metadata: Optional[list],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Sets the metadata into cache for the given Document
+    """
+    doc_key = get_metadata_cache_key(document.pk)
+    cache.set(
+        doc_key,
+        MetadataCacheData(
+            document.checksum,
+            original_metadata,
+            document.archive_checksum,
+            archive_metadata,
+        ),
+        timeout,
+    )
+
+
+def refresh_metadata_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the metadata for the given document ID
+    to the given timeout
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_thumbnail_modified_key(document_id: int) -> str:
+    """
+    Builds the key to store a thumbnail's timestamp
+    """
+    return f"doc_{document_id}_thumbnail_modified"
diff --git a/src/documents/classifier.py b/src/documents/classifier.py

index 5833e373e068ea2aaef9d18d04bea164274b8cbd..6180a8671d746b37603d4ef5263629d5af684bd0 100644 (file)
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -10,8 +10,13 @@ from pathlib import Path
  from typing import Optional
  
  from django.conf import settings
+from django.core.cache import cache
  from sklearn.exceptions import InconsistentVersionWarning
  
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
  from documents.models import Document
  from documents.models import MatchingModel
  
@@ -208,6 +213,15 @@ class DocumentClassifier:
              and self.last_doc_change_time >= latest_doc_change
          ) and self.last_auto_type_hash == hasher.digest():
              logger.info("No updates since last training")
+            # Set the classifier information into the cache
+            # Caching for 50 minutes, so slightly less than the normal retrain time
+            cache.set(
+                CLASSIFIER_MODIFIED_KEY,
+                self.last_doc_change_time,
+                CACHE_50_MINUTES,
+            )
+            cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+            cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
              return False
  
          # subtract 1 since -1 (null) is also part of the classes.
@@ -322,6 +336,12 @@ class DocumentClassifier:
          self.last_doc_change_time = latest_doc_change
          self.last_auto_type_hash = hasher.digest()
  
+        # Set the classifier information into the cache
+        # Caching for 50 minutes, so slightly less than the normal retrain time
+        cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
+
          return True
  
      def preprocess_content(self, content: str) -> str:  # pragma: no cover
diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py

index 07e6850fbc2b7dfefbeed15ee692632f6c85a664..1b53dfe2bda6f23b3e3fe2366cb5aeb311d2300f 100644 (file)
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@@ -1,9 +1,16 @@
-import pickle
  from datetime import datetime
+from datetime import timezone
  from typing import Optional
  
  from django.conf import settings
+from django.core.cache import cache
  
+from documents.caching import CACHE_5_MINUTES
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
+from documents.caching import get_thumbnail_modified_key
  from documents.classifier import DocumentClassifier
  from documents.models import Document
  
@@ -14,18 +21,25 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
      suggestions if the classifier has not been changed and the suggested dates
      setting is also unchanged
  
-    TODO: It would be nice to not duplicate the partial loading and the loading
-    between here and the actual classifier
      """
+    # If no model file, no etag at all
      if not settings.MODEL_FILE.exists():
          return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        _ = pickle.load(f)
-        last_auto_type_hash: bytes = pickle.load(f)
-        return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    # Check cache information
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
+    )
+    # If the version differs somehow, no etag
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_HASH_KEY in cache_hits:
+        # Refresh the cache and return the hash digest and the dates setting
+        cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
+        return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    return None
  
  
  def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
@@ -34,14 +48,23 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
      as there is not way to track the suggested date setting modification, but it seems
      unlikely that changes too often
      """
+    # No file, no last modified
      if not settings.MODEL_FILE.exists():
          return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        last_doc_change_time = pickle.load(f)
-        return last_doc_change_time
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
+    )
+    # If the version differs somehow, no last modified
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_MODIFIED_KEY in cache_hits:
+        # Refresh the cache and return the last modified
+        cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
+        return cache_hits[CLASSIFIER_MODIFIED_KEY]
+    return None
  
  
  def metadata_etag(request, pk: int) -> Optional[str]:
@@ -52,7 +75,7 @@ def metadata_etag(request, pk: int) -> Optional[str]:
      try:
          doc = Document.objects.get(pk=pk)
          return doc.checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
          return None
      return None
  
@@ -66,7 +89,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]:
      try:
          doc = Document.objects.get(pk=pk)
          return doc.modified
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
          return None
      return None
  
@@ -82,6 +105,46 @@ def preview_etag(request, pk: int) -> Optional[str]:
              and request.query_params["original"] == "true"
          )
          return doc.checksum if use_original else doc.archive_checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
+    return None
+
+
+def preview_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Uses the documents modified time to set the Last-Modified header.  Not strictly
+    speaking correct, but close enough and quick
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        return doc.modified
+    except Document.DoesNotExist:  # pragma: no cover
          return None
      return None
+
+
+def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Returns the filesystem last modified either from cache or from filesystem.
+    Cache should be (slightly?) faster than filesystem
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        if not doc.thumbnail_path.exists():
+            return None
+        doc_key = get_thumbnail_modified_key(pk)
+
+        cache_hit = cache.get(doc_key)
+        if cache_hit is not None:
+            cache.touch(doc_key, CACHE_50_MINUTES)
+            return cache_hit
+
+        # No cache, get the timestamp and cache the datetime
+        last_modified = datetime.fromtimestamp(
+            doc.thumbnail_path.stat().st_mtime,
+            tz=timezone.utc,
+        )
+        cache.set(doc_key, last_modified, CACHE_50_MINUTES)
+        return last_modified
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py

index 20dd64d822559e2630bf2a8f03030b24ca6b90fc..d7ae1eeb7ed8b2e819b14a3a2e5c657c8b458253 100644 (file)
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -4,6 +4,7 @@ import shutil
  import tempfile
  import uuid
  import zoneinfo
+from binascii import hexlify
  from datetime import timedelta
  from pathlib import Path
  from unittest import mock
@@ -13,12 +14,17 @@ from dateutil import parser
  from django.conf import settings
  from django.contrib.auth.models import Permission
  from django.contrib.auth.models import User
+from django.core.cache import cache
  from django.test import override_settings
  from django.utils import timezone
  from guardian.shortcuts import assign_perm
  from rest_framework import status
  from rest_framework.test import APITestCase
  
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
  from documents.models import Correspondent
  from documents.models import CustomField
  from documents.models import CustomFieldInstance
@@ -40,6 +46,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
  
          self.user = User.objects.create_superuser(username="temp_admin")
          self.client.force_authenticate(user=self.user)
+        cache.clear()
  
      def testDocuments(self):
          response = self.client.get("/api/documents/").data
@@ -1162,6 +1169,9 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
          self.assertEqual(meta["original_size"], os.stat(source_file).st_size)
          self.assertEqual(meta["archive_size"], os.stat(archive_file).st_size)
  
+        response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
      def test_get_metadata_invalid_doc(self):
          response = self.client.get("/api/documents/34576/metadata/")
          self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
@@ -1266,7 +1276,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
              },
          )
  
-    @mock.patch("documents.conditionals.pickle.load")
+    @mock.patch("documents.views.load_classifier")
      @mock.patch("documents.views.match_storage_paths")
      @mock.patch("documents.views.match_document_types")
      @mock.patch("documents.views.match_tags")
@@ -1278,7 +1288,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
          match_tags,
          match_document_types,
          match_storage_paths,
-        mocked_pickle_load,
+        mocked_load,
      ):
          """
          GIVEN:
@@ -1287,36 +1297,52 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
            - Classifier has not been modified
          THEN:
            - Subsequent requests are returned alright
-          - ETag and last modified are called
+          - ETag and last modified headers are set
          """
-        settings.MODEL_FILE.touch()
  
+        # setup the cache how the classifier does it
          from documents.classifier import DocumentClassifier
  
-        last_modified = timezone.now()
+        settings.MODEL_FILE.touch()
  
-        # ETag first, then modified
-        mock_effect = [
-            DocumentClassifier.FORMAT_VERSION,
-            "dont care",
-            b"thisisachecksum",
-            DocumentClassifier.FORMAT_VERSION,
-            last_modified,
+        classifier_checksum_bytes = b"thisisachecksum"
+        classifier_checksum_hex = hexlify(classifier_checksum_bytes).decode()
+
+        # Two loads, so two side effects
+        mocked_load.side_effect = [
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
          ]
-        mocked_pickle_load.side_effect = mock_effect
  
-        doc = Document.objects.create(
-            title="test",
-            mime_type="application/pdf",
-            content="this is an invoice from 12.04.2022!",
+        last_modified = timezone.now()
+        cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, classifier_checksum_hex, CACHE_50_MINUTES)
+        cache.set(
+            CLASSIFIER_VERSION_KEY,
+            DocumentClassifier.FORMAT_VERSION,
+            CACHE_50_MINUTES,
          )
  
+        # Mock the matching
          match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
          match_tags.return_value = [Tag(id=56), Tag(id=123)]
          match_document_types.return_value = [DocumentType(id=23)]
          match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
  
+        doc = Document.objects.create(
+            title="test",
+            mime_type="application/pdf",
+            content="this is an invoice from 12.04.2022!",
+        )
+
          response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
          self.assertEqual(
              response.data,
              {
@@ -1327,7 +1353,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
                  "dates": ["2022-04-12"],
              },
          )
-        mocked_pickle_load.assert_called()
          self.assertIn("Last-Modified", response.headers)
          self.assertEqual(
              response.headers["Last-Modified"],
@@ -1336,15 +1361,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
          self.assertIn("ETag", response.headers)
          self.assertEqual(
              response.headers["ETag"],
-            f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"",
+            f'"{classifier_checksum_hex}:{settings.NUMBER_OF_SUGGESTED_DATES}"',
          )
  
-        mocked_pickle_load.rest_mock()
-        mocked_pickle_load.side_effect = mock_effect
-
          response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
          self.assertEqual(response.status_code, status.HTTP_200_OK)
-        mocked_pickle_load.assert_called()
  
      @mock.patch("documents.parsers.parse_date_generator")
      @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
diff --git a/src/documents/views.py b/src/documents/views.py

index 11fb5b1f20fa06746d8b4e1fb743fd13da1326db..0578cdb244564c553760748428225ac0284b0bb5 100644 (file)
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -35,6 +35,7 @@ from django.utils.translation import get_language
  from django.views import View
  from django.views.decorators.cache import cache_control
  from django.views.decorators.http import condition
+from django.views.decorators.http import last_modified
  from django.views.generic import TemplateView
  from django_filters.rest_framework import DjangoFilterBackend
  from langdetect import detect
@@ -62,12 +63,21 @@ from documents import bulk_edit
  from documents.bulk_download import ArchiveOnlyStrategy
  from documents.bulk_download import OriginalAndArchiveStrategy
  from documents.bulk_download import OriginalsOnlyStrategy
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import get_metadata_cache
+from documents.caching import get_suggestion_cache
+from documents.caching import refresh_metadata_cache
+from documents.caching import refresh_suggestions_cache
+from documents.caching import set_metadata_cache
+from documents.caching import set_suggestions_cache
  from documents.classifier import load_classifier
  from documents.conditionals import metadata_etag
  from documents.conditionals import metadata_last_modified
  from documents.conditionals import preview_etag
+from documents.conditionals import preview_last_modified
  from documents.conditionals import suggestions_etag
  from documents.conditionals import suggestions_last_modified
+from documents.conditionals import thumbnail_last_modified
  from documents.data_models import ConsumableDocument
  from documents.data_models import DocumentMetadataOverrides
  from documents.data_models import DocumentSource
@@ -379,10 +389,12 @@ class DocumentViewSet(
  
              try:
                  return parser.extract_metadata(file, mime_type)
-            except Exception:
+            except Exception:  # pragma: no cover
+                logger.exception(f"Issue getting metadata for {file}")
                  # TODO: cover GPG errors, remove later.
                  return []
-        else:
+        else:  # pragma: no cover
+            logger.warning(f"No parser for {mime_type}")
              return []
  
      def get_filesize(self, filename):
@@ -407,16 +419,37 @@ class DocumentViewSet(
          except Document.DoesNotExist:
              raise Http404
  
+        document_cached_metadata = get_metadata_cache(doc.pk)
+
+        archive_metadata = None
+        archive_filesize = None
+        if document_cached_metadata is not None:
+            original_metadata = document_cached_metadata.original_metadata
+            archive_metadata = document_cached_metadata.archive_metadata
+            refresh_metadata_cache(doc.pk)
+        else:
+            original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
+
+            if doc.has_archive_version:
+                archive_filesize = self.get_filesize(doc.archive_path)
+                archive_metadata = self.get_metadata(
+                    doc.archive_path,
+                    "application/pdf",
+                )
+            set_metadata_cache(doc, original_metadata, archive_metadata)
+
          meta = {
              "original_checksum": doc.checksum,
              "original_size": self.get_filesize(doc.source_path),
              "original_mime_type": doc.mime_type,
              "media_filename": doc.filename,
              "has_archive_version": doc.has_archive_version,
-            "original_metadata": self.get_metadata(doc.source_path, doc.mime_type),
+            "original_metadata": original_metadata,
              "archive_checksum": doc.archive_checksum,
              "archive_media_filename": doc.archive_filename,
              "original_filename": doc.original_filename,
+            "archive_size": archive_filesize,
+            "archive_metadata": archive_metadata,
          }
  
          lang = "en"
@@ -426,16 +459,6 @@ class DocumentViewSet(
              pass
          meta["lang"] = lang
  
-        if doc.has_archive_version:
-            meta["archive_size"] = self.get_filesize(doc.archive_path)
-            meta["archive_metadata"] = self.get_metadata(
-                doc.archive_path,
-                "application/pdf",
-            )
-        else:
-            meta["archive_size"] = None
-            meta["archive_metadata"] = None
-
          return Response(meta)
  
      @action(methods=["get"], detail=True)
@@ -454,6 +477,12 @@ class DocumentViewSet(
          ):
              return HttpResponseForbidden("Insufficient permissions")
  
+        document_suggestions = get_suggestion_cache(doc.pk)
+
+        if document_suggestions is not None:
+            refresh_suggestions_cache(doc.pk)
+            return Response(document_suggestions.suggestions)
+
          classifier = load_classifier()
  
          dates = []
@@ -463,27 +492,30 @@ class DocumentViewSet(
                  {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
              )
  
-        return Response(
-            {
-                "correspondents": [
-                    c.id for c in match_correspondents(doc, classifier, request.user)
-                ],
-                "tags": [t.id for t in match_tags(doc, classifier, request.user)],
-                "document_types": [
-                    dt.id for dt in match_document_types(doc, classifier, request.user)
-                ],
-                "storage_paths": [
-                    dt.id for dt in match_storage_paths(doc, classifier, request.user)
-                ],
-                "dates": [
-                    date.strftime("%Y-%m-%d") for date in dates if date is not None
-                ],
-            },
-        )
+        resp_data = {
+            "correspondents": [
+                c.id for c in match_correspondents(doc, classifier, request.user)
+            ],
+            "tags": [t.id for t in match_tags(doc, classifier, request.user)],
+            "document_types": [
+                dt.id for dt in match_document_types(doc, classifier, request.user)
+            ],
+            "storage_paths": [
+                dt.id for dt in match_storage_paths(doc, classifier, request.user)
+            ],
+            "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
+        }
+
+        # Cache the suggestions and the classifier hash for later
+        set_suggestions_cache(doc.pk, resp_data, classifier)
+
+        return Response(resp_data)
  
      @action(methods=["get"], detail=True)
      @method_decorator(cache_control(public=False, max_age=5 * 60))
-    @method_decorator(condition(etag_func=preview_etag))
+    @method_decorator(
+        condition(etag_func=preview_etag, last_modified_func=preview_last_modified),
+    )
      def preview(self, request, pk=None):
          try:
              response = self.file_response(pk, request, "inline")
@@ -492,7 +524,8 @@ class DocumentViewSet(
              raise Http404
  
      @action(methods=["get"], detail=True)
-    @method_decorator(cache_control(public=False, max_age=315360000))
+    @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES))
+    @method_decorator(last_modified(thumbnail_last_modified))
      def thumb(self, request, pk=None):
          try:
              doc = Document.objects.get(id=pk)
@@ -506,8 +539,6 @@ class DocumentViewSet(
                  handle = GnuPG.decrypted(doc.thumbnail_file)
              else:
                  handle = doc.thumbnail_file
-            # TODO: Send ETag information and use that to send new thumbnails
-            #  if available
  
              return HttpResponse(handle, content_type="image/webp")
          except (FileNotFoundError, Document.DoesNotExist):
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 17ec2765d1f350b9fe2ceed975df85ec12d8e32d..7179f0358511be9c13125fdf343992fd68b679b3 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -762,8 +762,12 @@ CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
  # django setting.
  CACHES = {
      "default": {
-        "BACKEND": "django.core.cache.backends.redis.RedisCache",
+        "BACKEND": os.environ.get(
+            "PAPERLESS_CACHE_BACKEND",
+            "django.core.cache.backends.redis.RedisCache",
+        ),
          "LOCATION": _CHANNELS_REDIS_URL,
+        "KEY_PREFIX": os.getenv("PAPERLESS_REDIS_PREFIX", ""),
      },
  }
  
diff --git a/src/setup.cfg b/src/setup.cfg

index dc5e9e33a2e220f153499ee6bc0a8ff27e2dc454..1877cb16e44cead78363444bc381fbeebaa7cb36 100644 (file)
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -3,6 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings
  addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
  env =
      PAPERLESS_DISABLE_DBHANDLER=true
+    PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache
  
  [coverage:run]
  source =
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)
committer	GitHub <noreply@github.com>
	Sun, 4 Feb 2024 18:42:21 +0000 (10:42 -0800)
src/documents/caching.py	[new file with mode: 0644]	patch \| blob
src/documents/classifier.py		patch \| blob \| blame \| history
src/documents/conditionals.py		patch \| blob \| blame \| history
src/documents/tests/test_api_documents.py		patch \| blob \| blame \| history
src/documents/views.py		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history
src/setup.cfg		patch \| blob \| blame \| history