--- /dev/null
+import logging
+from binascii import hexlify
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Final
+from typing import Optional
+
+from django.core.cache import cache
+
+from documents.models import Document
+
+if TYPE_CHECKING:
+ from documents.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+ original_checksum: str
+ original_metadata: list
+ archive_checksum: Optional[str]
+ archive_metadata: Optional[list]
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+ classifier_version: int
+ classifier_hash: str
+ suggestions: dict
+
+
+CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
+CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
+CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
+
+CACHE_1_MINUTE: Final[int] = 60
+CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
+CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
+
+
+def get_suggestion_cache_key(document_id: int) -> str:
+ """
+ Returns the basic key for a document's suggestions
+ """
+ return f"doc_{document_id}_suggest"
+
+
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
+ """
+ If possible, return the cached suggestions for the given document ID.
+ The classifier needs to be matching in format and hash and the suggestions need to
+ have been cached once.
+ """
+ from documents.classifier import DocumentClassifier
+
+ doc_key = get_suggestion_cache_key(document_id)
+ cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+ # The document suggestions are in the cache
+ if doc_key in cache_hits:
+ doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+ # The classifier format is the same
+ # The classifier hash is the same
+ # Then the suggestions can be used
+ if (
+ CLASSIFIER_VERSION_KEY in cache_hits
+ and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+ and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+ ) and (
+ CLASSIFIER_HASH_KEY in cache_hits
+ and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+ ):
+ return doc_suggestions
+ else: # pragma: no cover
+ # Remove the key because something didn't match
+ cache.delete(doc_key)
+ return None
+
+
+def set_suggestions_cache(
+ document_id: int,
+ suggestions: dict,
+ classifier: Optional["DocumentClassifier"],
+ *,
+ timeout=CACHE_50_MINUTES,
+) -> None:
+ """
+ Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
+ this function is a no-op (there won't be suggestions then anyway)
+ """
+ if classifier is not None:
+ doc_key = get_suggestion_cache_key(document_id)
+ print(classifier.last_auto_type_hash)
+ cache.set(
+ doc_key,
+ SuggestionCacheData(
+ classifier.FORMAT_VERSION,
+ hexlify(classifier.last_auto_type_hash).decode(),
+ suggestions,
+ ),
+ timeout,
+ )
+
+
+def refresh_suggestions_cache(
+ document_id: int,
+ *,
+ timeout: int = CACHE_50_MINUTES,
+) -> None:
+ """
+ Refreshes the expiration of the suggestions for the given document ID
+ to the given timeout
+ """
+ doc_key = get_suggestion_cache_key(document_id)
+ cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+ """
+ Returns the basic key for a document's metadata
+ """
+ return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+ """
+ Returns the cached document metadata for the given document ID, as long as the metadata
+ was cached once and the checksums have not changed
+ """
+ doc_key = get_metadata_cache_key(document_id)
+ doc_metadata: Optional[MetadataCacheData] = cache.get(doc_key)
+ # The metadata exists in the cache
+ if doc_metadata is not None:
+ try:
+ doc = Document.objects.get(pk=document_id)
+ # The original checksums match
+ # If it has one, the archive checksums match
+ # Then, we can use the metadata
+ if (
+ doc_metadata.original_checksum == doc.checksum
+ and doc.has_archive_version
+ and doc_metadata.archive_checksum is not None
+ and doc_metadata.archive_checksum == doc.archive_checksum
+ ):
+ # Refresh cache
+ cache.touch(doc_key, CACHE_50_MINUTES)
+ return doc_metadata
+ else: # pragma: no cover
+ # Something didn't match, delete the key
+ cache.delete(doc_key)
+ except Document.DoesNotExist: # pragma: no cover
+ # Basically impossible, but the key existed, but the Document didn't
+ cache.delete(doc_key)
+ return None
+
+
+def set_metadata_cache(
+ document: Document,
+ original_metadata: list,
+ archive_metadata: Optional[list],
+ *,
+ timeout=CACHE_50_MINUTES,
+) -> None:
+ """
+ Sets the metadata into cache for the given Document
+ """
+ doc_key = get_metadata_cache_key(document.pk)
+ cache.set(
+ doc_key,
+ MetadataCacheData(
+ document.checksum,
+ original_metadata,
+ document.archive_checksum,
+ archive_metadata,
+ ),
+ timeout,
+ )
+
+
+def refresh_metadata_cache(
+ document_id: int,
+ *,
+ timeout: int = CACHE_50_MINUTES,
+) -> None:
+ """
+ Refreshes the expiration of the metadata for the given document ID
+ to the given timeout
+ """
+ doc_key = get_metadata_cache_key(document_id)
+ cache.touch(doc_key, timeout)
+
+
+def get_thumbnail_modified_key(document_id: int) -> str:
+ """
+ Builds the key to store a thumbnail's timestamp
+ """
+ return f"doc_{document_id}_thumbnail_modified"
from typing import Optional
from django.conf import settings
+from django.core.cache import cache
from sklearn.exceptions import InconsistentVersionWarning
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
from documents.models import Document
from documents.models import MatchingModel
and self.last_doc_change_time >= latest_doc_change
) and self.last_auto_type_hash == hasher.digest():
logger.info("No updates since last training")
+ # Set the classifier information into the cache
+ # Caching for 50 minutes, so slightly less than the normal retrain time
+ cache.set(
+ CLASSIFIER_MODIFIED_KEY,
+ self.last_doc_change_time,
+ CACHE_50_MINUTES,
+ )
+ cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+ cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return False
# subtract 1 since -1 (null) is also part of the classes.
self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest()
+ # Set the classifier information into the cache
+ # Caching for 50 minutes, so slightly less than the normal retrain time
+ cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
+ cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+ cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
+
return True
def preprocess_content(self, content: str) -> str: # pragma: no cover
-import pickle
from datetime import datetime
+from datetime import timezone
from typing import Optional
from django.conf import settings
+from django.core.cache import cache
+from documents.caching import CACHE_5_MINUTES
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
+from documents.caching import get_thumbnail_modified_key
from documents.classifier import DocumentClassifier
from documents.models import Document
suggestions if the classifier has not been changed and the suggested dates
setting is also unchanged
- TODO: It would be nice to not duplicate the partial loading and the loading
- between here and the actual classifier
"""
+ # If no model file, no etag at all
if not settings.MODEL_FILE.exists():
return None
- with open(settings.MODEL_FILE, "rb") as f:
- schema_version = pickle.load(f)
- if schema_version != DocumentClassifier.FORMAT_VERSION:
- return None
- _ = pickle.load(f)
- last_auto_type_hash: bytes = pickle.load(f)
- return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+ # Check cache information
+ cache_hits = cache.get_many(
+ [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
+ )
+ # If the version differs somehow, no etag
+ if (
+ CLASSIFIER_VERSION_KEY in cache_hits
+ and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+ ):
+ return None
+ elif CLASSIFIER_HASH_KEY in cache_hits:
+ # Refresh the cache and return the hash digest and the dates setting
+ cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
+ return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+ return None
def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
as there is not way to track the suggested date setting modification, but it seems
unlikely that changes too often
"""
+ # No file, no last modified
if not settings.MODEL_FILE.exists():
return None
- with open(settings.MODEL_FILE, "rb") as f:
- schema_version = pickle.load(f)
- if schema_version != DocumentClassifier.FORMAT_VERSION:
- return None
- last_doc_change_time = pickle.load(f)
- return last_doc_change_time
+ cache_hits = cache.get_many(
+ [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
+ )
+ # If the version differs somehow, no last modified
+ if (
+ CLASSIFIER_VERSION_KEY in cache_hits
+ and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+ ):
+ return None
+ elif CLASSIFIER_MODIFIED_KEY in cache_hits:
+ # Refresh the cache and return the last modified
+ cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
+ return cache_hits[CLASSIFIER_MODIFIED_KEY]
+ return None
def metadata_etag(request, pk: int) -> Optional[str]:
try:
doc = Document.objects.get(pk=pk)
return doc.checksum
- except Document.DoesNotExist:
+ except Document.DoesNotExist: # pragma: no cover
return None
return None
try:
doc = Document.objects.get(pk=pk)
return doc.modified
- except Document.DoesNotExist:
+ except Document.DoesNotExist: # pragma: no cover
return None
return None
and request.query_params["original"] == "true"
)
return doc.checksum if use_original else doc.archive_checksum
- except Document.DoesNotExist:
+ except Document.DoesNotExist: # pragma: no cover
+ return None
+ return None
+
+
+def preview_last_modified(request, pk: int) -> Optional[datetime]:
+ """
+ Uses the documents modified time to set the Last-Modified header. Not strictly
+ speaking correct, but close enough and quick
+ """
+ try:
+ doc = Document.objects.get(pk=pk)
+ return doc.modified
+ except Document.DoesNotExist: # pragma: no cover
return None
return None
+
+
+def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
+ """
+ Returns the filesystem last modified either from cache or from filesystem.
+ Cache should be (slightly?) faster than filesystem
+ """
+ try:
+ doc = Document.objects.get(pk=pk)
+ if not doc.thumbnail_path.exists():
+ return None
+ doc_key = get_thumbnail_modified_key(pk)
+
+ cache_hit = cache.get(doc_key)
+ if cache_hit is not None:
+ cache.touch(doc_key, CACHE_50_MINUTES)
+ return cache_hit
+
+ # No cache, get the timestamp and cache the datetime
+ last_modified = datetime.fromtimestamp(
+ doc.thumbnail_path.stat().st_mtime,
+ tz=timezone.utc,
+ )
+ cache.set(doc_key, last_modified, CACHE_50_MINUTES)
+ return last_modified
+ except Document.DoesNotExist: # pragma: no cover
+ return None
import tempfile
import uuid
import zoneinfo
+from binascii import hexlify
from datetime import timedelta
from pathlib import Path
from unittest import mock
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
+from django.core.cache import cache
from django.test import override_settings
from django.utils import timezone
from guardian.shortcuts import assign_perm
from rest_framework import status
from rest_framework.test import APITestCase
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user)
+ cache.clear()
def testDocuments(self):
response = self.client.get("/api/documents/").data
self.assertEqual(meta["original_size"], os.stat(source_file).st_size)
self.assertEqual(meta["archive_size"], os.stat(archive_file).st_size)
+ response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+
def test_get_metadata_invalid_doc(self):
response = self.client.get("/api/documents/34576/metadata/")
self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
},
)
- @mock.patch("documents.conditionals.pickle.load")
+ @mock.patch("documents.views.load_classifier")
@mock.patch("documents.views.match_storage_paths")
@mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags")
match_tags,
match_document_types,
match_storage_paths,
- mocked_pickle_load,
+ mocked_load,
):
"""
GIVEN:
- Classifier has not been modified
THEN:
- Subsequent requests are returned alright
- - ETag and last modified are called
+ - ETag and last modified headers are set
"""
- settings.MODEL_FILE.touch()
+ # setup the cache how the classifier does it
from documents.classifier import DocumentClassifier
- last_modified = timezone.now()
+ settings.MODEL_FILE.touch()
- # ETag first, then modified
- mock_effect = [
- DocumentClassifier.FORMAT_VERSION,
- "dont care",
- b"thisisachecksum",
- DocumentClassifier.FORMAT_VERSION,
- last_modified,
+ classifier_checksum_bytes = b"thisisachecksum"
+ classifier_checksum_hex = hexlify(classifier_checksum_bytes).decode()
+
+ # Two loads, so two side effects
+ mocked_load.side_effect = [
+ mock.Mock(
+ last_auto_type_hash=classifier_checksum_bytes,
+ FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+ ),
+ mock.Mock(
+ last_auto_type_hash=classifier_checksum_bytes,
+ FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+ ),
]
- mocked_pickle_load.side_effect = mock_effect
- doc = Document.objects.create(
- title="test",
- mime_type="application/pdf",
- content="this is an invoice from 12.04.2022!",
+ last_modified = timezone.now()
+ cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES)
+ cache.set(CLASSIFIER_HASH_KEY, classifier_checksum_hex, CACHE_50_MINUTES)
+ cache.set(
+ CLASSIFIER_VERSION_KEY,
+ DocumentClassifier.FORMAT_VERSION,
+ CACHE_50_MINUTES,
)
+ # Mock the matching
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
match_tags.return_value = [Tag(id=56), Tag(id=123)]
match_document_types.return_value = [DocumentType(id=23)]
match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
+ doc = Document.objects.create(
+ title="test",
+ mime_type="application/pdf",
+ content="this is an invoice from 12.04.2022!",
+ )
+
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(
response.data,
{
"dates": ["2022-04-12"],
},
)
- mocked_pickle_load.assert_called()
self.assertIn("Last-Modified", response.headers)
self.assertEqual(
response.headers["Last-Modified"],
self.assertIn("ETag", response.headers)
self.assertEqual(
response.headers["ETag"],
- f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"",
+ f'"{classifier_checksum_hex}:{settings.NUMBER_OF_SUGGESTED_DATES}"',
)
- mocked_pickle_load.rest_mock()
- mocked_pickle_load.side_effect = mock_effect
-
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK)
- mocked_pickle_load.assert_called()
@mock.patch("documents.parsers.parse_date_generator")
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
from django.views import View
from django.views.decorators.cache import cache_control
from django.views.decorators.http import condition
+from django.views.decorators.http import last_modified
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from langdetect import detect
from documents.bulk_download import ArchiveOnlyStrategy
from documents.bulk_download import OriginalAndArchiveStrategy
from documents.bulk_download import OriginalsOnlyStrategy
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import get_metadata_cache
+from documents.caching import get_suggestion_cache
+from documents.caching import refresh_metadata_cache
+from documents.caching import refresh_suggestions_cache
+from documents.caching import set_metadata_cache
+from documents.caching import set_suggestions_cache
from documents.classifier import load_classifier
from documents.conditionals import metadata_etag
from documents.conditionals import metadata_last_modified
from documents.conditionals import preview_etag
+from documents.conditionals import preview_last_modified
from documents.conditionals import suggestions_etag
from documents.conditionals import suggestions_last_modified
+from documents.conditionals import thumbnail_last_modified
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
try:
return parser.extract_metadata(file, mime_type)
- except Exception:
+ except Exception: # pragma: no cover
+ logger.exception(f"Issue getting metadata for {file}")
# TODO: cover GPG errors, remove later.
return []
- else:
+ else: # pragma: no cover
+ logger.warning(f"No parser for {mime_type}")
return []
def get_filesize(self, filename):
except Document.DoesNotExist:
raise Http404
+ document_cached_metadata = get_metadata_cache(doc.pk)
+
+ archive_metadata = None
+ archive_filesize = None
+ if document_cached_metadata is not None:
+ original_metadata = document_cached_metadata.original_metadata
+ archive_metadata = document_cached_metadata.archive_metadata
+ refresh_metadata_cache(doc.pk)
+ else:
+ original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
+
+ if doc.has_archive_version:
+ archive_filesize = self.get_filesize(doc.archive_path)
+ archive_metadata = self.get_metadata(
+ doc.archive_path,
+ "application/pdf",
+ )
+ set_metadata_cache(doc, original_metadata, archive_metadata)
+
meta = {
"original_checksum": doc.checksum,
"original_size": self.get_filesize(doc.source_path),
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": doc.has_archive_version,
- "original_metadata": self.get_metadata(doc.source_path, doc.mime_type),
+ "original_metadata": original_metadata,
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename,
"original_filename": doc.original_filename,
+ "archive_size": archive_filesize,
+ "archive_metadata": archive_metadata,
}
lang = "en"
pass
meta["lang"] = lang
- if doc.has_archive_version:
- meta["archive_size"] = self.get_filesize(doc.archive_path)
- meta["archive_metadata"] = self.get_metadata(
- doc.archive_path,
- "application/pdf",
- )
- else:
- meta["archive_size"] = None
- meta["archive_metadata"] = None
-
return Response(meta)
@action(methods=["get"], detail=True)
):
return HttpResponseForbidden("Insufficient permissions")
+ document_suggestions = get_suggestion_cache(doc.pk)
+
+ if document_suggestions is not None:
+ refresh_suggestions_cache(doc.pk)
+ return Response(document_suggestions.suggestions)
+
classifier = load_classifier()
dates = []
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
)
- return Response(
- {
- "correspondents": [
- c.id for c in match_correspondents(doc, classifier, request.user)
- ],
- "tags": [t.id for t in match_tags(doc, classifier, request.user)],
- "document_types": [
- dt.id for dt in match_document_types(doc, classifier, request.user)
- ],
- "storage_paths": [
- dt.id for dt in match_storage_paths(doc, classifier, request.user)
- ],
- "dates": [
- date.strftime("%Y-%m-%d") for date in dates if date is not None
- ],
- },
- )
+ resp_data = {
+ "correspondents": [
+ c.id for c in match_correspondents(doc, classifier, request.user)
+ ],
+ "tags": [t.id for t in match_tags(doc, classifier, request.user)],
+ "document_types": [
+ dt.id for dt in match_document_types(doc, classifier, request.user)
+ ],
+ "storage_paths": [
+ dt.id for dt in match_storage_paths(doc, classifier, request.user)
+ ],
+ "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
+ }
+
+ # Cache the suggestions and the classifier hash for later
+ set_suggestions_cache(doc.pk, resp_data, classifier)
+
+ return Response(resp_data)
@action(methods=["get"], detail=True)
@method_decorator(cache_control(public=False, max_age=5 * 60))
- @method_decorator(condition(etag_func=preview_etag))
+ @method_decorator(
+ condition(etag_func=preview_etag, last_modified_func=preview_last_modified),
+ )
def preview(self, request, pk=None):
try:
response = self.file_response(pk, request, "inline")
raise Http404
@action(methods=["get"], detail=True)
- @method_decorator(cache_control(public=False, max_age=315360000))
+ @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES))
+ @method_decorator(last_modified(thumbnail_last_modified))
def thumb(self, request, pk=None):
try:
doc = Document.objects.get(id=pk)
handle = GnuPG.decrypted(doc.thumbnail_file)
else:
handle = doc.thumbnail_file
- # TODO: Send ETag information and use that to send new thumbnails
- # if available
return HttpResponse(handle, content_type="image/webp")
except (FileNotFoundError, Document.DoesNotExist):
# django setting.
CACHES = {
"default": {
- "BACKEND": "django.core.cache.backends.redis.RedisCache",
+ "BACKEND": os.environ.get(
+ "PAPERLESS_CACHE_BACKEND",
+ "django.core.cache.backends.redis.RedisCache",
+ ),
"LOCATION": _CHANNELS_REDIS_URL,
+ "KEY_PREFIX": os.getenv("PAPERLESS_REDIS_PREFIX", ""),
},
}
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
env =
PAPERLESS_DISABLE_DBHANDLER=true
+ PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache
[coverage:run]
source =