]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Detect and reset invalid ASNs to 0 during indexing with a loud error to the user
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Thu, 2 Feb 2023 16:19:59 +0000 (08:19 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Fri, 3 Feb 2023 16:31:45 +0000 (08:31 -0800)
src/documents/consumer.py
src/documents/index.py
src/documents/models.py
src/documents/tests/test_index.py

index 8c80304d3c857db717975a7f7990ddf70a1c939d..1896415b12d228747843be11d48828dbb02952ba 100644 (file)
@@ -146,11 +146,16 @@ class Consumer(LoggingMixin):
             return
         # Validate the range is above zero and less than uint32_t max
         # otherwise, Whoosh can't handle it in the index
-        if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
+        if (
+            self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+            or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+        ):
             self._fail(
                 MESSAGE_ASN_RANGE,
                 f"Not consuming {self.filename}: "
-                f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
+                f"Given ASN {self.override_asn} is out of range "
+                f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+                f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
             )
         if Document.objects.filter(archive_serial_number=self.override_asn).exists():
             self._fail(
index 575e57e8be1dfbe3cddac7539e6d1b1677e1c00b..e11708f459a9206877862ea8a874f7578866a2cb 100644 (file)
@@ -90,10 +90,22 @@ def open_index_searcher():
         searcher.close()
 
 
-def update_document(writer, doc):
+def update_document(writer: AsyncWriter, doc: Document):
     tags = ",".join([t.name for t in doc.tags.all()])
     tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
     comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
+    asn = doc.archive_serial_number
+    if asn is not None and (
+        asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+        or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+    ):
+        logger.error(
+            f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
+            f"ASN is out of range "
+            f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+            f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
+        )
+        asn = 0
     writer.update_document(
         id=doc.pk,
         title=doc.title,
@@ -109,7 +121,7 @@ def update_document(writer, doc):
         has_type=doc.document_type is not None,
         created=doc.created,
         added=doc.added,
-        asn=doc.archive_serial_number,
+        asn=asn,
         modified=doc.modified,
         path=doc.storage_path.name if doc.storage_path else None,
         path_id=doc.storage_path.id if doc.storage_path else None,
index 84e96a79acd277e049ff27bd020202b647be2fef..a3c7cc4e6b68ec6313171756a16597d2e421e9f6 100644 (file)
@@ -3,6 +3,7 @@ import logging
 import os
 import re
 from collections import OrderedDict
+from typing import Final
 from typing import Optional
 
 import dateutil.parser
@@ -229,6 +230,9 @@ class Document(models.Model):
         help_text=_("The original name of the file when it was uploaded"),
     )
 
+    ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
+    ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
+
     archive_serial_number = models.PositiveIntegerField(
         _("archive serial number"),
         blank=True,
@@ -236,8 +240,8 @@ class Document(models.Model):
         unique=True,
         db_index=True,
         validators=[
-            MaxValueValidator(0xFF_FF_FF_FF),
-            MinValueValidator(0),
+            MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
+            MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
         ],
         help_text=_(
             "The position of this document in your physical document " "archive.",
index 696648427ecd54eeb39938663dea49233c4a6078..bf1865a433575b73baad10285449d1517fe8ffc1 100644 (file)
@@ -1,3 +1,5 @@
+from unittest import mock
+
 from django.test import TestCase
 from documents import index
 from documents.models import Document
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
         )
         self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
         self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
+
+    def test_archive_serial_number_ranging(self):
+        """
+        GIVEN:
+            - Document with an archive serial number above schema allowed size
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - Error is logged
+            - Document ASN is reset to 0 for the index
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+            # yes, this is allowed, unless full_clean is run
+            # DRF does call the validators, this test won't
+            archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
+        )
+        with self.assertLogs("paperless.index", level="ERROR") as cm:
+            with mock.patch(
+                "documents.index.AsyncWriter.update_document",
+            ) as mocked_update_doc:
+                index.add_or_update_document(doc1)
+
+                mocked_update_doc.assert_called_once()
+                _, kwargs = mocked_update_doc.call_args
+
+                self.assertEqual(kwargs["asn"], 0)
+
+                error_str = cm.output[0]
+                expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
+                self.assertIn(expected_str, error_str)
+
+    def test_archive_serial_number_is_none(self):
+        """
+        GIVEN:
+            - Document with no archive serial number
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - ASN isn't touched
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+        )
+        with mock.patch(
+            "documents.index.AsyncWriter.update_document",
+        ) as mocked_update_doc:
+            index.add_or_update_document(doc1)
+
+            mocked_update_doc.assert_called_once()
+            _, kwargs = mocked_update_doc.call_args
+
+            self.assertIsNone(kwargs["asn"])