]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Fix: Document metadata is lost during barcode splitting (#4982)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Fri, 15 Dec 2023 17:17:25 +0000 (09:17 -0800)
committerGitHub <noreply@github.com>
Fri, 15 Dec 2023 17:17:25 +0000 (09:17 -0800)
* Fixes barcode splitting dropping metadata that might be needed for the round 2

src/documents/barcodes.py
src/documents/tasks.py
src/documents/tests/test_barcodes.py
src/documents/tests/utils.py

index cac02e3e973e946bca5b2d3f43cc071144195c42..3e2b309b685acf85790b3b8bc4a1a27a695476a7 100644 (file)
@@ -14,6 +14,8 @@ from pikepdf import Pdf
 from PIL import Image
 
 from documents.converters import convert_from_tiff_to_pdf
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
@@ -53,6 +55,7 @@ class BarcodeReader:
         self.mime: Final[str] = mime_type
         self.pdf_file: Path = self.file
         self.barcodes: list[Barcode] = []
+        self._tiff_conversion_done = False
         self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
 
         if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
@@ -150,12 +153,14 @@ class BarcodeReader:
 
     def convert_from_tiff_to_pdf(self):
         """
-        May convert a TIFF image into a PDF, if the input is a TIFF
+        May convert a TIFF image into a PDF, if the input is a TIFF and
+        the TIFF has not been made into a PDF
         """
         # Nothing to do, pdf_file is already assigned correctly
-        if self.mime != "image/tiff":
+        if self.mime != "image/tiff" or self._tiff_conversion_done:
             return
 
+        self._tiff_conversion_done = True
         self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
 
     def detect(self) -> None:
@@ -167,6 +172,9 @@ class BarcodeReader:
         if self.barcodes:
             return
 
+        # No op if not a TIFF
+        self.convert_from_tiff_to_pdf()
+
         # Choose the library for reading
         if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
             reader = self.read_barcodes_pyzbar
@@ -240,7 +248,7 @@ class BarcodeReader:
         """
 
         document_paths = []
-        fname = self.file.with_suffix("").name
+        fname = self.file.stem
         with Pdf.open(self.pdf_file) as input_pdf:
             # Start with an empty document
             current_document: list[Page] = []
@@ -290,7 +298,7 @@ class BarcodeReader:
     def separate(
         self,
         source: DocumentSource,
-        override_name: Optional[str] = None,
+        overrides: DocumentMetadataOverrides,
     ) -> bool:
         """
         Separates the document, based on barcodes and configuration, creating new
@@ -316,27 +324,23 @@ class BarcodeReader:
             logger.warning("No pages to split on!")
             return False
 
-        # Create the split documents
-        doc_paths = self.separate_pages(separator_pages)
+        tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
 
-        # Save the new documents to correct folder
-        if source != DocumentSource.ConsumeFolder:
-            # The given file is somewhere in SCRATCH_DIR,
-            # and new documents must be moved to the CONSUMPTION_DIR
-            # for the consumer to notice them
-            save_to_dir = settings.CONSUMPTION_DIR
-        else:
-            # The given file is somewhere in CONSUMPTION_DIR,
-            # and may be some levels down for recursive tagging
-            # so use the file's parent to preserve any metadata
-            save_to_dir = self.file.parent
-
-        for idx, document_path in enumerate(doc_paths):
-            if override_name is not None:
-                newname = f"{idx}_{override_name}"
-                dest = save_to_dir / newname
-            else:
-                dest = save_to_dir
-            logger.info(f"Saving {document_path} to {dest}")
-            copy_file_with_basic_stats(document_path, dest)
+        from documents import tasks
+
+        # Create the split document tasks
+        for new_document in self.separate_pages(separator_pages):
+            copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
+
+            tasks.consume_file.delay(
+                ConsumableDocument(
+                    # Same source, for templates
+                    source=source,
+                    # Can't use same folder or the consume might grab it again
+                    original_file=(tmp_dir / new_document.name).resolve(),
+                ),
+                # All the same metadata
+                overrides,
+            )
+        logger.info("Barcode splitting complete!")
         return True
index 10a44a8fe10d6111933e656e7683f4d5ffb4ea56..d0728a719b6da8e421750552b9f22d8d1848ca66 100644 (file)
@@ -140,7 +140,7 @@ def consume_file(
         with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
             if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
                 input_doc.source,
-                overrides.filename,
+                overrides,
             ):
                 # notify the sender, otherwise the progress bar
                 # in the UI stays stuck
index 70f7807cc81c25e2dea563c822af5afe00ae0e66..e4d8ccc57aaeb6f6cd0f06324207397609b66a93 100644 (file)
@@ -1,5 +1,4 @@
 import shutil
-from pathlib import Path
 from unittest import mock
 
 import pytest
@@ -11,10 +10,13 @@ from documents import tasks
 from documents.barcodes import BarcodeReader
 from documents.consumer import ConsumerError
 from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DocumentConsumeDelayMixin
 from documents.tests.utils import FileSystemAssertsMixin
+from documents.tests.utils import SampleDirMixin
 
 try:
     import zxingcpp  # noqa: F401
@@ -25,11 +27,7 @@ except ImportError:
 
 
 @override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
-class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_DIR = Path(__file__).parent / "samples"
-
-    BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
-
+class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, SampleDirMixin, TestCase):
     def test_scan_file_for_separating_barcodes(self):
         """
         GIVEN:
@@ -48,6 +46,46 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             self.assertEqual(reader.pdf_file, test_file)
             self.assertDictEqual(separator_page_numbers, {0: False})
 
+    @override_settings(
+        CONSUMER_BARCODE_TIFF_SUPPORT=True,
+    )
+    def test_scan_tiff_for_separating_barcodes(self):
+        """
+        GIVEN:
+            - TIFF image containing barcodes
+        WHEN:
+            - Consume task returns
+        THEN:
+            - The file was split
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
+
+        with BarcodeReader(test_file, "image/tiff") as reader:
+            reader.detect()
+            separator_page_numbers = reader.get_separation_pages()
+
+            self.assertDictEqual(separator_page_numbers, {1: False})
+
+    @override_settings(
+        CONSUMER_BARCODE_TIFF_SUPPORT=True,
+    )
+    def test_scan_tiff_with_alpha_for_separating_barcodes(self):
+        """
+        GIVEN:
+            - TIFF image containing barcodes
+        WHEN:
+            - Consume task returns
+        THEN:
+            - The file was split
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
+
+        with BarcodeReader(test_file, "image/tiff") as reader:
+            reader.detect()
+            separator_page_numbers = reader.get_separation_pages()
+
+            self.assertDictEqual(separator_page_numbers, {1: False})
+
     def test_scan_file_for_separating_barcodes_none_present(self):
         """
         GIVEN:
@@ -285,6 +323,28 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             self.assertGreater(len(reader.barcodes), 0)
             self.assertDictEqual(separator_page_numbers, {1: False})
 
+    def test_scan_file_for_separating_barcodes_password(self):
+        """
+        GIVEN:
+            - Password protected PDF
+        WHEN:
+            - File is scanned for barcode
+        THEN:
+            - Scanning handles the exception without crashing
+        """
+        test_file = self.SAMPLE_DIR / "password-is-test.pdf"
+        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
+            with BarcodeReader(test_file, "application/pdf") as reader:
+                reader.detect()
+                warning = cm.output[0]
+                expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
+                self.assertTrue(warning.startswith(expected_str))
+
+                separator_page_numbers = reader.get_separation_pages()
+
+                self.assertEqual(reader.pdf_file, test_file)
+                self.assertDictEqual(separator_page_numbers, {})
+
     def test_separate_pages(self):
         """
         GIVEN:
@@ -332,8 +392,12 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 
         with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
             with BarcodeReader(test_file, "application/pdf") as reader:
-                success = reader.separate(DocumentSource.ApiUpload)
-                self.assertFalse(success)
+                self.assertFalse(
+                    reader.separate(
+                        DocumentSource.ApiUpload,
+                        DocumentMetadataOverrides(),
+                    ),
+                )
                 self.assertEqual(
                     cm.output,
                     [
@@ -341,215 +405,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
                     ],
                 )
 
-    def test_save_to_dir_given_name(self):
-        """
-        GIVEN:
-            - File to save to a directory
-            - There is a name override
-        WHEN:
-            - The file is saved
-        THEN:
-            - The file exists
-        """
-        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
-        with BarcodeReader(test_file, "application/pdf") as reader:
-            reader.separate(DocumentSource.ApiUpload, "newname.pdf")
-
-            self.assertEqual(reader.pdf_file, test_file)
-            target_file1 = settings.CONSUMPTION_DIR / "0_newname.pdf"
-            target_file2 = settings.CONSUMPTION_DIR / "1_newname.pdf"
-            self.assertIsFile(target_file1)
-            self.assertIsFile(target_file2)
-
-    def test_barcode_splitter_api_upload(self):
-        """
-        GIVEN:
-            - Input file containing barcodes
-        WHEN:
-            - Input file is split on barcodes
-        THEN:
-            - Correct number of files produced
-        """
-        sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
-        test_file = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
-        shutil.copy(sample_file, test_file)
-
-        with BarcodeReader(test_file, "application/pdf") as reader:
-            reader.separate(DocumentSource.ApiUpload)
-
-            self.assertEqual(reader.pdf_file, test_file)
-
-            target_file1 = (
-                settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
-            )
-
-            target_file2 = (
-                settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
-            )
-
-            self.assertIsFile(target_file1)
-            self.assertIsFile(target_file2)
-
-    def test_barcode_splitter_consume_dir(self):
-        """
-        GIVEN:
-            - Input file containing barcodes
-        WHEN:
-            - Input file is split on barcodes
-        THEN:
-            - Correct number of files produced
-        """
-        sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
-        test_file = settings.CONSUMPTION_DIR / "patch-code-t-middle.pdf"
-        shutil.copy(sample_file, test_file)
-
-        with BarcodeReader(test_file, "application/pdf") as reader:
-            reader.detect()
-            reader.separate(DocumentSource.ConsumeFolder)
-
-            self.assertEqual(reader.pdf_file, test_file)
-
-            target_file1 = (
-                settings.CONSUMPTION_DIR / "patch-code-t-middle_document_0.pdf"
-            )
-
-            target_file2 = (
-                settings.CONSUMPTION_DIR / "patch-code-t-middle_document_1.pdf"
-            )
-
-            self.assertIsFile(target_file1)
-            self.assertIsFile(target_file2)
-
-    def test_barcode_splitter_consume_dir_recursive(self):
-        """
-        GIVEN:
-            - Input file containing barcodes
-            - Input file is within a directory structure of the consume folder
-        WHEN:
-            - Input file is split on barcodes
-        THEN:
-            - Correct number of files produced
-            - Output files are within the same directory structure
-        """
-        sample_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
-        test_file = (
-            settings.CONSUMPTION_DIR / "tag1" / "tag2" / "patch-code-t-middle.pdf"
-        )
-        test_file.parent.mkdir(parents=True)
-        shutil.copy(sample_file, test_file)
-
-        with BarcodeReader(test_file, "application/pdf") as reader:
-            reader.separate(DocumentSource.ConsumeFolder)
-
-            self.assertEqual(reader.pdf_file, test_file)
-
-            target_file1 = (
-                settings.CONSUMPTION_DIR
-                / "tag1"
-                / "tag2"
-                / "patch-code-t-middle_document_0.pdf"
-            )
-
-            target_file2 = (
-                settings.CONSUMPTION_DIR
-                / "tag1"
-                / "tag2"
-                / "patch-code-t-middle_document_1.pdf"
-            )
-
-            self.assertIsFile(target_file1)
-            self.assertIsFile(target_file2)
-
-    @override_settings(CONSUMER_ENABLE_BARCODES=True)
-    def test_consume_barcode_file(self):
-        """
-        GIVEN:
-            - Input file with barcodes given to consume task
-        WHEN:
-            - Consume task returns
-        THEN:
-            - The file was split
-        """
-        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
-
-        dst = settings.SCRATCH_DIR / "patch-code-t-middle.pdf"
-        shutil.copy(test_file, dst)
-
-        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                ),
-                "File successfully split",
-            )
-
-    @override_settings(
-        CONSUMER_ENABLE_BARCODES=True,
-        CONSUMER_BARCODE_TIFF_SUPPORT=True,
-    )
-    def test_consume_barcode_tiff_file(self):
-        """
-        GIVEN:
-            - TIFF image containing barcodes
-        WHEN:
-            - Consume task returns
-        THEN:
-            - The file was split
-        """
-        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
-
-        dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
-        shutil.copy(test_file, dst)
-
-        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                ),
-                "File successfully split",
-            )
-        self.assertIsNotFile(dst)
-
-    @override_settings(
-        CONSUMER_ENABLE_BARCODES=True,
-        CONSUMER_BARCODE_TIFF_SUPPORT=True,
-    )
-    def test_consume_barcode_tiff_file_with_alpha(self):
-        """
-        GIVEN:
-            - TIFF image containing barcodes
-            - TIFF image has an alpha layer
-        WHEN:
-            - Consume task handles the alpha layer and returns
-        THEN:
-            - The file was split without issue
-        """
-        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle-alpha.tiff"
-
-        dst = settings.SCRATCH_DIR / "patch-code-t-middle.tiff"
-        shutil.copy(test_file, dst)
-
-        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                ),
-                "File successfully split",
-            )
-        self.assertIsNotFile(dst)
-
     @override_settings(
         CONSUMER_ENABLE_BARCODES=True,
         CONSUMER_BARCODE_TIFF_SUPPORT=True,
@@ -597,60 +452,6 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         self.assertIsNone(kwargs["override_document_type_id"])
         self.assertIsNone(kwargs["override_tag_ids"])
 
-    @override_settings(
-        CONSUMER_ENABLE_BARCODES=True,
-        CONSUMER_BARCODE_TIFF_SUPPORT=True,
-    )
-    def test_consume_barcode_supported_no_extension_file(self):
-        """
-        GIVEN:
-            - TIFF image containing barcodes
-            - TIFF file is given without extension
-        WHEN:
-            - Consume task returns
-        THEN:
-            - The file was split
-        """
-        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.tiff"
-
-        dst = settings.SCRATCH_DIR / "patch-code-t-middle"
-        shutil.copy(test_file, dst)
-
-        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(
-                tasks.consume_file(
-                    ConsumableDocument(
-                        source=DocumentSource.ConsumeFolder,
-                        original_file=dst,
-                    ),
-                    None,
-                ),
-                "File successfully split",
-            )
-        self.assertIsNotFile(dst)
-
-    def test_scan_file_for_separating_barcodes_password(self):
-        """
-        GIVEN:
-            - Password protected PDF
-        WHEN:
-            - File is scanned for barcode
-        THEN:
-            - Scanning handles the exception without crashing
-        """
-        test_file = self.SAMPLE_DIR / "password-is-test.pdf"
-        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            with BarcodeReader(test_file, "application/pdf") as reader:
-                reader.detect()
-                warning = cm.output[0]
-                expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
-                self.assertTrue(warning.startswith(expected_str))
-
-                separator_page_numbers = reader.get_separation_pages()
-
-                self.assertEqual(reader.pdf_file, test_file)
-                self.assertDictEqual(separator_page_numbers, {})
-
     @override_settings(
         CONSUMER_ENABLE_BARCODES=True,
         CONSUMER_ENABLE_ASN_BARCODE=True,
@@ -722,11 +523,64 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             self.assertEqual(len(document_list), 5)
 
 
-class TestAsnBarcode(DirectoriesMixin, TestCase):
-    SAMPLE_DIR = Path(__file__).parent / "samples"
+@override_settings(CONSUMER_BARCODE_SCANNER="PYZBAR")
+class TestBarcodeNewConsume(
+    DirectoriesMixin,
+    FileSystemAssertsMixin,
+    SampleDirMixin,
+    DocumentConsumeDelayMixin,
+    TestCase,
+):
+    @override_settings(CONSUMER_ENABLE_BARCODES=True)
+    def test_consume_barcode_file(self):
+        """
+        GIVEN:
+            - Incoming file with at 1 barcode producing 2 documents
+            - Document includes metadata override information
+        WHEN:
+            - The document is split
+        THEN:
+            - Two new consume tasks are created
+            - Metadata overrides are preserved for the new consume
+            - The document source is unchanged (for consume templates)
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t-middle.pdf"
+        temp_copy = self.dirs.scratch_dir / test_file.name
+        shutil.copy(test_file, temp_copy)
 
-    BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"
+        overrides = DocumentMetadataOverrides(tag_ids=[1, 2, 9])
 
+        with mock.patch("documents.tasks.async_to_sync") as progress_mocker:
+            self.assertEqual(
+                tasks.consume_file(
+                    ConsumableDocument(
+                        source=DocumentSource.ConsumeFolder,
+                        original_file=temp_copy,
+                    ),
+                    overrides,
+                ),
+                "File successfully split",
+            )
+            # We let the consumer know progress is done
+            progress_mocker.assert_called_once()
+            # 2 new document consume tasks created
+            self.assertEqual(self.consume_file_mock.call_count, 2)
+
+            self.assertIsNotFile(temp_copy)
+
+            # Check the split files exist
+            # Check the source is unchanged
+            # Check the overrides are unchanged
+            for (
+                new_input_doc,
+                new_doc_overrides,
+            ) in self.get_all_consume_delay_call_args():
+                self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
+                self.assertIsFile(new_input_doc.original_file)
+                self.assertEqual(overrides, new_doc_overrides)
+
+
+class TestAsnBarcode(DirectoriesMixin, SampleDirMixin, TestCase):
     @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
     def test_scan_file_for_asn_custom_prefix(self):
         """
index b1e9c0523d62afb1d3cf36db2942dd31c1d609f1..fe7dbb0599a58bfb2dbce24513cf16e09bd247cb 100644 (file)
@@ -235,8 +235,10 @@ class DocumentConsumeDelayMixin:
         """
         Iterates over all calls to the async task and returns the arguments
         """
+        # Must be at least 1 call
+        self.consume_file_mock.assert_called()
 
-        for args, _ in self.consume_file_mock.call_args_list:
+        for args, kwargs in self.consume_file_mock.call_args_list:
             input_doc, overrides = args
 
             yield (input_doc, overrides)
@@ -244,7 +246,7 @@ class DocumentConsumeDelayMixin:
     def get_specific_consume_delay_call_args(
         self,
         index: int,
-    ) -> Iterator[tuple[ConsumableDocument, DocumentMetadataOverrides]]:
+    ) -> tuple[ConsumableDocument, DocumentMetadataOverrides]:
         """
         Returns the arguments of a specific call to the async task
         """
@@ -299,3 +301,9 @@ class TestMigrations(TransactionTestCase):
 
     def setUpBeforeMigration(self, apps):
         pass
+
+
+class SampleDirMixin:
+    SAMPLE_DIR = Path(__file__).parent / "samples"
+
+    BARCODE_SAMPLE_DIR = SAMPLE_DIR / "barcodes"