]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Feature: Allow tagging by putting barcode stickers on documents (#5580)
authorpkrahmer <5699756+pkrahmer@users.noreply.github.com>
Mon, 5 Feb 2024 17:38:19 +0000 (18:38 +0100)
committerGitHub <noreply@github.com>
Mon, 5 Feb 2024 17:38:19 +0000 (17:38 +0000)
docs/configuration.md
paperless.conf.example
src/documents/barcodes.py
src/documents/tests/test_barcodes.py
src/paperless/settings.py

index f473921cb9351575ce394894861661cdd7f05aee..e99e0a0857c3d35840399ceee6ccd2b780e42cb9 100644 (file)
@@ -1173,6 +1173,55 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0.
 
     Defaults to "300"
 
+#### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=<bool>`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE}
+
+: Enables the detection of barcodes in the scanned document and
+assigns or creates tags if a properly formatted barcode is detected.
+
+    The barcode must match one of the (configurable) regular expressions.
+    If the barcode text contains ',' (comma), it is split into multiple
+    barcodes which are individually processed for tagging.
+
+    Matching is case insensitive.
+
+    Defaults to false.
+
+#### [`PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING=<json dict>`](#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING) {#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING}
+
+: Defines a dictionary of filter regex and substitute expressions.
+
+    Syntax: {"<regex>": "<substitute>" [,...]]}
+
+    A barcode is considered for tagging if the barcode text matches
+    at least one of the provided <regex> pattern.
+
+    If a match is found, the <substitute> rule is applied. This allows very
+    versatile reformatting and mapping of barcode pattern to tag values.
+
+    If a tag is not found it will be created.
+
+    Defaults to:
+
+    {"TAG:(.*)": "\\g<1>"} which defines
+    - a regex TAG:(.*) which includes barcodes beginning with TAG:
+      followed by any text that gets stored into match group #1 and
+    - a substitute \\g<1> that replaces the original barcode text
+      by the content in match group #1.
+    Consequently, the tag is the barcode text without its TAG: prefix.
+
+    More examples:
+
+    {"ASN12.*": "JOHN", "ASN13.*": "SMITH"} for example maps
+    - ASN12nnnn barcodes to the tag JOHN and
+    - ASN13nnnn barcodes to the tag SMITH.
+
+    {"T-J": "JOHN", "T-S": "SMITH", "T-D": "DOE"} directly maps
+    - T-J barcodes to the tag JOHN,
+    - T-S barcodes to the tag SMITH and
+    - T-D barcodes to the tag DOE.
+
+    Please refer to the Python regex documentation for more information.
+
 ## Audit Trail
 
 #### [`PAPERLESS_AUDIT_LOG_ENABLED=<bool>`](#PAPERLESS_AUDIT_LOG_ENABLED) {#PAPERLESS_AUDIT_LOG_ENABLED}
index 1610dcda9552c5b3333387d94192e2b6d6b08396..db557a7b60ea8e9a4e06122bba3c1296402b6871 100644 (file)
@@ -68,6 +68,8 @@
 #PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
 #PAPERLESS_CONSUMER_BARCODE_UPSCALE=0.0
 #PAPERLESS_CONSUMER_BARCODE_DPI=300
+#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=false
+#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"}
 #PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=false
 #PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME=double-sided
 #PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=false
index 606451f84396a3d052c4cc9d449b9bf8ccfefcf4..4bfb9b791f152752646ccc72b005d16e533a6bc8 100644 (file)
@@ -14,6 +14,7 @@ from PIL import Image
 
 from documents.converters import convert_from_tiff_to_pdf
 from documents.data_models import ConsumableDocument
+from documents.models import Tag
 from documents.plugins.base import ConsumeTaskPlugin
 from documents.plugins.base import StopConsumeTaskError
 from documents.plugins.helpers import ProgressStatusOptions
@@ -65,7 +66,9 @@ class BarcodePlugin(ConsumeTaskPlugin):
             supported_mimes = {"application/pdf"}
 
         return (
-            settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
+            settings.CONSUMER_ENABLE_ASN_BARCODE
+            or settings.CONSUMER_ENABLE_BARCODES
+            or settings.CONSUMER_ENABLE_TAG_BARCODE
         ) and self.input_doc.mime_type in supported_mimes
 
     def setup(self):
@@ -90,6 +93,16 @@ class BarcodePlugin(ConsumeTaskPlugin):
             logger.info(f"Found ASN in barcode: {located_asn}")
             self.metadata.asn = located_asn
 
+        # try reading tags from barcodes
+        if settings.CONSUMER_ENABLE_TAG_BARCODE:
+            tags = self.tags
+            if tags is not None and len(tags) > 0:
+                if self.metadata.tag_ids:
+                    self.metadata.tag_ids += tags
+                else:
+                    self.metadata.tag_ids = tags
+                logger.info(f"Found tags in barcode: {tags}")
+
         separator_pages = self.get_separation_pages()
         if not separator_pages:
             return "No pages to split on!"
@@ -279,6 +292,53 @@ class BarcodePlugin(ConsumeTaskPlugin):
 
         return asn
 
+    @property
+    def tags(self) -> Optional[list[int]]:
+        """
+        Search the parsed barcodes for any tags.
+        Returns the detected tag ids (or empty list)
+        """
+        tags = []
+
+        # Ensure the barcodes have been read
+        self.detect()
+
+        for x in self.barcodes:
+            tag_texts = x.value
+
+            for raw in tag_texts.split(","):
+                try:
+                    tag = None
+                    for regex in settings.CONSUMER_TAG_BARCODE_MAPPING:
+                        if re.match(regex, raw, flags=re.IGNORECASE):
+                            sub = settings.CONSUMER_TAG_BARCODE_MAPPING[regex]
+                            tag = (
+                                re.sub(regex, sub, raw, flags=re.IGNORECASE)
+                                if sub
+                                else raw
+                            )
+                            break
+
+                    if tag:
+                        tag = Tag.objects.get_or_create(
+                            name__iexact=tag,
+                            defaults={"name": tag},
+                        )[0]
+
+                        logger.debug(
+                            f"Found Tag Barcode '{raw}', substituted "
+                            f"to '{tag}' and mapped to "
+                            f"tag #{tag.pk}.",
+                        )
+                        tags.append(tag.pk)
+
+                except Exception as e:
+                    logger.error(
+                        f"Failed to find or create TAG '{raw}' because: {e}",
+                    )
+
+        return tags
+
     def get_separation_pages(self) -> dict[int, bool]:
         """
         Search the parsed barcodes for separators and returns a dict of page
index 4552a2b77fb0a5004ca485f39acd782f06674283..3dd6d62ff366486cc79ff36a7e0f4ad46c8ce06d 100644 (file)
@@ -14,6 +14,7 @@ from documents.barcodes import BarcodePlugin
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
+from documents.models import Tag
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import DocumentConsumeDelayMixin
 from documents.tests.utils import DummyProgressManager
@@ -741,3 +742,125 @@ class TestBarcodeZxing(TestBarcode):
 @override_settings(CONSUMER_BARCODE_SCANNER="ZXING")
 class TestAsnBarcodesZxing(TestAsnBarcode):
     pass
+
+
+class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, TestCase):
+    @contextmanager
+    def get_reader(self, filepath: Path) -> BarcodePlugin:
+        reader = BarcodePlugin(
+            ConsumableDocument(DocumentSource.ConsumeFolder, original_file=filepath),
+            DocumentMetadataOverrides(),
+            DummyProgressManager(filepath.name, None),
+            self.dirs.scratch_dir,
+            "task-id",
+        )
+        reader.setup()
+        yield reader
+        reader.cleanup()
+
+    @override_settings(CONSUMER_ENABLE_TAG_BARCODE=True)
+    def test_scan_file_without_matching_barcodes(self):
+        """
+        GIVEN:
+            - PDF containing tag barcodes but none with matching prefix (default "TAG:")
+        WHEN:
+            - File is scanned for barcodes
+        THEN:
+            - No TAG has been created
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
+        with self.get_reader(test_file) as reader:
+            reader.run()
+            tags = reader.metadata.tag_ids
+            self.assertEqual(tags, None)
+
+    @override_settings(
+        CONSUMER_ENABLE_TAG_BARCODE=False,
+        CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"},
+    )
+    def test_scan_file_with_matching_barcode_but_function_disabled(self):
+        """
+        GIVEN:
+            - PDF containing a tag barcode with matching custom prefix
+            - The tag barcode functionality is disabled
+        WHEN:
+            - File is scanned for barcodes
+        THEN:
+            - No TAG has been created
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
+        with self.get_reader(test_file) as reader:
+            reader.run()
+            tags = reader.metadata.tag_ids
+            self.assertEqual(tags, None)
+
+    @override_settings(
+        CONSUMER_ENABLE_TAG_BARCODE=True,
+        CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<1>"},
+    )
+    def test_scan_file_for_tag_custom_prefix(self):
+        """
+        GIVEN:
+            - PDF containing a tag barcode with custom prefix
+            - The barcode mapping accepts this prefix and removes it from the mapped tag value
+            - The created tag is the non-prefixed values
+        WHEN:
+            - File is scanned for barcodes
+        THEN:
+            - The TAG is located
+            - One TAG has been created
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
+        with self.get_reader(test_file) as reader:
+            reader.metadata.tag_ids = [99]
+            reader.run()
+            self.assertEqual(reader.pdf_file, test_file)
+            tags = reader.metadata.tag_ids
+            self.assertEqual(len(tags), 2)
+            self.assertEqual(tags[0], 99)
+            self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[1])
+
+    @override_settings(
+        CONSUMER_ENABLE_TAG_BARCODE=True,
+        CONSUMER_TAG_BARCODE_MAPPING={"ASN(.*)": "\\g<1>"},
+    )
+    def test_scan_file_for_many_custom_tags(self):
+        """
+        GIVEN:
+            - PDF containing multiple tag barcode with custom prefix
+            - The barcode mapping accepts this prefix and removes it from the mapped tag value
+            - The created tags are the non-prefixed values
+        WHEN:
+            - File is scanned for barcodes
+        THEN:
+            - The TAG is located
+            - File Tags have been created
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "split-by-asn-1.pdf"
+        with self.get_reader(test_file) as reader:
+            reader.run()
+            tags = reader.metadata.tag_ids
+            self.assertEqual(len(tags), 5)
+            self.assertEqual(Tag.objects.get(name__iexact="00123").pk, tags[0])
+            self.assertEqual(Tag.objects.get(name__iexact="00124").pk, tags[1])
+            self.assertEqual(Tag.objects.get(name__iexact="00125").pk, tags[2])
+            self.assertEqual(Tag.objects.get(name__iexact="00126").pk, tags[3])
+            self.assertEqual(Tag.objects.get(name__iexact="00127").pk, tags[4])
+
+    @override_settings(
+        CONSUMER_ENABLE_TAG_BARCODE=True,
+        CONSUMER_TAG_BARCODE_MAPPING={"CUSTOM-PREFIX-(.*)": "\\g<3>"},
+    )
+    def test_scan_file_for_tag_raises_value_error(self):
+        """
+        GIVEN:
+            - Any error occurs during tag barcode processing
+        THEN:
+            - The processing should be skipped and not break the import
+        """
+        test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-custom-prefix.pdf"
+        with self.get_reader(test_file) as reader:
+            reader.run()
+            # expect error to be caught and logged only
+            tags = reader.metadata.tag_ids
+            self.assertEqual(tags, None)
index 7179f0358511be9c13125fdf343992fd68b679b3..4f7894acc2ff18da0fb07865293b8d8b13142b6d 100644 (file)
@@ -853,6 +853,19 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
 
 CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
 
+CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean(
+    "PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE",
+)
+
+CONSUMER_TAG_BARCODE_MAPPING = dict(
+    json.loads(
+        os.getenv(
+            "PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING",
+            '{"TAG:(.*)": "\\\\g<1>"}',
+        ),
+    ),
+)
+
 CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
     "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
 )