Updates ignore path filtering so files in a folder in an ignored folder will be ignor...

author Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 14 Feb 2023 20:54:03 +0000 (12:54 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Thu, 16 Feb 2023 17:05:11 +0000 (09:05 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 14 Feb 2023 20:54:03 +0000 (12:54 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Thu, 16 Feb 2023 17:05:11 +0000 (09:05 -0800)
diff --git a/docs/configuration.md b/docs/configuration.md

index 27b3f1b28787af576aecf7f5c75002fac91deb62..aeea1c7a439234aac3c2169ebc2c75d3c0c07e1f 100644 (file)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -999,13 +999,20 @@ within your documents.
  `PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
  
  : By default, paperless ignores certain files and folders in the
-consumption directory, such as system files created by the Mac OS.
+consumption directory, such as system files created by the Mac OS
+or hidden folders some tools use to store data.
  
      This can be adjusted by configuring a custom json array with
      patterns to exclude.
  
+    For example, `.DS_STORE/*` will ignore any files found in a folder
+    named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
+
+    A pattern like `._*` will ignore anything starting with `._`, including:
+    `._foo.pdf` and `._bar/foo.pdf`
+
      Defaults to
-    `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`.
+    `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`.
  
  ## Binaries
  
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py

index 9107d574a63b729b0e632cecdfabee5254085421..c3f6bbed4871d5c9031b6d1f4e117e647a0d972e 100644 (file)
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,5 +1,6 @@
  import logging
  import os
+from fnmatch import filter
  from pathlib import Path
  from pathlib import PurePath
  from threading import Event
@@ -7,6 +8,7 @@ from threading import Thread
  from time import monotonic
  from time import sleep
  from typing import Final
+from typing import Set
  
  from django.conf import settings
  from django.core.management.base import BaseCommand
@@ -25,15 +27,15 @@ except ImportError:  # pragma: nocover
  logger = logging.getLogger("paperless.management.consumer")
  
  
-def _tags_from_path(filepath):
-    """Walk up the directory tree from filepath to CONSUMPTION_DIR
+def _tags_from_path(filepath) -> Set[Tag]:
+    """
+    Walk up the directory tree from filepath to CONSUMPTION_DIR
      and get or create Tag IDs for every directory.
+
+    Returns set of Tag models
      """
-    normalized_consumption_dir = os.path.abspath(
-        os.path.normpath(settings.CONSUMPTION_DIR),
-    )
      tag_ids = set()
-    path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
+    path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
      for part in path_parts:
          tag_ids.add(
              Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
@@ -43,14 +45,41 @@ def _tags_from_path(filepath):
  
  
  def _is_ignored(filepath: str) -> bool:
-    normalized_consumption_dir = os.path.abspath(
-        os.path.normpath(settings.CONSUMPTION_DIR),
+    """
+    Checks if the given file should be ignored, based on configured
+    patterns.
+
+    Returns True if the file is ignored, False otherwise
+    """
+    filepath = os.path.abspath(
+        os.path.normpath(filepath),
      )
-    filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
-    return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
  
+    # Trim out the consume directory, leaving only filename and it's
+    # path relative to the consume directory
+    filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
  
-def _consume(filepath):
+    # March through the components of the path, including directories and the filename
+    # looking for anything matching
+    # foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
+    parts = []
+    for part in filepath_relative.parts:
+        # If the part is not the name (ie, it's a dir)
+        # Need to append the trailing slash or fnmatch doesn't match
+        # fnmatch("dir", "dir/*") == False
+        # fnmatch("dir/", "dir/*") == True
+        if part != filepath_relative.name:
+            part = part + "/"
+        parts.append(part)
+
+    for pattern in settings.CONSUMER_IGNORE_PATTERNS:
+        if len(filter(parts, pattern)):
+            return True
+
+    return False
+
+
+def _consume(filepath: str) -> None:
      if os.path.isdir(filepath) or _is_ignored(filepath):
          return
  
@@ -103,7 +132,13 @@ def _consume(filepath):
          logger.exception("Error while consuming document")
  
  
-def _consume_wait_unmodified(file):
+def _consume_wait_unmodified(file: str) -> None:
+    """
+    Waits for the given file to appear unmodified based on file size
+    and modification time.  Will wait a configured number of seconds
+    and retry a configured number of times before either consuming or
+    giving up
+    """
      if _is_ignored(file):
          return
  
diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py

index 822a7ed074c19ec1af332468912993e216fc0198..3db8de0343e5686a018ac3ed68792c77033b25b7 100644 (file)
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
  
      def test_is_ignored(self):
          test_paths = [
-            (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
-            (os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
-            (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
-            (
-                os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
-                True,
-            ),
-            (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
-            (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
-            (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    "foo",
+                    ".DS_STORE",
+                    "bar.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    ".DS_STORE",
+                    "foo",
+                    "bar.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    ".stversions",
+                    "foo.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    "@eaDir",
+                    "SYNO@.fileindexdb",
+                    "_1jk.fnm",
+                ),
+                "ignore": True,
+            },
          ]
-        for file_path, expected_ignored in test_paths:
+        for test_setup in test_paths:
+            filepath = test_setup["path"]
+            expected_ignored_result = test_setup["ignore"]
              self.assertEqual(
-                expected_ignored,
-                document_consumer._is_ignored(file_path),
-                f'_is_ignored("{file_path}") != {expected_ignored}',
+                expected_ignored_result,
+                document_consumer._is_ignored(filepath),
+                f'_is_ignored("{filepath}") != {expected_ignored_result}',
              )
  
      @mock.patch("documents.management.commands.document_consumer.open")
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 5cac5e621769a97b58bbc384e00f4ba5da926c97..409579bda99441d234e969368048b76b7662c212 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -673,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
      json.loads(
          os.getenv(
              "PAPERLESS_CONSUMER_IGNORE_PATTERNS",
-            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]',  # noqa: E501
+            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]',  # noqa: E501
          ),
      ),
  )
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 14 Feb 2023 20:54:03 +0000 (12:54 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Thu, 16 Feb 2023 17:05:11 +0000 (09:05 -0800)
docs/configuration.md		patch \| blob \| blame \| history
src/documents/management/commands/document_consumer.py		patch \| blob \| blame \| history
src/documents/tests/test_management_consumer.py		patch \| blob \| blame \| history
src/paperless/settings.py		patch \| blob \| blame \| history