`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
: By default, paperless ignores certain files and folders in the
-consumption directory, such as system files created by the Mac OS.
+consumption directory, such as system files created by the Mac OS
+or hidden folders some tools use to store data.
This can be adjusted by configuring a custom json array with
patterns to exclude.
+ For example, `.DS_STORE/*` will ignore any files found in a folder
+ named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
+
+ A pattern like `._*` will ignore anything starting with `._`, including:
+ `._foo.pdf` and `._bar/foo.pdf`
+
Defaults to
- `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`.
+ `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`.
## Binaries
import logging
import os
+from fnmatch import filter
from pathlib import Path
from pathlib import PurePath
from threading import Event
from time import monotonic
from time import sleep
from typing import Final
+from typing import Set
from django.conf import settings
from django.core.management.base import BaseCommand
logger = logging.getLogger("paperless.management.consumer")
-def _tags_from_path(filepath):
- """Walk up the directory tree from filepath to CONSUMPTION_DIR
+def _tags_from_path(filepath) -> Set[Tag]:
+ """
+ Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
+
+ Returns set of Tag models
"""
- normalized_consumption_dir = os.path.abspath(
- os.path.normpath(settings.CONSUMPTION_DIR),
- )
tag_ids = set()
- path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
+ path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
def _is_ignored(filepath: str) -> bool:
- normalized_consumption_dir = os.path.abspath(
- os.path.normpath(settings.CONSUMPTION_DIR),
+ """
+ Checks if the given file should be ignored, based on configured
+ patterns.
+
+ Returns True if the file is ignored, False otherwise
+ """
+ filepath = os.path.abspath(
+ os.path.normpath(filepath),
)
- filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
- return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
+ # Trim out the consume directory, leaving only filename and it's
+ # path relative to the consume directory
+ filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
-def _consume(filepath):
+ # March through the components of the path, including directories and the filename
+ # looking for anything matching
+ # foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
+ parts = []
+ for part in filepath_relative.parts:
+ # If the part is not the name (ie, it's a dir)
+ # Need to append the trailing slash or fnmatch doesn't match
+ # fnmatch("dir", "dir/*") == False
+ # fnmatch("dir/", "dir/*") == True
+ if part != filepath_relative.name:
+ part = part + "/"
+ parts.append(part)
+
+ for pattern in settings.CONSUMER_IGNORE_PATTERNS:
+ if len(filter(parts, pattern)):
+ return True
+
+ return False
+
+
+def _consume(filepath: str) -> None:
if os.path.isdir(filepath) or _is_ignored(filepath):
return
logger.exception("Error while consuming document")
-def _consume_wait_unmodified(file):
+def _consume_wait_unmodified(file: str) -> None:
+ """
+ Waits for the given file to appear unmodified based on file size
+ and modification time. Will wait a configured number of seconds
+ and retry a configured number of times before either consuming or
+ giving up
+ """
if _is_ignored(file):
return
def test_is_ignored(self):
test_paths = [
- (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
- (os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
- (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
- (
- os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
- True,
- ),
- (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
- (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
- (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
+ {
+ "path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
+ "ignore": False,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
+ "ignore": False,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(
+ self.dirs.consumption_dir,
+ "foo",
+ ".DS_STORE",
+ "bar.pdf",
+ ),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(
+ self.dirs.consumption_dir,
+ ".DS_STORE",
+ "foo",
+ "bar.pdf",
+ ),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
+ "ignore": False,
+ },
+ {
+ "path": os.path.join(
+ self.dirs.consumption_dir,
+ ".stversions",
+ "foo.pdf",
+ ),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
+ "ignore": False,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
+ "ignore": False,
+ },
+ {
+ "path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
+ "ignore": True,
+ },
+ {
+ "path": os.path.join(
+ self.dirs.consumption_dir,
+ "@eaDir",
+ "SYNO@.fileindexdb",
+ "_1jk.fnm",
+ ),
+ "ignore": True,
+ },
]
- for file_path, expected_ignored in test_paths:
+ for test_setup in test_paths:
+ filepath = test_setup["path"]
+ expected_ignored_result = test_setup["ignore"]
self.assertEqual(
- expected_ignored,
- document_consumer._is_ignored(file_path),
- f'_is_ignored("{file_path}") != {expected_ignored}',
+ expected_ignored_result,
+ document_consumer._is_ignored(filepath),
+ f'_is_ignored("{filepath}") != {expected_ignored_result}',
)
@mock.patch("documents.management.commands.document_consumer.open")
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
- '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501
+ '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]', # noqa: E501
),
),
)