Cleans up some command help text and adds more control over process count for command with a Pool
Use this command to re-create document thumbnails. Optionally include the ` --document {id}` option to generate thumbnails for a specific document only.
+You may also specify `--processes` to control the number of processes used to generate new thumbnails. The default is to utilize
+a quarter of the available processors.
+
```
document_thumbnails
```
document_fuzzy_match [--ratio] [--processes N]
```
-| Option | Required | Default | Description |
-| ----------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------ |
-| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
-| --processes | No | 4 | Number of processes to use for matching. Setting 1 disables multiple processes |
+| Option | Required | Default | Description |
+| ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
+| --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes |
def add_arguments(self, parser):
parser.add_argument(
"--passphrase",
- help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
- "specify it here",
+ help=(
+ "If PAPERLESS_PASSPHRASE isn't set already, you need to "
+ "specify it here"
+ ),
)
def handle(self, *args, **options):
try:
- print(
- "\n\nWARNING: This script is going to work directly on your "
- "document originals, so\nWARNING: you probably shouldn't run "
- "this unless you've got a recent backup\nWARNING: handy. It "
- "*should* work without a hitch, but be safe and backup your\n"
- "WARNING: stuff first.\n\nHit Ctrl+C to exit now, or Enter to "
- "continue.\n\n",
+ self.stdout.write(
+ self.style.WARNING(
+ "\n\n"
+ "WARNING: This script is going to work directly on your "
+ "document originals, so\n"
+ "WARNING: you probably shouldn't run "
+ "this unless you've got a recent backup\n"
+ "WARNING: handy. It "
+ "*should* work without a hitch, but be safe and backup your\n"
+ "WARNING: stuff first.\n\n"
+ "Hit Ctrl+C to exit now, or Enter to "
+ "continue.\n\n",
+ ),
)
_ = input()
except KeyboardInterrupt:
self.__gpg_to_unencrypted(passphrase)
- @staticmethod
- def __gpg_to_unencrypted(passphrase):
+ def __gpg_to_unencrypted(self, passphrase: str):
encrypted_files = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG,
)
for document in encrypted_files:
- print(f"Decrypting {document}".encode())
+ self.stdout.write(f"Decrypting {document}")
old_paths = [document.source_path, document.thumbnail_path]
from django.conf import settings
from django.core.management.base import BaseCommand
+from documents.management.commands.mixins import MultiProcessMixin
+from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
-class Command(BaseCommand):
- help = """
- Using the current classification model, assigns correspondents, tags
- and document types to all documents, effectively allowing you to
- back-tag all previously indexed documents with metadata created (or
- modified) after their initial import.
- """.replace(
- " ",
- "",
+class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
+ help = (
+ "Using the current classification model, assigns correspondents, tags "
+ "and document types to all documents, effectively allowing you to "
+ "back-tag all previously indexed documents with metadata created (or "
+ "modified) after their initial import."
)
def add_arguments(self, parser):
"--overwrite",
default=False,
action="store_true",
- help="Recreates the archived document for documents that already "
- "have an archived version.",
+ help=(
+ "Recreates the archived document for documents that already "
+ "have an archived version."
+ ),
)
parser.add_argument(
"-d",
default=None,
type=int,
required=False,
- help="Specify the ID of a document, and this command will only "
- "run on this specific document.",
- )
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
+ help=(
+ "Specify the ID of a document, and this command will only "
+ "run on this specific document."
+ ),
)
+ self.add_argument_progress_bar_mixin(parser)
+ self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
+ self.handle_processes_mixin(**options)
+ self.handle_progress_bar_mixin(**options)
+
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
overwrite = options["overwrite"]
)
# Note to future self: this prevents django from reusing database
- # conncetions between processes, which is bad and does not work
+ # connections between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
try:
logging.getLogger().handlers[0].level = logging.ERROR
- with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
- list(
- tqdm.tqdm(
- pool.imap_unordered(update_document_archive_file, document_ids),
- total=len(document_ids),
- disable=options["no_progress_bar"],
- ),
- )
+
+ if self.process_count == 1:
+ for doc_id in document_ids:
+ update_document_archive_file(doc_id)
+ else: # pragma: no cover
+ with multiprocessing.Pool(self.process_count) as pool:
+ list(
+ tqdm.tqdm(
+ pool.imap_unordered(
+ update_document_archive_file,
+ document_ids,
+ ),
+ total=len(document_ids),
+ disable=self.no_progress_bar,
+ ),
+ )
except KeyboardInterrupt:
self.stdout.write(self.style.NOTICE("Aborting..."))
class Command(BaseCommand):
- help = """
- Trains the classifier on your data and saves the resulting models to a
- file. The document consumer will then automatically use this new model.
- """.replace(
- " ",
- "",
+ help = (
+ "Trains the classifier on your data and saves the resulting models to a "
+ "file. The document consumer will then automatically use this new model."
)
- def __init__(self, *args, **kwargs):
- BaseCommand.__init__(self, *args, **kwargs)
-
def handle(self, *args, **options):
train_classifier()
class Command(BaseCommand):
- help = """
- Decrypt and rename all files in our collection into a given target
- directory. And include a manifest file containing document data for
- easy import.
- """.replace(
- " ",
- "",
+ help = (
+ "Decrypt and rename all files in our collection into a given target "
+ "directory. And include a manifest file containing document data for "
+ "easy import."
)
def add_arguments(self, parser):
"--compare-checksums",
default=False,
action="store_true",
- help="Compare file checksums when determining whether to export "
- "a file or not. If not specified, file size and time "
- "modified is used instead.",
+ help=(
+ "Compare file checksums when determining whether to export "
+ "a file or not. If not specified, file size and time "
+ "modified is used instead."
+ ),
)
parser.add_argument(
"--delete",
default=False,
action="store_true",
- help="After exporting, delete files in the export directory that "
- "do not belong to the current export, such as files from "
- "deleted documents.",
+ help=(
+ "After exporting, delete files in the export directory that "
+ "do not belong to the current export, such as files from "
+ "deleted documents."
+ ),
)
parser.add_argument(
"--use-filename-format",
default=False,
action="store_true",
- help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
- "export directory, if configured.",
+ help=(
+ "Use PAPERLESS_FILENAME_FORMAT for storing files in the "
+ "export directory, if configured."
+ ),
)
parser.add_argument(
"--use-folder-prefix",
default=False,
action="store_true",
- help="Export files in dedicated folders according to their nature: "
- "archive, originals or thumbnails",
+ help=(
+ "Export files in dedicated folders according to their nature: "
+ "archive, originals or thumbnails"
+ ),
)
parser.add_argument(
from django.core.management import BaseCommand
from django.core.management import CommandError
+from documents.management.commands.mixins import MultiProcessMixin
+from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
-class Command(BaseCommand):
+class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "Searches for documents where the content almost matches"
def add_arguments(self, parser):
type=float,
help="Ratio to consider documents a match",
)
- parser.add_argument(
- "--processes",
- default=4,
- type=int,
- help="Number of processes to distribute work amongst",
- )
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
- )
+ self.add_argument_progress_bar_mixin(parser)
+ self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
+ self.handle_processes_mixin(**options)
+ self.handle_progress_bar_mixin(**options)
+
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
- if options["processes"] < 1:
- raise CommandError("There must be at least 1 process")
-
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
- if options["processes"] == 1:
+ if self.process_count == 1:
results = []
- for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
+ for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
- else:
- with multiprocessing.Pool(processes=options["processes"]) as pool:
+ else: # pragma: no cover
+ with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
- disable=options["no_progress_bar"],
+ disable=self.no_progress_bar,
),
)
class Command(BaseCommand):
- help = """
- Using a manifest.json file, load the data from there, and import the
- documents it refers to.
- """.replace(
- " ",
- "",
+ help = (
+ "Using a manifest.json file, load the data from there, and import the "
+ "documents it refers to."
)
def add_arguments(self, parser):
from django.core.management import BaseCommand
from django.db import transaction
+from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import index_optimize
from documents.tasks import index_reindex
-class Command(BaseCommand):
+class Command(ProgressBarMixin, BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
parser.add_argument("command", choices=["reindex", "optimize"])
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
- )
+ self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
+ self.handle_progress_bar_mixin(**options)
with transaction.atomic():
if options["command"] == "reindex":
- index_reindex(progress_bar_disable=options["no_progress_bar"])
+ index_reindex(progress_bar_disable=self.no_progress_bar)
elif options["command"] == "optimize":
index_optimize()
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
+from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
-class Command(BaseCommand):
- help = """
- This will rename all documents to match the latest filename format.
- """.replace(
- " ",
- "",
- )
+class Command(ProgressBarMixin, BaseCommand):
+ help = "This will rename all documents to match the latest filename format."
def add_arguments(self, parser):
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
- )
+ self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
+ self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
- disable=options["no_progress_bar"],
+ disable=self.no_progress_bar,
):
post_save.send(Document, instance=document)
from django.core.management.base import BaseCommand
from documents.classifier import load_classifier
+from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
logger = logging.getLogger("paperless.management.retagger")
-class Command(BaseCommand):
- help = """
- Using the current classification model, assigns correspondents, tags
- and document types to all documents, effectively allowing you to
- back-tag all previously indexed documents with metadata created (or
- modified) after their initial import.
- """.replace(
- " ",
- "",
+class Command(ProgressBarMixin, BaseCommand):
+ help = (
+ "Using the current classification model, assigns correspondents, tags "
+ "and document types to all documents, effectively allowing you to "
+ "back-tag all previously indexed documents with metadata created (or "
+ "modified) after their initial import."
)
def add_arguments(self, parser):
"--use-first",
default=False,
action="store_true",
- help="By default this command won't try to assign a correspondent "
- "if more than one matches the document. Use this flag if "
- "you'd rather it just pick the first one it finds.",
+ help=(
+ "By default this command won't try to assign a correspondent "
+ "if more than one matches the document. Use this flag if "
+ "you'd rather it just pick the first one it finds."
+ ),
)
parser.add_argument(
"-f",
"--overwrite",
default=False,
action="store_true",
- help="If set, the document retagger will overwrite any previously"
- "set correspondent, document and remove correspondents, types"
- "and tags that do not match anymore due to changed rules.",
- )
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
+ help=(
+ "If set, the document retagger will overwrite any previously"
+ "set correspondent, document and remove correspondents, types"
+ "and tags that do not match anymore due to changed rules."
+ ),
)
+ self.add_argument_progress_bar_mixin(parser)
parser.add_argument(
"--suggest",
default=False,
)
def handle(self, *args, **options):
+ self.handle_progress_bar_mixin(**options)
# Detect if we support color
color = self.style.ERROR("test") != "test"
classifier = load_classifier()
- for document in tqdm.tqdm(documents, disable=options["no_progress_bar"]):
+ for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]:
set_correspondent(
sender=None,
from django.core.management.base import BaseCommand
+from documents.management.commands.mixins import ProgressBarMixin
from documents.sanity_checker import check_sanity
-class Command(BaseCommand):
- help = """
- This command checks your document archive for issues.
- """.replace(
- " ",
- "",
- )
+class Command(ProgressBarMixin, BaseCommand):
+ help = "This command checks your document archive for issues."
def add_arguments(self, parser):
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
- )
+ self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
- messages = check_sanity(progress=not options["no_progress_bar"])
+ self.handle_progress_bar_mixin(**options)
+ messages = check_sanity(progress=self.use_progress_bar)
messages.log_messages()
from django import db
from django.core.management.base import BaseCommand
+from documents.management.commands.mixins import MultiProcessMixin
+from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.parsers import get_parser_class_for_mime_type
parser.cleanup()
-class Command(BaseCommand):
- help = """
- This will regenerate the thumbnails for all documents.
- """.replace(
- " ",
- "",
- )
+class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
+ help = "This will regenerate the thumbnails for all documents."
def add_arguments(self, parser):
parser.add_argument(
default=None,
type=int,
required=False,
- help="Specify the ID of a document, and this command will only "
- "run on this specific document.",
- )
- parser.add_argument(
- "--no-progress-bar",
- default=False,
- action="store_true",
- help="If set, the progress bar will not be shown",
+ help=(
+ "Specify the ID of a document, and this command will only "
+ "run on this specific document."
+ ),
)
+ self.add_argument_progress_bar_mixin(parser)
+ self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
+ self.handle_processes_mixin(**options)
+ self.handle_progress_bar_mixin(**options)
+
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
# with postgres.
db.connections.close_all()
- with multiprocessing.Pool() as pool:
- list(
- tqdm.tqdm(
- pool.imap_unordered(_process_document, ids),
- total=len(ids),
- disable=options["no_progress_bar"],
- ),
- )
+ if self.process_count == 1:
+ for doc_id in ids:
+ _process_document(doc_id)
+ else: # pragma: no cover
+ with multiprocessing.Pool(processes=self.process_count) as pool:
+ list(
+ tqdm.tqdm(
+ pool.imap_unordered(_process_document, ids),
+ total=len(ids),
+ disable=self.no_progress_bar,
+ ),
+ )
import logging
import os
+from argparse import RawTextHelpFormatter
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
class Command(BaseCommand):
- help = """
- Creates a Django superuser:
- User named: admin
- Email: root@localhost
- with password based on env variable.
- No superuser will be created, when:
- - The username is taken already exists
- - A superuser already exists
- - PAPERLESS_ADMIN_PASSWORD is not set
- """.replace(
- " ",
- "",
+ help = (
+ "Creates a Django superuser:\n"
+ " User named: admin\n"
+ " Email: root@localhost\n"
+ " Password: based on env variable PAPERLESS_ADMIN_PASSWORD\n"
+ "No superuser will be created, when:\n"
+ " - The username is taken already exists\n"
+ " - A superuser already exists\n"
+ " - PAPERLESS_ADMIN_PASSWORD is not set"
)
+ def create_parser(self, *args, **kwargs):
+ parser = super().create_parser(*args, **kwargs)
+ parser.formatter_class = RawTextHelpFormatter
+ return parser
+
def handle(self, *args, **options):
username = os.getenv("PAPERLESS_ADMIN_USER", "admin")
mail = os.getenv("PAPERLESS_ADMIN_MAIL", "root@localhost")
--- /dev/null
+import os
+from argparse import ArgumentParser
+
+from django.core.management import CommandError
+
+
+class MultiProcessMixin:
+ """
+ Small class to handle adding an argument and validating it
+ for the use of multiple processes
+ """
+
+ def add_argument_processes_mixin(self, parser: ArgumentParser):
+ parser.add_argument(
+ "--processes",
+ default=max(1, os.cpu_count() // 4),
+ type=int,
+ help="Number of processes to distribute work amongst",
+ )
+
+ def handle_processes_mixin(self, *args, **options):
+ self.process_count = options["processes"]
+ if self.process_count < 1:
+ raise CommandError("There must be at least 1 process")
+
+
+class ProgressBarMixin:
+ """
+ Many commands use a progress bar, which can be disabled
+ via this class
+ """
+
+ def add_argument_progress_bar_mixin(self, parser: ArgumentParser):
+ parser.add_argument(
+ "--no-progress-bar",
+ default=False,
+ action="store_true",
+ help="If set, the progress bar will not be shown",
+ )
+
+ def handle_progress_bar_mixin(self, *args, **options):
+ self.no_progress_bar = options["no_progress_bar"]
+ self.use_progress_bar = not self.no_progress_bar
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
- call_command("document_archiver")
+ call_command("document_archiver", "--processes", "1")
def test_handle_document(self):
doc = self.make_models()
def test_command(self):
self.assertIsNotFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)
- call_command("document_thumbnails")
+ call_command("document_thumbnails", "--processes", "1")
self.assertIsFile(self.d1.thumbnail_path)
self.assertIsFile(self.d2.thumbnail_path)
def test_command_documentid(self):
self.assertIsNotFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)
- call_command("document_thumbnails", "-d", f"{self.d1.id}")
+ call_command("document_thumbnails", "--processes", "1", "-d", f"{self.d1.id}")
self.assertIsFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)
class Command(BaseCommand):
- help = """
- """.replace(
- " ",
- "",
- )
+ help = "Manually triggers a fetching and processing of all mail accounts"
def handle(self, *args, **options):
tasks.process_mail_accounts()