# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
- document.filename = generate_unique_filename(
- document, settings.ORIGINALS_DIR)
+ document.filename = generate_unique_filename(document)
create_source_path_directory(document.source_path)
self._write(document.storage_type,
thumbnail, document.thumbnail_path)
if archive_path and os.path.isfile(archive_path):
+ document.archive_filename = generate_unique_filename(
+ document,
+ archive_filename=True
+ )
create_source_path_directory(document.archive_path)
self._write(document.storage_type,
archive_path, document.archive_path)
return mydictionary
-def generate_unique_filename(doc, root):
+def generate_unique_filename(doc, archive_filename=False):
+ if archive_filename:
+ old_filename = doc.archive_filename
+ root = settings.ARCHIVE_DIR
+ else:
+ old_filename = doc.filename
+ root = settings.ORIGINALS_DIR
+
counter = 0
while True:
- new_filename = generate_filename(doc, counter)
- if new_filename == doc.filename:
+ new_filename = generate_filename(
+ doc, counter, archive_filename=archive_filename)
+ if new_filename == old_filename:
# still the same as before.
return new_filename
return new_filename
-def generate_filename(doc, counter=0, append_gpg=True):
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
+
+ filetype_str = ".pdf" if archive_filename else doc.file_type
+
if len(path) > 0:
- filename = f"{path}{counter_str}{doc.file_type}"
+ filename = f"{path}{counter_str}{filetype_str}"
else:
- filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
+ filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
-
-
-def archive_name_from_filename(filename):
- name, ext = os.path.splitext(filename)
- if ext == ".pdf":
- return filename
- else:
- return filename + ".pdf"
from documents.models import Document
from ... import index
-from ...file_handling import create_source_path_directory
+from ...file_handling import create_source_path_directory, \
+ generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
with transaction.atomic():
with open(parser.get_archive_path(), 'rb') as f:
checksum = hashlib.md5(f.read()).hexdigest()
- # i'm going to save first so that in case the file move
+ # I'm going to save first so that in case the file move
# fails, the database is rolled back.
- # we also don't use save() since that triggers the filehandling
+ # We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
+ document.archive_filename = generate_unique_filename(
+ document, archive_filename=True)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
- content=parser.get_text()
+ content=parser.get_text(),
+ archive_filename=document.archive_filename
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
document_ids = list(map(
lambda doc: doc.id,
filter(
- lambda d: overwrite or not d.archive_checksum,
+ lambda d: overwrite or not d.has_archive_version,
documents
)
))
thumbnail_target = os.path.join(self.target, thumbnail_name)
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
- if os.path.exists(document.archive_path):
+ if document.has_archive_version:
archive_name = base_name + "-archive.pdf"
archive_target = os.path.join(self.target, archive_name)
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
# Generated by Django 3.1.6 on 2021-02-07 22:26
+import datetime
import hashlib
import logging
import os
import shutil
+import pathvalidate
from django.conf import settings
-from django.db import migrations
+from django.db import migrations, models
+from django.template.defaultfilters import slugify
+from documents.file_handling import defaultdictNoStr, many_to_dictionary
logger = logging.getLogger("paperless.migrations")
-def archive_name_from_filename_old(filename):
+def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
- fname = archive_name_from_filename_old(doc.filename)
+ fname = archive_name_from_filename(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
)
-def archive_name_from_filename_new(filename):
- name, ext = os.path.splitext(filename)
- if ext == ".pdf":
- return filename
- else:
- return filename + ".pdf"
+STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
- if doc.filename:
- fname = archive_name_from_filename_new(doc.filename)
+ if doc.archive_filename is not None:
+ return os.path.join(
+ settings.ARCHIVE_DIR,
+ str(doc.archive_filename)
+ )
else:
- fname = "{:07}.pdf".format(doc.pk)
-
- return os.path.join(
- settings.ARCHIVE_DIR,
- fname
- )
-
-
-STORAGE_TYPE_GPG = "gpg"
+ return None
def source_path(doc):
)
+def generate_unique_filename(doc, archive_filename=False):
+ if archive_filename:
+ old_filename = doc.archive_filename
+ root = settings.ARCHIVE_DIR
+ else:
+ old_filename = doc.filename
+ root = settings.ORIGINALS_DIR
+
+ counter = 0
+
+ while True:
+ new_filename = generate_filename(
+ doc, counter, archive_filename=archive_filename)
+ if new_filename == old_filename:
+ # still the same as before.
+ return new_filename
+
+ if os.path.exists(os.path.join(root, new_filename)):
+ counter += 1
+ else:
+ return new_filename
+
+
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
+ path = ""
+
+ try:
+ if settings.PAPERLESS_FILENAME_FORMAT is not None:
+ tags = defaultdictNoStr(lambda: slugify(None),
+ many_to_dictionary(doc.tags))
+
+ tag_list = pathvalidate.sanitize_filename(
+ ",".join(sorted(
+ [tag.name for tag in doc.tags.all()]
+ )),
+ replacement_text="-"
+ )
+
+ if doc.correspondent:
+ correspondent = pathvalidate.sanitize_filename(
+ doc.correspondent.name, replacement_text="-"
+ )
+ else:
+ correspondent = "none"
+
+ if doc.document_type:
+ document_type = pathvalidate.sanitize_filename(
+ doc.document_type.name, replacement_text="-"
+ )
+ else:
+ document_type = "none"
+
+ path = settings.PAPERLESS_FILENAME_FORMAT.format(
+ title=pathvalidate.sanitize_filename(
+ doc.title, replacement_text="-"),
+ correspondent=correspondent,
+ document_type=document_type,
+ created=datetime.date.isoformat(doc.created),
+ created_year=doc.created.year if doc.created else "none",
+ created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
+ created_day=f"{doc.created.day:02}" if doc.created else "none",
+ added=datetime.date.isoformat(doc.added),
+ added_year=doc.added.year if doc.added else "none",
+ added_month=f"{doc.added.month:02}" if doc.added else "none",
+ added_day=f"{doc.added.day:02}" if doc.added else "none",
+ tags=tags,
+ tag_list=tag_list
+ ).strip()
+
+ path = path.strip(os.sep)
+
+ except (ValueError, KeyError, IndexError):
+ logger.warning(
+ f"Invalid PAPERLESS_FILENAME_FORMAT: "
+ f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
+
+ counter_str = f"_{counter:02}" if counter else ""
+
+ filetype_str = ".pdf" if archive_filename else doc.file_type
+
+ if len(path) > 0:
+ filename = f"{path}{counter_str}{filetype_str}"
+ else:
+ filename = f"{doc.pk:07}{counter_str}{filetype_str}"
+
+ # Append .gpg for encrypted files
+ if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
+ filename += ".gpg"
+
+ return filename
+
+
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
- new_path = archive_path_new(doc)
if not os.path.isfile(old_path):
raise ValueError(
f"Archived document of {doc.filename} does not exist at: "
f"{old_path}")
- if old_path != new_path and os.path.isfile(new_path):
- raise ValueError(
- f"Need to move {old_path} to {new_path}, but target file "
- f"already exists")
-
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
f"document {doc.filename} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
- # move files
for doc in Document.objects.filter(archive_checksum__isnull=False):
- old_path = archive_path_old(doc)
- new_path = archive_path_new(doc)
if doc.id in affected_document_ids:
+ old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
os.unlink(old_path)
else:
- # move unaffected archive versions
- if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
- logger.debug(
- f"Moving {old_path} to {new_path}"
- )
- shutil.move(old_path, new_path)
+ # Set archive path for unaffected files
+ doc.archive_filename = archive_path_old(doc)
+ Document.objects.filter(id=doc.id).update(
+ archive_filename=doc.archive_filename
+ )
# regenerate archive documents
for doc_id in affected_document_ids:
try:
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
doc.content = parser.get_text()
- if parser.archive_path and os.path.isfile(parser.archive_path):
- with open(parser.archive_path, "rb") as f:
+
+ if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
+ doc.archive_filename = generate_unique_filename(
+ doc, archive_filename=True)
+ with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
- shutil.copy2(parser.archive_path, archive_path_new(doc))
+ os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
+ shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
- if os.path.isfile(archive_path_new(doc)):
- os.unlink(archive_path_new(doc))
doc.save()
except ParseError:
logger.exception(
]
operations = [
+ migrations.AddField(
+ model_name='document',
+ name='archive_filename',
+ field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
+ ),
+ migrations.AlterField(
+ model_name='document',
+ name='filename',
+ field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
+ ),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
- )
+ ),
]
from django.utils.translation import gettext_lazy as _
-from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension
max_length=1024,
editable=False,
default=None,
+ unique=True,
null=True,
help_text=_("Current filename in storage")
)
+ archive_filename = models.FilePathField(
+ _("archive filename"),
+ max_length=1024,
+ editable=False,
+ default=None,
+ unique=True,
+ null=True,
+ help_text=_("Current archive filename in storage")
+ )
+
archive_serial_number = models.IntegerField(
_("archive serial number"),
blank=True,
def source_file(self):
return open(self.source_path, "rb")
+ @property
+ def has_archive_version(self):
+ return self.archive_filename is not None
+
@property
def archive_path(self):
- if self.filename:
- fname = archive_name_from_filename(self.filename)
+ if self.has_archive_version:
+ return os.path.join(
+ settings.ARCHIVE_DIR,
+ str(self.archive_filename)
+ )
else:
- fname = "{:07}.pdf".format(self.pk)
+ return None
- return os.path.join(
- settings.ARCHIVE_DIR,
- fname
- )
@property
def archive_file(self):
))
# Check sanity of the archive file.
- if doc.archive_checksum:
+ if doc.has_archive_version:
if not os.path.isfile(doc.archive_path):
messages.append(SanityError(
f"Archived version of document {doc.pk} does not exist."
from .. import index, matching
from ..file_handling import delete_empty_directories, \
- create_source_path_directory, archive_name_from_filename, \
+ create_source_path_directory, \
generate_unique_filename
from ..models import Document, Tag
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
with FileLock(settings.MEDIA_LOCK):
- for f in (instance.source_path,
- instance.archive_path,
- instance.thumbnail_path):
- if os.path.isfile(f):
+ for filename in (instance.source_path,
+ instance.archive_path,
+ instance.thumbnail_path):
+ if filename and os.path.isfile(filename):
try:
- os.unlink(f)
+ os.unlink(filename)
logger.debug(
- f"Deleted file {f}.")
+ f"Deleted file {filename}.")
except OSError as e:
logger.warning(
f"While deleting document {str(instance)}, the file "
- f"{f} could not be deleted: {e}"
+ f"{filename} could not be deleted: {e}"
)
delete_empty_directories(
root=settings.ORIGINALS_DIR
)
- delete_empty_directories(
- os.path.dirname(instance.archive_path),
- root=settings.ARCHIVE_DIR
- )
+ if instance.has_archive_version:
+ delete_empty_directories(
+ os.path.dirname(instance.archive_path),
+ root=settings.ARCHIVE_DIR
+ )
def validate_move(instance, old_path, new_path):
with FileLock(settings.MEDIA_LOCK):
old_filename = instance.filename
- new_filename = generate_unique_filename(
- instance, settings.ORIGINALS_DIR)
+ new_filename = generate_unique_filename(instance)
if new_filename == instance.filename:
# Don't do anything if its the same.
# archive files are optional, archive checksum tells us if we have one,
# since this is None for documents without archived files.
- if instance.archive_checksum:
- new_archive_filename = archive_name_from_filename(new_filename)
+ if instance.has_archive_version:
+ old_archive_filename = instance.archive_filename
+ new_archive_filename = generate_unique_filename(
+ instance, archive_filename=True
+ )
old_archive_path = instance.archive_path
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
new_archive_filename)
create_source_path_directory(new_archive_path)
else:
+ old_archive_filename = None
+ new_archive_filename = None
old_archive_path = None
new_archive_path = None
try:
os.rename(old_source_path, new_source_path)
- if instance.archive_checksum:
- os.rename(old_archive_path, new_archive_path)
instance.filename = new_filename
+ if instance.has_archive_version:
+ os.rename(old_archive_path, new_archive_path)
+ instance.archive_filename = new_archive_filename
+
# Don't save() here to prevent infinite recursion.
Document.objects.filter(pk=instance.pk).update(
- filename=new_filename)
+ filename=instance.filename,
+ archive_filename=instance.archive_filename,
+ )
except OSError as e:
instance.filename = old_filename
+ instance.archive_filename = old_archive_filename
# this happens when we can't move a file. If that's the case for
# the archive file, we try our best to revert the changes.
# no need to save the instance, the update() has not happened yet.
try:
os.rename(new_source_path, old_source_path)
- os.rename(new_archive_path, old_archive_path)
+ if instance.has_archive_version:
+ os.rename(new_archive_path, old_archive_path)
except Exception as e:
# This is fine, since:
# A: if we managed to move source from A to B, we will also
# since moving them once succeeded, it's very likely going to
# succeed again.
os.rename(new_source_path, old_source_path)
- if instance.archive_checksum:
+ if instance.has_archive_version:
os.rename(new_archive_path, old_archive_path)
instance.filename = old_filename
+ instance.archive_filename = old_archive_filename
# again, no need to save the instance, since the actual update()
# operation failed.
delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
- if old_archive_path and not os.path.isfile(old_archive_path):
+ if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
- if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
+ if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
file_handle = doc.archive_file
filename = doc.get_public_filename(archive=True)
mime_type = 'application/pdf'
"original_size": os.stat(doc.source_path).st_size,
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
- "has_archive_version": os.path.isfile(doc.archive_path),
+ "has_archive_version": doc.has_archive_version,
"original_metadata": self.get_metadata(
- doc.source_path, doc.mime_type)
+ doc.source_path, doc.mime_type),
+ "archive_checksum": doc.archive_checksum,
+ "archive_media_filename": doc.archive_filename
}
- if doc.archive_checksum and os.path.isfile(doc.archive_path):
- meta['archive_checksum'] = doc.archive_checksum
+ if doc.has_archive_version:
meta['archive_size'] = os.stat(doc.archive_path).st_size,
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
- meta['archive_checksum'] = None
meta['archive_size'] = None
meta['archive_metadata'] = None