from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
-from documents.models import FileInfo
from documents.models import StoragePath
from documents.models import Tag
from documents.models import WorkflowTrigger
) -> Document:
# If someone gave us the original filename, use it instead of doc.
- file_info = FileInfo.from_filename(self.filename)
-
self.log.debug("Saving record to database")
if self.metadata.created is not None:
self.log.debug(
f"Creation date from post_documents parameter: {create_date}",
)
- elif file_info.created is not None:
- create_date = file_info.created
- self.log.debug(f"Creation date from FileInfo: {create_date}")
elif date is not None:
create_date = date
self.log.debug(f"Creation date from parse_date: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
- title = file_info.title
+ if self.metadata.filename:
+ title = Path(self.metadata.filename).stem
+ else:
+ title = self.input_doc.original_file.stem
+
if self.metadata.title is not None:
try:
title = self._parse_title_placeholders(self.metadata.title)
import datetime
-import os
-import re
-from collections import OrderedDict
from pathlib import Path
from typing import Final
-import dateutil.parser
import pathvalidate
from celery import states
from django.conf import settings
return f"SavedViewFilterRule: {self.rule_type} : {self.value}"
-# TODO: why is this in the models file?
-# TODO: how about, what is this and where is it documented?
-# It appears to parsing JSON from an environment variable to get a title and date from
-# the filename, if possible, as a higher priority than either document filename or
-# content parsing
-class FileInfo:
- REGEXES = OrderedDict(
- [
- (
- "created-title",
- re.compile(
- r"^(?P<created>\d{8}(\d{6})?Z) - (?P<title>.*)$",
- flags=re.IGNORECASE,
- ),
- ),
- ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)),
- ],
- )
-
- def __init__(
- self,
- created=None,
- correspondent=None,
- title=None,
- tags=(),
- extension=None,
- ):
- self.created = created
- self.title = title
- self.extension = extension
- self.correspondent = correspondent
- self.tags = tags
-
- @classmethod
- def _get_created(cls, created):
- try:
- return dateutil.parser.parse(f"{created[:-1]:0<14}Z")
- except ValueError:
- return None
-
- @classmethod
- def _get_title(cls, title):
- return title
-
- @classmethod
- def _mangle_property(cls, properties, name):
- if name in properties:
- properties[name] = getattr(cls, f"_get_{name}")(properties[name])
-
- @classmethod
- def from_filename(cls, filename) -> "FileInfo":
- # Mutate filename in-place before parsing its components
- # by applying at most one of the configured transformations.
- for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS:
- (filename, count) = pattern.subn(repl, filename)
- if count:
- break
-
- # do this after the transforms so that the transforms can do whatever
- # with the file extension.
- filename_no_ext = os.path.splitext(filename)[0]
-
- if filename_no_ext == filename and filename.startswith("."):
- # This is a very special case where there is no text before the
- # file type.
- # TODO: this should be handled better. The ext is not removed
- # because usually, files like '.pdf' are just hidden files
- # with the name pdf, but in our case, its more likely that
- # there's just no name to begin with.
- filename = ""
- # This isn't too bad either, since we'll just not match anything
- # and return an empty title. TODO: actually, this is kinda bad.
- else:
- filename = filename_no_ext
-
- # Parse filename components.
- for regex in cls.REGEXES.values():
- m = regex.match(filename)
- if m:
- properties = m.groupdict()
- cls._mangle_property(properties, "created")
- cls._mangle_property(properties, "title")
- return cls(**properties)
-
-
# Extending User Model Using a One-To-One Link
class UiSettings(models.Model):
user = models.OneToOneField(
import datetime
import os
-import re
import shutil
import stat
import tempfile
import zoneinfo
from pathlib import Path
-from unittest import TestCase as UnittestTestCase
from unittest import mock
from unittest.mock import MagicMock
from documents.models import CustomField
from documents.models import Document
from documents.models import DocumentType
-from documents.models import FileInfo
from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from paperless_mail.parsers import MailDocumentParser
-class TestAttributes(UnittestTestCase):
- TAGS = ("tag1", "tag2", "tag3")
-
- def _test_guess_attributes_from_name(self, filename, sender, title, tags):
- file_info = FileInfo.from_filename(filename)
-
- if sender:
- self.assertEqual(file_info.correspondent.name, sender, filename)
- else:
- self.assertIsNone(file_info.correspondent, filename)
-
- self.assertEqual(file_info.title, title, filename)
-
- self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename)
-
- def test_guess_attributes_from_name_when_title_starts_with_dash(self):
- self._test_guess_attributes_from_name(
- "- weird but should not break.pdf",
- None,
- "- weird but should not break",
- (),
- )
-
- def test_guess_attributes_from_name_when_title_ends_with_dash(self):
- self._test_guess_attributes_from_name(
- "weird but should not break -.pdf",
- None,
- "weird but should not break -",
- (),
- )
-
-
-class TestFieldPermutations(TestCase):
- valid_dates = (
- "20150102030405Z",
- "20150102Z",
- )
- valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""]
- valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""]
- valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
-
- def _test_guessed_attributes(
- self,
- filename,
- created=None,
- correspondent=None,
- title=None,
- tags=None,
- ):
- info = FileInfo.from_filename(filename)
-
- # Created
- if created is None:
- self.assertIsNone(info.created, filename)
- else:
- self.assertEqual(info.created.year, int(created[:4]), filename)
- self.assertEqual(info.created.month, int(created[4:6]), filename)
- self.assertEqual(info.created.day, int(created[6:8]), filename)
-
- # Correspondent
- if correspondent:
- self.assertEqual(info.correspondent.name, correspondent, filename)
- else:
- self.assertEqual(info.correspondent, None, filename)
-
- # Title
- self.assertEqual(info.title, title, filename)
-
- # Tags
- if tags is None:
- self.assertEqual(info.tags, (), filename)
- else:
- self.assertEqual([t.name for t in info.tags], tags.split(","), filename)
-
- def test_just_title(self):
- template = "{title}.pdf"
- for title in self.valid_titles:
- spec = dict(title=title)
- filename = template.format(**spec)
- self._test_guessed_attributes(filename, **spec)
-
- def test_created_and_title(self):
- template = "{created} - {title}.pdf"
-
- for created in self.valid_dates:
- for title in self.valid_titles:
- spec = {"created": created, "title": title}
- self._test_guessed_attributes(template.format(**spec), **spec)
-
- def test_invalid_date_format(self):
- info = FileInfo.from_filename("06112017Z - title.pdf")
- self.assertEqual(info.title, "title")
- self.assertIsNone(info.created)
-
- def test_filename_parse_transforms(self):
- filename = "tag1,tag2_20190908_180610_0001.pdf"
- all_patt = re.compile("^.*$")
- none_patt = re.compile("$a")
- re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
-
- # No transformations configured (= default)
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
- self.assertEqual(info.tags, ())
- self.assertIsNone(info.created)
-
- # Pattern doesn't match (filename unaltered)
- with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
-
- # Simple transformation (match all)
- with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "all")
-
- # Multiple transformations configured (first pattern matches)
- with self.settings(
- FILENAME_PARSE_TRANSFORMS=[
- (all_patt, "all.gif"),
- (all_patt, "anotherall.gif"),
- ],
- ):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "all")
-
- # Multiple transformations configured (second pattern matches)
- with self.settings(
- FILENAME_PARSE_TRANSFORMS=[
- (none_patt, "none.gif"),
- (all_patt, "anotherall.gif"),
- ],
- ):
- info = FileInfo.from_filename(filename)
- self.assertEqual(info.title, "anotherall")
-
-
class _BaseTestParser(DocumentParser):
def get_settings(self):
"""