"INP001",
"T201",
]
-lint.per-file-ignores."src/documents/management/commands/document_consumer.py" = [
- "PTH",
-] # TODO Enable & remove
-lint.per-file-ignores."src/documents/migrations/1012_fix_archive_files.py" = [
- "PTH",
-] # TODO Enable & remove
lint.per-file-ignores."src/documents/models.py" = [
"SIM115",
]
-lint.per-file-ignores."src/documents/parsers.py" = [
- "PTH",
-] # TODO Enable & remove
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"RUF001",
]
logger = logging.getLogger("paperless.management.consumer")
-def _tags_from_path(filepath) -> list[int]:
+def _tags_from_path(filepath: Path) -> list[int]:
"""
Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
"""
db.close_old_connections()
tag_ids = set()
- path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
+ path_parts = filepath.relative_to(settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
return list(tag_ids)
-def _is_ignored(filepath: str) -> bool:
+def _is_ignored(filepath: Path) -> bool:
"""
Checks if the given file should be ignored, based on configured
patterns.
Returns True if the file is ignored, False otherwise
"""
- filepath = os.path.abspath(
- os.path.normpath(filepath),
- )
-
# Trim out the consume directory, leaving only filename and it's
# path relative to the consume directory
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
return False
-def _consume(filepath: str) -> None:
- if os.path.isdir(filepath) or _is_ignored(filepath):
+def _consume(filepath: Path) -> None:
+ if filepath.is_dir() or _is_ignored(filepath):
return
- if not os.path.isfile(filepath):
+ if not filepath.is_file():
logger.debug(f"Not consuming file {filepath}: File has moved.")
return
- if not is_file_ext_supported(os.path.splitext(filepath)[1]):
+ if not is_file_ext_supported(filepath.suffix):
logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
return
while (read_try_count < os_error_retry_count) and not file_open_ok:
try:
- with open(filepath, "rb"):
+ with filepath.open("rb"):
file_open_ok = True
except OSError as e:
read_try_count += 1
logger.exception("Error while consuming document")
-def _consume_wait_unmodified(file: str) -> None:
+def _consume_wait_unmodified(file: Path) -> None:
"""
Waits for the given file to appear unmodified based on file size
and modification time. Will wait a configured number of seconds
current_try = 0
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
try:
- stat_data = os.stat(file)
+ stat_data = file.stat()
new_mtime = stat_data.st_mtime
new_size = stat_data.st_size
except FileNotFoundError:
self._pool = pool
def on_created(self, event):
- self._pool.submit(_consume_wait_unmodified, event.src_path)
+ self._pool.submit(_consume_wait_unmodified, Path(event.src_path))
def on_moved(self, event):
- self._pool.submit(_consume_wait_unmodified, event.dest_path)
+ self._pool.submit(_consume_wait_unmodified, Path(event.dest_path))
class Command(BaseCommand):
if not directory:
raise CommandError("CONSUMPTION_DIR does not appear to be set.")
- directory = os.path.abspath(directory)
+ directory = Path(directory).resolve()
- if not os.path.isdir(directory):
+ if not directory.is_dir():
raise CommandError(f"Consumption directory {directory} does not exist")
# Consumer will need this
if recursive:
for dirpath, _, filenames in os.walk(directory):
for filename in filenames:
- filepath = os.path.join(dirpath, filename)
+ filepath = Path(dirpath) / filename
_consume(filepath)
else:
- for entry in os.scandir(directory):
- _consume(entry.path)
+ for filepath in directory.iterdir():
+ _consume(filepath)
if options["oneshot"]:
return
try:
for event in inotify.read(timeout=timeout_ms):
path = inotify.get_path(event.wd) if recursive else directory
- filepath = os.path.join(path, event.name)
+ filepath = Path(path) / event.name
if flags.MODIFY in flags.from_mask(event.mask):
notified_files.pop(filepath, None)
else:
# Also make sure the file exists still, some scanners might write a
# temporary file first
- file_still_exists = os.path.exists(filepath) and os.path.isfile(
- filepath,
- )
+ file_still_exists = filepath.exists() and filepath.is_file()
if waited_long_enough and file_still_exists:
_consume(filepath)
import os
import shutil
from collections import defaultdict
+from pathlib import Path
from time import sleep
import pathvalidate
return mydictionary
-def archive_name_from_filename(filename):
- return os.path.splitext(filename)[0] + ".pdf"
+def archive_name_from_filename(filename: Path) -> Path:
+ return Path(filename.stem + ".pdf")
-def archive_path_old(doc):
+def archive_path_old(doc) -> Path:
if doc.filename:
- fname = archive_name_from_filename(doc.filename)
+ fname = archive_name_from_filename(Path(doc.filename))
else:
- fname = f"{doc.pk:07}.pdf"
+ fname = Path(f"{doc.pk:07}.pdf")
- return os.path.join(settings.ARCHIVE_DIR, fname)
+ return settings.ARCHIVE_DIR / fname
STORAGE_TYPE_GPG = "gpg"
-def archive_path_new(doc):
+def archive_path_new(doc) -> Path | None:
if doc.archive_filename is not None:
- return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename))
+ return settings.ARCHIVE_DIR / doc.archive_filename
else:
return None
-def source_path(doc):
+def source_path(doc) -> Path:
if doc.filename:
- fname = str(doc.filename)
+ fname = doc.filename
else:
fname = f"{doc.pk:07}{doc.file_type}"
if doc.storage_type == STORAGE_TYPE_GPG:
- fname += ".gpg" # pragma: no cover
+ fname = Path(str(fname) + ".gpg") # pragma: no cover
- return os.path.join(settings.ORIGINALS_DIR, fname)
+ return settings.ORIGINALS_DIR / fname
def generate_unique_filename(doc, *, archive_filename=False):
# still the same as before.
return new_filename
- if os.path.exists(os.path.join(root, new_filename)):
+ if (root / new_filename).exists():
counter += 1
else:
return new_filename
parser,
source_path(doc),
doc.mime_type,
- os.path.basename(doc.filename),
+ Path(doc.filename).name,
)
doc.content = parser.get_text()
- if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
+ if parser.get_archive_path() and Path(parser.get_archive_path()).is_file():
doc.archive_filename = generate_unique_filename(
doc,
archive_filename=True,
)
- with open(parser.get_archive_path(), "rb") as f:
+ with Path(parser.get_archive_path()).open("rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
- os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
+ archive_path_new(doc).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
# check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
- if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+ if doc.id not in affected_document_ids and not old_path.is_file():
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: {old_path}",
)
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
- if os.path.isfile(old_path):
+ if old_path.is_file():
logger.debug(f"Removing {old_path}")
- os.unlink(old_path)
+ old_path.unlink()
else:
# Set archive path for unaffected files
- doc.archive_filename = archive_name_from_filename(doc.filename)
+ doc.archive_filename = archive_name_from_filename(Path(doc.filename))
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename,
)
f"filename.",
)
old_archive_paths.add(old_archive_path)
- if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
+ if new_archive_path != old_archive_path and old_archive_path.is_file():
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists.",
args += ["-depth", str(depth)] if depth else []
args += ["-auto-orient"] if auto_orient else []
args += ["-define", "pdf:use-cropbox=true"] if use_cropbox else []
- args += [input_file, output_file]
+ args += [str(input_file), str(output_file)]
logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
return (Path(__file__).parent / "resources" / "document.webp").resolve()
-def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
- out_path = os.path.join(temp_dir, "convert_gs.webp")
+def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> Path:
+ out_path: Path = Path(temp_dir) / "convert_gs.webp"
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
extra={"group": logging_group},
)
# Ghostscript doesn't handle WebP outputs
- gs_out_path = os.path.join(temp_dir, "gs_out.png")
+ gs_out_path: Path = Path(temp_dir) / "gs_out.png"
cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
try:
# The caller might expect a generated thumbnail that can be moved,
# so we need to copy it before it gets moved.
# https://github.com/paperless-ngx/paperless-ngx/issues/3631
- default_thumbnail_path = os.path.join(temp_dir, "document.webp")
+ default_thumbnail_path: Path = Path(temp_dir) / "document.webp"
copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
return default_thumbnail_path
-def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
+def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -> Path:
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
- out_path = temp_dir / "convert.webp"
+ out_path: Path = temp_dir / "convert.webp"
# Run convert to get a decent thumbnail
try:
},
)
@override_settings(
- MODEL_FILE=(Path(__file__).parent / "data" / "model.pickle").as_posix(),
+ MODEL_FILE=str(Path(__file__).parent / "data" / "model.pickle"),
)
@pytest.mark.skip(
reason="Disabled caching due to high memory usage - need to investigate.",
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
filename = self.get_test_file()
- shadow_file = Path(self.dirs.scratch_dir / "._sample.pdf")
+ shadow_file = Path(self.dirs.scratch_dir) / "._sample.pdf"
shutil.copy(filename, shadow_file)
def test_is_ignored(self):
test_paths = [
{
- "path": (Path(self.dirs.consumption_dir) / "foo.pdf").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / "foo.pdf"),
"ignore": False,
},
{
- "path": (
- Path(self.dirs.consumption_dir) / "foo" / "bar.pdf"
- ).as_posix(),
+ "path": str(
+ Path(self.dirs.consumption_dir) / "foo" / "bar.pdf",
+ ),
"ignore": False,
},
{
- "path": (Path(self.dirs.consumption_dir) / ".DS_STORE").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / ".DS_STORE"),
"ignore": True,
},
{
- "path": (Path(self.dirs.consumption_dir) / ".DS_Store").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / ".DS_Store"),
"ignore": True,
},
{
- "path": (
- Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf"
- ).as_posix(),
+ "path": str(
+ Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf",
+ ),
"ignore": True,
},
{
- "path": (Path(self.dirs.consumption_dir) / ".stfolder.pdf").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / ".stfolder.pdf"),
"ignore": False,
},
{
- "path": (
- Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf"
- ).as_posix(),
+ "path": str(
+ Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf",
+ ),
"ignore": True,
},
{
- "path": (
- Path(self.dirs.consumption_dir) / ".stversions.pdf"
- ).as_posix(),
+ "path": str(
+ Path(self.dirs.consumption_dir) / ".stversions.pdf",
+ ),
"ignore": False,
},
{
- "path": (Path(self.dirs.consumption_dir) / "._foo.pdf").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / "._foo.pdf"),
"ignore": True,
},
{
- "path": (Path(self.dirs.consumption_dir) / "my_foo.pdf").as_posix(),
+ "path": str(Path(self.dirs.consumption_dir) / "my_foo.pdf"),
"ignore": False,
},
{
- "path": (
- Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf"
- ).as_posix(),
+ "path": str(
+ Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf",
+ ),
"ignore": True,
},
{
- "path": (
+ "path": str(
Path(self.dirs.consumption_dir)
/ "@eaDir"
/ "SYNO@.fileindexdb"
- / "_1jk.fnm"
- ).as_posix(),
+ / "_1jk.fnm",
+ ),
"ignore": True,
},
]
f'_is_ignored("{filepath}") != {expected_ignored_result}',
)
- @mock.patch("documents.management.commands.document_consumer.open")
+ @mock.patch("documents.management.commands.document_consumer.Path.open")
def test_consume_file_busy(self, open_mock):
# Calling this mock always raises this
open_mock.side_effect = OSError
for element in manifest:
if element["model"] == "documents.document":
- fname = (
- self.target / element[document_exporter.EXPORTER_FILE_NAME]
- ).as_posix()
+ fname = str(
+ self.target / element[document_exporter.EXPORTER_FILE_NAME],
+ )
self.assertIsFile(fname)
self.assertIsFile(
self.target / element[document_exporter.EXPORTER_THUMBNAIL_NAME],
call_command(*args)
- expected_file = (
- self.target / f"export-{timezone.localdate().isoformat()}.zip"
- ).as_posix()
+ expected_file = str(
+ self.target / f"export-{timezone.localdate().isoformat()}.zip",
+ )
self.assertIsFile(expected_file)
):
call_command(*args)
- expected_file = (
- self.target / f"export-{timezone.localdate().isoformat()}.zip"
- ).as_posix()
+ expected_file = str(
+ self.target / f"export-{timezone.localdate().isoformat()}.zip",
+ )
self.assertIsFile(expected_file)
call_command(*args)
- expected_file = (
- self.target / f"export-{timezone.localdate().isoformat()}.zip"
- ).as_posix()
+ expected_file = str(
+ self.target / f"export-{timezone.localdate().isoformat()}.zip",
+ )
self.assertIsFile(expected_file)
self.assertIsNotFile(existing_file)
)
-def archive_name_from_filename(filename):
- return Path(filename).stem + ".pdf"
+def archive_name_from_filename(filename: Path) -> Path:
+ return Path(filename.stem + ".pdf")
-def archive_path_old(self):
+def archive_path_old(self) -> Path:
if self.filename:
- fname = archive_name_from_filename(self.filename)
+ fname = archive_name_from_filename(Path(self.filename))
else:
- fname = f"{self.pk:07}.pdf"
+ fname = Path(f"{self.pk:07}.pdf")
return Path(settings.ARCHIVE_DIR) / fname
databases = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
- "NAME": str(DATA_DIR / "db.sqlite3"),
+ "NAME": DATA_DIR / "db.sqlite3",
"OPTIONS": {},
},
}
("zh-tw", _("Chinese Traditional")),
]
-LOCALE_PATHS = [str(BASE_DIR / "locale")]
+LOCALE_PATHS = [BASE_DIR / "locale"]
TIME_ZONE = os.getenv("PAPERLESS_TIME_ZONE", "UTC")
"file_paperless": {
"class": "concurrent_log_handler.ConcurrentRotatingFileHandler",
"formatter": "verbose",
- "filename": str(LOGGING_DIR / "paperless.log"),
+ "filename": LOGGING_DIR / "paperless.log",
"maxBytes": LOGROTATE_MAX_SIZE,
"backupCount": LOGROTATE_MAX_BACKUPS,
},
"file_mail": {
"class": "concurrent_log_handler.ConcurrentRotatingFileHandler",
"formatter": "verbose",
- "filename": str(LOGGING_DIR / "mail.log"),
+ "filename": LOGGING_DIR / "mail.log",
"maxBytes": LOGROTATE_MAX_SIZE,
"backupCount": LOGROTATE_MAX_BACKUPS,
},
"file_celery": {
"class": "concurrent_log_handler.ConcurrentRotatingFileHandler",
"formatter": "verbose",
- "filename": str(LOGGING_DIR / "celery.log"),
+ "filename": LOGGING_DIR / "celery.log",
"maxBytes": LOGROTATE_MAX_SIZE,
"backupCount": LOGROTATE_MAX_BACKUPS,
},
CELERY_BEAT_SCHEDULE = _parse_beat_schedule()
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule-filename
-CELERY_BEAT_SCHEDULE_FILENAME = str(DATA_DIR / "celerybeat-schedule.db")
+CELERY_BEAT_SCHEDULE_FILENAME = DATA_DIR / "celerybeat-schedule.db"
# Cachalot: Database read cache.
"""
parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count(
- (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 1)
page_count = parser.get_page_count(
- (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 6)
parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count(
- (self.SAMPLE_FILES / "password-protected.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "password-protected.pdf"),
"application/pdf",
)
self.assertEqual(page_count, None)
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
- (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
@mock.patch("documents.parsers.run_convert")
def test_thumbnail_fallback(self, m):
def call_convert(input_file, output_file, **kwargs):
- if ".pdf" in input_file:
+ if ".pdf" in str(input_file):
raise ParseError("Does not compute.")
else:
run_convert(input_file=input_file, output_file=output_file, **kwargs)
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
- (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
- (self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
def test_get_dpi(self):
parser = RasterisedDocumentParser(None)
- dpi = parser.get_dpi((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix())
+ dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
self.assertEqual(dpi, None)
- dpi = parser.get_dpi((self.SAMPLE_FILES / "simple.png").as_posix())
+ dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
self.assertEqual(dpi, 72)
def test_simple_digital(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
def test_signed(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "signed.pdf").as_posix(), "application/pdf")
+ parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
)
def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
def test_image_simple(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.png").as_posix(), "image/png")
+ parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
self.assertIsFile(parser.archive_path)
dest_file = Path(tempdir) / "simple-alpha.png"
shutil.copy(sample_file, dest_file)
- parser.parse(dest_file.as_posix(), "image/png")
+ parser.parse(str(dest_file), "image/png")
self.assertIsFile(parser.archive_path)
parser = RasterisedDocumentParser(None)
dpi = parser.calculate_a4_dpi(
- (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
+ str(self.SAMPLE_FILES / "simple-no-dpi.png"),
)
self.assertEqual(dpi, 62)
def f():
parser.parse(
- (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
+ str(self.SAMPLE_FILES / "simple-no-dpi.png"),
"image/png",
)
def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), "image/png")
+ parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
self.assertIsFile(parser.archive_path)
def test_multi_page(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "single-page-mixed.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "rotated.pdf").as_posix(), "application/pdf")
+ parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
self.assertContainsStrings(
parser.get_text(),
[
"""
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "multi-page-images.tiff").as_posix(),
+ str(self.SAMPLE_FILES / "multi-page-images.tiff"),
"image/tiff",
)
self.assertIsFile(parser.archive_path)
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
- sample_file = (
- self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff"
- ).as_posix()
+ sample_file = str(
+ self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
+ )
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "rtl-test.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "rtl-test.pdf"),
"application/pdf",
)
self.assertRaises(
ParseError,
parser.parse,
- (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
+ str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
def test_bmp(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.bmp").as_posix(), "image/bmp")
+ parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.jpg").as_posix(), "image/jpeg")
+ parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.heic").as_posix(), "image/heic")
+ parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.gif").as_posix(), "image/gif")
+ parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
- parser.parse((self.SAMPLE_FILES / "simple.tif").as_posix(), "image/tiff")
+ parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_webp(self):
parser = RasterisedDocumentParser(None)
parser.parse(
- (self.SAMPLE_FILES / "document.webp").as_posix(),
+ str(self.SAMPLE_FILES / "document.webp"),
"image/webp",
)
self.assertIsFile(parser.archive_path)