uvicorn = {extras = ["standard"], version = "==0.25.0"}
watchdog = "~=4.0"
whitenoise = "~=6.7"
-whoosh="~=2.7"
+whoosh = "~=2.7"
zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
[dev-packages]
pytest-env = "*"
pytest-sugar = "*"
pytest-xdist = "*"
+pytest-mock = "*"
pytest-rerunfailures = "*"
imagehash = "*"
daphne = "*"
{
"_meta": {
"hash": {
- "sha256": "37d8a84e16b6f6785d0daa79b249beab7fbef0c177a13eccfce79816bf61ccd0"
+ "sha256": "272a69e9011a60f2d326b77d99d261425b66ebcc8ae929372213700ae47de0f5"
},
"pipfile-spec": 6,
"requires": {},
"markers": "python_version >= '3.9'",
"version": "==0.30.0"
},
+ "pytest-mock": {
+ "hashes": [
+ "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f",
+ "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"
+ ],
+ "index": "pypi",
+ "markers": "python_version >= '3.8'",
+ "version": "==3.14.0"
+ },
"pytest-rerunfailures": {
"hashes": [
"sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32",
return default_thumbnail_path
-def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
+def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
- out_path = os.path.join(temp_dir, "convert.webp")
+ out_path = temp_dir / "convert.webp"
# Run convert to get a decent thumbnail
try:
auto_orient=True,
use_cropbox=True,
input_file=f"{in_path}[0]",
- output_file=out_path,
+ output_file=str(out_path),
logging_group=logging_group,
)
except ParseError as e:
--- /dev/null
+import zoneinfo
+
+import pytest
+from pytest_django.fixtures import SettingsWrapper
+
+
+@pytest.fixture()
+def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo:
+ return zoneinfo.ZoneInfo(settings.TIME_ZONE)
import datetime
+from zoneinfo import ZoneInfo
-from dateutil import tz
-from django.conf import settings
-from django.test import TestCase
-from django.test import override_settings
+from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
-class TestDate(TestCase):
+class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
- self.assertEqual(parse_date("", text), None)
+ assert parse_date("", text) is None
def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum"
- self.assertEqual(parse_date("", text), None)
+ assert parse_date("", text) is None
def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum"
- self.assertEqual(parse_date("", text), None)
+ assert parse_date("", text) is None
- def test_date_format_4(self):
+ def test_date_format_4(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text)
- self.assertEqual(
- date,
- datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
+ assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
- def test_date_format_5(self):
+ def test_date_format_5(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
date = parse_date("", text)
- self.assertEqual(
- date,
- datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
+ assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
def test_date_format_6(self):
text = (
"BIC\n"
"lorem ipsum"
)
- self.assertEqual(parse_date("", text), None)
+ assert parse_date("", text) is None
- def test_date_format_7(self):
+ def test_date_format_7(self, settings_timezone: ZoneInfo):
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
- self.assertEqual(
- date,
- datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
+ assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
- def test_date_format_8(self):
+ def test_date_format_8(self, settings_timezone: ZoneInfo):
text = (
"lorem ipsum\n"
"Wohnort\n"
"lorem ipsum\n"
"März 2020"
)
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2020,
+ 3,
+ 1,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_9(self):
+ def test_date_format_9(self, settings_timezone: ZoneInfo):
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2020,
+ 3,
+ 1,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_10(self):
+ def test_date_format_10(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_11(self):
+ def test_date_format_11(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_12(self):
+ def test_date_format_12(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_13(self):
+ def test_date_format_13(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_14(self):
+ def test_date_format_14(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
def test_date_format_15(self):
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
- self.assertIsNone(parse_date("", text), None)
+ assert parse_date("", text) is None
def test_date_format_16(self):
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
- self.assertIsNone(parse_date("", text), None)
+ assert parse_date("", text) is None
def test_date_format_17(self):
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
- self.assertIsNone(parse_date("", text), None)
+ assert parse_date("", text) is None
def test_date_format_18(self):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
- self.assertIsNone(parse_date("", text), None)
+ assert parse_date("", text) is None
- def test_date_format_19(self):
+ def test_date_format_19(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 21,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_20(self):
+ def test_date_format_20(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_21(self):
+ def test_date_format_21(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 2,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_22(self):
+ def test_date_format_22(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 23,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_23(self):
+ def test_date_format_23(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 24,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_24(self):
+ def test_date_format_24(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 21,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_25(self):
+ def test_date_format_25(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2022,
+ 3,
+ 25,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
- def test_date_format_26(self):
+ def test_date_format_26(self, settings_timezone: ZoneInfo):
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2019, 9, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2019,
+ 9,
+ 25,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
def test_crazy_date_past(self):
- self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
+ assert parse_date("", "01-07-0590 00:00:00") is None
def test_crazy_date_future(self):
- self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
+ assert parse_date("", "01-07-2350 00:00:00") is None
def test_crazy_date_with_spaces(self):
- self.assertIsNone(parse_date("", "20 408000l 2475"))
-
- def test_utf_month_names(self):
- self.assertEqual(
- parse_date("", "13 décembre 2023"),
- datetime.datetime(2023, 12, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "13 août 2022"),
- datetime.datetime(2022, 8, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "11 März 2020"),
- datetime.datetime(2020, 3, 11, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "17. ožujka 2018."),
- datetime.datetime(2018, 3, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "1. veljače 2016."),
- datetime.datetime(2016, 2, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "15. února 1985"),
- datetime.datetime(1985, 2, 15, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "30. září 2011"),
- datetime.datetime(2011, 9, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "28. května 1990"),
- datetime.datetime(1990, 5, 28, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "1. grudzień 1997"),
- datetime.datetime(1997, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "17 Şubat 2024"),
- datetime.datetime(2024, 2, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "30 Ağustos 2012"),
- datetime.datetime(2012, 8, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "17 Eylül 2000"),
- datetime.datetime(2000, 9, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- parse_date("", "5. október 1992"),
- datetime.datetime(1992, 10, 5, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
-
- def test_multiple_dates(self):
+ assert parse_date("", "20 408000l 2475") is None
+
+ def test_utf_month_names(self, settings_timezone: ZoneInfo):
+ assert parse_date("", "13 décembre 2023") == datetime.datetime(
+ 2023,
+ 12,
+ 13,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "13 août 2022") == datetime.datetime(
+ 2022,
+ 8,
+ 13,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "11 März 2020") == datetime.datetime(
+ 2020,
+ 3,
+ 11,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
+ 2018,
+ 3,
+ 17,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "1. veljače 2016.") == datetime.datetime(
+ 2016,
+ 2,
+ 1,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "15. února 1985") == datetime.datetime(
+ 1985,
+ 2,
+ 15,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "30. září 2011") == datetime.datetime(
+ 2011,
+ 9,
+ 30,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "28. května 1990") == datetime.datetime(
+ 1990,
+ 5,
+ 28,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "1. grudzień 1997") == datetime.datetime(
+ 1997,
+ 12,
+ 1,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "17 Şubat 2024") == datetime.datetime(
+ 2024,
+ 2,
+ 17,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
+ 2012,
+ 8,
+ 30,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "17 Eylül 2000") == datetime.datetime(
+ 2000,
+ 9,
+ 17,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+ assert parse_date("", "5. október 1992") == datetime.datetime(
+ 1992,
+ 10,
+ 5,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+
+ def test_multiple_dates(self, settings_timezone: ZoneInfo):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and December 2021.
But not 24-12-9999 because it's in the future..."""
dates = list(parse_date_generator("", text))
- self.assertEqual(len(dates), 3)
- self.assertEqual(
- dates[0],
- datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- dates[1],
- datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- self.assertEqual(
- dates[2],
- datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
- @override_settings(FILENAME_DATE_ORDER="YMD")
- def test_filename_date_parse_valid_ymd(self, *args):
+ assert dates == [
+ datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
+ datetime.datetime(
+ 2022,
+ 7,
+ 22,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ ),
+ datetime.datetime(
+ 2021,
+ 12,
+ 1,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ ),
+ ]
+
+ def test_filename_date_parse_valid_ymd(
+ self,
+ settings: SettingsWrapper,
+ settings_timezone: ZoneInfo,
+ ):
"""
GIVEN:
- Date parsing from the filename is enabled
THEN:
- Should parse the date from the filename
"""
- self.assertEqual(
- parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
- datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
-
- @override_settings(FILENAME_DATE_ORDER="DMY")
- def test_filename_date_parse_valid_dmy(self, *args):
+ settings.FILENAME_DATE_ORDER = "YMD"
+
+ assert parse_date(
+ "/tmp/Scan-2022-04-01.pdf",
+ "No date in here",
+ ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
+
+ def test_filename_date_parse_valid_dmy(
+ self,
+ settings: SettingsWrapper,
+ settings_timezone: ZoneInfo,
+ ):
"""
GIVEN:
- Date parsing from the filename is enabled
THEN:
- Should parse the date from the filename
"""
- self.assertEqual(
- parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
- datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
+ settings.FILENAME_DATE_ORDER = "DMY"
+ assert parse_date(
+ "/tmp/Scan-10.01.2021.pdf",
+ "No date in here",
+ ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
- @override_settings(FILENAME_DATE_ORDER="YMD")
- def test_filename_date_parse_invalid(self, *args):
+ def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
"""
GIVEN:
- Date parsing from the filename is enabled
THEN:
- No date is parsed
"""
- self.assertIsNone(
- parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
- )
-
- @override_settings(
- FILENAME_DATE_ORDER="YMD",
- IGNORE_DATES=(datetime.date(2022, 4, 1),),
- )
- def test_filename_date_ignored_use_content(self, *args):
+ settings.FILENAME_DATE_ORDER = "YMD"
+ assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
+
+ def test_filename_date_ignored_use_content(
+ self,
+ settings: SettingsWrapper,
+ settings_timezone: ZoneInfo,
+ ):
"""
GIVEN:
- Date parsing from the filename is enabled
THEN:
- Should parse the date from the content not filename
"""
- self.assertEqual(
- parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
- datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
-
- @override_settings(
- IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
- )
- def test_ignored_dates_default_order(self, *args):
+ settings.FILENAME_DATE_ORDER = "YMD"
+ settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
+ assert parse_date(
+ "/tmp/Scan-2022-04-01.pdf",
+ "The matching date is 24.03.2022",
+ ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
+
+ def test_ignored_dates_default_order(
+ self,
+ settings: SettingsWrapper,
+ settings_timezone: ZoneInfo,
+ ):
"""
GIVEN:
- Ignore dates have been set
THEN:
- Should parse the date non-ignored date from content
"""
+ settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
- )
-
- @override_settings(
- IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
- DATE_ORDER="YMD",
- )
- def test_ignored_dates_order_ymd(self, *args):
+ assert parse_date("", text) == datetime.datetime(
+ 2018,
+ 2,
+ 13,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
+ )
+
+ def test_ignored_dates_order_ymd(
+ self,
+ settings: SettingsWrapper,
+ settings_timezone: ZoneInfo,
+ ):
"""
GIVEN:
- Ignore dates have been set
THEN:
- Should parse the date non-ignored date from content
"""
+
+ settings.FILENAME_DATE_ORDER = "YMD"
+ settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
+
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
- self.assertEqual(
- parse_date("", text),
- datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+ assert parse_date("", text) == datetime.datetime(
+ 2018,
+ 2,
+ 13,
+ 0,
+ 0,
+ tzinfo=settings_timezone,
)
return PdfAFormat.A3b
return None
- def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
+ def get_thumbnail(
+ self,
+ document_path: Path,
+ mime_type: str,
+ file_name=None,
+ ) -> Path:
if not self.archive_path:
self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path),
--- /dev/null
+import os
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_mail.mail import MailAccountHandler
+from paperless_mail.models import MailAccount
+from paperless_mail.parsers import MailDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+ return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def broken_email_file(sample_dir: Path) -> Path:
+ return sample_dir / "broken.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_file(sample_dir: Path) -> Path:
+ return sample_dir / "simple_text.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
+ return sample_dir / "simple_text.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
+ return sample_dir / "simple_text.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_file(sample_dir: Path) -> Path:
+ return sample_dir / "html.eml"
+
+
+@pytest.fixture(scope="session")
+def html_email_pdf_file(sample_dir: Path) -> Path:
+ return sample_dir / "html.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def html_email_thumbnail_file(sample_dir: Path) -> Path:
+ return sample_dir / "html.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_html_file(sample_dir: Path) -> Path:
+ return sample_dir / "html.eml.html"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_first(sample_dir: Path) -> Path:
+ return sample_dir / "first.pdf"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_second(sample_dir: Path) -> Path:
+ return sample_dir / "second.pdf"
+
+
+@pytest.fixture()
+def mail_parser() -> MailDocumentParser:
+ return MailDocumentParser(logging_group=None)
+
+
+@pytest.fixture()
+def live_mail_account() -> Generator[MailAccount, None, None]:
+ try:
+ account = MailAccount.objects.create(
+ name="test",
+ imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
+ username=os.environ["PAPERLESS_MAIL_TEST_USER"],
+ password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
+ imap_port=993,
+ )
+ yield account
+ finally:
+ account.delete()
+
+
+@pytest.fixture()
+def mail_account_handler() -> MailAccountHandler:
+ return MailAccountHandler()
import os
+import warnings
import pytest
-from django.test import TestCase
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]),
reason="Live server testing not enabled",
)
-class TestMailLiveServer(TestCase):
- def setUp(self) -> None:
- self.mail_account_handler = MailAccountHandler()
- self.account = MailAccount.objects.create(
- name="test",
- imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
- username=os.environ["PAPERLESS_MAIL_TEST_USER"],
- password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
- imap_port=993,
- )
-
- return super().setUp()
-
- def tearDown(self) -> None:
- self.account.delete()
- return super().tearDown()
-
- def test_process_non_gmail_server_flag(self):
+@pytest.mark.django_db()
+class TestMailLiveServer:
+ def test_process_non_gmail_server_flag(
+ self,
+ mail_account_handler: MailAccountHandler,
+ live_mail_account: MailAccount,
+ ):
try:
rule1 = MailRule.objects.create(
name="testrule",
- account=self.account,
+ account=live_mail_account,
action=MailRule.MailAction.FLAG,
)
- self.mail_account_handler.handle_mail_account(self.account)
+ mail_account_handler.handle_mail_account(live_mail_account)
rule1.delete()
except MailError as e:
- self.fail(f"Failure: {e}")
- except Exception:
- pass
-
- def test_process_non_gmail_server_tag(self):
+ pytest.fail(f"Failure: {e}")
+ except Exception as e:
+ warnings.warn(f"Unhandled exception: {e}")
+
+ def test_process_non_gmail_server_tag(
+ self,
+ mail_account_handler: MailAccountHandler,
+ live_mail_account: MailAccount,
+ ):
try:
rule2 = MailRule.objects.create(
name="testrule",
- account=self.account,
+ account=live_mail_account,
action=MailRule.MailAction.TAG,
)
- self.mail_account_handler.handle_mail_account(self.account)
+ mail_account_handler.handle_mail_account(live_mail_account)
rule2.delete()
except MailError as e:
- self.fail(f"Failure: {e}")
- except Exception:
- pass
+ pytest.fail(f"Failure: {e}")
+ except Exception as e:
+ warnings.warn(f"Unhandled exception: {e}")
import datetime
+import logging
from pathlib import Path
-from unittest import mock
import httpx
-from django.test import TestCase
+import pytest
+from django.test.html import parse_html
+from pytest_django.fixtures import SettingsWrapper
+from pytest_httpx import HTTPXMock
+from pytest_mock import MockerFixture
from documents.parsers import ParseError
-from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.parsers import MailDocumentParser
-from paperless_tika.tests.utils import HttpxMockMixin
-class BaseMailParserTestCase(TestCase):
- """
- Basic setup for the below test cases
- """
-
- SAMPLE_DIR = Path(__file__).parent / "samples"
-
- def setUp(self) -> None:
- super().setUp()
- self.parser = MailDocumentParser(logging_group=None)
-
- def tearDown(self) -> None:
- super().tearDown()
- self.parser.cleanup()
-
-
-class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
+class TestEmailFileParsing:
"""
Tests around reading a file and parsing it into a
MailMessage
"""
- def test_parse_error_missing_file(self):
+ def test_parse_error_missing_file(
+ self,
+ mail_parser: MailDocumentParser,
+ sample_dir: Path,
+ ):
"""
GIVEN:
- Fresh parser
- An Exception is thrown
"""
# Check if exception is raised when parsing fails.
- test_file = self.SAMPLE_DIR / "doesntexist.eml"
-
- self.assertIsNotFile(test_file)
- self.assertRaises(
- ParseError,
- self.parser.parse,
- test_file,
- "messages/rfc822",
- )
+ test_file = sample_dir / "doesntexist.eml"
+
+ assert not test_file.exists()
- def test_parse_error_invalid_email(self):
+ with pytest.raises(ParseError):
+ mail_parser.parse(test_file, "messages/rfc822")
+
+ def test_parse_error_invalid_email(
+ self,
+ mail_parser: MailDocumentParser,
+ broken_email_file: Path,
+ ):
"""
GIVEN:
- Fresh parser
- An Exception is thrown
"""
# Check if exception is raised when the mail is faulty.
- self.assertRaises(
- ParseError,
- self.parser.parse,
- self.SAMPLE_DIR / "broken.eml",
- "messages/rfc822",
- )
- def test_parse_simple_text_email_file(self):
+ with pytest.raises(ParseError):
+ mail_parser.parse(broken_email_file, "messages/rfc822")
+
+ def test_parse_simple_text_email_file(
+ self,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ ):
"""
GIVEN:
- Fresh parser
- The content of the mail should be available in the parse result.
"""
# Parse Test file and check relevant content
- parsed1 = self.parser.parse_file_to_message(
- self.SAMPLE_DIR / "simple_text.eml",
- )
-
- self.assertEqual(parsed1.date.year, 2022)
- self.assertEqual(parsed1.date.month, 10)
- self.assertEqual(parsed1.date.day, 12)
- self.assertEqual(parsed1.date.hour, 21)
- self.assertEqual(parsed1.date.minute, 40)
- self.assertEqual(parsed1.date.second, 43)
- self.assertEqual(parsed1.date.tzname(), "UTC+02:00")
- self.assertEqual(parsed1.from_, "mail@someserver.de")
- self.assertEqual(parsed1.subject, "Simple Text Mail")
- self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n")
- self.assertEqual(parsed1.to, ("some@one.de",))
-
-
-class TestEmailMetadataExtraction(BaseMailParserTestCase):
+ parsed_msg = mail_parser.parse_file_to_message(simple_txt_email_file)
+
+ assert parsed_msg.date.year == 2022
+ assert parsed_msg.date.month == 10
+ assert parsed_msg.date.day == 12
+ assert parsed_msg.date.hour == 21
+ assert parsed_msg.date.minute == 40
+ assert parsed_msg.date.second == 43
+ assert parsed_msg.date.tzname() == "UTC+02:00"
+ assert parsed_msg.from_ == "mail@someserver.de"
+ assert parsed_msg.subject == "Simple Text Mail"
+ assert parsed_msg.text == "This is just a simple Text Mail.\n"
+ assert parsed_msg.to == ("some@one.de",)
+
+
+class TestEmailMetadataExtraction:
"""
Tests extraction of metadata from an email
"""
- def test_extract_metadata_fail(self):
+ def test_extract_metadata_fail(
+ self,
+ caplog: pytest.LogCaptureFixture,
+ mail_parser: MailDocumentParser,
+ ):
"""
GIVEN:
- Fresh start
- A log warning should be generated
"""
# Validate if warning is logged when parsing fails
- with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm:
- self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822"))
- self.assertIn(
- "WARNING:paperless.parsing.mail:Error while fetching document metadata for na",
- cm.output[0],
- )
+ assert mail_parser.extract_metadata("na", "message/rfc822") == []
+
+ assert len(caplog.records) == 1
+ record = caplog.records[0]
+
+ assert record.levelno == logging.WARNING
+ assert record.name == "paperless.parsing.mail"
+ assert "Error while fetching document metadata for na" in record.message
- def test_extract_metadata(self):
+ def test_extract_metadata(
+ self,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ ):
"""
GIVEN:
- Fresh start
- metadata is returned
"""
# Validate Metadata parsing returns the expected results
- metadata = self.parser.extract_metadata(
- self.SAMPLE_DIR / "simple_text.eml",
- "message/rfc822",
- )
-
- self.assertIn(
- {"namespace": "", "prefix": "", "key": "attachments", "value": ""},
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "",
- "key": "date",
- "value": "2022-10-12 21:40:43 UTC+02:00",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "content-language",
- "value": "en-US",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "content-type",
- "value": "text/plain; charset=UTF-8; format=flowed",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "date",
- "value": "Wed, 12 Oct 2022 21:40:43 +0200",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "delivered-to",
- "value": "mail@someserver.de",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "from",
- "value": "Some One <mail@someserver.de>",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "message-id",
- "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "mime-version",
- "value": "1.0",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "received",
- "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "return-path",
- "value": "<mail@someserver.de>",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "subject",
- "value": "Simple Text Mail",
- },
- metadata,
- )
- self.assertIn(
- {"namespace": "", "prefix": "header", "key": "to", "value": "some@one.de"},
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "user-agent",
- "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
- },
- metadata,
- )
- self.assertIn(
- {
- "namespace": "",
- "prefix": "header",
- "key": "x-last-tls-session-version",
- "value": "TLSv1.3",
- },
- metadata,
- )
-
-
-class TestEmailThumbnailGenerate(BaseMailParserTestCase):
+ metadata = mail_parser.extract_metadata(simple_txt_email_file, "message/rfc822")
+
+ assert {
+ "namespace": "",
+ "prefix": "",
+ "key": "attachments",
+ "value": "",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "",
+ "key": "date",
+ "value": "2022-10-12 21:40:43 UTC+02:00",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "content-language",
+ "value": "en-US",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "content-type",
+ "value": "text/plain; charset=UTF-8; format=flowed",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "date",
+ "value": "Wed, 12 Oct 2022 21:40:43 +0200",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "delivered-to",
+ "value": "mail@someserver.de",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "from",
+ "value": "Some One <mail@someserver.de>",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "message-id",
+ "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "mime-version",
+ "value": "1.0",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "received",
+ "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "return-path",
+ "value": "<mail@someserver.de>",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "subject",
+ "value": "Simple Text Mail",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "to",
+ "value": "some@one.de",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "user-agent",
+ "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
+ } in metadata
+ assert {
+ "namespace": "",
+ "prefix": "header",
+ "key": "x-last-tls-session-version",
+ "value": "TLSv1.3",
+ } in metadata
+
+
+class TestEmailThumbnailGenerate:
"""
Tests the correct generation of an thumbnail for an email
"""
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
- @mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
def test_get_thumbnail(
self,
- mock_make_thumbnail_from_pdf: mock.MagicMock,
- mock_generate_pdf: mock.MagicMock,
+ mocker: MockerFixture,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
):
"""
GIVEN:
- The parser should call the functions which generate the thumbnail
"""
mocked_return = "Passing the return value through.."
+ mock_make_thumbnail_from_pdf = mocker.patch(
+ "paperless_mail.parsers.make_thumbnail_from_pdf",
+ )
mock_make_thumbnail_from_pdf.return_value = mocked_return
+ mock_generate_pdf = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+ )
mock_generate_pdf.return_value = "Mocked return value.."
- test_file = self.SAMPLE_DIR / "simple_text.eml"
-
- thumb = self.parser.get_thumbnail(
- test_file,
- "message/rfc822",
- )
+ thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
mock_generate_pdf.assert_called_once()
mock_make_thumbnail_from_pdf.assert_called_once_with(
"Mocked return value..",
- self.parser.tempdir,
+ mail_parser.tempdir,
None,
)
- self.assertEqual(mocked_return, thumb)
+ assert mocked_return == thumb
-class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
- def test_tika_parse_unsuccessful(self):
+class TestTikaHtmlParse:
+ def test_tika_parse_unsuccessful(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ ):
"""
GIVEN:
- Fresh start
- the parser should return an empty string
"""
# Check unsuccessful parsing
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []},
)
- parsed = self.parser.tika_parse("None")
- self.assertEqual("", parsed)
+ parsed = mail_parser.tika_parse("None")
+ assert parsed == ""
- def test_tika_parse(self):
+ def test_tika_parse(self, httpx_mock: HTTPXMock, mail_parser: MailDocumentParser):
"""
GIVEN:
- Fresh start
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
expected_text = "Some Text"
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": expected_text,
},
)
- parsed = self.parser.tika_parse(html)
- self.assertEqual(expected_text, parsed.strip())
- self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
+ parsed = mail_parser.tika_parse(html)
+ assert expected_text == parsed.strip()
+ assert "http://localhost:9998" in str(httpx_mock.get_request().url)
- def test_tika_parse_exception(self):
+ def test_tika_parse_exception(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ ):
"""
GIVEN:
- Fresh start
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
- self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
+ httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
- self.assertRaises(ParseError, self.parser.tika_parse, html)
+ with pytest.raises(ParseError):
+ mail_parser.tika_parse(html)
- def test_tika_parse_unreachable(self):
+ def test_tika_parse_unreachable(
+ self,
+ settings: SettingsWrapper,
+ mail_parser: MailDocumentParser,
+ ):
"""
GIVEN:
- Fresh start
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
# Check if exception is raised when Tika cannot be reached.
- self.parser.tika_server = ""
- self.assertRaises(ParseError, self.parser.tika_parse, html)
+ with pytest.raises(ParseError):
+ settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
+ mail_parser.tika_parse(html)
-class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase):
- def test_parse_no_file(self):
- """
- GIVEN:
- - Fresh start
- WHEN:
- - parsing is attempted with nonexistent file
- THEN:
- - Exception is thrown
- """
- # Check if exception is raised when parsing fails.
- self.assertRaises(
- ParseError,
- self.parser.parse,
- self.SAMPLE_DIR / "na.eml",
- "message/rfc822",
- )
-
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
- def test_parse_eml_simple(self, mock_generate_pdf: mock.MagicMock):
+class TestParser:
+ def test_parse_eml_simple(
+ self,
+ mocker: MockerFixture,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ ):
"""
GIVEN:
- Fresh start
- parsed information is available
"""
# Validate parsing returns the expected results
-
- self.parser.parse(
- self.SAMPLE_DIR / "simple_text.eml",
- "message/rfc822",
+ mock_generate_pdf = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
+
+ mail_parser.parse(simple_txt_email_file, "message/rfc822")
text_expected = (
"Subject: Simple Text Mail\n\n"
"From: Some One <mail@someserver.de>\n\n"
"BCC: fdf@fvf.de\n\n"
"\n\nThis is just a simple Text Mail."
)
- self.assertEqual(text_expected, self.parser.text)
- self.assertEqual(
+ assert text_expected == mail_parser.text
+ assert (
datetime.datetime(
2022,
10,
40,
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
- ),
- self.parser.date,
+ )
+ == mail_parser.date
)
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
mock_generate_pdf.assert_called()
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
- def test_parse_eml_html(self, mock_generate_pdf: mock.MagicMock):
+ def test_parse_eml_html(
+ self,
+ mocker: MockerFixture,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ ):
"""
GIVEN:
- Fresh start
THEN:
- Tika is called, parsed information from non html parts is available
"""
+
+ mock_generate_pdf = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+ )
+
# Validate parsing returns the expected results
text_expected = (
"Subject: HTML Message\n\n"
"Some Text and an embedded image."
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
},
)
- self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+ mail_parser.parse(html_email_file, "message/rfc822")
mock_generate_pdf.assert_called_once()
- self.assertEqual(text_expected, self.parser.text)
- self.assertEqual(
+ assert text_expected == mail_parser.text
+ assert (
datetime.datetime(
2022,
10,
23,
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
- ),
- self.parser.date,
+ )
+ == mail_parser.date
)
- def test_generate_pdf_parse_error(self):
+ def test_generate_pdf_parse_error(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ ):
"""
GIVEN:
- Fresh start
THEN:
- a ParseError Exception is thrown
"""
- self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
+ httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
- self.assertRaises(
- ParseError,
- self.parser.parse,
- self.SAMPLE_DIR / "simple_text.eml",
- "message/rfc822",
- )
+ with pytest.raises(ParseError):
+ mail_parser.parse(simple_txt_email_file, "message/rfc822")
- def test_generate_pdf_simple_email(self):
+ def test_generate_pdf_simple_email(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ simple_txt_email_pdf_file: Path,
+ ):
"""
GIVEN:
- Simple text email with no HTML content
- Archive file is generated
"""
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
- content=(self.SAMPLE_DIR / "simple_text.eml.pdf").read_bytes(),
+ content=simple_txt_email_pdf_file.read_bytes(),
)
- self.parser.parse(self.SAMPLE_DIR / "simple_text.eml", "message/rfc822")
+ mail_parser.parse(simple_txt_email_file, "message/rfc822")
- self.assertIsNotNone(self.parser.archive_path)
+ assert mail_parser.archive_path is not None
- def test_generate_pdf_html_email(self):
+ def test_generate_pdf_html_email(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ html_email_pdf_file: Path,
+ ):
"""
GIVEN:
- email with HTML content
- Gotenberg is used to merge the two PDFs
- Archive file is generated
"""
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"X-TIKA:content": "This is some Tika HTML text",
},
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
- content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+ content=html_email_pdf_file.read_bytes(),
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge",
method="POST",
content=b"Pretend merged PDF content",
)
- self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+ mail_parser.parse(html_email_file, "message/rfc822")
- self.assertIsNotNone(self.parser.archive_path)
+ assert mail_parser.archive_path is not None
- def test_generate_pdf_html_email_html_to_pdf_failure(self):
+ def test_generate_pdf_html_email_html_to_pdf_failure(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ html_email_pdf_file: Path,
+ ):
"""
GIVEN:
- email with HTML content
THEN:
- ParseError is raised
"""
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"X-TIKA:content": "This is some Tika HTML text",
},
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
- content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+ content=html_email_pdf_file.read_bytes(),
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR,
)
- with self.assertRaises(ParseError):
- self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+ with pytest.raises(ParseError):
+ mail_parser.parse(html_email_file, "message/rfc822")
- def test_generate_pdf_html_email_merge_failure(self):
+ def test_generate_pdf_html_email_merge_failure(
+ self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ html_email_pdf_file: Path,
+ ):
"""
GIVEN:
- email with HTML content
THEN:
- ParseError is raised
"""
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"X-TIKA:content": "This is some Tika HTML text",
},
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
- content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+ content=html_email_pdf_file.read_bytes(),
)
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge",
method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR,
)
- with self.assertRaises(ParseError):
- self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+ with pytest.raises(ParseError):
+ mail_parser.parse(html_email_file, "message/rfc822")
- def test_mail_to_html(self):
+ def test_mail_to_html(
+ self,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ html_email_html_file: Path,
+ ):
"""
GIVEN:
- Email message with HTML content
THEN:
- Resulting HTML is as expected
"""
- mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
- html_file = self.parser.mail_to_html(mail)
- expected_html_file = self.SAMPLE_DIR / "html.eml.html"
+ mail = mail_parser.parse_file_to_message(html_email_file)
+ html_file = mail_parser.mail_to_html(mail)
+
+ expected_html = parse_html(html_email_html_file.read_text())
+ actual_html = parse_html(html_file.read_text())
- self.assertHTMLEqual(expected_html_file.read_text(), html_file.read_text())
+ assert expected_html == actual_html
def test_generate_pdf_from_mail(
self,
+ httpx_mock: HTTPXMock,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
):
"""
GIVEN:
- Gotenberg is used to convert HTML to PDF
"""
- self.httpx_mock.add_response(content=b"Content")
+ httpx_mock.add_response(content=b"Content")
- mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
+ mail = mail_parser.parse_file_to_message(html_email_file)
- retval = self.parser.generate_pdf_from_mail(mail)
- self.assertEqual(b"Content", retval.read_bytes())
+ retval = mail_parser.generate_pdf_from_mail(mail)
+ assert retval.read_bytes() == b"Content"
- request = self.httpx_mock.get_request()
+ request = httpx_mock.get_request()
- self.assertEqual(
- str(request.url),
- "http://localhost:3000/forms/chromium/convert/html",
- )
+ assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html"
import subprocess
import tempfile
from pathlib import Path
-from unittest import mock
import httpx
import pytest
-from django.test import TestCase
from imagehash import average_hash
from PIL import Image
+from pytest_mock import MockerFixture
-from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import util_call_with_backoff
-from paperless_mail.tests.test_parsers import BaseMailParserTestCase
+from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str:
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
-class TestUrlCanary(TestCase):
+class TestUrlCanary:
"""
Verify certain URLs are still available so testing is valid still
"""
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
"""
- with self.assertRaises(httpx.HTTPStatusError) as cm:
+ with pytest.raises(httpx.HTTPStatusError) as exec_info:
resp = httpx.get(
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
)
resp.raise_for_status()
- self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND)
+ assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
def test_is_online_image_still_available(self):
"""
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
-class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
+class TestParserLive:
@staticmethod
def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}"
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
- def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
+ def test_get_thumbnail(
+ self,
+ mocker: MockerFixture,
+ mail_parser: MailDocumentParser,
+ simple_txt_email_file: Path,
+ simple_txt_email_pdf_file: Path,
+ simple_txt_email_thumbnail_file: Path,
+ ):
"""
GIVEN:
- Fresh start
THEN:
- The returned thumbnail image file is as expected
"""
- mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf"
- thumb = self.parser.get_thumbnail(
- self.SAMPLE_DIR / "simple_text.eml",
- "message/rfc822",
+ mock_generate_pdf = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
- self.assertIsFile(thumb)
+ mock_generate_pdf.return_value = simple_txt_email_pdf_file
- expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp"
+ thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
- self.assertEqual(
- self.imagehash(thumb),
- self.imagehash(expected),
- f"Created Thumbnail {thumb} differs from expected file {expected}",
- )
+ assert thumb.exists()
+ assert thumb.is_file()
+
+ assert (
+ self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
+ ), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
- def test_tika_parse_successful(self):
+ def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
"""
GIVEN:
- Fresh start
expected_text = "Some Text"
# Check successful parsing
- parsed = self.parser.tika_parse(html)
- self.assertEqual(expected_text, parsed.strip())
+ parsed = mail_parser.tika_parse(html)
+ assert expected_text == parsed.strip()
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
- @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging(
self,
- mock_generate_pdf_from_html: mock.MagicMock,
- mock_generate_pdf_from_mail: mock.MagicMock,
+ mocker: MockerFixture,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ merged_pdf_first: Path,
+ merged_pdf_second: Path,
):
"""
GIVEN:
THEN:
- gotenberg is called to merge files and the resulting file is returned
"""
- mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf"
- mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf"
-
- msg = self.parser.parse_file_to_message(
- self.SAMPLE_DIR / "html.eml",
+ mock_generate_pdf_from_html = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
)
+ mock_generate_pdf_from_mail = mocker.patch(
+ "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
+ )
+ mock_generate_pdf_from_mail.return_value = merged_pdf_first
+ mock_generate_pdf_from_html.return_value = merged_pdf_second
+
+ msg = mail_parser.parse_file_to_message(html_email_file)
_, pdf_path = util_call_with_backoff(
- self.parser.generate_pdf,
+ mail_parser.generate_pdf,
[msg],
)
- self.assertIsFile(pdf_path)
+ assert pdf_path.exists()
+ assert pdf_path.is_file()
extracted = extract_text(pdf_path)
expected = (
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
)
- self.assertEqual(expected, extracted)
+ assert expected == extracted
- def test_generate_pdf_from_mail(self):
+ def test_generate_pdf_from_mail(
+ self,
+ mail_parser: MailDocumentParser,
+ html_email_file: Path,
+ html_email_pdf_file: Path,
+ html_email_thumbnail_file: Path,
+ ):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from simple eml file is requested
THEN:
- - gotenberg is called and the resulting file is returned and look as expected.
+ - Gotenberg is called and the resulting file is returned and look as expected.
"""
- util_call_with_backoff(
- self.parser.parse,
- [self.SAMPLE_DIR / "html.eml", "message/rfc822"],
- )
+ util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
# Check the archive PDF
- archive_path = self.parser.get_archive_path()
+ archive_path = mail_parser.get_archive_path()
archive_text = extract_text(archive_path)
- expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf")
+ expected_archive_text = extract_text(html_email_pdf_file)
# Archive includes the HTML content, so use in
- self.assertIn(expected_archive_text, archive_text)
+ assert expected_archive_text in archive_text
# Check the thumbnail
- generated_thumbnail = self.parser.get_thumbnail(
- self.SAMPLE_DIR / "html.eml",
+ generated_thumbnail = mail_parser.get_thumbnail(
+ html_email_file,
"message/rfc822",
)
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same.
- expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp")
+ expected_hash = self.imagehash(html_email_thumbnail_file)
- self.assertEqual(
- generated_thumbnail_hash,
- expected_hash,
- f"PDF looks different. Check if {generated_thumbnail} looks weird.",
- )
+ assert (
+ generated_thumbnail_hash == expected_hash
+ ), f"PDF looks different. Check if {generated_thumbnail} looks weird."
-import os
+from pathlib import Path
from django.conf import settings
from PIL import Image
logging_name = "paperless.parsing.text"
- def get_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
text = self.read_file_handle_unicode_errors(document_path)
img = Image.new("RGB", (500, 700), color="white")
)
draw.text((5, 5), text, font=font, fill="black")
- out_path = os.path.join(self.tempdir, "thumb.webp")
+ out_path = self.tempdir / "thumb.webp"
img.save(out_path, format="WEBP")
return out_path
--- /dev/null
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_text.parsers import TextDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+ return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture()
+def text_parser() -> Generator[TextDocumentParser, None, None]:
+ try:
+ parser = TextDocumentParser(logging_group=None)
+ yield parser
+ finally:
+ parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(sample_dir: Path) -> Path:
+ return sample_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(sample_dir: Path) -> Path:
+ return sample_dir / "decode_error.txt"
from pathlib import Path
-from django.test import TestCase
-
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
from paperless_text.parsers import TextDocumentParser
-class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
-
- def test_thumbnail(self):
- parser = TextDocumentParser(None)
-
+class TestTextParser:
+ def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
# just make sure that it does not crash
- f = parser.get_thumbnail(
- self.SAMPLE_DIR / "test.txt",
- "text/plain",
- )
- self.assertIsFile(f)
-
- def test_parse(self):
- parser = TextDocumentParser(None)
+ f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+ assert f.exists()
+ assert f.is_file()
- parser.parse(
- self.SAMPLE_DIR / "test.txt",
- "text/plain",
- )
+ def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
+ text_parser.parse(sample_txt_file, "text/plain")
- self.assertEqual(parser.get_text(), "This is a test file.\n")
- self.assertIsNone(parser.get_archive_path())
+ assert text_parser.get_text() == "This is a test file.\n"
+ assert text_parser.get_archive_path() is None
- def test_parse_invalid_bytes(self):
+ def test_parse_invalid_bytes(
+ self,
+ text_parser: TextDocumentParser,
+ malformed_txt_file: Path,
+ ):
"""
GIVEN:
- Text file which contains invalid UTF bytes
- Parsing continues
- Invalid bytes are removed
"""
- parser = TextDocumentParser(None)
- parser.parse(
- self.SAMPLE_DIR / "decode_error.txt",
- "text/plain",
- )
+ text_parser.parse(malformed_txt_file, "text/plain")
- self.assertEqual(parser.get_text(), "Pantothens�ure\n")
- self.assertIsNone(parser.get_archive_path())
+ assert text_parser.get_text() == "Pantothens�ure\n"
+ assert text_parser.get_archive_path() is None
--- /dev/null
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_tika.parsers import TikaDocumentParser
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+ try:
+ parser = TikaDocumentParser(logging_group=None)
+ yield parser
+ finally:
+ parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+ return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(sample_dir: Path) -> Path:
+ return sample_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(sample_dir: Path) -> Path:
+ return sample_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(sample_dir: Path) -> Path:
+ return sample_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(sample_dir: Path) -> Path:
+ return sample_dir / "multi-part-broken.odt"
import os
from pathlib import Path
-from typing import Final
import pytest
-from django.test import TestCase
from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
-class TestTikaParserAgainstServer(TestCase):
+@pytest.mark.django_db()
+class TestTikaParserAgainstServer:
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
- SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
-
- def setUp(self) -> None:
- self.parser = TikaDocumentParser(logging_group=None)
-
- def tearDown(self) -> None:
- self.parser.cleanup()
-
- def test_basic_parse_odt(self):
+ def test_basic_parse_odt(
+ self,
+ tika_parser: TikaDocumentParser,
+ sample_odt_file: Path,
+ ):
"""
GIVEN:
- An input ODT format document
- Document content is correct
- Document date is correct
"""
- test_file = self.SAMPLE_DIR / Path("sample.odt")
-
util_call_with_backoff(
- self.parser.parse,
- [test_file, "application/vnd.oasis.opendocument.text"],
+ tika_parser.parse,
+ [sample_odt_file, "application/vnd.oasis.opendocument.text"],
)
- self.assertEqual(
- self.parser.text,
- "This is an ODT test document, created September 14, 2022",
+ assert (
+ tika_parser.text
+ == "This is an ODT test document, created September 14, 2022"
)
- self.assertIsNotNone(self.parser.archive_path)
- with open(self.parser.archive_path, "rb") as f:
- # PDFs begin with the bytes PDF-x.y
- self.assertTrue(b"PDF-" in f.read()[:10])
+ assert tika_parser.archive_path is not None
+ assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
- # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+ # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
- def test_basic_parse_docx(self):
+ def test_basic_parse_docx(
+ self,
+ tika_parser: TikaDocumentParser,
+ sample_docx_file: Path,
+ ):
"""
GIVEN:
- An input DOCX format document
- Document content is correct
- Document date is correct
"""
- test_file = self.SAMPLE_DIR / Path("sample.docx")
-
util_call_with_backoff(
- self.parser.parse,
+ tika_parser.parse,
[
- test_file,
+ sample_docx_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
)
- self.assertEqual(
- self.parser.text,
- "This is an DOCX test document, also made September 14, 2022",
+ assert (
+ tika_parser.text
+ == "This is an DOCX test document, also made September 14, 2022"
)
- self.assertIsNotNone(self.parser.archive_path)
- with open(self.parser.archive_path, "rb") as f:
- self.assertTrue(b"PDF-" in f.read()[:10])
+ assert tika_parser.archive_path is not None
+ with open(tika_parser.archive_path, "rb") as f:
+ assert b"PDF-" in f.read()[:10]
- # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+ # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
- def test_basic_parse_doc(self):
+ def test_basic_parse_doc(
+ self,
+ tika_parser: TikaDocumentParser,
+ sample_doc_file: Path,
+ ):
"""
GIVEN:
- An input DOC format document
- Document content is correct
- Document date is correct
"""
- test_file = self.SAMPLE_DIR / "sample.doc"
-
util_call_with_backoff(
- self.parser.parse,
- [test_file, "application/msword"],
+ tika_parser.parse,
+ [sample_doc_file, "application/msword"],
)
- self.assertIn(
- "his is a test document, saved in the older .doc format",
- self.parser.text,
+ assert (
+ "This is a test document, saved in the older .doc format"
+ in tika_parser.text
)
- self.assertIsNotNone(self.parser.archive_path)
- with open(self.parser.archive_path, "rb") as f:
- self.assertTrue(b"PDF-" in f.read()[:10])
-
- def test_tika_fails_multi_part(self):
+ assert tika_parser.archive_path is not None
+ with open(tika_parser.archive_path, "rb") as f:
+ assert b"PDF-" in f.read()[:10]
+
+ def test_tika_fails_multi_part(
+ self,
+ tika_parser: TikaDocumentParser,
+ sample_broken_odt: Path,
+ ):
"""
GIVEN:
- An input ODT format document
See also:
- https://issues.apache.org/jira/browse/TIKA-4110
"""
- test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
-
util_call_with_backoff(
- self.parser.parse,
- [test_file, "application/vnd.oasis.opendocument.text"],
+ tika_parser.parse,
+ [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
)
- self.assertIsNotNone(self.parser.archive_path)
- with open(self.parser.archive_path, "rb") as f:
- self.assertTrue(b"PDF-" in f.read()[:10])
+ assert tika_parser.archive_path is not None
+ with open(tika_parser.archive_path, "rb") as f:
+ assert b"PDF-" in f.read()[:10]
import datetime
-import os
import zoneinfo
+from http import HTTPStatus
from pathlib import Path
-from django.test import TestCase
-from django.test import override_settings
+import pytest
from httpx import codes
from httpx._multipart import DataField
-from rest_framework import status
+from pytest_django.fixtures import SettingsWrapper
+from pytest_httpx import HTTPXMock
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
-from paperless_tika.tests.utils import HttpxMockMixin
-class TestTikaParser(HttpxMockMixin, TestCase):
- def setUp(self) -> None:
- self.parser = TikaDocumentParser(logging_group=None)
-
- def tearDown(self) -> None:
- self.parser.cleanup()
-
- @override_settings(TIME_ZONE="America/Chicago")
- def test_parse(self):
+@pytest.mark.django_db()
+class TestTikaParser:
+ def test_parse(
+ self,
+ httpx_mock: HTTPXMock,
+ settings: SettingsWrapper,
+ tika_parser: TikaDocumentParser,
+ sample_odt_file: Path,
+ ):
+ settings.TIME_ZONE = "America/Chicago"
# Pretend parse response
- self.httpx_mock.add_response(
+ httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
},
)
# Pretend convert to PDF response
- self.httpx_mock.add_response(content=b"PDF document")
-
- file = Path(os.path.join(self.parser.tempdir, "input.odt"))
- file.touch()
-
- self.parser.parse(file, "application/vnd.oasis.opendocument.text")
-
- self.assertEqual(self.parser.text, "the content")
- self.assertIsNotNone(self.parser.archive_path)
- with open(self.parser.archive_path, "rb") as f:
- self.assertEqual(f.read(), b"PDF document")
-
- self.assertEqual(
- self.parser.date,
- datetime.datetime(
- 2020,
- 11,
- 21,
- tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
- ),
+ httpx_mock.add_response(content=b"PDF document")
+
+ tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
+
+ assert tika_parser.text == "the content"
+ assert tika_parser.archive_path is not None
+ with open(tika_parser.archive_path, "rb") as f:
+ assert f.read() == b"PDF document"
+
+ assert tika_parser.date == datetime.datetime(
+ 2020,
+ 11,
+ 21,
+ tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
)
- def test_metadata(self):
- self.httpx_mock.add_response(
+ def test_metadata(
+ self,
+ httpx_mock: HTTPXMock,
+ tika_parser: TikaDocumentParser,
+ sample_odt_file: Path,
+ ):
+ httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
},
)
- file = Path(os.path.join(self.parser.tempdir, "input.odt"))
- file.touch()
-
- metadata = self.parser.extract_metadata(
- file,
+ metadata = tika_parser.extract_metadata(
+ sample_odt_file,
"application/vnd.oasis.opendocument.text",
)
- self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
- self.assertTrue("Some-key" in [m["key"] for m in metadata])
+ assert "dcterms:created" in [m["key"] for m in metadata]
+ assert "Some-key" in [m["key"] for m in metadata]
- def test_convert_failure(self):
+ def test_convert_failure(
+ self,
+ httpx_mock: HTTPXMock,
+ tika_parser: TikaDocumentParser,
+ sample_odt_file: Path,
+ ):
"""
GIVEN:
- Document needs to be converted to PDF
- Parse error is raised
"""
# Pretend convert to PDF response
- self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
-
- file = Path(os.path.join(self.parser.tempdir, "input.odt"))
- file.touch()
+ httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
- with self.assertRaises(ParseError):
- self.parser.convert_to_pdf(file, None)
+ with pytest.raises(ParseError):
+ tika_parser.convert_to_pdf(sample_odt_file, None)
- def test_request_pdf_a_format(self):
+ @pytest.mark.parametrize(
+ ("setting_value", "expected_form_value"),
+ [
+ ("pdfa", "PDF/A-2b"),
+ ("pdfa-1", "PDF/A-2b"),
+ ("pdfa-2", "PDF/A-2b"),
+ ("pdfa-3", "PDF/A-3b"),
+ ],
+ )
+ def test_request_pdf_a_format(
+ self,
+ setting_value: str,
+ expected_form_value: str,
+ httpx_mock: HTTPXMock,
+ settings: SettingsWrapper,
+ tika_parser: TikaDocumentParser,
+ sample_odt_file: Path,
+ ):
"""
GIVEN:
- Document needs to be converted to PDF
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
- file = Path(os.path.join(self.parser.tempdir, "input.odt"))
- file.touch()
+ settings.OCR_OUTPUT_TYPE = setting_value
+ httpx_mock.add_response(
+ status_code=codes.OK,
+ content=b"PDF document",
+ method="POST",
+ )
- for setting, expected_key in [
- ("pdfa", "PDF/A-2b"),
- ("pdfa-2", "PDF/A-2b"),
- ("pdfa-1", "PDF/A-2b"),
- ("pdfa-3", "PDF/A-3b"),
- ]:
- with override_settings(OCR_OUTPUT_TYPE=setting):
- self.httpx_mock.add_response(
- status_code=codes.OK,
- content=b"PDF document",
- method="POST",
- )
-
- self.parser.convert_to_pdf(file, None)
-
- request = self.httpx_mock.get_request()
- found = False
- for field in request.stream.fields:
- if isinstance(field, DataField) and field.name == "pdfa":
- self.assertEqual(field.value, expected_key)
- found = True
- break
- self.assertTrue(found)
-
- self.httpx_mock.reset(assert_all_responses_were_requested=False)
+ tika_parser.convert_to_pdf(sample_odt_file, None)
+
+ request = httpx_mock.get_request()
+ found = False
+ for field in request.stream.fields:
+ if isinstance(field, DataField) and field.name == "pdfa":
+ assert field.value == expected_form_value
+ found = True
+ assert found, "pdfFormat was not found"
+
+ httpx_mock.reset(assert_all_responses_were_requested=False)
+++ /dev/null
-import pytest
-from pytest_httpx import HTTPXMock
-
-
-class HttpxMockMixin:
- @pytest.fixture(autouse=True)
- def httpx_mock_auto(self, httpx_mock: HTTPXMock):
- """
- Workaround for allowing use of a fixture with unittest style testing
- """
- self.httpx_mock = httpx_mock