]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Chore: Initial conversion to pytest fixtures (#7110)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Mon, 8 Jul 2024 14:46:20 +0000 (07:46 -0700)
committerGitHub <noreply@github.com>
Mon, 8 Jul 2024 14:46:20 +0000 (07:46 -0700)
17 files changed:
Pipfile
Pipfile.lock
src/documents/parsers.py
src/documents/tests/conftest.py [new file with mode: 0644]
src/documents/tests/test_date_parsing.py
src/paperless_mail/parsers.py
src/paperless_mail/tests/conftest.py [new file with mode: 0644]
src/paperless_mail/tests/test_live_mail.py
src/paperless_mail/tests/test_parsers.py
src/paperless_mail/tests/test_parsers_live.py
src/paperless_text/parsers.py
src/paperless_text/tests/conftest.py [new file with mode: 0644]
src/paperless_text/tests/test_parser.py
src/paperless_tika/tests/conftest.py [new file with mode: 0644]
src/paperless_tika/tests/test_live_tika.py
src/paperless_tika/tests/test_tika_parser.py
src/paperless_tika/tests/utils.py [deleted file]

diff --git a/Pipfile b/Pipfile
index 69c3084ec5b7452fb2d7d8bad5186ba5f641b180..77bb99bcf338091a2189d321a6617d9bede0b469 100644 (file)
--- a/Pipfile
+++ b/Pipfile
@@ -55,7 +55,7 @@ tqdm = "*"
 uvicorn = {extras = ["standard"], version = "==0.25.0"}
 watchdog = "~=4.0"
 whitenoise = "~=6.7"
-whoosh="~=2.7"
+whoosh = "~=2.7"
 zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
 
 [dev-packages]
@@ -71,6 +71,7 @@ pytest-httpx = "*"
 pytest-env = "*"
 pytest-sugar = "*"
 pytest-xdist = "*"
+pytest-mock = "*"
 pytest-rerunfailures = "*"
 imagehash = "*"
 daphne = "*"
index 6c8a8c7242b6afdbfb5374bc915ee27e54084c0f..cda0f7681068039c291c06fb9b3ef48d97849229 100644 (file)
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "37d8a84e16b6f6785d0daa79b249beab7fbef0c177a13eccfce79816bf61ccd0"
+            "sha256": "272a69e9011a60f2d326b77d99d261425b66ebcc8ae929372213700ae47de0f5"
         },
         "pipfile-spec": 6,
         "requires": {},
             "markers": "python_version >= '3.9'",
             "version": "==0.30.0"
         },
+        "pytest-mock": {
+            "hashes": [
+                "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f",
+                "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==3.14.0"
+        },
         "pytest-rerunfailures": {
             "hashes": [
                 "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32",
index 09b1442c00150492bd4e0f1589cb79356a095360..1297162e2bb6bde065efb38accbbbe8085a49919 100644 (file)
@@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
         return default_thumbnail_path
 
 
-def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
+def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
     """
     The thumbnail of a PDF is just a 500px wide image of the first page.
     """
-    out_path = os.path.join(temp_dir, "convert.webp")
+    out_path = temp_dir / "convert.webp"
 
     # Run convert to get a decent thumbnail
     try:
@@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
             auto_orient=True,
             use_cropbox=True,
             input_file=f"{in_path}[0]",
-            output_file=out_path,
+            output_file=str(out_path),
             logging_group=logging_group,
         )
     except ParseError as e:
diff --git a/src/documents/tests/conftest.py b/src/documents/tests/conftest.py
new file mode 100644 (file)
index 0000000..aa86f6e
--- /dev/null
@@ -0,0 +1,9 @@
+import zoneinfo
+
+import pytest
+from pytest_django.fixtures import SettingsWrapper
+
+
+@pytest.fixture()
+def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo:
+    return zoneinfo.ZoneInfo(settings.TIME_ZONE)
index 25309527541ea2b00f3969f58a39a7c045269079..f0afae5433106fe525a5913940ebef3fcc972bd4 100644 (file)
@@ -1,42 +1,34 @@
 import datetime
+from zoneinfo import ZoneInfo
 
-from dateutil import tz
-from django.conf import settings
-from django.test import TestCase
-from django.test import override_settings
+from pytest_django.fixtures import SettingsWrapper
 
 from documents.parsers import parse_date
 from documents.parsers import parse_date_generator
 
 
-class TestDate(TestCase):
+class TestDate:
     def test_date_format_1(self):
         text = "lorem ipsum 130218 lorem ipsum"
-        self.assertEqual(parse_date("", text), None)
+        assert parse_date("", text) is None
 
     def test_date_format_2(self):
         text = "lorem ipsum 2018 lorem ipsum"
-        self.assertEqual(parse_date("", text), None)
+        assert parse_date("", text) is None
 
     def test_date_format_3(self):
         text = "lorem ipsum 20180213 lorem ipsum"
-        self.assertEqual(parse_date("", text), None)
+        assert parse_date("", text) is None
 
-    def test_date_format_4(self):
+    def test_date_format_4(self, settings_timezone: ZoneInfo):
         text = "lorem ipsum 13.02.2018 lorem ipsum"
         date = parse_date("", text)
-        self.assertEqual(
-            date,
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
+        assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
 
-    def test_date_format_5(self):
+    def test_date_format_5(self, settings_timezone: ZoneInfo):
         text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
         date = parse_date("", text)
-        self.assertEqual(
-            date,
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
+        assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
 
     def test_date_format_6(self):
         text = (
@@ -50,17 +42,14 @@ class TestDate(TestCase):
             "BIC\n"
             "lorem ipsum"
         )
-        self.assertEqual(parse_date("", text), None)
+        assert parse_date("", text) is None
 
-    def test_date_format_7(self):
+    def test_date_format_7(self, settings_timezone: ZoneInfo):
         text = "lorem ipsum\nMärz 2019\nlorem ipsum"
         date = parse_date("", text)
-        self.assertEqual(
-            date,
-            datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
+        assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
 
-    def test_date_format_8(self):
+    def test_date_format_8(self, settings_timezone: ZoneInfo):
         text = (
             "lorem ipsum\n"
             "Wohnort\n"
@@ -73,209 +62,331 @@ class TestDate(TestCase):
             "lorem ipsum\n"
             "März 2020"
         )
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2020,
+            3,
+            1,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_9(self):
+    def test_date_format_9(self, settings_timezone: ZoneInfo):
         text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2020,
+            3,
+            1,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_10(self):
+    def test_date_format_10(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_11(self):
+    def test_date_format_11(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_12(self):
+    def test_date_format_12(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_13(self):
+    def test_date_format_13(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_14(self):
+    def test_date_format_14(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
     def test_date_format_15(self):
         text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
-        self.assertIsNone(parse_date("", text), None)
+        assert parse_date("", text) is None
 
     def test_date_format_16(self):
         text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
-        self.assertIsNone(parse_date("", text), None)
+        assert parse_date("", text) is None
 
     def test_date_format_17(self):
         text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
-        self.assertIsNone(parse_date("", text), None)
+        assert parse_date("", text) is None
 
     def test_date_format_18(self):
         text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
-        self.assertIsNone(parse_date("", text), None)
+        assert parse_date("", text) is None
 
-    def test_date_format_19(self):
+    def test_date_format_19(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            21,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_20(self):
+    def test_date_format_20(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            22,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_21(self):
+    def test_date_format_21(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            2,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_22(self):
+    def test_date_format_22(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            23,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_23(self):
+    def test_date_format_23(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            24,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_24(self):
+    def test_date_format_24(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            21,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_25(self):
+    def test_date_format_25(self, settings_timezone: ZoneInfo):
         text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2022,
+            3,
+            25,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
-    def test_date_format_26(self):
+    def test_date_format_26(self, settings_timezone: ZoneInfo):
         text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2019, 9, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2019,
+            9,
+            25,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
 
     def test_crazy_date_past(self):
-        self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
+        assert parse_date("", "01-07-0590 00:00:00") is None
 
     def test_crazy_date_future(self):
-        self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
+        assert parse_date("", "01-07-2350 00:00:00") is None
 
     def test_crazy_date_with_spaces(self):
-        self.assertIsNone(parse_date("", "20 408000l 2475"))
-
-    def test_utf_month_names(self):
-        self.assertEqual(
-            parse_date("", "13 décembre 2023"),
-            datetime.datetime(2023, 12, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "13 août 2022"),
-            datetime.datetime(2022, 8, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "11 März 2020"),
-            datetime.datetime(2020, 3, 11, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "17. ožujka 2018."),
-            datetime.datetime(2018, 3, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "1. veljače 2016."),
-            datetime.datetime(2016, 2, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "15. února 1985"),
-            datetime.datetime(1985, 2, 15, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "30. září 2011"),
-            datetime.datetime(2011, 9, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "28. května 1990"),
-            datetime.datetime(1990, 5, 28, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "1. grudzień 1997"),
-            datetime.datetime(1997, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "17 Şubat 2024"),
-            datetime.datetime(2024, 2, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "30 Ağustos 2012"),
-            datetime.datetime(2012, 8, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "17 Eylül 2000"),
-            datetime.datetime(2000, 9, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            parse_date("", "5. október 1992"),
-            datetime.datetime(1992, 10, 5, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-
-    def test_multiple_dates(self):
+        assert parse_date("", "20 408000l 2475") is None
+
+    def test_utf_month_names(self, settings_timezone: ZoneInfo):
+        assert parse_date("", "13 décembre 2023") == datetime.datetime(
+            2023,
+            12,
+            13,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "13 août 2022") == datetime.datetime(
+            2022,
+            8,
+            13,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "11 März 2020") == datetime.datetime(
+            2020,
+            3,
+            11,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
+            2018,
+            3,
+            17,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "1. veljače 2016.") == datetime.datetime(
+            2016,
+            2,
+            1,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "15. února 1985") == datetime.datetime(
+            1985,
+            2,
+            15,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "30. září 2011") == datetime.datetime(
+            2011,
+            9,
+            30,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "28. května 1990") == datetime.datetime(
+            1990,
+            5,
+            28,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "1. grudzień 1997") == datetime.datetime(
+            1997,
+            12,
+            1,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "17 Şubat 2024") == datetime.datetime(
+            2024,
+            2,
+            17,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
+            2012,
+            8,
+            30,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "17 Eylül 2000") == datetime.datetime(
+            2000,
+            9,
+            17,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+        assert parse_date("", "5. október 1992") == datetime.datetime(
+            1992,
+            10,
+            5,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+
+    def test_multiple_dates(self, settings_timezone: ZoneInfo):
         text = """This text has multiple dates.
                   For example 02.02.2018, 22 July 2022 and December 2021.
                   But not 24-12-9999 because it's in the future..."""
         dates = list(parse_date_generator("", text))
-        self.assertEqual(len(dates), 3)
-        self.assertEqual(
-            dates[0],
-            datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            dates[1],
-            datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-        self.assertEqual(
-            dates[2],
-            datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
 
-    @override_settings(FILENAME_DATE_ORDER="YMD")
-    def test_filename_date_parse_valid_ymd(self, *args):
+        assert dates == [
+            datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
+            datetime.datetime(
+                2022,
+                7,
+                22,
+                0,
+                0,
+                tzinfo=settings_timezone,
+            ),
+            datetime.datetime(
+                2021,
+                12,
+                1,
+                0,
+                0,
+                tzinfo=settings_timezone,
+            ),
+        ]
+
+    def test_filename_date_parse_valid_ymd(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
         """
         GIVEN:
             - Date parsing from the filename is enabled
@@ -285,13 +396,18 @@ class TestDate(TestCase):
         THEN:
             - Should parse the date from the filename
         """
-        self.assertEqual(
-            parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
-            datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-
-    @override_settings(FILENAME_DATE_ORDER="DMY")
-    def test_filename_date_parse_valid_dmy(self, *args):
+        settings.FILENAME_DATE_ORDER = "YMD"
+
+        assert parse_date(
+            "/tmp/Scan-2022-04-01.pdf",
+            "No date in here",
+        ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
+
+    def test_filename_date_parse_valid_dmy(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
         """
         GIVEN:
             - Date parsing from the filename is enabled
@@ -301,13 +417,13 @@ class TestDate(TestCase):
         THEN:
             - Should parse the date from the filename
         """
-        self.assertEqual(
-            parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
-            datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
+        settings.FILENAME_DATE_ORDER = "DMY"
+        assert parse_date(
+            "/tmp/Scan-10.01.2021.pdf",
+            "No date in here",
+        ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
 
-    @override_settings(FILENAME_DATE_ORDER="YMD")
-    def test_filename_date_parse_invalid(self, *args):
+    def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
         """
         GIVEN:
             - Date parsing from the filename is enabled
@@ -317,15 +433,14 @@ class TestDate(TestCase):
         THEN:
             - No date is parsed
         """
-        self.assertIsNone(
-            parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
-        )
-
-    @override_settings(
-        FILENAME_DATE_ORDER="YMD",
-        IGNORE_DATES=(datetime.date(2022, 4, 1),),
-    )
-    def test_filename_date_ignored_use_content(self, *args):
+        settings.FILENAME_DATE_ORDER = "YMD"
+        assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
+
+    def test_filename_date_ignored_use_content(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
         """
         GIVEN:
             - Date parsing from the filename is enabled
@@ -338,15 +453,18 @@ class TestDate(TestCase):
         THEN:
             - Should parse the date from the content not filename
         """
-        self.assertEqual(
-            parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
-            datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-
-    @override_settings(
-        IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
-    )
-    def test_ignored_dates_default_order(self, *args):
+        settings.FILENAME_DATE_ORDER = "YMD"
+        settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
+        assert parse_date(
+            "/tmp/Scan-2022-04-01.pdf",
+            "The matching date is 24.03.2022",
+        ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
+
+    def test_ignored_dates_default_order(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
         """
         GIVEN:
             - Ignore dates have been set
@@ -356,17 +474,22 @@ class TestDate(TestCase):
         THEN:
             - Should parse the date non-ignored date from content
         """
+        settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
         text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
-        )
-
-    @override_settings(
-        IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
-        DATE_ORDER="YMD",
-    )
-    def test_ignored_dates_order_ymd(self, *args):
+        assert parse_date("", text) == datetime.datetime(
+            2018,
+            2,
+            13,
+            0,
+            0,
+            tzinfo=settings_timezone,
+        )
+
+    def test_ignored_dates_order_ymd(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
         """
         GIVEN:
             - Ignore dates have been set
@@ -377,9 +500,17 @@ class TestDate(TestCase):
         THEN:
             - Should parse the date non-ignored date from content
         """
+
+        settings.FILENAME_DATE_ORDER = "YMD"
+        settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
+
         text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
 
-        self.assertEqual(
-            parse_date("", text),
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        assert parse_date("", text) == datetime.datetime(
+            2018,
+            2,
+            13,
+            0,
+            0,
+            tzinfo=settings_timezone,
         )
index 9047b5f90c7054969ac24982b98fcdd2a4ca9985..4e83844e22d077b8528d5895d7b3bc4f6cf9ed57 100644 (file)
@@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser):
             return PdfAFormat.A3b
         return None
 
-    def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
+    def get_thumbnail(
+        self,
+        document_path: Path,
+        mime_type: str,
+        file_name=None,
+    ) -> Path:
         if not self.archive_path:
             self.archive_path = self.generate_pdf(
                 self.parse_file_to_message(document_path),
diff --git a/src/paperless_mail/tests/conftest.py b/src/paperless_mail/tests/conftest.py
new file mode 100644 (file)
index 0000000..01a98d5
--- /dev/null
@@ -0,0 +1,89 @@
+import os
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_mail.mail import MailAccountHandler
+from paperless_mail.models import MailAccount
+from paperless_mail.parsers import MailDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def broken_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "broken.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "simple_text.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml"
+
+
+@pytest.fixture(scope="session")
+def html_email_pdf_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf"
+
+
+@pytest.fixture(scope="session")
+def html_email_thumbnail_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.pdf.webp"
+
+
+@pytest.fixture(scope="session")
+def html_email_html_file(sample_dir: Path) -> Path:
+    return sample_dir / "html.eml.html"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_first(sample_dir: Path) -> Path:
+    return sample_dir / "first.pdf"
+
+
+@pytest.fixture(scope="session")
+def merged_pdf_second(sample_dir: Path) -> Path:
+    return sample_dir / "second.pdf"
+
+
+@pytest.fixture()
+def mail_parser() -> MailDocumentParser:
+    return MailDocumentParser(logging_group=None)
+
+
+@pytest.fixture()
+def live_mail_account() -> Generator[MailAccount, None, None]:
+    try:
+        account = MailAccount.objects.create(
+            name="test",
+            imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
+            username=os.environ["PAPERLESS_MAIL_TEST_USER"],
+            password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
+            imap_port=993,
+        )
+        yield account
+    finally:
+        account.delete()
+
+
+@pytest.fixture()
+def mail_account_handler() -> MailAccountHandler:
+    return MailAccountHandler()
index 6de2a67707649327a5058d90538b9a9c22257165..ecf9f73b65c6c4f9fac2e1591064672c13292d48 100644 (file)
@@ -1,7 +1,7 @@
 import os
+import warnings
 
 import pytest
-from django.test import TestCase
 
 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.mail import MailError
@@ -16,53 +16,46 @@ from paperless_mail.models import MailRule
     or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]),
     reason="Live server testing not enabled",
 )
-class TestMailLiveServer(TestCase):
-    def setUp(self) -> None:
-        self.mail_account_handler = MailAccountHandler()
-        self.account = MailAccount.objects.create(
-            name="test",
-            imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
-            username=os.environ["PAPERLESS_MAIL_TEST_USER"],
-            password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
-            imap_port=993,
-        )
-
-        return super().setUp()
-
-    def tearDown(self) -> None:
-        self.account.delete()
-        return super().tearDown()
-
-    def test_process_non_gmail_server_flag(self):
+@pytest.mark.django_db()
+class TestMailLiveServer:
+    def test_process_non_gmail_server_flag(
+        self,
+        mail_account_handler: MailAccountHandler,
+        live_mail_account: MailAccount,
+    ):
         try:
             rule1 = MailRule.objects.create(
                 name="testrule",
-                account=self.account,
+                account=live_mail_account,
                 action=MailRule.MailAction.FLAG,
             )
 
-            self.mail_account_handler.handle_mail_account(self.account)
+            mail_account_handler.handle_mail_account(live_mail_account)
 
             rule1.delete()
 
         except MailError as e:
-            self.fail(f"Failure: {e}")
-        except Exception:
-            pass
-
-    def test_process_non_gmail_server_tag(self):
+            pytest.fail(f"Failure: {e}")
+        except Exception as e:
+            warnings.warn(f"Unhandled exception: {e}")
+
+    def test_process_non_gmail_server_tag(
+        self,
+        mail_account_handler: MailAccountHandler,
+        live_mail_account: MailAccount,
+    ):
         try:
             rule2 = MailRule.objects.create(
                 name="testrule",
-                account=self.account,
+                account=live_mail_account,
                 action=MailRule.MailAction.TAG,
             )
 
-            self.mail_account_handler.handle_mail_account(self.account)
+            mail_account_handler.handle_mail_account(live_mail_account)
 
             rule2.delete()
 
         except MailError as e:
-            self.fail(f"Failure: {e}")
-        except Exception:
-            pass
+            pytest.fail(f"Failure: {e}")
+        except Exception as e:
+            warnings.warn(f"Unhandled exception: {e}")
index 5bcff19f643eb1a15b9548fed2556b859b163e8c..a0baa48218654cc6b9495fa015af1a1049e7bbc5 100644 (file)
@@ -1,39 +1,29 @@
 import datetime
+import logging
 from pathlib import Path
-from unittest import mock
 
 import httpx
-from django.test import TestCase
+import pytest
+from django.test.html import parse_html
+from pytest_django.fixtures import SettingsWrapper
+from pytest_httpx import HTTPXMock
+from pytest_mock import MockerFixture
 
 from documents.parsers import ParseError
-from documents.tests.utils import FileSystemAssertsMixin
 from paperless_mail.parsers import MailDocumentParser
-from paperless_tika.tests.utils import HttpxMockMixin
 
 
-class BaseMailParserTestCase(TestCase):
-    """
-    Basic setup for the below test cases
-    """
-
-    SAMPLE_DIR = Path(__file__).parent / "samples"
-
-    def setUp(self) -> None:
-        super().setUp()
-        self.parser = MailDocumentParser(logging_group=None)
-
-    def tearDown(self) -> None:
-        super().tearDown()
-        self.parser.cleanup()
-
-
-class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
+class TestEmailFileParsing:
     """
     Tests around reading a file and parsing it into a
     MailMessage
     """
 
-    def test_parse_error_missing_file(self):
+    def test_parse_error_missing_file(
+        self,
+        mail_parser: MailDocumentParser,
+        sample_dir: Path,
+    ):
         """
         GIVEN:
             - Fresh parser
@@ -43,17 +33,18 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
             - An Exception is thrown
         """
         # Check if exception is raised when parsing fails.
-        test_file = self.SAMPLE_DIR / "doesntexist.eml"
-
-        self.assertIsNotFile(test_file)
-        self.assertRaises(
-            ParseError,
-            self.parser.parse,
-            test_file,
-            "messages/rfc822",
-        )
+        test_file = sample_dir / "doesntexist.eml"
+
+        assert not test_file.exists()
 
-    def test_parse_error_invalid_email(self):
+        with pytest.raises(ParseError):
+            mail_parser.parse(test_file, "messages/rfc822")
+
+    def test_parse_error_invalid_email(
+        self,
+        mail_parser: MailDocumentParser,
+        broken_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh parser
@@ -63,14 +54,15 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
             - An Exception is thrown
         """
         # Check if exception is raised when the mail is faulty.
-        self.assertRaises(
-            ParseError,
-            self.parser.parse,
-            self.SAMPLE_DIR / "broken.eml",
-            "messages/rfc822",
-        )
 
-    def test_parse_simple_text_email_file(self):
+        with pytest.raises(ParseError):
+            mail_parser.parse(broken_email_file, "messages/rfc822")
+
+    def test_parse_simple_text_email_file(
+        self,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh parser
@@ -80,29 +72,31 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
             - The content of the mail should be available in the parse result.
         """
         # Parse Test file and check relevant content
-        parsed1 = self.parser.parse_file_to_message(
-            self.SAMPLE_DIR / "simple_text.eml",
-        )
-
-        self.assertEqual(parsed1.date.year, 2022)
-        self.assertEqual(parsed1.date.month, 10)
-        self.assertEqual(parsed1.date.day, 12)
-        self.assertEqual(parsed1.date.hour, 21)
-        self.assertEqual(parsed1.date.minute, 40)
-        self.assertEqual(parsed1.date.second, 43)
-        self.assertEqual(parsed1.date.tzname(), "UTC+02:00")
-        self.assertEqual(parsed1.from_, "mail@someserver.de")
-        self.assertEqual(parsed1.subject, "Simple Text Mail")
-        self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n")
-        self.assertEqual(parsed1.to, ("some@one.de",))
-
-
-class TestEmailMetadataExtraction(BaseMailParserTestCase):
+        parsed_msg = mail_parser.parse_file_to_message(simple_txt_email_file)
+
+        assert parsed_msg.date.year == 2022
+        assert parsed_msg.date.month == 10
+        assert parsed_msg.date.day == 12
+        assert parsed_msg.date.hour == 21
+        assert parsed_msg.date.minute == 40
+        assert parsed_msg.date.second == 43
+        assert parsed_msg.date.tzname() == "UTC+02:00"
+        assert parsed_msg.from_ == "mail@someserver.de"
+        assert parsed_msg.subject == "Simple Text Mail"
+        assert parsed_msg.text == "This is just a simple Text Mail.\n"
+        assert parsed_msg.to == ("some@one.de",)
+
+
+class TestEmailMetadataExtraction:
     """
     Tests extraction of metadata from an email
     """
 
-    def test_extract_metadata_fail(self):
+    def test_extract_metadata_fail(
+        self,
+        caplog: pytest.LogCaptureFixture,
+        mail_parser: MailDocumentParser,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -112,14 +106,20 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase):
             - A log warning should be generated
         """
         # Validate if warning is logged when parsing fails
-        with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm:
-            self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822"))
-            self.assertIn(
-                "WARNING:paperless.parsing.mail:Error while fetching document metadata for na",
-                cm.output[0],
-            )
+        assert mail_parser.extract_metadata("na", "message/rfc822") == []
+
+        assert len(caplog.records) == 1
+        record = caplog.records[0]
+
+        assert record.levelno == logging.WARNING
+        assert record.name == "paperless.parsing.mail"
+        assert "Error while fetching document metadata for na" in record.message
 
-    def test_extract_metadata(self):
+    def test_extract_metadata(
+        self,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -129,149 +129,110 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase):
             - metadata is returned
         """
         # Validate Metadata parsing returns the expected results
-        metadata = self.parser.extract_metadata(
-            self.SAMPLE_DIR / "simple_text.eml",
-            "message/rfc822",
-        )
-
-        self.assertIn(
-            {"namespace": "", "prefix": "", "key": "attachments", "value": ""},
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": "date",
-                "value": "2022-10-12 21:40:43 UTC+02:00",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "content-language",
-                "value": "en-US",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "content-type",
-                "value": "text/plain; charset=UTF-8; format=flowed",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "date",
-                "value": "Wed, 12 Oct 2022 21:40:43 +0200",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "delivered-to",
-                "value": "mail@someserver.de",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "from",
-                "value": "Some One <mail@someserver.de>",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "message-id",
-                "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "mime-version",
-                "value": "1.0",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "received",
-                "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "return-path",
-                "value": "<mail@someserver.de>",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "subject",
-                "value": "Simple Text Mail",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {"namespace": "", "prefix": "header", "key": "to", "value": "some@one.de"},
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "user-agent",
-                "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
-            },
-            metadata,
-        )
-        self.assertIn(
-            {
-                "namespace": "",
-                "prefix": "header",
-                "key": "x-last-tls-session-version",
-                "value": "TLSv1.3",
-            },
-            metadata,
-        )
-
-
-class TestEmailThumbnailGenerate(BaseMailParserTestCase):
+        metadata = mail_parser.extract_metadata(simple_txt_email_file, "message/rfc822")
+
+        assert {
+            "namespace": "",
+            "prefix": "",
+            "key": "attachments",
+            "value": "",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "",
+            "key": "date",
+            "value": "2022-10-12 21:40:43 UTC+02:00",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "content-language",
+            "value": "en-US",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "content-type",
+            "value": "text/plain; charset=UTF-8; format=flowed",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "date",
+            "value": "Wed, 12 Oct 2022 21:40:43 +0200",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "delivered-to",
+            "value": "mail@someserver.de",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "from",
+            "value": "Some One <mail@someserver.de>",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "message-id",
+            "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "mime-version",
+            "value": "1.0",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "received",
+            "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "return-path",
+            "value": "<mail@someserver.de>",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "subject",
+            "value": "Simple Text Mail",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "to",
+            "value": "some@one.de",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "user-agent",
+            "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
+        } in metadata
+        assert {
+            "namespace": "",
+            "prefix": "header",
+            "key": "x-last-tls-session-version",
+            "value": "TLSv1.3",
+        } in metadata
+
+
+class TestEmailThumbnailGenerate:
     """
     Tests the correct generation of an thumbnail for an email
     """
 
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
-    @mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
     def test_get_thumbnail(
         self,
-        mock_make_thumbnail_from_pdf: mock.MagicMock,
-        mock_generate_pdf: mock.MagicMock,
+        mocker: MockerFixture,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
     ):
         """
         GIVEN:
@@ -282,29 +243,34 @@ class TestEmailThumbnailGenerate(BaseMailParserTestCase):
             - The parser should call the functions which generate the thumbnail
         """
         mocked_return = "Passing the return value through.."
+        mock_make_thumbnail_from_pdf = mocker.patch(
+            "paperless_mail.parsers.make_thumbnail_from_pdf",
+        )
         mock_make_thumbnail_from_pdf.return_value = mocked_return
 
+        mock_generate_pdf = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+        )
         mock_generate_pdf.return_value = "Mocked return value.."
 
-        test_file = self.SAMPLE_DIR / "simple_text.eml"
-
-        thumb = self.parser.get_thumbnail(
-            test_file,
-            "message/rfc822",
-        )
+        thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
 
         mock_generate_pdf.assert_called_once()
         mock_make_thumbnail_from_pdf.assert_called_once_with(
             "Mocked return value..",
-            self.parser.tempdir,
+            mail_parser.tempdir,
             None,
         )
 
-        self.assertEqual(mocked_return, thumb)
+        assert mocked_return == thumb
 
 
-class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
-    def test_tika_parse_unsuccessful(self):
+class TestTikaHtmlParse:
+    def test_tika_parse_unsuccessful(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -314,13 +280,13 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
             - the parser should return an empty string
         """
         # Check unsuccessful parsing
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []},
         )
-        parsed = self.parser.tika_parse("None")
-        self.assertEqual("", parsed)
+        parsed = mail_parser.tika_parse("None")
+        assert parsed == ""
 
-    def test_tika_parse(self):
+    def test_tika_parse(self, httpx_mock: HTTPXMock, mail_parser: MailDocumentParser):
         """
         GIVEN:
             - Fresh start
@@ -332,18 +298,22 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
         html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
         expected_text = "Some Text"
 
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             json={
                 "Content-Type": "text/html",
                 "X-TIKA:Parsed-By": [],
                 "X-TIKA:content": expected_text,
             },
         )
-        parsed = self.parser.tika_parse(html)
-        self.assertEqual(expected_text, parsed.strip())
-        self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
+        parsed = mail_parser.tika_parse(html)
+        assert expected_text == parsed.strip()
+        assert "http://localhost:9998" in str(httpx_mock.get_request().url)
 
-    def test_tika_parse_exception(self):
+    def test_tika_parse_exception(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -354,11 +324,16 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
         """
         html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
 
-        self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
+        httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
 
-        self.assertRaises(ParseError, self.parser.tika_parse, html)
+        with pytest.raises(ParseError):
+            mail_parser.tika_parse(html)
 
-    def test_tika_parse_unreachable(self):
+    def test_tika_parse_unreachable(
+        self,
+        settings: SettingsWrapper,
+        mail_parser: MailDocumentParser,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -370,30 +345,18 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
         html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
 
         # Check if exception is raised when Tika cannot be reached.
-        self.parser.tika_server = ""
-        self.assertRaises(ParseError, self.parser.tika_parse, html)
+        with pytest.raises(ParseError):
+            settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
+            mail_parser.tika_parse(html)
 
 
-class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase):
-    def test_parse_no_file(self):
-        """
-        GIVEN:
-            - Fresh start
-        WHEN:
-            - parsing is attempted with nonexistent file
-        THEN:
-            - Exception is thrown
-        """
-        # Check if exception is raised when parsing fails.
-        self.assertRaises(
-            ParseError,
-            self.parser.parse,
-            self.SAMPLE_DIR / "na.eml",
-            "message/rfc822",
-        )
-
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
-    def test_parse_eml_simple(self, mock_generate_pdf: mock.MagicMock):
+class TestParser:
+    def test_parse_eml_simple(
+        self,
+        mocker: MockerFixture,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -403,11 +366,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             - parsed information is available
         """
         # Validate parsing returns the expected results
-
-        self.parser.parse(
-            self.SAMPLE_DIR / "simple_text.eml",
-            "message/rfc822",
+        mock_generate_pdf = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
         )
+
+        mail_parser.parse(simple_txt_email_file, "message/rfc822")
         text_expected = (
             "Subject: Simple Text Mail\n\n"
             "From: Some One <mail@someserver.de>\n\n"
@@ -416,8 +379,8 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             "BCC: fdf@fvf.de\n\n"
             "\n\nThis is just a simple Text Mail."
         )
-        self.assertEqual(text_expected, self.parser.text)
-        self.assertEqual(
+        assert text_expected == mail_parser.text
+        assert (
             datetime.datetime(
                 2022,
                 10,
@@ -426,15 +389,20 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
                 40,
                 43,
                 tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
-            ),
-            self.parser.date,
+            )
+            == mail_parser.date
         )
 
         # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
         mock_generate_pdf.assert_called()
 
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
-    def test_parse_eml_html(self, mock_generate_pdf: mock.MagicMock):
+    def test_parse_eml_html(
+        self,
+        mocker: MockerFixture,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -443,6 +411,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
         THEN:
             - Tika is called, parsed information from non html parts is available
         """
+
+        mock_generate_pdf = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
+        )
+
         # Validate parsing returns the expected results
         text_expected = (
             "Subject: HTML Message\n\n"
@@ -453,7 +426,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             "Some Text and an embedded image."
         )
 
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             json={
                 "Content-Type": "text/html",
                 "X-TIKA:Parsed-By": [],
@@ -461,11 +434,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             },
         )
 
-        self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+        mail_parser.parse(html_email_file, "message/rfc822")
 
         mock_generate_pdf.assert_called_once()
-        self.assertEqual(text_expected, self.parser.text)
-        self.assertEqual(
+        assert text_expected == mail_parser.text
+        assert (
             datetime.datetime(
                 2022,
                 10,
@@ -474,11 +447,16 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
                 23,
                 19,
                 tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
-            ),
-            self.parser.date,
+            )
+            == mail_parser.date
         )
 
-    def test_generate_pdf_parse_error(self):
+    def test_generate_pdf_parse_error(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -487,16 +465,18 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
         THEN:
             - a ParseError Exception is thrown
         """
-        self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
+        httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
 
-        self.assertRaises(
-            ParseError,
-            self.parser.parse,
-            self.SAMPLE_DIR / "simple_text.eml",
-            "message/rfc822",
-        )
+        with pytest.raises(ParseError):
+            mail_parser.parse(simple_txt_email_file, "message/rfc822")
 
-    def test_generate_pdf_simple_email(self):
+    def test_generate_pdf_simple_email(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+        simple_txt_email_pdf_file: Path,
+    ):
         """
         GIVEN:
             - Simple text email with no HTML content
@@ -507,17 +487,23 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             - Archive file is generated
         """
 
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/chromium/convert/html",
             method="POST",
-            content=(self.SAMPLE_DIR / "simple_text.eml.pdf").read_bytes(),
+            content=simple_txt_email_pdf_file.read_bytes(),
         )
 
-        self.parser.parse(self.SAMPLE_DIR / "simple_text.eml", "message/rfc822")
+        mail_parser.parse(simple_txt_email_file, "message/rfc822")
 
-        self.assertIsNotNone(self.parser.archive_path)
+        assert mail_parser.archive_path is not None
 
-    def test_generate_pdf_html_email(self):
+    def test_generate_pdf_html_email(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        html_email_pdf_file: Path,
+    ):
         """
         GIVEN:
             - email with HTML content
@@ -528,7 +514,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             - Gotenberg is used to merge the two PDFs
             - Archive file is generated
         """
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:9998/tika/text",
             method="PUT",
             json={
@@ -537,21 +523,27 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
                 "X-TIKA:content": "This is some Tika HTML text",
             },
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/chromium/convert/html",
             method="POST",
-            content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+            content=html_email_pdf_file.read_bytes(),
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/pdfengines/merge",
             method="POST",
             content=b"Pretend merged PDF content",
         )
-        self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+        mail_parser.parse(html_email_file, "message/rfc822")
 
-        self.assertIsNotNone(self.parser.archive_path)
+        assert mail_parser.archive_path is not None
 
-    def test_generate_pdf_html_email_html_to_pdf_failure(self):
+    def test_generate_pdf_html_email_html_to_pdf_failure(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        html_email_pdf_file: Path,
+    ):
         """
         GIVEN:
             - email with HTML content
@@ -561,7 +553,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
         THEN:
             - ParseError is raised
         """
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:9998/tika/text",
             method="PUT",
             json={
@@ -570,20 +562,26 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
                 "X-TIKA:content": "This is some Tika HTML text",
             },
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/chromium/convert/html",
             method="POST",
-            content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+            content=html_email_pdf_file.read_bytes(),
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/chromium/convert/html",
             method="POST",
             status_code=httpx.codes.INTERNAL_SERVER_ERROR,
         )
-        with self.assertRaises(ParseError):
-            self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+        with pytest.raises(ParseError):
+            mail_parser.parse(html_email_file, "message/rfc822")
 
-    def test_generate_pdf_html_email_merge_failure(self):
+    def test_generate_pdf_html_email_merge_failure(
+        self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        html_email_pdf_file: Path,
+    ):
         """
         GIVEN:
             - email with HTML content
@@ -593,7 +591,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
         THEN:
             - ParseError is raised
         """
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:9998/tika/text",
             method="PUT",
             json={
@@ -602,20 +600,25 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
                 "X-TIKA:content": "This is some Tika HTML text",
             },
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/chromium/convert/html",
             method="POST",
-            content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
+            content=html_email_pdf_file.read_bytes(),
         )
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             url="http://localhost:3000/forms/pdfengines/merge",
             method="POST",
             status_code=httpx.codes.INTERNAL_SERVER_ERROR,
         )
-        with self.assertRaises(ParseError):
-            self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
+        with pytest.raises(ParseError):
+            mail_parser.parse(html_email_file, "message/rfc822")
 
-    def test_mail_to_html(self):
+    def test_mail_to_html(
+        self,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        html_email_html_file: Path,
+    ):
         """
         GIVEN:
             - Email message with HTML content
@@ -624,14 +627,19 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
         THEN:
             - Resulting HTML is as expected
         """
-        mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
-        html_file = self.parser.mail_to_html(mail)
-        expected_html_file = self.SAMPLE_DIR / "html.eml.html"
+        mail = mail_parser.parse_file_to_message(html_email_file)
+        html_file = mail_parser.mail_to_html(mail)
+
+        expected_html = parse_html(html_email_html_file.read_text())
+        actual_html = parse_html(html_file.read_text())
 
-        self.assertHTMLEqual(expected_html_file.read_text(), html_file.read_text())
+        assert expected_html == actual_html
 
     def test_generate_pdf_from_mail(
         self,
+        httpx_mock: HTTPXMock,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
     ):
         """
         GIVEN:
@@ -642,16 +650,13 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
             - Gotenberg is used to convert HTML to PDF
         """
 
-        self.httpx_mock.add_response(content=b"Content")
+        httpx_mock.add_response(content=b"Content")
 
-        mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
+        mail = mail_parser.parse_file_to_message(html_email_file)
 
-        retval = self.parser.generate_pdf_from_mail(mail)
-        self.assertEqual(b"Content", retval.read_bytes())
+        retval = mail_parser.generate_pdf_from_mail(mail)
+        assert retval.read_bytes() == b"Content"
 
-        request = self.httpx_mock.get_request()
+        request = httpx_mock.get_request()
 
-        self.assertEqual(
-            str(request.url),
-            "http://localhost:3000/forms/chromium/convert/html",
-        )
+        assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html"
index 3260725a5bb7e0c52bf7fa28591314e5e8b9632c..9e13ad25e2c5cbead8722fc7020533b57c1fad86 100644 (file)
@@ -3,17 +3,15 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from unittest import mock
 
 import httpx
 import pytest
-from django.test import TestCase
 from imagehash import average_hash
 from PIL import Image
+from pytest_mock import MockerFixture
 
-from documents.tests.utils import FileSystemAssertsMixin
 from documents.tests.utils import util_call_with_backoff
-from paperless_mail.tests.test_parsers import BaseMailParserTestCase
+from paperless_mail.parsers import MailDocumentParser
 
 
 def extract_text(pdf_path: Path) -> str:
@@ -50,7 +48,7 @@ class MailAttachmentMock:
     "PAPERLESS_CI_TEST" not in os.environ,
     reason="No Gotenberg/Tika servers to test with",
 )
-class TestUrlCanary(TestCase):
+class TestUrlCanary:
     """
     Verify certain URLs are still available so testing is valid still
     """
@@ -69,13 +67,13 @@ class TestUrlCanary(TestCase):
         whether this image stays online forever, so here we check if we can detect if is not
         available anymore.
         """
-        with self.assertRaises(httpx.HTTPStatusError) as cm:
+        with pytest.raises(httpx.HTTPStatusError) as exec_info:
             resp = httpx.get(
                 "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
             )
             resp.raise_for_status()
 
-        self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND)
+        assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
 
     def test_is_online_image_still_available(self):
         """
@@ -100,13 +98,19 @@ class TestUrlCanary(TestCase):
     "PAPERLESS_CI_TEST" not in os.environ,
     reason="No Gotenberg/Tika servers to test with",
 )
-class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
+class TestParserLive:
     @staticmethod
     def imagehash(file, hash_size=18):
         return f"{average_hash(Image.open(file), hash_size)}"
 
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
-    def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
+    def test_get_thumbnail(
+        self,
+        mocker: MockerFixture,
+        mail_parser: MailDocumentParser,
+        simple_txt_email_file: Path,
+        simple_txt_email_pdf_file: Path,
+        simple_txt_email_thumbnail_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
@@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
         THEN:
             - The returned thumbnail image file is as expected
         """
-        mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf"
-        thumb = self.parser.get_thumbnail(
-            self.SAMPLE_DIR / "simple_text.eml",
-            "message/rfc822",
+        mock_generate_pdf = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf",
         )
-        self.assertIsFile(thumb)
+        mock_generate_pdf.return_value = simple_txt_email_pdf_file
 
-        expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp"
+        thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
 
-        self.assertEqual(
-            self.imagehash(thumb),
-            self.imagehash(expected),
-            f"Created Thumbnail {thumb} differs from expected file {expected}",
-        )
+        assert thumb.exists()
+        assert thumb.is_file()
+
+        assert (
+            self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
+        ), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
 
-    def test_tika_parse_successful(self):
+    def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
         """
         GIVEN:
             - Fresh start
@@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
         expected_text = "Some Text"
 
         # Check successful parsing
-        parsed = self.parser.tika_parse(html)
-        self.assertEqual(expected_text, parsed.strip())
+        parsed = mail_parser.tika_parse(html)
+        assert expected_text == parsed.strip()
 
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
-    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
     def test_generate_pdf_gotenberg_merging(
         self,
-        mock_generate_pdf_from_html: mock.MagicMock,
-        mock_generate_pdf_from_mail: mock.MagicMock,
+        mocker: MockerFixture,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        merged_pdf_first: Path,
+        merged_pdf_second: Path,
     ):
         """
         GIVEN:
@@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
         THEN:
             - gotenberg is called to merge files and the resulting file is returned
         """
-        mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf"
-        mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf"
-
-        msg = self.parser.parse_file_to_message(
-            self.SAMPLE_DIR / "html.eml",
+        mock_generate_pdf_from_html = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
         )
+        mock_generate_pdf_from_mail = mocker.patch(
+            "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
+        )
+        mock_generate_pdf_from_mail.return_value = merged_pdf_first
+        mock_generate_pdf_from_html.return_value = merged_pdf_second
+
+        msg = mail_parser.parse_file_to_message(html_email_file)
 
         _, pdf_path = util_call_with_backoff(
-            self.parser.generate_pdf,
+            mail_parser.generate_pdf,
             [msg],
         )
-        self.assertIsFile(pdf_path)
+        assert pdf_path.exists()
+        assert pdf_path.is_file()
 
         extracted = extract_text(pdf_path)
         expected = (
             "first   PDF   to   be   merged.\n\x0csecond PDF   to   be   merged.\n\x0c"
         )
 
-        self.assertEqual(expected, extracted)
+        assert expected == extracted
 
-    def test_generate_pdf_from_mail(self):
+    def test_generate_pdf_from_mail(
+        self,
+        mail_parser: MailDocumentParser,
+        html_email_file: Path,
+        html_email_pdf_file: Path,
+        html_email_thumbnail_file: Path,
+    ):
         """
         GIVEN:
             - Fresh start
         WHEN:
             - pdf generation from simple eml file is requested
         THEN:
-            - gotenberg is called and the resulting file is returned and look as expected.
+            - Gotenberg is called and the resulting file is returned and look as expected.
         """
 
-        util_call_with_backoff(
-            self.parser.parse,
-            [self.SAMPLE_DIR / "html.eml", "message/rfc822"],
-        )
+        util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
 
         # Check the archive PDF
-        archive_path = self.parser.get_archive_path()
+        archive_path = mail_parser.get_archive_path()
         archive_text = extract_text(archive_path)
-        expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf")
+        expected_archive_text = extract_text(html_email_pdf_file)
 
         # Archive includes the HTML content, so use in
-        self.assertIn(expected_archive_text, archive_text)
+        assert expected_archive_text in archive_text
 
         # Check the thumbnail
-        generated_thumbnail = self.parser.get_thumbnail(
-            self.SAMPLE_DIR / "html.eml",
+        generated_thumbnail = mail_parser.get_thumbnail(
+            html_email_file,
             "message/rfc822",
         )
         generated_thumbnail_hash = self.imagehash(generated_thumbnail)
 
         # The created pdf is not reproducible. But the converted image should always look the same.
-        expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp")
+        expected_hash = self.imagehash(html_email_thumbnail_file)
 
-        self.assertEqual(
-            generated_thumbnail_hash,
-            expected_hash,
-            f"PDF looks different. Check if {generated_thumbnail} looks weird.",
-        )
+        assert (
+            generated_thumbnail_hash == expected_hash
+        ), f"PDF looks different. Check if {generated_thumbnail} looks weird."
index b6481adc92ebfdc9dc41ebe99ece2f3a6a7749b7..58df11d7aa32203852331cbfa2802448f13f5c2b 100644 (file)
@@ -1,4 +1,4 @@
-import os
+from pathlib import Path
 
 from django.conf import settings
 from PIL import Image
@@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser):
 
     logging_name = "paperless.parsing.text"
 
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
         text = self.read_file_handle_unicode_errors(document_path)
 
         img = Image.new("RGB", (500, 700), color="white")
@@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser):
         )
         draw.text((5, 5), text, font=font, fill="black")
 
-        out_path = os.path.join(self.tempdir, "thumb.webp")
+        out_path = self.tempdir / "thumb.webp"
         img.save(out_path, format="WEBP")
 
         return out_path
diff --git a/src/paperless_text/tests/conftest.py b/src/paperless_text/tests/conftest.py
new file mode 100644 (file)
index 0000000..1d9e4fc
--- /dev/null
@@ -0,0 +1,30 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_text.parsers import TextDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture()
+def text_parser() -> Generator[TextDocumentParser, None, None]:
+    try:
+        parser = TextDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "decode_error.txt"
index cc5ce76febffa9df2108f26f9a4b5305d1f74be9..0f8cc19bace1d58c43150c746856ef7e12c01375 100644 (file)
@@ -1,37 +1,26 @@
 from pathlib import Path
 
-from django.test import TestCase
-
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
 from paperless_text.parsers import TextDocumentParser
 
 
-class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
-
-    def test_thumbnail(self):
-        parser = TextDocumentParser(None)
-
+class TestTextParser:
+    def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
         # just make sure that it does not crash
-        f = parser.get_thumbnail(
-            self.SAMPLE_DIR / "test.txt",
-            "text/plain",
-        )
-        self.assertIsFile(f)
-
-    def test_parse(self):
-        parser = TextDocumentParser(None)
+        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+        assert f.exists()
+        assert f.is_file()
 
-        parser.parse(
-            self.SAMPLE_DIR / "test.txt",
-            "text/plain",
-        )
+    def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
+        text_parser.parse(sample_txt_file, "text/plain")
 
-        self.assertEqual(parser.get_text(), "This is a test file.\n")
-        self.assertIsNone(parser.get_archive_path())
+        assert text_parser.get_text() == "This is a test file.\n"
+        assert text_parser.get_archive_path() is None
 
-    def test_parse_invalid_bytes(self):
+    def test_parse_invalid_bytes(
+        self,
+        text_parser: TextDocumentParser,
+        malformed_txt_file: Path,
+    ):
         """
         GIVEN:
             - Text file which contains invalid UTF bytes
@@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             - Parsing continues
             - Invalid bytes are removed
         """
-        parser = TextDocumentParser(None)
 
-        parser.parse(
-            self.SAMPLE_DIR / "decode_error.txt",
-            "text/plain",
-        )
+        text_parser.parse(malformed_txt_file, "text/plain")
 
-        self.assertEqual(parser.get_text(), "Pantothens�ure\n")
-        self.assertIsNone(parser.get_archive_path())
+        assert text_parser.get_text() == "Pantothens�ure\n"
+        assert text_parser.get_archive_path() is None
diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py
new file mode 100644 (file)
index 0000000..657192e
--- /dev/null
@@ -0,0 +1,40 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_tika.parsers import TikaDocumentParser
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+    try:
+        parser = TikaDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(sample_dir: Path) -> Path:
+    return sample_dir / "multi-part-broken.odt"
index 1c6225bdc69444651700831aaeba0f605ba3450d..7d8cffffd87191375660fd070a04244f97ac8862 100644 (file)
@@ -1,9 +1,7 @@
 import os
 from pathlib import Path
-from typing import Final
 
 import pytest
-from django.test import TestCase
 
 from documents.tests.utils import util_call_with_backoff
 from paperless_tika.parsers import TikaDocumentParser
@@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser
     "PAPERLESS_CI_TEST" not in os.environ,
     reason="No Gotenberg/Tika servers to test with",
 )
-class TestTikaParserAgainstServer(TestCase):
+@pytest.mark.django_db()
+class TestTikaParserAgainstServer:
     """
     This test case tests the Tika parsing against a live tika server,
     if the environment contains the correct value indicating such a server
     is available.
     """
 
-    SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
-
-    def setUp(self) -> None:
-        self.parser = TikaDocumentParser(logging_group=None)
-
-    def tearDown(self) -> None:
-        self.parser.cleanup()
-
-    def test_basic_parse_odt(self):
+    def test_basic_parse_odt(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - An input ODT format document
@@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / Path("sample.odt")
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/vnd.oasis.opendocument.text"],
+            tika_parser.parse,
+            [sample_odt_file, "application/vnd.oasis.opendocument.text"],
         )
 
-        self.assertEqual(
-            self.parser.text,
-            "This is an ODT test document, created September 14, 2022",
+        assert (
+            tika_parser.text
+            == "This is an ODT test document, created September 14, 2022"
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            # PDFs begin with the bytes PDF-x.y
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
 
         # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
 
-    def test_basic_parse_docx(self):
+    def test_basic_parse_docx(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_docx_file: Path,
+    ):
         """
         GIVEN:
             - An input DOCX format document
@@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / Path("sample.docx")
-
         util_call_with_backoff(
-            self.parser.parse,
+            tika_parser.parse,
             [
-                test_file,
+                sample_docx_file,
                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
             ],
         )
 
-        self.assertEqual(
-            self.parser.text,
-            "This is an DOCX test document, also made September 14, 2022",
+        assert (
+            tika_parser.text
+            == "This is an DOCX test document, also made September 14, 2022"
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
 
-        # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
 
-    def test_basic_parse_doc(self):
+    def test_basic_parse_doc(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_doc_file: Path,
+    ):
         """
         GIVEN:
             - An input DOC format document
@@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / "sample.doc"
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/msword"],
+            tika_parser.parse,
+            [sample_doc_file, "application/msword"],
         )
 
-        self.assertIn(
-            "his is a test document, saved in the older .doc format",
-            self.parser.text,
+        assert (
+            "This is a test document, saved in the older .doc format"
+            in tika_parser.text
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
-
-    def test_tika_fails_multi_part(self):
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
+
+    def test_tika_fails_multi_part(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_broken_odt: Path,
+    ):
         """
         GIVEN:
             - An input ODT format document
@@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase):
         See also:
             - https://issues.apache.org/jira/browse/TIKA-4110
         """
-        test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/vnd.oasis.opendocument.text"],
+            tika_parser.parse,
+            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
         )
 
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
index ee010eb49e6770537cbe8f4858349e0788dd8219..6b048f252860afd14df2c73549c759f88e86bb4c 100644 (file)
@@ -1,30 +1,30 @@
 import datetime
-import os
 import zoneinfo
+from http import HTTPStatus
 from pathlib import Path
 
-from django.test import TestCase
-from django.test import override_settings
+import pytest
 from httpx import codes
 from httpx._multipart import DataField
-from rest_framework import status
+from pytest_django.fixtures import SettingsWrapper
+from pytest_httpx import HTTPXMock
 
 from documents.parsers import ParseError
 from paperless_tika.parsers import TikaDocumentParser
-from paperless_tika.tests.utils import HttpxMockMixin
 
 
-class TestTikaParser(HttpxMockMixin, TestCase):
-    def setUp(self) -> None:
-        self.parser = TikaDocumentParser(logging_group=None)
-
-    def tearDown(self) -> None:
-        self.parser.cleanup()
-
-    @override_settings(TIME_ZONE="America/Chicago")
-    def test_parse(self):
+@pytest.mark.django_db()
+class TestTikaParser:
+    def test_parse(
+        self,
+        httpx_mock: HTTPXMock,
+        settings: SettingsWrapper,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
+        settings.TIME_ZONE = "America/Chicago"
         # Pretend parse response
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             json={
                 "Content-Type": "application/vnd.oasis.opendocument.text",
                 "X-TIKA:Parsed-By": [],
@@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             },
         )
         # Pretend convert to PDF response
-        self.httpx_mock.add_response(content=b"PDF document")
-
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
-
-        self.parser.parse(file, "application/vnd.oasis.opendocument.text")
-
-        self.assertEqual(self.parser.text, "the content")
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertEqual(f.read(), b"PDF document")
-
-        self.assertEqual(
-            self.parser.date,
-            datetime.datetime(
-                2020,
-                11,
-                21,
-                tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
-            ),
+        httpx_mock.add_response(content=b"PDF document")
+
+        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
+
+        assert tika_parser.text == "the content"
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert f.read() == b"PDF document"
+
+        assert tika_parser.date == datetime.datetime(
+            2020,
+            11,
+            21,
+            tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
         )
 
-    def test_metadata(self):
-        self.httpx_mock.add_response(
+    def test_metadata(
+        self,
+        httpx_mock: HTTPXMock,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
+        httpx_mock.add_response(
             json={
                 "Content-Type": "application/vnd.oasis.opendocument.text",
                 "X-TIKA:Parsed-By": [],
@@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             },
         )
 
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
-
-        metadata = self.parser.extract_metadata(
-            file,
+        metadata = tika_parser.extract_metadata(
+            sample_odt_file,
             "application/vnd.oasis.opendocument.text",
         )
 
-        self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
-        self.assertTrue("Some-key" in [m["key"] for m in metadata])
+        assert "dcterms:created" in [m["key"] for m in metadata]
+        assert "Some-key" in [m["key"] for m in metadata]
 
-    def test_convert_failure(self):
+    def test_convert_failure(
+        self,
+        httpx_mock: HTTPXMock,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - Document needs to be converted to PDF
@@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             - Parse error is raised
         """
         # Pretend convert to PDF response
-        self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
-
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
+        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
 
-        with self.assertRaises(ParseError):
-            self.parser.convert_to_pdf(file, None)
+        with pytest.raises(ParseError):
+            tika_parser.convert_to_pdf(sample_odt_file, None)
 
-    def test_request_pdf_a_format(self):
+    @pytest.mark.parametrize(
+        ("setting_value", "expected_form_value"),
+        [
+            ("pdfa", "PDF/A-2b"),
+            ("pdfa-1", "PDF/A-2b"),
+            ("pdfa-2", "PDF/A-2b"),
+            ("pdfa-3", "PDF/A-3b"),
+        ],
+    )
+    def test_request_pdf_a_format(
+        self,
+        setting_value: str,
+        expected_form_value: str,
+        httpx_mock: HTTPXMock,
+        settings: SettingsWrapper,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - Document needs to be converted to PDF
@@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase):
         THEN:
             - Request to Gotenberg contains the expected PDF/A format string
         """
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
+        settings.OCR_OUTPUT_TYPE = setting_value
+        httpx_mock.add_response(
+            status_code=codes.OK,
+            content=b"PDF document",
+            method="POST",
+        )
 
-        for setting, expected_key in [
-            ("pdfa", "PDF/A-2b"),
-            ("pdfa-2", "PDF/A-2b"),
-            ("pdfa-1", "PDF/A-2b"),
-            ("pdfa-3", "PDF/A-3b"),
-        ]:
-            with override_settings(OCR_OUTPUT_TYPE=setting):
-                self.httpx_mock.add_response(
-                    status_code=codes.OK,
-                    content=b"PDF document",
-                    method="POST",
-                )
-
-                self.parser.convert_to_pdf(file, None)
-
-                request = self.httpx_mock.get_request()
-                found = False
-                for field in request.stream.fields:
-                    if isinstance(field, DataField) and field.name == "pdfa":
-                        self.assertEqual(field.value, expected_key)
-                        found = True
-                        break
-                self.assertTrue(found)
-
-                self.httpx_mock.reset(assert_all_responses_were_requested=False)
+        tika_parser.convert_to_pdf(sample_odt_file, None)
+
+        request = httpx_mock.get_request()
+        found = False
+        for field in request.stream.fields:
+            if isinstance(field, DataField) and field.name == "pdfa":
+                assert field.value == expected_form_value
+                found = True
+        assert found, "pdfFormat was not found"
+
+        httpx_mock.reset(assert_all_responses_were_requested=False)
diff --git a/src/paperless_tika/tests/utils.py b/src/paperless_tika/tests/utils.py
deleted file mode 100644 (file)
index b26f79e..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-import pytest
-from pytest_httpx import HTTPXMock
-
-
-class HttpxMockMixin:
-    @pytest.fixture(autouse=True)
-    def httpx_mock_auto(self, httpx_mock: HTTPXMock):
-        """
-        Workaround for allowing use of a fixture with unittest style testing
-        """
-        self.httpx_mock = httpx_mock