response = self.client.get("/api/documents/", format="json")
self.assertEqual(response.status_code, 200)
results_full = response.data["results"]
- self.assertTrue("content" in results_full[0])
- self.assertTrue("id" in results_full[0])
+ self.assertIn("content", results_full[0])
+ self.assertIn("id", results_full[0])
response = self.client.get("/api/documents/?fields=id", format="json")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
self.assertFalse("content" in results[0])
- self.assertTrue("id" in results[0])
+ self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=content", format="json")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
- self.assertTrue("content" in results[0])
+ self.assertIn("content", results[0])
self.assertFalse("id" in results[0])
self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=id,content", format="json")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
- self.assertTrue("content" in results[0])
- self.assertTrue("id" in results[0])
+ self.assertIn("content", results[0])
+ self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 2)
response = self.client.get(
self.assertEqual(response.status_code, 200)
results = response.data["results"]
self.assertFalse("content" in results[0])
- self.assertTrue("id" in results[0])
+ self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=", format="json")
cmd.manifest = [{"model": "documents.document"}]
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
- self.assertTrue("The manifest file contains a record" in str(cm.exception))
+ self.assertIn("The manifest file contains a record", str(cm.exception))
cmd.manifest = [
{"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
# self.assertRaises(CommandError, cmd._check_manifest)
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
- self.assertTrue(
- 'The manifest file refers to "noexist.pdf"' in str(cm.exception),
+ self.assertIn(
+ 'The manifest file refers to "noexist.pdf"',
+ str(cm.exception),
)
from tempfile import TemporaryDirectory
from unittest import mock
+from django.apps import apps
+from django.test import override_settings
from django.test import TestCase
from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
+from paperless_tika.parsers import TikaDocumentParser
class TestParserDiscovery(TestCase):
class TestParserAvailability(TestCase):
- def test_file_extensions(self):
-
+ def test_tesseract_parser(self):
+ """
+ GIVEN:
+ - Various mime types
+ WHEN:
+ - The parser class is instantiated
+ THEN:
+ - The Tesseract based parser is return
+ """
supported_mimes_and_exts = [
("application/pdf", ".pdf"),
("image/png", ".png"),
("image/jpeg", ".jpg"),
("image/tiff", ".tif"),
("image/webp", ".webp"),
+ ]
+
+ supported_exts = get_supported_file_extensions()
+
+ for mime_type, ext in supported_mimes_and_exts:
+ self.assertIn(ext, supported_exts)
+ self.assertEqual(get_default_file_extension(mime_type), ext)
+ self.assertIsInstance(
+ get_parser_class_for_mime_type(mime_type)(logging_group=None),
+ RasterisedDocumentParser,
+ )
+
+ def test_text_parser(self):
+ """
+ GIVEN:
+ - Various mime types of a text form
+ WHEN:
+ - The parser class is instantiated
+ THEN:
+ - The text based parser is return
+ """
+ supported_mimes_and_exts = [
("text/plain", ".txt"),
("text/csv", ".csv"),
]
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
+ self.assertIsInstance(
+ get_parser_class_for_mime_type(mime_type)(logging_group=None),
+ TextDocumentParser,
+ )
+ def test_tika_parser(self):
+ """
+ GIVEN:
+ - Various mime types of a office document form
+ WHEN:
+ - The parser class is instantiated
+ THEN:
+ - The Tika/Gotenberg based parser is return
+ """
+ supported_mimes_and_exts = [
+ ("application/vnd.oasis.opendocument.text", ".odt"),
+ ("text/rtf", ".rtf"),
+ ("application/msword", ".doc"),
+ (
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ".docx",
+ ),
+ ]
+
+ # Force the app ready to notice the settings override
+ with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
+ app = apps.get_app_config("paperless_tika")
+ app.ready()
+ supported_exts = get_supported_file_extensions()
+
+ for mime_type, ext in supported_mimes_and_exts:
+ self.assertIn(ext, supported_exts)
+ self.assertEqual(get_default_file_extension(mime_type), ext)
+ self.assertIsInstance(
+ get_parser_class_for_mime_type(mime_type)(logging_group=None),
+ TikaDocumentParser,
+ )
+
+ def test_no_parser_for_mime(self):
+ self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
+
+ def test_default_extension(self):
# Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
# Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
- self.assertIsInstance(
- get_parser_class_for_mime_type("application/pdf")(logging_group=None),
- RasterisedDocumentParser,
- )
- self.assertIsInstance(
- get_parser_class_for_mime_type("text/plain")(logging_group=None),
- TextDocumentParser,
- )
- self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
-
+ def test_file_extension_support(self):
self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh"))
self.assertFalse(is_file_ext_supported(""))
}
+@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
class TestWebSockets(TestCase):
- @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
async def test_no_auth(self):
communicator = WebsocketCommunicator(application, "/ws/status/")
connected, subprotocol = await communicator.connect()
self.assertFalse(connected)
await communicator.disconnect()
- @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
@mock.patch("paperless.consumers.StatusConsumer._authenticated")
async def test_auth(self, _authenticated):
_authenticated.return_value = True
await communicator.disconnect()
- @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
@mock.patch("paperless.consumers.StatusConsumer._authenticated")
async def test_receive(self, _authenticated):
_authenticated.return_value = True
class FaviconView(View):
- def get(self, request, *args, **kwargs):
+ def get(self, request, *args, **kwargs): # pragma: nocover
favicon = os.path.join(
os.path.dirname(__file__),
"static",
except Exception:
# TODO catch all for various issues with PDFminer.six.
- # If PDFminer fails, fall back to OCR.
+ # If pdftotext fails, fall back to OCR.
self.log(
"warning",
"Error while getting text from PDF document with " "pdfminer.six",
)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
- self.assertFalse("page 3" in parser.get_text().lower())
+ self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self):
)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
- self.assertFalse("page 2" in parser.get_text().lower())
- self.assertFalse("page 3" in parser.get_text().lower())
+ self.assertNotIn("page 2", parser.get_text().lower())
+ self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
+ with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
+ params = parser.construct_ocrmypdf_parameters("", "", "", "")
+ self.assertIn("max_image_mpixels", params)
+ self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
+
+ with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
+ params = parser.construct_ocrmypdf_parameters("", "", "", "")
+ self.assertNotIn("max_image_mpixels", params)
+
def test_rtl_language_detection(self):
"""
GIVEN:
from pathlib import Path
from unittest import mock
+from django.test import override_settings
from django.test import TestCase
+from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
from requests import Response
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata])
+
+ @mock.patch("paperless_tika.parsers.parser.from_file")
+ @mock.patch("paperless_tika.parsers.requests.post")
+ def test_convert_failure(self, post, from_file):
+ """
+ GIVEN:
+ - Document needs to be converted to PDF
+ WHEN:
+ - Gotenberg server returns an error
+ THEN:
+ - Parse error is raised
+ """
+ from_file.return_value = {
+ "content": "the content",
+ "metadata": {"Creation-Date": "2020-11-21"},
+ }
+ response = Response()
+ response._content = b"PDF document"
+ response.status_code = 500
+ post.return_value = response
+
+ file = os.path.join(self.parser.tempdir, "input.odt")
+ Path(file).touch()
+
+ with self.assertRaises(ParseError):
+ self.parser.convert_to_pdf(file, None)
+
+ @mock.patch("paperless_tika.parsers.requests.post")
+ def test_request_pdf_a_format(self, post: mock.Mock):
+ """
+ GIVEN:
+ - Document needs to be converted to PDF
+ WHEN:
+ - Specific PDF/A format requested
+ THEN:
+ - Request to Gotenberg contains the expected PDF/A format string
+ """
+ file = os.path.join(self.parser.tempdir, "input.odt")
+ Path(file).touch()
+
+ response = Response()
+ response._content = b"PDF document"
+ response.status_code = 200
+ post.return_value = response
+
+ for setting, expected_key in [
+ ("pdfa", "PDF/A-2b"),
+ ("pdfa-2", "PDF/A-2b"),
+ ("pdfa-1", "PDF/A-1a"),
+ ("pdfa-3", "PDF/A-3b"),
+ ]:
+ with override_settings(OCR_OUTPUT_TYPE=setting):
+ self.parser.convert_to_pdf(file, None)
+
+ post.assert_called_once()
+ _, kwargs = post.call_args
+
+ self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
+
+ post.reset_mock()