import os
import re
-import ocrmypdf
-import pdftotext
-import pikepdf
from PIL import Image
from django.conf import settings
-from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
logging_name = "paperless.parsing.tesseract"
def extract_metadata(self, document_path, mime_type):
+ import pikepdf
+
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
return None
def parse(self, document_path, mime_type, file_name=None):
+ import ocrmypdf
+ from ocrmypdf import InputFileError, EncryptedPdfError
+
mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path)
def get_text_from_pdf(pdf_file):
+ import pdftotext
if not os.path.isfile(pdf_file):
return None
self.assertRaises(ParseError, f)
- @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
- def test_image_calc_a4_dpi(self, m):
+ def test_image_calc_a4_dpi(self):
parser = RasterisedDocumentParser(None)
- parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
-
- m.assert_called_once()
-
- args, kwargs = m.call_args
+ dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
- self.assertEqual(kwargs['image_dpi'], 62)
+ self.assertEqual(dpi, 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):