Don't use the sidecar file when redoing the OCR, it only contains new text

author Trenton H <797416+stumpylog@users.noreply.github.com>

Mon, 21 Nov 2022 22:45:20 +0000 (14:45 -0800)

committer Trenton H <797416+stumpylog@users.noreply.github.com>

Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)
author Trenton H <797416+stumpylog@users.noreply.github.com>
Mon, 21 Nov 2022 22:45:20 +0000 (14:45 -0800)
committer Trenton H <797416+stumpylog@users.noreply.github.com>
Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py

index 405df07ce5fb85d81bfd6876941d72c2b75d523a..aa3ad64fa227e8998b749a75b0bf29cf1e26cce8 100644 (file)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -95,7 +95,13 @@ class RasterisedDocumentParser(DocumentParser):
              return None
  
      def extract_text(self, sidecar_file, pdf_file):
-        if sidecar_file and os.path.isfile(sidecar_file):
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # the whole text, so do not utilize it in that case
+        if (
+            sidecar_file is not None
+            and os.path.isfile(sidecar_file)
+            and settings.OCR_MODE != "redo"
+        ):
              with open(sidecar_file) as f:
                  text = f.read()
  
@@ -142,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
              "input_file": input_file,
              "output_file": output_file,
              # need to use threads, since this will be run in daemonized
-            # processes by django-q.
+            # processes via the task library.
              "use_threads": True,
              "jobs": settings.THREADS_PER_WORKER,
              "language": settings.OCR_LANGUAGE,
@@ -165,9 +171,11 @@ class RasterisedDocumentParser(DocumentParser):
              if settings.OCR_MODE == "redo":
                  ocrmypdf_args["clean"] = True
              else:
+                # --clean-final is not compatible with --redo-ocr
                  ocrmypdf_args["clean_final"] = True
  
-        if settings.OCR_DESKEW and not settings.OCR_MODE == "redo":
+        if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
+            # --deskew is not compatible with --redo-ocr
              ocrmypdf_args["deskew"] = True
  
          if settings.OCR_ROTATE_PAGES:
@@ -263,7 +271,7 @@ class RasterisedDocumentParser(DocumentParser):
  
          # Either no text was in the original or there should be an archive
          # file created, so OCR the file and create an archive with any
-        # test located via OCR
+        # text located via OCR
  
          import ocrmypdf
          from ocrmypdf import InputFileError, EncryptedPdfError
author	Trenton H <797416+stumpylog@users.noreply.github.com>
	Mon, 21 Nov 2022 22:45:20 +0000 (14:45 -0800)
committer	Trenton H <797416+stumpylog@users.noreply.github.com>
	Tue, 22 Nov 2022 15:22:41 +0000 (07:22 -0800)