From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 9 Feb 2024 20:08:23 +0000 (-0800) Subject: Fix: Test metadata items for Unicode issues (#5707) X-Git-Tag: v2.5.0~4^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0b1523f4e5dc37ab936c1f6ec8fe49145a5ca057;p=thirdparty%2Fpaperless-ngx.git Fix: Test metadata items for Unicode issues (#5707) Test each key for unicode issues and reject ones which will fail inside DRF --- diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 92fb90bb15..b8cf129803 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -69,6 +69,11 @@ class MailDocumentParser(DocumentParser): for key, value in mail.headers.items(): value = ", ".join(i for i in value) + try: + value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping header {key}: {e}") + continue result.append( { diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b6baa32893..09086585ef 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -55,11 +55,21 @@ class RasterisedDocumentParser(DocumentParser): value = str(value) try: m = namespace_pattern.match(key) + if m is None: # pragma: no cover + continue + namespace = m.group(1) + key_value = m.group(2) + try: + namespace.encode("utf-8") + key_value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping metadata key {key}: {e}") + continue result.append( { - "namespace": m.group(1), - "prefix": meta.REVERSE_NS[m.group(1)], - "key": m.group(2), + "namespace": namespace, + "prefix": meta.REVERSE_NS[namespace], + "key": key_value, "value": value, }, )