gh-94606: Fix error when message with Unicode surrogate not surrogateescaped string...

author Sidney Markowitz <sidney@sidney.com>

Mon, 11 Dec 2023 16:21:18 +0000 (05:21 +1300)

committer GitHub <noreply@github.com>

Mon, 11 Dec 2023 16:21:18 +0000 (18:21 +0200)
author Sidney Markowitz <sidney@sidney.com>
Mon, 11 Dec 2023 16:21:18 +0000 (05:21 +1300)
committer GitHub <noreply@github.com>
Mon, 11 Dec 2023 16:21:18 +0000 (18:21 +0200)
diff --git a/Lib/email/message.py b/Lib/email/message.py

index 411118c74dabb411de55136fa69bacc75d68d7d9..fe769580fed5d0a5db7c8158ec4b9f4875fb40a9 100644 (file)
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -289,25 +289,26 @@ class Message:
          # cte might be a Header, so for now stringify it.
          cte = str(self.get('content-transfer-encoding', '')).lower()
          # payload may be bytes here.
-        if isinstance(payload, str):
-            if utils._has_surrogates(payload):
-                bpayload = payload.encode('ascii', 'surrogateescape')
-                if not decode:
+        if not decode:
+            if isinstance(payload, str) and utils._has_surrogates(payload):
+                try:
+                    bpayload = payload.encode('ascii', 'surrogateescape')
                      try:
                          payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
                      except LookupError:
                          payload = bpayload.decode('ascii', 'replace')
-            elif decode:
-                try:
-                    bpayload = payload.encode('ascii')
-                except UnicodeError:
-                    # This won't happen for RFC compliant messages (messages
-                    # containing only ASCII code points in the unicode input).
-                    # If it does happen, turn the string into bytes in a way
-                    # guaranteed not to fail.
-                    bpayload = payload.encode('raw-unicode-escape')
-        if not decode:
+                except UnicodeEncodeError:
+                    pass
              return payload
+        if isinstance(payload, str):
+            try:
+                bpayload = payload.encode('ascii', 'surrogateescape')
+            except UnicodeEncodeError:
+                # This won't happen for RFC compliant messages (messages
+                # containing only ASCII code points in the unicode input).
+                # If it does happen, turn the string into bytes in a way
+                # guaranteed not to fail.
+                bpayload = payload.encode('raw-unicode-escape')
          if cte == 'quoted-printable':
              return quopri.decodestring(bpayload)
          elif cte == 'base64':
diff --git a/Lib/email/utils.py b/Lib/email/utils.py

index a49a8fa986ce0cfa69b4ee01cb63de232476afbf..9175f2fdb6e69e7b9591394785ec0e5fe3f703e8 100644 (file)
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -44,10 +44,10 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
  escapesre = re.compile(r'[\\"]')
  
  def _has_surrogates(s):
-    """Return True if s contains surrogate-escaped binary data."""
+    """Return True if s may contain surrogate-escaped binary data."""
      # This check is based on the fact that unless there are surrogates, utf8
      # (Python's default encoding) can encode any string.  This is the fastest
-    # way to check for surrogates, see issue 11454 for timings.
+    # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
      try:
          s.encode()
          return False
diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py

index d3f396f02e7a725cdf84411f6795f0e52af7dff7..034f7626c1fc7c05c1c3b56601fa50c3e13c7a17 100644 (file)
--- a/Lib/test/test_email/test_message.py
+++ b/Lib/test/test_email/test_message.py
@@ -748,6 +748,35 @@ class TestEmailMessageBase:
          self.assertEqual(len(list(m.iter_attachments())), 2)
          self.assertEqual(m.get_payload(), orig)
  
+    get_payload_surrogate_params = {
+
+        'good_surrogateescape': (
+            "String that can be encod\udcc3\udcabd with surrogateescape",
+            b'String that can be encod\xc3\xabd with surrogateescape'
+            ),
+
+        'string_with_utf8': (
+            "String with utf-8 charactër",
+            b'String with utf-8 charact\xebr'
+            ),
+
+        'surrogate_and_utf8': (
+            "String that cannot be ëncod\udcc3\udcabd with surrogateescape",
+             b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
+            ),
+
+        'out_of_range_surrogate': (
+            "String with \udfff cannot be encoded with surrogateescape",
+             b'String with \\udfff cannot be encoded with surrogateescape'
+            ),
+    }
+
+    def get_payload_surrogate_as_gh_94606(self, msg, expected):
+        """test for GH issue 94606"""
+        m = self._str_msg(msg)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
  
  class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
      message = EmailMessage
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst

new file mode 100644 (file)

index 0000000..5201ab7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst
@@ -0,0 +1,3 @@
+Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message\r
+with a Unicode surrogate character and the message content is not well-formed for\r
+surrogateescape encoding. Patch by Sidney Markowitz.
author	Sidney Markowitz <sidney@sidney.com>
	Mon, 11 Dec 2023 16:21:18 +0000 (05:21 +1300)
committer	GitHub <noreply@github.com>
	Mon, 11 Dec 2023 16:21:18 +0000 (18:21 +0200)
Lib/email/message.py		patch \| blob \| blame \| history
Lib/email/utils.py		patch \| blob \| blame \| history
Lib/test/test_email/test_message.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst	[new file with mode: 0644]	patch \| blob