]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-94606: Fix error when message with Unicode surrogate not surrogateescaped string...
authorSidney Markowitz <sidney@sidney.com>
Mon, 11 Dec 2023 16:21:18 +0000 (05:21 +1300)
committerGitHub <noreply@github.com>
Mon, 11 Dec 2023 16:21:18 +0000 (18:21 +0200)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Lib/email/message.py
Lib/email/utils.py
Lib/test/test_email/test_message.py
Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst [new file with mode: 0644]

index 411118c74dabb411de55136fa69bacc75d68d7d9..fe769580fed5d0a5db7c8158ec4b9f4875fb40a9 100644 (file)
@@ -289,25 +289,26 @@ class Message:
         # cte might be a Header, so for now stringify it.
         cte = str(self.get('content-transfer-encoding', '')).lower()
         # payload may be bytes here.
-        if isinstance(payload, str):
-            if utils._has_surrogates(payload):
-                bpayload = payload.encode('ascii', 'surrogateescape')
-                if not decode:
+        if not decode:
+            if isinstance(payload, str) and utils._has_surrogates(payload):
+                try:
+                    bpayload = payload.encode('ascii', 'surrogateescape')
                     try:
                         payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
                     except LookupError:
                         payload = bpayload.decode('ascii', 'replace')
-            elif decode:
-                try:
-                    bpayload = payload.encode('ascii')
-                except UnicodeError:
-                    # This won't happen for RFC compliant messages (messages
-                    # containing only ASCII code points in the unicode input).
-                    # If it does happen, turn the string into bytes in a way
-                    # guaranteed not to fail.
-                    bpayload = payload.encode('raw-unicode-escape')
-        if not decode:
+                except UnicodeEncodeError:
+                    pass
             return payload
+        if isinstance(payload, str):
+            try:
+                bpayload = payload.encode('ascii', 'surrogateescape')
+            except UnicodeEncodeError:
+                # This won't happen for RFC compliant messages (messages
+                # containing only ASCII code points in the unicode input).
+                # If it does happen, turn the string into bytes in a way
+                # guaranteed not to fail.
+                bpayload = payload.encode('raw-unicode-escape')
         if cte == 'quoted-printable':
             return quopri.decodestring(bpayload)
         elif cte == 'base64':
index a49a8fa986ce0cfa69b4ee01cb63de232476afbf..9175f2fdb6e69e7b9591394785ec0e5fe3f703e8 100644 (file)
@@ -44,10 +44,10 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[\\"]')
 
 def _has_surrogates(s):
-    """Return True if s contains surrogate-escaped binary data."""
+    """Return True if s may contain surrogate-escaped binary data."""
     # This check is based on the fact that unless there are surrogates, utf8
     # (Python's default encoding) can encode any string.  This is the fastest
-    # way to check for surrogates, see issue 11454 for timings.
+    # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
     try:
         s.encode()
         return False
index d3f396f02e7a725cdf84411f6795f0e52af7dff7..034f7626c1fc7c05c1c3b56601fa50c3e13c7a17 100644 (file)
@@ -748,6 +748,35 @@ class TestEmailMessageBase:
         self.assertEqual(len(list(m.iter_attachments())), 2)
         self.assertEqual(m.get_payload(), orig)
 
+    get_payload_surrogate_params = {
+
+        'good_surrogateescape': (
+            "String that can be encod\udcc3\udcabd with surrogateescape",
+            b'String that can be encod\xc3\xabd with surrogateescape'
+            ),
+
+        'string_with_utf8': (
+            "String with utf-8 charactër",
+            b'String with utf-8 charact\xebr'
+            ),
+
+        'surrogate_and_utf8': (
+            "String that cannot be ëncod\udcc3\udcabd with surrogateescape",
+             b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
+            ),
+
+        'out_of_range_surrogate': (
+            "String with \udfff cannot be encoded with surrogateescape",
+             b'String with \\udfff cannot be encoded with surrogateescape'
+            ),
+    }
+
+    def get_payload_surrogate_as_gh_94606(self, msg, expected):
+        """test for GH issue 94606"""
+        m = self._str_msg(msg)
+        payload = m.get_payload(decode=True)
+        self.assertEqual(expected, payload)
+
 
 class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
     message = EmailMessage
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst
new file mode 100644 (file)
index 0000000..5201ab7
--- /dev/null
@@ -0,0 +1,3 @@
+Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message\r
+with a Unicode surrogate character and the message content is not well-formed for\r
+surrogateescape encoding. Patch by Sidney Markowitz.