bpo-43323: Fix UnicodeEncodeError in the email module (GH-32137)

author Serhiy Storchaka <storchaka@gmail.com>

Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)

committer GitHub <noreply@github.com>

Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)
committer GitHub <noreply@github.com>
Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)
diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py

index 295ae7eb21237c6d9dd3be3bdcdc24f378b7d64a..6795a606de037e2e428f95087e394e9e16a5ebbb 100644 (file)
--- a/Lib/email/_encoded_words.py
+++ b/Lib/email/_encoded_words.py
@@ -179,15 +179,15 @@ def decode(ew):
      # Turn the CTE decoded bytes into unicode.
      try:
          string = bstring.decode(charset)
-    except UnicodeError:
+    except UnicodeDecodeError:
          defects.append(errors.UndecodableBytesDefect("Encoded word "
-            "contains bytes not decodable using {} charset".format(charset)))
+            f"contains bytes not decodable using {charset!r} charset"))
          string = bstring.decode(charset, 'surrogateescape')
-    except LookupError:
+    except (LookupError, UnicodeEncodeError):
          string = bstring.decode('ascii', 'surrogateescape')
          if charset.lower() != 'unknown-8bit':
-            defects.append(errors.CharsetError("Unknown charset {} "
-                "in encoded word; decoded as unknown bytes".format(charset)))
+            defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
+                f"in encoded word; decoded as unknown bytes"))
      return string, charset, lang, defects
  
  
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 51d355fbb0abc5460835414998983717256e9364..8a8fb8bc42a954038aa135957718914815e87d25 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -781,7 +781,7 @@ class MimeParameters(TokenList):
                      else:
                          try:
                              value = value.decode(charset, 'surrogateescape')
-                        except LookupError:
+                        except (LookupError, UnicodeEncodeError):
                              # XXX: there should really be a custom defect for
                              # unknown character set to make it easy to find,
                              # because otherwise unknown charset is a silent
diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py

index 0b8b1de3359aa6e68e04110e6b40de4dcd94f6a8..1713962f94caef24515b2d49fb39811cc7d9944a 100644 (file)
--- a/Lib/test/test_email/test__encoded_words.py
+++ b/Lib/test/test_email/test__encoded_words.py
@@ -130,6 +130,13 @@ class TestDecode(TestEmailBase):
                     # XXX Should this be a new Defect instead?
                     defects = [errors.CharsetError])
  
+    def test_invalid_character_in_charset(self):
+        self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=',
+                   b'foo\xacbar'.decode('ascii', 'surrogateescape'),
+                   charset = 'utf-8\udce2\udc80\udc9d',
+                   # XXX Should this be a new Defect instead?
+                   defects = [errors.CharsetError])
+
      def test_q_nonascii(self):
          self._test('=?utf-8?q?=C3=89ric?=',
                     'Éric',
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py

index 933aa4cbc1959740eb9211eee0bc42aa13b9302e..69f883a3673f26f97912f8dfa6de2f273cba9ffd 100644 (file)
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -5356,6 +5356,15 @@ Content-Disposition: inline;
  Content-Transfer-Encoding: 8bit
  Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
  
+"""
+        msg = email.message_from_string(m)
+        self.assertEqual(msg.get_filename(), 'myfile.txt')
+
+    def test_rfc2231_bad_character_in_encoding(self):
+        m = """\
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
+
  """
          msg = email.message_from_string(m)
          self.assertEqual(msg.get_filename(), 'myfile.txt')
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index 59fcd932e0ec4a89b30a7b7291535c079b4a0d90..25347ef13c21475ae9293902946f6b334ccd5776 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -714,6 +714,18 @@ class TestContentTypeHeader(TestHeaderBase):
              " charset*=unknown-8bit''utf-8%E2%80%9D\n",
              ),
  
+        'rfc2231_nonascii_in_charset_of_charset_parameter_value': (
+            "text/plain; charset*=utf-8”''utf-8%E2%80%9D",
+            'text/plain',
+            'text',
+            'plain',
+            {'charset': 'utf-8”'},
+            [],
+            'text/plain; charset="utf-8”"',
+            "Content-Type: text/plain;"
+            " charset*=utf-8''utf-8%E2%80%9D\n",
+            ),
+
          'rfc2231_encoded_then_unencoded_segments': (
              ('application/x-foo;'
                  '\tname*0*="us-ascii\'en-us\'My";'
diff --git a/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst

new file mode 100644 (file)

index 0000000..98d7310
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst
@@ -0,0 +1,2 @@
+Fix errors in the :mod:`email` module if the charset itself contains
+undecodable/unencodable characters.
author	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)
committer	GitHub <noreply@github.com>
	Sat, 30 Apr 2022 10:17:23 +0000 (13:17 +0300)
Lib/email/_encoded_words.py		patch \| blob \| blame \| history
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__encoded_words.py		patch \| blob \| blame \| history
Lib/test/test_email/test_email.py		patch \| blob \| blame \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-03-27-12-40-16.bpo-43323.9mFPuI.rst	[new file with mode: 0644]	patch \| blob