gh-83938, gh-122476: Stop incorrectly RFC 2047 encoding non-ASCII email addresses...

author Mike Edmunds <medmunds@gmail.com>

Fri, 1 May 2026 17:52:06 +0000 (10:52 -0700)

committer GitHub <noreply@github.com>

Fri, 1 May 2026 17:52:06 +0000 (13:52 -0400)
author Mike Edmunds <medmunds@gmail.com>
Fri, 1 May 2026 17:52:06 +0000 (10:52 -0700)
committer GitHub <noreply@github.com>
Fri, 1 May 2026 17:52:06 +0000 (13:52 -0400)
diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst

index 8f6e4218c97b38c685b1bbf6ba7316680c5961c4..816d02d86f4fc4b187229bb0d247016c2727c0ce 100644 (file)
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -403,11 +403,26 @@ added matters.  To illustrate::
     .. attribute:: utf8
  
        If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
-      headers by encoding them as "encoded words".  If ``True``, follow
-      :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
+      headers by encoding them as :rfc:`2047` "encoded words".  If ``True``,
+      follow :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
        formatted in this way may be passed to SMTP servers that support
        the ``SMTPUTF8`` extension (:rfc:`6531`).
  
+      When ``False``, the generator will raise
+      :exc:`~email.errors.HeaderWriteError` if any header includes non-ASCII
+      characters in a context where :rfc:`2047` does not permit encoded words.
+      This particularly applies to mailboxes ("addr-spec") with non-ASCII
+      characters, which can be created via
+      :class:`~email.headerregistry.Address`. To use a mailbox with a non-ASCII
+      domain name with ``utf8=False``, first encode the domain using the
+      third-party :pypi:`idna` or :pypi:`uts46` module or with
+      :mod:`encodings.idna`. It is not possible to use a non-ASCII username
+      ("local-part") in a mailbox when ``utf8=False``.
+
+      .. versionchanged:: 3.15
+         Can trigger the raising of :exc:`~email.errors.HeaderWriteError`.
+         (Earlier versions incorrectly applied :rfc:`2047` in certain contexts,
+         mostly notably in addr-specs.)
  
     .. attribute:: refold_source
  
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst

index a687ee5115be0590dd387527ea909dae23a422b9..782b2fe002442ce8dbc9c3dbd24d6fc281847371 100644 (file)
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -914,6 +914,16 @@ faulthandler
    (Contributed by Eric Froemling in :gh:`149085`.)
  
  
+email
+-----
+
+* Email generators now raise an error when an :class:`.EmailMessage` cannot be
+  accurately flattened due to a non-ASCII email address (mailbox) in an address
+  header. Options for supporting Email Address Internationalization (EAI) are
+  discussed in :attr:`.EmailPolicy.utf8`.
+  (Contributed by R David Murray and Mike Edmunds in :gh:`122540`.)
+
+
  functools
  ---------
  
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index a53903a197f39e68e0491b25c7e7bc7f33c14437..26b6e26ae652fa6724f07d13af4d0f9b0d999d22 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -157,10 +157,7 @@ class TokenList(list):
      def startswith_fws(self):
          return self[0].startswith_fws()
  
-    @property
-    def as_ew_allowed(self):
-        """True if all top level tokens of this part may be RFC2047 encoded."""
-        return all(part.as_ew_allowed for part in self)
+    as_ew_allowed = True
  
      @property
      def comments(self):
@@ -429,6 +426,7 @@ class NameAddr(TokenList):
  class AngleAddr(TokenList):
  
      token_type = 'angle-addr'
+    as_ew_allowed = False
  
      @property
      def local_part(self):
@@ -847,26 +845,22 @@ class ParameterizedHeaderValue(TokenList):
  
  class ContentType(ParameterizedHeaderValue):
      token_type = 'content-type'
-    as_ew_allowed = False
      maintype = 'text'
      subtype = 'plain'
  
  
  class ContentDisposition(ParameterizedHeaderValue):
      token_type = 'content-disposition'
-    as_ew_allowed = False
      content_disposition = None
  
  
  class ContentTransferEncoding(TokenList):
      token_type = 'content-transfer-encoding'
-    as_ew_allowed = False
      cte = '7bit'
  
  
  class HeaderLabel(TokenList):
      token_type = 'header-label'
-    as_ew_allowed = False
  
  
  class MsgID(TokenList):
@@ -2835,13 +2829,68 @@ def _steal_trailing_WSP_if_exists(lines):
  
  
  def _refold_parse_tree(parse_tree, *, policy):
-    """Return string of contents of parse_tree folded according to RFC rules.
-
-    """
      # max_line_length 0/None means no limit, ie: infinitely long.
      maxlen = policy.max_line_length or sys.maxsize
      encoding = 'utf-8' if policy.utf8 else 'us-ascii'
      lines = ['']  # Folded lines to be output
+    if parse_tree.as_ew_allowed:
+        _refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy)
+    else:
+        _refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy)
+    return policy.linesep.join(lines) + policy.linesep
+
+def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy):
+    parts = list(parse_tree)
+    while parts:
+        part = parts.pop(0)
+        tstr = str(part)
+        try:
+            tstr.encode(encoding)
+        except UnicodeEncodeError:
+            if any(isinstance(x, errors.UndecodableBytesDefect)
+                   for x in part.all_defects):
+                # There is garbage data from parsing a message in binary mode,
+                # just pass it through.  Not good, but the best we can do.
+                pass
+            elif policy.utf8:
+                # If this happens, it's a programmer error.
+                raise
+            else:
+                raise errors.HeaderWriteError(
+                    f"Non-ASCII {part.token_type} '{part}' is invalid"
+                    " under current policy setting (utf8=False)"
+                )
+        if len(tstr) <= maxlen - len(lines[-1]):
+            lines[-1] += tstr
+            continue
+        # This part is too long to fit.  The RFC wants us to break at
+        # "major syntactic breaks", so unless we don't consider this
+        # to be one, check if it will fit on the next line by itself.
+        if (part.syntactic_break and
+                len(tstr) + 1 <= maxlen):
+            newline = _steal_trailing_WSP_if_exists(lines)
+            if newline or part.startswith_fws():
+                lines.append(newline + tstr)
+                continue
+        if not hasattr(part, 'encode'):
+            # It's not a terminal, try folding the subparts.
+            newparts = list(part)
+            parts = newparts + parts
+            continue
+        # We can't figure out how to wrap, it, so give up.
+        newline = _steal_trailing_WSP_if_exists(lines)
+        if newline or part.startswith_fws():
+            lines.append(newline + tstr)
+        else:
+            # We can't fold it onto the next line either...
+            lines[-1] += tstr
+    return
+
+
+def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy):
+    """Return string of contents of parse_tree folded according to RFC rules.
+
+    """
      last_word_is_ew = False
      last_ew = None  # if there is an encoded word in the last line of lines,
                      # points to the encoded word's first character
@@ -2855,6 +2904,11 @@ def _refold_parse_tree(parse_tree, *, policy):
          if part is end_ew_not_allowed:
              wrap_as_ew_blocked -= 1
              continue
+        if part.token_type == 'mime-parameters':
+            # Mime parameter folding (using RFC2231) is extra special.
+            _fold_mime_parameters(part, lines, maxlen, encoding)
+            last_word_is_ew = False
+            continue
          tstr = str(part)
          if not want_encoding:
              if part.token_type in ('ptext', 'vtext'):
@@ -2876,14 +2930,11 @@ def _refold_parse_tree(parse_tree, *, policy):
                  charset = 'utf-8'
              want_encoding = True
  
-        if part.token_type == 'mime-parameters':
-            # Mime parameter folding (using RFC2231) is extra special.
-            _fold_mime_parameters(part, lines, maxlen, encoding)
-            last_word_is_ew = False
-            continue
-
          if want_encoding and not wrap_as_ew_blocked:
-            if not part.as_ew_allowed:
+            if any(
+                    not x.as_ew_allowed for x in part
+                    if hasattr(x, 'as_ew_allowed')
+                ):
                  want_encoding = False
                  last_ew = None
                  if part.syntactic_break:
@@ -2964,6 +3015,8 @@ def _refold_parse_tree(parse_tree, *, policy):
                      [ValueTerminal(make_quoted_pairs(p), 'ptext')
                       for p in newparts] +
                      [ValueTerminal('"', 'ptext')])
+                _refold_without_ew(newparts, lines, maxlen, encoding, policy=policy)
+                continue
              if part.token_type == 'comment':
                  newparts = (
                      [ValueTerminal('(', 'ptext')] +
@@ -2991,7 +3044,7 @@ def _refold_parse_tree(parse_tree, *, policy):
              lines[-1] += tstr
          last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
  
-    return policy.linesep.join(lines) + policy.linesep
+    return
  
  def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
      """Fold string to_encode into lines as encoded word, combining if allowed.
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index f3c03062572ba5e96d0c26b134562dd51734742d..bc698759614c36104ae0662e3d6f7e84220bb9b0 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -3374,10 +3374,12 @@ class TestFolding(TestEmailBase):
          self._test(token, expected, policy=policy)
  
      def test_encoded_word_with_undecodable_bytes(self):
-        self._test(parser.get_address_list(
-            ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
+        self._test(
+            parser.get_address_list(
+                ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
+                ' <xyz@abc.com>'
                  )[0],
-            ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n',
+            ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= <xyz@abc.com>\n',
              )
  
  
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py

index 3c9a86f3e8cf291145c45da53fbf9f96cf4d4d6a..8d912738029f781a22934d835dd48d634e90d037 100644 (file)
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -1,4 +1,5 @@
  import io
+import re
  import textwrap
  import unittest
  import random
@@ -295,6 +296,69 @@ class TestGeneratorBase:
          g.flatten(msg)
          self.assertEqual(s.getvalue(), self.typ(expected))
  
+    def test_non_ascii_addr_spec_raises(self):
+        # non-ascii is not permitted in any part of an addr-spec.  If the
+        # programmer generated it, it's an error.  (See also
+        # test_non_ascii_addr_spec_preserved below.)
+        p = self.policy.clone(utf8=False, max_line_length=20)
+        g = self.genclass(self.ioclass(), policy=p)
+        # XXX The particular part detected here isn't part of a behavioral
+        # spec and may change in the future.
+        cases = [
+            ('wők@example.com', 'wők', 'local-part'),
+            ('wok@exàmple.com', 'exàmple.com', 'domain'),
+            ('wők@exàmple.com', 'wők', 'local-part'),
+            (
+                '"Name, for display" <wők@example.com>',
+                'wők@example.com',
+                'addr-spec',
+                ),
+            (
+                'Näyttönimi <wők@example.com>',
+                'wők@example.com',
+                'addr-spec',
+                ),
+            (
+                '"a lőng quoted string as the local part"@example.com',
+                'a lőng quoted string as the local part',
+                'local-part',
+                ),
+
+        ]
+        for address, badtoken, partname in cases:
+            with self.subTest(address=address):
+                msg = EmailMessage()
+                msg['To'] = address
+                expected_error = (
+                    fr"(?i)(?=.*non-ascii)"
+                    fr"(?=.*{re.escape(badtoken)})"
+                    fr"(?=.*{partname})"
+                    fr"(?=.*policy.*utf8)"
+                )
+                with self.assertRaisesRegex(
+                    email.errors.HeaderWriteError, expected_error
+                ):
+                    g.flatten(msg)
+
+    def test_local_part_quoted_string_wrapped_correctly(self):
+        msg = self.msgmaker(self.typ(textwrap.dedent("""\
+            To: <"a long local part in a quoted string"@example.com>
+            Subject: test
+
+            None
+            """)), policy=self.policy.clone(max_line_length=20))
+        expected = textwrap.dedent("""\
+            To: <"a long local part in a
+             quoted string"@example.com>
+            Subject: test
+
+            None
+            """)
+        s = self.ioclass()
+        g = self.genclass(s, policy=self.policy.clone(max_line_length=30))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), self.typ(expected))
+
      def _test_boundary_detection(self, linesep):
          # Generate a boundary token in the same way as _make_boundary
          token = random.randrange(sys.maxsize)
@@ -515,12 +579,12 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
  
      def test_smtputf8_policy(self):
          msg = EmailMessage()
-        msg['From'] = "Páolo <főo@bar.com>"
+        msg['From'] = "Páolo <főo@bàr.com>"
          msg['To'] = 'Dinsdale'
          msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
          msg.set_content("oh là là, know what I mean, know what I mean?")
          expected = textwrap.dedent("""\
-            From: Páolo <főo@bar.com>
+            From: Páolo <főo@bàr.com>
              To: Dinsdale
              Subject: Nudge nudge, wink, wink \u1F609
              Content-Type: text/plain; charset="utf-8"
@@ -555,6 +619,37 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
          g.flatten(msg)
          self.assertEqual(s.getvalue(), expected)
  
+    def test_non_ascii_addr_spec_preserved(self):
+        # A defective non-ASCII addr-spec parsed from the original
+        # message is left unchanged when flattening.
+        # (See also test_non_ascii_addr_spec_raises above.)
+        source = (
+            'To: jörg@example.com, "But a long name still works with refold_source" <jörg@example.com>'
+        ).encode()
+        expected = (
+            b'To: j\xc3\xb6rg@example.com,\n'
+            b' "But a long name still works with refold_source" <j\xc3\xb6rg@example.com>\n'
+            b'\n'
+        )
+        msg = message_from_bytes(source, policy=policy.default)
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.default)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
+    def test_idna_encoding_preserved(self):
+        # Nothing tries to decode a pre-encoded IDNA domain.
+        msg = EmailMessage()
+        msg["To"] = Address(
+            username='jörg',
+            domain='☕.example'.encode('idna').decode()  # IDNA 2003
+        )
+        expected = 'To: jörg@xn--53h.example\n\n'.encode()
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.default.clone(utf8=True))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst

new file mode 100644 (file)

index 0000000..7082c72
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst
@@ -0,0 +1,8 @@
+The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
+a mailbox with non-ASCII characters in its domain. Under a policy with
+:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
+such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
+Either apply an appropriate IDNA encoding to convert the domain to ASCII before
+serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with
+``utf8=True``) to correctly pass through the internationalized domain name
+as Unicode characters.
diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst

new file mode 100644 (file)

index 0000000..29c076d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst
@@ -0,0 +1,7 @@
+The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
+a mailbox with non-ASCII characters in its local-part. Under a policy with
+:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
+such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
+There is no valid 7-bit encoding for an internationalized local-part. Use
+:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to
+correctly pass through the local-part as Unicode characters.
author	Mike Edmunds <medmunds@gmail.com>
	Fri, 1 May 2026 17:52:06 +0000 (10:52 -0700)
committer	GitHub <noreply@github.com>
	Fri, 1 May 2026 17:52:06 +0000 (13:52 -0400)
Doc/library/email.policy.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.15.rst		patch \| blob \| blame \| history
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test_generator.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst	[new file with mode: 0644]	patch \| blob
Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst	[new file with mode: 0644]	patch \| blob