.. attribute:: utf8
If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
- headers by encoding them as "encoded words". If ``True``, follow
- :rfc:`6532` and use ``utf-8`` encoding for headers. Messages
+ headers by encoding them as :rfc:`2047` "encoded words". If ``True``,
+ follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages
formatted in this way may be passed to SMTP servers that support
the ``SMTPUTF8`` extension (:rfc:`6531`).
+ When ``False``, the generator will raise
+ :exc:`~email.errors.HeaderWriteError` if any header includes non-ASCII
+ characters in a context where :rfc:`2047` does not permit encoded words.
+ This particularly applies to mailboxes ("addr-spec") with non-ASCII
+ characters, which can be created via
+ :class:`~email.headerregistry.Address`. To use a mailbox with a non-ASCII
+ domain name with ``utf8=False``, first encode the domain using the
+ third-party :pypi:`idna` or :pypi:`uts46` module or with
+ :mod:`encodings.idna`. It is not possible to use a non-ASCII username
+ ("local-part") in a mailbox when ``utf8=False``.
+
+ .. versionchanged:: 3.15
+ Can trigger the raising of :exc:`~email.errors.HeaderWriteError`.
+ (Earlier versions incorrectly applied :rfc:`2047` in certain contexts,
+ mostly notably in addr-specs.)
.. attribute:: refold_source
(Contributed by Eric Froemling in :gh:`149085`.)
+email
+-----
+
+* Email generators now raise an error when an :class:`.EmailMessage` cannot be
+ accurately flattened due to a non-ASCII email address (mailbox) in an address
+ header. Options for supporting Email Address Internationalization (EAI) are
+ discussed in :attr:`.EmailPolicy.utf8`.
+ (Contributed by R David Murray and Mike Edmunds in :gh:`122540`.)
+
+
functools
---------
def startswith_fws(self):
return self[0].startswith_fws()
- @property
- def as_ew_allowed(self):
- """True if all top level tokens of this part may be RFC2047 encoded."""
- return all(part.as_ew_allowed for part in self)
+ as_ew_allowed = True
@property
def comments(self):
class AngleAddr(TokenList):
token_type = 'angle-addr'
+ as_ew_allowed = False
@property
def local_part(self):
class ContentType(ParameterizedHeaderValue):
token_type = 'content-type'
- as_ew_allowed = False
maintype = 'text'
subtype = 'plain'
class ContentDisposition(ParameterizedHeaderValue):
token_type = 'content-disposition'
- as_ew_allowed = False
content_disposition = None
class ContentTransferEncoding(TokenList):
token_type = 'content-transfer-encoding'
- as_ew_allowed = False
cte = '7bit'
class HeaderLabel(TokenList):
token_type = 'header-label'
- as_ew_allowed = False
class MsgID(TokenList):
def _refold_parse_tree(parse_tree, *, policy):
- """Return string of contents of parse_tree folded according to RFC rules.
-
- """
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = [''] # Folded lines to be output
+ if parse_tree.as_ew_allowed:
+ _refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy)
+ else:
+ _refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy)
+ return policy.linesep.join(lines) + policy.linesep
+
+def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy):
+ parts = list(parse_tree)
+ while parts:
+ part = parts.pop(0)
+ tstr = str(part)
+ try:
+ tstr.encode(encoding)
+ except UnicodeEncodeError:
+ if any(isinstance(x, errors.UndecodableBytesDefect)
+ for x in part.all_defects):
+ # There is garbage data from parsing a message in binary mode,
+ # just pass it through. Not good, but the best we can do.
+ pass
+ elif policy.utf8:
+ # If this happens, it's a programmer error.
+ raise
+ else:
+ raise errors.HeaderWriteError(
+ f"Non-ASCII {part.token_type} '{part}' is invalid"
+ " under current policy setting (utf8=False)"
+ )
+ if len(tstr) <= maxlen - len(lines[-1]):
+ lines[-1] += tstr
+ continue
+ # This part is too long to fit. The RFC wants us to break at
+ # "major syntactic breaks", so unless we don't consider this
+ # to be one, check if it will fit on the next line by itself.
+ if (part.syntactic_break and
+ len(tstr) + 1 <= maxlen):
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ continue
+ if not hasattr(part, 'encode'):
+ # It's not a terminal, try folding the subparts.
+ newparts = list(part)
+ parts = newparts + parts
+ continue
+ # We can't figure out how to wrap, it, so give up.
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ else:
+ # We can't fold it onto the next line either...
+ lines[-1] += tstr
+ return
+
+
+def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy):
+ """Return string of contents of parse_tree folded according to RFC rules.
+
+ """
last_word_is_ew = False
last_ew = None # if there is an encoded word in the last line of lines,
# points to the encoded word's first character
if part is end_ew_not_allowed:
wrap_as_ew_blocked -= 1
continue
+ if part.token_type == 'mime-parameters':
+ # Mime parameter folding (using RFC2231) is extra special.
+ _fold_mime_parameters(part, lines, maxlen, encoding)
+ last_word_is_ew = False
+ continue
tstr = str(part)
if not want_encoding:
if part.token_type in ('ptext', 'vtext'):
charset = 'utf-8'
want_encoding = True
- if part.token_type == 'mime-parameters':
- # Mime parameter folding (using RFC2231) is extra special.
- _fold_mime_parameters(part, lines, maxlen, encoding)
- last_word_is_ew = False
- continue
-
if want_encoding and not wrap_as_ew_blocked:
- if not part.as_ew_allowed:
+ if any(
+ not x.as_ew_allowed for x in part
+ if hasattr(x, 'as_ew_allowed')
+ ):
want_encoding = False
last_ew = None
if part.syntactic_break:
[ValueTerminal(make_quoted_pairs(p), 'ptext')
for p in newparts] +
[ValueTerminal('"', 'ptext')])
+ _refold_without_ew(newparts, lines, maxlen, encoding, policy=policy)
+ continue
if part.token_type == 'comment':
newparts = (
[ValueTerminal('(', 'ptext')] +
lines[-1] += tstr
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
- return policy.linesep.join(lines) + policy.linesep
+ return
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
"""Fold string to_encode into lines as encoded word, combining if allowed.
self._test(token, expected, policy=policy)
def test_encoded_word_with_undecodable_bytes(self):
- self._test(parser.get_address_list(
- ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
+ self._test(
+ parser.get_address_list(
+ ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
+ ' <xyz@abc.com>'
)[0],
- ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n',
+ ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= <xyz@abc.com>\n',
)
import io
+import re
import textwrap
import unittest
import random
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))
+ def test_non_ascii_addr_spec_raises(self):
+ # non-ascii is not permitted in any part of an addr-spec. If the
+ # programmer generated it, it's an error. (See also
+ # test_non_ascii_addr_spec_preserved below.)
+ p = self.policy.clone(utf8=False, max_line_length=20)
+ g = self.genclass(self.ioclass(), policy=p)
+ # XXX The particular part detected here isn't part of a behavioral
+ # spec and may change in the future.
+ cases = [
+ ('wők@example.com', 'wők', 'local-part'),
+ ('wok@exàmple.com', 'exàmple.com', 'domain'),
+ ('wők@exàmple.com', 'wők', 'local-part'),
+ (
+ '"Name, for display" <wők@example.com>',
+ 'wők@example.com',
+ 'addr-spec',
+ ),
+ (
+ 'Näyttönimi <wők@example.com>',
+ 'wők@example.com',
+ 'addr-spec',
+ ),
+ (
+ '"a lőng quoted string as the local part"@example.com',
+ 'a lőng quoted string as the local part',
+ 'local-part',
+ ),
+
+ ]
+ for address, badtoken, partname in cases:
+ with self.subTest(address=address):
+ msg = EmailMessage()
+ msg['To'] = address
+ expected_error = (
+ fr"(?i)(?=.*non-ascii)"
+ fr"(?=.*{re.escape(badtoken)})"
+ fr"(?=.*{partname})"
+ fr"(?=.*policy.*utf8)"
+ )
+ with self.assertRaisesRegex(
+ email.errors.HeaderWriteError, expected_error
+ ):
+ g.flatten(msg)
+
+ def test_local_part_quoted_string_wrapped_correctly(self):
+ msg = self.msgmaker(self.typ(textwrap.dedent("""\
+ To: <"a long local part in a quoted string"@example.com>
+ Subject: test
+
+ None
+ """)), policy=self.policy.clone(max_line_length=20))
+ expected = textwrap.dedent("""\
+ To: <"a long local part in a
+ quoted string"@example.com>
+ Subject: test
+
+ None
+ """)
+ s = self.ioclass()
+ g = self.genclass(s, policy=self.policy.clone(max_line_length=30))
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), self.typ(expected))
+
def _test_boundary_detection(self, linesep):
# Generate a boundary token in the same way as _make_boundary
token = random.randrange(sys.maxsize)
def test_smtputf8_policy(self):
msg = EmailMessage()
- msg['From'] = "Páolo <főo@bar.com>"
+ msg['From'] = "Páolo <főo@bàr.com>"
msg['To'] = 'Dinsdale'
msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
msg.set_content("oh là là, know what I mean, know what I mean?")
expected = textwrap.dedent("""\
- From: Páolo <főo@bar.com>
+ From: Páolo <főo@bàr.com>
To: Dinsdale
Subject: Nudge nudge, wink, wink \u1F609
Content-Type: text/plain; charset="utf-8"
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)
+ def test_non_ascii_addr_spec_preserved(self):
+ # A defective non-ASCII addr-spec parsed from the original
+ # message is left unchanged when flattening.
+ # (See also test_non_ascii_addr_spec_raises above.)
+ source = (
+ 'To: jörg@example.com, "But a long name still works with refold_source" <jörg@example.com>'
+ ).encode()
+ expected = (
+ b'To: j\xc3\xb6rg@example.com,\n'
+ b' "But a long name still works with refold_source" <j\xc3\xb6rg@example.com>\n'
+ b'\n'
+ )
+ msg = message_from_bytes(source, policy=policy.default)
+ s = io.BytesIO()
+ g = BytesGenerator(s, policy=policy.default)
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), expected)
+
+ def test_idna_encoding_preserved(self):
+ # Nothing tries to decode a pre-encoded IDNA domain.
+ msg = EmailMessage()
+ msg["To"] = Address(
+ username='jörg',
+ domain='☕.example'.encode('idna').decode() # IDNA 2003
+ )
+ expected = 'To: jörg@xn--53h.example\n\n'.encode()
+ s = io.BytesIO()
+ g = BytesGenerator(s, policy=policy.default.clone(utf8=True))
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), expected)
+
if __name__ == '__main__':
unittest.main()
--- /dev/null
+The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
+a mailbox with non-ASCII characters in its domain. Under a policy with
+:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
+such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
+Either apply an appropriate IDNA encoding to convert the domain to ASCII before
+serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with
+``utf8=True``) to correctly pass through the internationalized domain name
+as Unicode characters.
--- /dev/null
+The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
+a mailbox with non-ASCII characters in its local-part. Under a policy with
+:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
+such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
+There is no valid 7-bit encoding for an internationalized local-part. Use
+:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to
+correctly pass through the local-part as Unicode characters.