[3.11] gh-121284: Fix email address header folding with parsed encoded-word (GH-12275...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)

committer GitHub <noreply@github.com>

Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)
committer GitHub <noreply@github.com>
Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 045a01bcf1e0d70ca075bb1bb42767e8fd55b6c7..0183a1508b1219be3eef49e2de7e3dfc3f838205 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1047,7 +1047,7 @@ def get_fws(value):
      fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
      return fws, newvalue
  
-def get_encoded_word(value):
+def get_encoded_word(value, terminal_type='vtext'):
      """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
  
      """
@@ -1086,7 +1086,7 @@ def get_encoded_word(value):
              ew.append(token)
              continue
          chars, *remainder = _wsp_splitter(text, 1)
-        vtext = ValueTerminal(chars, 'vtext')
+        vtext = ValueTerminal(chars, terminal_type)
          _validate_xtext(vtext)
          ew.append(vtext)
          text = ''.join(remainder)
@@ -1128,7 +1128,7 @@ def get_unstructured(value):
          valid_ew = True
          if value.startswith('=?'):
              try:
-                token, value = get_encoded_word(value)
+                token, value = get_encoded_word(value, 'utext')
              except _InvalidEwError:
                  valid_ew = False
              except errors.HeaderParseError:
@@ -1157,7 +1157,7 @@ def get_unstructured(value):
          # the parser to go in an infinite loop.
          if valid_ew and rfc2047_matcher.search(tok):
              tok, *remainder = value.partition('=?')
-        vtext = ValueTerminal(tok, 'vtext')
+        vtext = ValueTerminal(tok, 'utext')
          _validate_xtext(vtext)
          unstructured.append(vtext)
          value = ''.join(remainder)
@@ -2792,7 +2792,7 @@ def _refold_parse_tree(parse_tree, *, policy):
              continue
          tstr = str(part)
          if not want_encoding:
-            if part.token_type == 'ptext':
+            if part.token_type in ('ptext', 'vtext'):
                  # Encode if tstr contains special characters.
                  want_encoding = not SPECIALSNL.isdisjoint(tstr)
              else:
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index cd6495490e3d553cc482e63f1d1d4d2ef4568bff..6025b34ac4a0f82a126604f5c8fdc9b637a406cd 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2985,6 +2985,31 @@ class TestFolding(TestEmailBase):
              '=?utf-8?q?H=C3=BCbsch?= Kaktus <beautiful@example.com>,\n'
                  ' =?utf-8?q?bei=C3=9Ft_bei=C3=9Ft?= <biter@example.com>\n')
  
+    def test_address_list_with_specials_in_encoded_word(self):
+        # An encoded-word parsed from a structured header must remain
+        # encoded when it contains specials. Regression for gh-121284.
+        policy = self.policy.clone(max_line_length=40)
+        cases = [
+            # (to, folded)
+            ('=?utf-8?q?A_v=C3=A9ry_long_name_with=2C_comma?= <to@example.com>',
+             'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n'
+             ' =?utf-8?q?=2C?= comma <to@example.com>\n'),
+            ('=?utf-8?q?This_long_name_does_not_need_encoded=2Dword?= <to@example.com>',
+             'This long name does not need\n'
+             ' encoded-word <to@example.com>\n'),
+            ('"A véry long name with, comma" <to@example.com>',
+             # (This isn't the best fold point, but it's not invalid.)
+             'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n'
+             ' =?utf-8?q?=2C?= comma <to@example.com>\n'),
+            ('"A véry long name containing a, comma" <to@example.com>',
+             'A =?utf-8?q?v=C3=A9ry?= long name\n'
+             ' containing =?utf-8?q?a=2C?= comma\n'
+             ' <to@example.com>\n'),
+        ]
+        for (to, folded) in cases:
+            with self.subTest(to=to):
+                self._test(parser.get_address_list(to)[0], folded, policy=policy)
+
      def test_address_list_with_list_separator_after_fold(self):
          a = 'x' * 66 + '@example.com'
          to = f'{a}, "Hübsch Kaktus" <beautiful@example.com>'
diff --git a/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst

new file mode 100644 (file)

index 0000000..923e911
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst
@@ -0,0 +1,7 @@
+Fix bug in the folding of rfc2047 encoded-words when flattening an email message
+using a modern email policy. Previously when an encoded-word was too long
+for a line, it would be decoded, split across lines, and re-encoded. But commas
+and other special characters in the original text could be left unencoded and
+unquoted. This could theoretically be used to spoof header lines using
+a carefully constructed encoded-word if the resulting rendered email was
+transmitted or re-parsed.
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)
committer	GitHub <noreply@github.com>
	Thu, 3 Apr 2025 16:27:02 +0000 (18:27 +0200)
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst	[new file with mode: 0644]	patch \| blob