gh-128110: Fix rfc2047 whitespace handling in email parser address headers (#130749)

author Mike Edmunds <medmunds@gmail.com>

Mon, 11 May 2026 22:20:09 +0000 (15:20 -0700)

committer GitHub <noreply@github.com>

Mon, 11 May 2026 22:20:09 +0000 (18:20 -0400)
author Mike Edmunds <medmunds@gmail.com>
Mon, 11 May 2026 22:20:09 +0000 (15:20 -0700)
committer GitHub <noreply@github.com>
Mon, 11 May 2026 22:20:09 +0000 (18:20 -0400)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 9873958f5c2790c1614a8caeacde27ec76d80169..792072ab9f6128a63b8a047a02736fd72933f4e3 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1461,6 +1461,16 @@ def get_phrase(value):
          else:
              try:
                  token, value = get_word(value)
+                if (token[0].token_type == 'encoded-word'
+                    and phrase
+                    and phrase[-1].token_type == 'atom'
+                    and len(phrase[-1]) > 1
+                    and phrase[-1][-2].token_type == 'encoded-word'
+                    and phrase[-1][-1].token_type == 'cfws'
+                    and not phrase[-1][-1].comments
+                ):
+                    # linear ws between ews needs special handing...
+                    phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws')
              except errors.HeaderParseError:
                  if value[0] in CFWS_LEADER:
                      token, value = get_cfws(value)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index aded44e85ee3368d00736c77c4da4232fcd751c4..9d9fe418ee4d0670e487323d697dd0b4d26477e0 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -1060,6 +1060,78 @@ class TestParser(TestParserMixin, TestEmailBase):
          with self.assertRaises(errors.HeaderParseError):
              parser.get_phrase(' (foo) ')
  
+    def test_get_phrase_adjacent_ew(self):
+        # "'linear-white-space' that separates a pair of adjacent
+        # 'encoded-word's is ignored" (rfc2047 section 6.2)
+        self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
+
+    def test_get_phrase_adjacent_ew_different_encodings(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], ''
+        )
+
+    def test_get_phrase_adjacent_ew_encoded_spaces(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=',
+            'Encoded spaces preserved',
+            'Encoded spaces preserved',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=',
+            'Comment (is not) linear-white-space',
+            'Comment linear-white-space',
+            [],
+            '',
+            comments=['is not'],
+        )
+
+    def test_get_phrase_adjacent_ew_no_error_on_defects(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Def?= =?ascii?q?ect still joins?=',
+            'Defect still joins',
+            'Defect still joins',
+            [errors.InvalidHeaderDefect],  # whitespace inside encoded word
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_ignore_non_ew(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?No?= =?join?= for non-ew',
+            'No =?join?= for non-ew',
+            'No =?join?= for non-ew',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_ignore_invalid_ew(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew',
+            'No =?ascii?rot13?wbva= for invalid ew',
+            'No =?ascii?rot13?wbva= for invalid ew',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_missing_space(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Joi?==?ascii?q?ned?=',
+            'Joined',
+            'Joined',
+            [errors.InvalidHeaderDefect],  # missing trailing whitespace
+            ''
+        )
+
      # get_local_part
  
      def test_get_local_part_simple(self):
@@ -2387,6 +2459,22 @@ class TestParser(TestParserMixin, TestEmailBase):
          self.assertEqual(address[0].token_type,
                           'mailbox')
  
+    def test_get_address_rfc2047_display_name_adjacent_ews(self):
+        address = self._test_get_x(parser.get_address,
+            '=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= <foo@example.com>',
+            'Bérénice <foo@example.com>',
+            'Bérénice <foo@example.com>',
+            [],
+            '')
+        self.assertEqual(address.token_type, 'address')
+        self.assertEqual(len(address.mailboxes), 1)
+        self.assertEqual(address.mailboxes,
+                         address.all_mailboxes)
+        self.assertEqual(address.mailboxes[0].display_name,
+                         'Bérénice')
+        self.assertEqual(address[0].token_type,
+                         'mailbox')
+
      def test_get_address_empty_group(self):
          address = self._test_get_x(parser.get_address,
              'Monty Python:;',
diff --git a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst

new file mode 100644 (file)

index 0000000..b08b188
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
@@ -0,0 +1,5 @@
+Fix bug in the parsing of :mod:`email` address headers that could result in
+extraneous spaces in the decoded text when using a modern email policy.
+Space between pairs of adjacent :rfc:`2047` encoded-words is now ignored, per
+section 6.2 (and consistent with existing parsing of unstructured
+headers like *Subject*).
author	Mike Edmunds <medmunds@gmail.com>
	Mon, 11 May 2026 22:20:09 +0000 (15:20 -0700)
committer	GitHub <noreply@github.com>
	Mon, 11 May 2026 22:20:09 +0000 (18:20 -0400)
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst	[new file with mode: 0644]	patch \| blob