gh-144156: Fix email header folding concatenating encoded words (#144692)

author Robsdedude <dev@rouvenbauer.de>

Thu, 19 Feb 2026 18:29:05 +0000 (19:29 +0100)

committer GitHub <noreply@github.com>

Thu, 19 Feb 2026 18:29:05 +0000 (13:29 -0500)
author Robsdedude <dev@rouvenbauer.de>
Thu, 19 Feb 2026 18:29:05 +0000 (19:29 +0100)
committer GitHub <noreply@github.com>
Thu, 19 Feb 2026 18:29:05 +0000 (13:29 -0500)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 172f9ef9e5f096be2d6c33c7facf5fc060168068..4c5394ab6353ac492c7aa8de6d50d6dfbaecba10 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -80,7 +80,8 @@ from email import utils
  # Useful constants and functions
  #
  
-WSP = set(' \t')
+_WSP = ' \t'
+WSP = set(_WSP)
  CFWS_LEADER = WSP | set('(')
  SPECIALS = set(r'()<>@,:;.\"[]')
  ATOM_ENDS = SPECIALS | WSP
@@ -2835,6 +2836,7 @@ def _steal_trailing_WSP_if_exists(lines):
              lines.pop()
      return wsp
  
+
  def _refold_parse_tree(parse_tree, *, policy):
      """Return string of contents of parse_tree folded according to RFC rules.
  
@@ -2843,11 +2845,9 @@ def _refold_parse_tree(parse_tree, *, policy):
      maxlen = policy.max_line_length or sys.maxsize
      encoding = 'utf-8' if policy.utf8 else 'us-ascii'
      lines = ['']  # Folded lines to be output
-    leading_whitespace = ''  # When we have whitespace between two encoded
-                             # words, we may need to encode the whitespace
-                             # at the beginning of the second word.
-    last_ew = None  # Points to the last encoded character if there's an ew on
-                    # the line
+    last_word_is_ew = False
+    last_ew = None  # if there is an encoded word in the last line of lines,
+                    # points to the encoded word's first character
      last_charset = None
      wrap_as_ew_blocked = 0
      want_encoding = False  # This is set to True if we need to encode this part
@@ -2882,6 +2882,7 @@ def _refold_parse_tree(parse_tree, *, policy):
          if part.token_type == 'mime-parameters':
              # Mime parameter folding (using RFC2231) is extra special.
              _fold_mime_parameters(part, lines, maxlen, encoding)
+            last_word_is_ew = False
              continue
  
          if want_encoding and not wrap_as_ew_blocked:
@@ -2898,6 +2899,7 @@ def _refold_parse_tree(parse_tree, *, policy):
                              # XXX what if encoded_part has no leading FWS?
                              lines.append(newline)
                          lines[-1] += encoded_part
+                        last_word_is_ew = False
                          continue
                  # Either this is not a major syntactic break, so we don't
                  # want it on a line by itself even if it fits, or it
@@ -2916,11 +2918,16 @@ def _refold_parse_tree(parse_tree, *, policy):
                      (last_charset == 'unknown-8bit' or
                       last_charset == 'utf-8' and charset != 'us-ascii')):
                      last_ew = None
-                last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
-                                      part.ew_combine_allowed, charset, leading_whitespace)
-                # This whitespace has been added to the lines in _fold_as_ew()
-                # so clear it now.
-                leading_whitespace = ''
+                last_ew = _fold_as_ew(
+                    tstr,
+                    lines,
+                    maxlen,
+                    last_ew,
+                    part.ew_combine_allowed,
+                    charset,
+                    last_word_is_ew,
+                )
+                last_word_is_ew = True
                  last_charset = charset
                  want_encoding = False
                  continue
@@ -2933,28 +2940,19 @@ def _refold_parse_tree(parse_tree, *, policy):
  
          if len(tstr) <= maxlen - len(lines[-1]):
              lines[-1] += tstr
+            last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
              continue
  
          # This part is too long to fit.  The RFC wants us to break at
          # "major syntactic breaks", so unless we don't consider this
          # to be one, check if it will fit on the next line by itself.
-        leading_whitespace = ''
          if (part.syntactic_break and
                  len(tstr) + 1 <= maxlen):
              newline = _steal_trailing_WSP_if_exists(lines)
              if newline or part.startswith_fws():
-                # We're going to fold the data onto a new line here.  Due to
-                # the way encoded strings handle continuation lines, we need to
-                # be prepared to encode any whitespace if the next line turns
-                # out to start with an encoded word.
                  lines.append(newline + tstr)
-
-                whitespace_accumulator = []
-                for char in lines[-1]:
-                    if char not in WSP:
-                        break
-                    whitespace_accumulator.append(char)
-                leading_whitespace = ''.join(whitespace_accumulator)
+                last_word_is_ew = (last_word_is_ew
+                                   and not bool(lines[-1].strip(_WSP)))
                  last_ew = None
                  continue
          if not hasattr(part, 'encode'):
@@ -2994,10 +2992,11 @@ def _refold_parse_tree(parse_tree, *, policy):
          else:
              # We can't fold it onto the next line either...
              lines[-1] += tstr
+        last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
  
      return policy.linesep.join(lines) + policy.linesep
  
-def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
      """Fold string to_encode into lines as encoded word, combining if allowed.
      Return the new value for last_ew, or None if ew_combine_allowed is False.
  
@@ -3012,6 +3011,16 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
          to_encode = str(
              get_unstructured(lines[-1][last_ew:] + to_encode))
          lines[-1] = lines[-1][:last_ew]
+    elif last_word_is_ew:
+        # If we are following up an encoded word with another encoded word,
+        # any white space between the two will be ignored when decoded.
+        # Therefore, we encode all to-be-displayed whitespace in the second
+        # encoded word.
+        len_without_wsp = len(lines[-1].rstrip(_WSP))
+        leading_whitespace = lines[-1][len_without_wsp:]
+        lines[-1] = (lines[-1][:len_without_wsp]
+                     + (' ' if leading_whitespace else ''))
+        to_encode = leading_whitespace + to_encode
      elif to_encode[0] in WSP:
          # We're joining this to non-encoded text, so don't encode
          # the leading blank.
@@ -3040,20 +3049,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
  
      while to_encode:
          remaining_space = maxlen - len(lines[-1])
-        text_space = remaining_space - chrome_len - len(leading_whitespace)
+        text_space = remaining_space - chrome_len
          if text_space <= 0:
-            lines.append(' ')
+            newline = _steal_trailing_WSP_if_exists(lines)
+            lines.append(newline or ' ')
+            new_last_ew = len(lines[-1])
              continue
  
-        # If we are at the start of a continuation line, prepend whitespace
-        # (we only want to do this when the line starts with an encoded word
-        # but if we're folding in this helper function, then we know that we
-        # are going to be writing out an encoded word.)
-        if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
-            encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
-            lines[-1] += encoded_word
-            leading_whitespace = ''
-
          to_encode_word = to_encode[:text_space]
          encoded_word = _ew.encode(to_encode_word, charset=encode_as)
          excess = len(encoded_word) - remaining_space
@@ -3065,7 +3067,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
              excess = len(encoded_word) - remaining_space
          lines[-1] += encoded_word
          to_encode = to_encode[len(to_encode_word):]
-        leading_whitespace = ''
  
          if to_encode:
              lines.append(' ')
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py

index 3ca79edf6a65d9c74cf8273e114954d29c84be5e..c2d7d09d591e86138b7d7c1d8913853cae8a7c14 100644 (file)
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -393,6 +393,50 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
          g.flatten(msg)
          self.assertEqual(s.getvalue(), expected)
  
+    # gh-144156: fold between non-encoded and encoded words don't need to encoded
+    #            the separating space
+    def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
+        source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
+                  "bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
+                  "d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
+        expected = (
+            b"Subject: "
+            b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
+            b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
+            b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
+            b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n"
+        )
+        msg = EmailMessage()
+        msg['Subject'] = source
+        s = io.BytesIO()
+        g = BytesGenerator(s)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
+    def test_ew_folding_round_trip_1(self):
+        print()
+        source = "aaaaaaaaa фффффффф "
+        msg = EmailMessage()
+        msg['Subject'] = source
+        s = io.BytesIO()
+        g = BytesGenerator(s, maxheaderlen=30)
+        g.flatten(msg)
+        flat = s.getvalue()
+        reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
+        self.assertMultiLineEqual(reparsed, source)
+
+    def test_ew_folding_round_trip_2(self):
+        print()
+        source = "aaa aaaaaaa   aaa ффф фффф  "
+        msg = EmailMessage()
+        msg['Subject'] = source
+        s = io.BytesIO()
+        g = BytesGenerator(s, maxheaderlen=30)
+        g.flatten(msg)
+        flat = s.getvalue()
+        reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
+        self.assertMultiLineEqual(reparsed, source)
+
      def test_cte_type_7bit_handles_unknown_8bit(self):
          source = ("Subject: Maintenant je vous présente mon "
                   "collègue\n\n").encode('utf-8')
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index 95c6afbee41ef5f4310a5acd5c499ba0e0f48499..c9c639515972442af3d4129303c78a2b20eb2153 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1711,7 +1711,7 @@ class TestFolding(TestHeaderBase):
              'singlewordthatwontfit')
          self.assertEqual(
              h.fold(policy=policy.default.clone(max_line_length=20)),
-            'Subject: \n'
+            'Subject:\n'
              ' =?utf-8?q?thisisa?=\n'
              ' =?utf-8?q?verylon?=\n'
              ' =?utf-8?q?glineco?=\n'
@@ -1727,7 +1727,7 @@ class TestFolding(TestHeaderBase):
              'singlewordthatwontfit plusanotherverylongwordthatwontfit')
          self.assertEqual(
              h.fold(policy=policy.default.clone(max_line_length=20)),
-            'Subject: \n'
+            'Subject:\n'
              ' =?utf-8?q?thisisa?=\n'
              ' =?utf-8?q?verylon?=\n'
              ' =?utf-8?q?glineco?=\n'
diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py

index 71ec0febb0fd862d7abaa09fdb1b16f4d7731e73..90e8e5580295f9b2f26b7ac2154fcd86a1ef174f 100644 (file)
--- a/Lib/test/test_email/test_policy.py
+++ b/Lib/test/test_email/test_policy.py
@@ -273,7 +273,7 @@ class PolicyAPITests(unittest.TestCase):
          actual = policy.fold('Subject', 'ą' * 12)
          self.assertEqual(
              actual,
-            'Subject: \n' +
+            'Subject:\n' +
              12 * ' =?utf-8?q?=C4=85?=\n')
  
      def test_short_maxlen_error(self):
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-10-22-05-51.gh-issue-144156.UbrC7F.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-10-22-05-51.gh-issue-144156.UbrC7F.rst

new file mode 100644 (file)

index 0000000..c4a0655
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-10-22-05-51.gh-issue-144156.UbrC7F.rst
@@ -0,0 +1 @@
+Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used.  Now whitespace is correctly preserved and also correctly added between adjacent encoded words.  The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace.\r
author	Robsdedude <dev@rouvenbauer.de>
	Thu, 19 Feb 2026 18:29:05 +0000 (19:29 +0100)
committer	GitHub <noreply@github.com>
	Thu, 19 Feb 2026 18:29:05 +0000 (13:29 -0500)
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test_generator.py		patch \| blob \| blame \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| blame \| history
Lib/test/test_email/test_policy.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2026-02-10-22-05-51.gh-issue-144156.UbrC7F.rst	[new file with mode: 0644]	patch \| blob