# Useful constants and functions
#
-WSP = set(' \t')
+_WSP = ' \t'
+WSP = set(_WSP)
CFWS_LEADER = WSP | set('(')
SPECIALS = set(r'()<>@,:;.\"[]')
ATOM_ENDS = SPECIALS | WSP
lines.pop()
return wsp
+
def _refold_parse_tree(parse_tree, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = [''] # Folded lines to be output
- leading_whitespace = '' # When we have whitespace between two encoded
- # words, we may need to encode the whitespace
- # at the beginning of the second word.
- last_ew = None # Points to the last encoded character if there's an ew on
- # the line
+ last_word_is_ew = False
+ last_ew = None # if there is an encoded word in the last line of lines,
+ # points to the encoded word's first character
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False # This is set to True if we need to encode this part
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
+ last_word_is_ew = False
continue
if want_encoding and not wrap_as_ew_blocked:
# XXX what if encoded_part has no leading FWS?
lines.append(newline)
lines[-1] += encoded_part
+ last_word_is_ew = False
continue
# Either this is not a major syntactic break, so we don't
# want it on a line by itself even if it fits, or it
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
- last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
- part.ew_combine_allowed, charset, leading_whitespace)
- # This whitespace has been added to the lines in _fold_as_ew()
- # so clear it now.
- leading_whitespace = ''
+ last_ew = _fold_as_ew(
+ tstr,
+ lines,
+ maxlen,
+ last_ew,
+ part.ew_combine_allowed,
+ charset,
+ last_word_is_ew,
+ )
+ last_word_is_ew = True
last_charset = charset
want_encoding = False
continue
if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
+ last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
continue
# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
- leading_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
- # We're going to fold the data onto a new line here. Due to
- # the way encoded strings handle continuation lines, we need to
- # be prepared to encode any whitespace if the next line turns
- # out to start with an encoded word.
lines.append(newline + tstr)
-
- whitespace_accumulator = []
- for char in lines[-1]:
- if char not in WSP:
- break
- whitespace_accumulator.append(char)
- leading_whitespace = ''.join(whitespace_accumulator)
+ last_word_is_ew = (last_word_is_ew
+ and not bool(lines[-1].strip(_WSP)))
last_ew = None
continue
if not hasattr(part, 'encode'):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
+ last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
return policy.linesep.join(lines) + policy.linesep
-def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
+ elif last_word_is_ew:
+ # If we are following up an encoded word with another encoded word,
+ # any white space between the two will be ignored when decoded.
+ # Therefore, we encode all to-be-displayed whitespace in the second
+ # encoded word.
+ len_without_wsp = len(lines[-1].rstrip(_WSP))
+ leading_whitespace = lines[-1][len_without_wsp:]
+ lines[-1] = (lines[-1][:len_without_wsp]
+ + (' ' if leading_whitespace else ''))
+ to_encode = leading_whitespace + to_encode
elif to_encode[0] in WSP:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
while to_encode:
remaining_space = maxlen - len(lines[-1])
- text_space = remaining_space - chrome_len - len(leading_whitespace)
+ text_space = remaining_space - chrome_len
if text_space <= 0:
- lines.append(' ')
+ newline = _steal_trailing_WSP_if_exists(lines)
+ lines.append(newline or ' ')
+ new_last_ew = len(lines[-1])
continue
- # If we are at the start of a continuation line, prepend whitespace
- # (we only want to do this when the line starts with an encoded word
- # but if we're folding in this helper function, then we know that we
- # are going to be writing out an encoded word.)
- if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
- encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
- lines[-1] += encoded_word
- leading_whitespace = ''
-
to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
excess = len(encoded_word) - remaining_space
lines[-1] += encoded_word
to_encode = to_encode[len(to_encode_word):]
- leading_whitespace = ''
if to_encode:
lines.append(' ')
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)
+ # gh-144156: fold between non-encoded and encoded words don't need to encoded
+ # the separating space
+ def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
+ source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
+ "bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
+ "d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
+ expected = (
+ b"Subject: "
+ b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
+ b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
+ b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
+ b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n"
+ )
+ msg = EmailMessage()
+ msg['Subject'] = source
+ s = io.BytesIO()
+ g = BytesGenerator(s)
+ g.flatten(msg)
+ self.assertEqual(s.getvalue(), expected)
+
+ def test_ew_folding_round_trip_1(self):
+ print()
+ source = "aaaaaaaaa фффффффф "
+ msg = EmailMessage()
+ msg['Subject'] = source
+ s = io.BytesIO()
+ g = BytesGenerator(s, maxheaderlen=30)
+ g.flatten(msg)
+ flat = s.getvalue()
+ reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
+ self.assertMultiLineEqual(reparsed, source)
+
+ def test_ew_folding_round_trip_2(self):
+ print()
+ source = "aaa aaaaaaa aaa ффф фффф "
+ msg = EmailMessage()
+ msg['Subject'] = source
+ s = io.BytesIO()
+ g = BytesGenerator(s, maxheaderlen=30)
+ g.flatten(msg)
+ flat = s.getvalue()
+ reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
+ self.assertMultiLineEqual(reparsed, source)
+
def test_cte_type_7bit_handles_unknown_8bit(self):
source = ("Subject: Maintenant je vous présente mon "
"collègue\n\n").encode('utf-8')