From: Barry Warsaw Date: Mon, 14 Oct 2002 17:26:03 +0000 (+0000) Subject: Backport bugfix microrelease of email 2.4.3 from cvs trunk. X-Git-Tag: v2.2.2~7 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=31eac4e4b39a9ced6f0192e06a2b70fa2581d307;p=thirdparty%2FPython%2Fcpython.git Backport bugfix microrelease of email 2.4.3 from cvs trunk. --- diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py index 67cc1ecb42d0..b852245b00af 100644 --- a/Lib/email/Charset.py +++ b/Lib/email/Charset.py @@ -43,6 +43,8 @@ CHARSETS = { 'iso-2022-jp': (BASE64, None, None), 'koi8-r': (BASE64, BASE64, None), 'utf-8': (SHORTEST, BASE64, 'utf-8'), + # We're making this one up to represent raw unencoded 8-bit + '8bit': (None, BASE64, 'utf-8'), } # Aliases for other commonly-used names for character sets. Map @@ -53,21 +55,16 @@ ALIASES = { 'ascii': 'us-ascii', } -# Map charsets to their Unicode codec strings. Note that the Japanese -# examples included below do not (yet) come with Python! They are available -# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/ - -# The Chinese and Korean codecs are available from SourceForge: -# -# http://sourceforge.net/projects/python-codecs/ +# Map charsets to their Unicode codec strings. Note that Python doesn't come +# with any Asian codecs by default. Here's where to get them: # -# although you'll need to check them out of cvs since they haven't been file -# released yet. You might also try to use +# Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python +# Korean -- http://sf.net/projects/koco +# Chinese -- http://sf.net/projects/python-codecs # -# http://www.freshports.org/port-description.php3?port=6702 -# -# if you can get logged in. AFAICT, both the Chinese and Korean codecs are -# fairly experimental at this point. +# Note that these codecs have their own lifecycle and may be in varying states +# of stability and useability. + CODEC_MAP = { 'euc-jp': 'japanese.euc-jp', 'iso-2022-jp': 'japanese.iso-2022-jp', diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py index 7f05218d4859..58e2f91d622a 100644 --- a/Lib/email/Generator.py +++ b/Lib/email/Generator.py @@ -8,7 +8,7 @@ import time import re import random -from types import ListType +from types import ListType, StringType from cStringIO import StringIO from email.Header import Header @@ -35,6 +35,14 @@ SPACE8 = ' ' * 8 fcre = re.compile(r'^From ', re.MULTILINE) +def _is8bitstring(s): + if isinstance(s, StringType): + try: + unicode(s, 'us-ascii') + except UnicodeError: + return True + return False + class Generator: @@ -174,6 +182,14 @@ class Generator: # No line was actually longer than maxheaderlen characters, so # just return the original unchanged. return text + # If we have raw 8bit data in a byte string, we have no idea what the + # encoding is. I think there is no safe way to split this string. If + # it's ascii-subset, then we could do a normal ascii split, but if + # it's multibyte then we could break the string. There's no way to + # know so the least harm seems to be to not split the string and risk + # it being too long. + if _is8bitstring(text): + return text # The `text' argument already has the field name prepended, so don't # provide it here or the first line will get folded too short. h = Header(text, maxlinelen=maxheaderlen, diff --git a/Lib/email/Header.py b/Lib/email/Header.py index a40226dc1462..0ceacc7bf309 100644 --- a/Lib/email/Header.py +++ b/Lib/email/Header.py @@ -153,6 +153,8 @@ class Header: """ if charset is None: charset = USASCII + if not isinstance(charset, Charset): + charset = Charset(charset) self._charset = charset self._continuation_ws = continuation_ws cws_expanded_len = len(continuation_ws.replace('\t', SPACE8)) @@ -216,31 +218,52 @@ class Header: charset = self._charset elif not isinstance(charset, Charset): charset = Charset(charset) - # Normalize and check the string - if isinstance(s, StringType): - # Possibly raise UnicodeError if it can't e encoded - unicode(s, charset.get_output_charset()) - elif isinstance(s, UnicodeType): - # Convert Unicode to byte string for later concatenation - for charset in USASCII, charset, UTF8: - try: - s = s.encode(charset.get_output_charset()) - break - except UnicodeError: - pass - else: - assert False, 'Could not encode to utf-8' + # If the charset is our faux 8bit charset, leave the string unchanged + if charset <> '8bit': + # We need to test that the string can be converted to unicode and + # back to a byte string, given the input and output codecs of the + # charset. + if isinstance(s, StringType): + # Possibly raise UnicodeError if the byte string can't be + # converted to a unicode with the input codec of the charset. + incodec = charset.input_codec or 'us-ascii' + ustr = unicode(s, incodec) + # Now make sure that the unicode could be converted back to a + # byte string with the output codec, which may be different + # than the iput coded. Still, use the original byte string. + outcodec = charset.output_codec or 'us-ascii' + ustr.encode(outcodec) + elif isinstance(s, UnicodeType): + # Now we have to be sure the unicode string can be converted + # to a byte string with a reasonable output codec. We want to + # use the byte string in the chunk. + for charset in USASCII, charset, UTF8: + try: + outcodec = charset.output_codec or 'us-ascii' + s = s.encode(outcodec) + break + except UnicodeError: + pass + else: + assert False, 'utf-8 conversion failed' self._chunks.append((s, charset)) def _split(self, s, charset, firstline=False): - # Split up a header safely for use with encode_chunks. BAW: this - # appears to be a private convenience method. + # Split up a header safely for use with encode_chunks. splittable = charset.to_splittable(s) encoded = charset.from_splittable(splittable) elen = charset.encoded_header_len(encoded) if elen <= self._maxlinelen: return [(encoded, charset)] + # If we have undetermined raw 8bit characters sitting in a byte + # string, we really don't know what the right thing to do is. We + # can't really split it because it might be multibyte data which we + # could break if we split it between pairs. The least harm seems to + # be to not split the header at all, but that means they could go out + # longer than maxlinelen. + elif charset == '8bit': + return [(s, charset)] # BAW: I'm not sure what the right test here is. What we're trying to # do is be faithful to RFC 2822's recommendation that ($2.2.3): # @@ -346,27 +369,27 @@ class Header: rtn.append(EMPTYSTRING.join(sublines)) return [(chunk, charset) for chunk in rtn] - def _encode_chunks(self): - """MIME-encode a header with many different charsets and/or encodings. - - Given a list of pairs (string, charset), return a MIME-encoded string - suitable for use in a header field. Each pair may have different - charsets and/or encodings, and the resulting header will accurately - reflect each setting. - - Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like - character sets like iso-8859-1), email.Utils.BASE64 (Base64, for - non-ASCII like character sets like KOI8-R and iso-2022-jp), or None - (no encoding). - - Each pair will be represented on a separate line; the resulting string - will be in the format: - - "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n - =?charset2?b?SvxyZ2VuIEL2aW5n?=" - """ + def _encode_chunks(self, newchunks): + # MIME-encode a header with many different charsets and/or encodings. + # + # Given a list of pairs (string, charset), return a MIME-encoded + # string suitable for use in a header field. Each pair may have + # different charsets and/or encodings, and the resulting header will + # accurately reflect each setting. + # + # Each encoding can be email.Utils.QP (quoted-printable, for + # ASCII-like character sets like iso-8859-1), email.Utils.BASE64 + # (Base64, for non-ASCII like character sets like KOI8-R and + # iso-2022-jp), or None (no encoding). + # + # Each pair will be represented on a separate line; the resulting + # string will be in the format: + # + # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n + # =?charset2?b?SvxyZ2VuIEL2aW5n?=" + # chunks = [] - for header, charset in self._chunks: + for header, charset in newchunks: if charset is None or charset.header_encoding is None: # There's no encoding for this chunk's charsets _max_append(chunks, header, self._maxlinelen) @@ -397,5 +420,4 @@ class Header: newchunks = [] for s, charset in self._chunks: newchunks += self._split(s, charset, True) - self._chunks = newchunks - return self._encode_chunks() + return self._encode_chunks(newchunks) diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py index 2945b0510f02..b784da8247fa 100644 --- a/Lib/email/__init__.py +++ b/Lib/email/__init__.py @@ -4,7 +4,7 @@ """A package for parsing, handling, and generating email messages. """ -__version__ = '2.4.2' +__version__ = '2.4.3' __all__ = [ 'base64MIME', diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index daf9e287a71a..176e7740c225 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -703,6 +703,27 @@ from modemcable093.139-201-24.que.mc.videotron.ca ([24.201.139.93] \tid 17k4h5-00034i-00 \tfor test@mems-exchange.org; Wed, 28 Aug 2002 11:25:20 -0400""") + def test_long_8bit_header(self): + eq = self.ndiffAssertEqual + msg = Message() + h = Header('Britische Regierung gibt', 'iso-8859-1') + h.append('gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte') + msg['Subject'] = h + eq(msg.as_string(), """\ +Subject: =?iso-8859-1?q?Britische_Regierung_gibt?= + =?iso-8859-1?q?gr=FCnes_Licht_f=FCr_Offshore-Windkraftprojekte?= + +""") + + def test_long_8bit_header_no_charset(self): + eq = self.ndiffAssertEqual + msg = Message() + msg['Reply-To'] = 'Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte ' + eq(msg.as_string(), """\ +Reply-To: Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte + +""") + # Test mangling of "From " lines in the body of a message