From: Barry Warsaw Date: Wed, 8 Feb 2006 13:33:20 +0000 (+0000) Subject: Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455 X-Git-Tag: v2.3.6c1~7 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f5853f7592fac39e7f6d0bccc0884b666b9edd99;p=thirdparty%2FPython%2Fcpython.git Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455 (.set_payload() gives bad .get_payload() results). Specific changes include: Simplfy the default CODEC_MAP in Charset.py to not include the Japanese and Korean codecs. The names of the codecs are different depending on whether you're using Python 2.4 and 2.5, which include the codecs by default, or earlier Python's which provide the codecs under different names as a third party library. Now, we attempt to discover which (if either) is available and populate the CODEC_MAP as appropriate. Message.set_charset(): When the message does not already have a Content-Transfer-Encoding header, instead of just adding the header, we also encode the body as defined by the assigned Charset. As before, if the body_encoding is callable, we just call that. If not, then we add a call to body_encode() before setting the header. This way, we guarantee that a message's text payload is always encoded properly. Remove the payload encoding code from Generator._handle_text(). With the above patch, this would cause the body to be doubly encoded. Doing this in the Message class is better than only doing it in the Generator. Added some new tests to ensure everything works correctly. Also changed the way the test_email_codecs.py tests get added (using the same lookup code that the CODEC_MAP adjustments use). This resolves both issues for email 2.5/Python 2.3. I will patch forward to email 3.0 for both Python 2.4 and 2.5. --- diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py index dd328e050152..fb4e5a9b185d 100644 --- a/Lib/email/Charset.py +++ b/Lib/email/Charset.py @@ -1,5 +1,5 @@ -# Copyright (C) 2001,2002 Python Software Foundation -# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw) +# Copyright (C) 2001-2006 Python Software Foundation +# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw) from types import UnicodeType from email.Encoders import encode_7or8bit @@ -99,20 +99,13 @@ ALIASES = { # of stability and useability. CODEC_MAP = { - 'euc-jp': 'japanese.euc-jp', - 'iso-2022-jp': 'japanese.iso-2022-jp', - 'shift_jis': 'japanese.shift_jis', - 'euc-kr': 'korean.euc-kr', - 'ks_c_5601-1987': 'korean.cp949', - 'iso-2022-kr': 'korean.iso-2022-kr', - 'johab': 'korean.johab', - 'gb2132': 'eucgb2312_cn', - 'big5': 'big5_tw', - 'utf-8': 'utf-8', + 'gb2132': 'eucgb2312_cn', + 'big5': 'big5_tw', + 'utf-8': 'utf-8', # Hack: We don't want *any* conversion for stuff marked us-ascii, as all # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. # Let that stuff pass through without conversion to/from Unicode. - 'us-ascii': None, + 'us-ascii': None, } @@ -165,6 +158,26 @@ def add_codec(charset, codecname): CODEC_MAP[charset] = codecname +def _find_asian_codec(charset, language): + try: + unicode('foo', charset) + return charset + except LookupError: + try: + codec = language + '.' + charset + unicode('foo', codec) + return codec + except LookupError: + return None + + +for _charset in ('euc-jp', 'iso-2022-jp', 'shift_jis'): + add_codec(_charset, _find_asian_codec(_charset, 'japanese') or _charset) + +for _charset in ('euc-kr', 'cp949', 'iso-2022-kr', 'johab'): + add_codec(_charset, _find_asian_codec(_charset, 'korean') or _charset) + + class Charset: """Map character sets to their email properties. @@ -229,7 +242,7 @@ class Charset: self.input_codec = CODEC_MAP.get(self.input_charset, self.input_charset) self.output_codec = CODEC_MAP.get(self.output_charset, - self.input_codec) + self.input_codec) def __str__(self): return self.input_charset.lower() diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py index 56d44ea52176..bbc19cd2c6dd 100644 --- a/Lib/email/Generator.py +++ b/Lib/email/Generator.py @@ -1,8 +1,7 @@ -# Copyright (C) 2001,2002 Python Software Foundation -# Author: barry@zope.com (Barry Warsaw) +# Copyright (C) 2001-2006 Python Software Foundation +# Author: barry@python.org (Barry Warsaw) -"""Classes to generate plain text from a message object tree. -""" +"""Classes to generate plain text from a message object tree.""" import re import sys @@ -192,9 +191,6 @@ class Generator: payload = msg.get_payload() if payload is None: return - cset = msg.get_charset() - if cset is not None: - payload = cset.body_encode(payload) if not _isstring(payload): raise TypeError, 'string payload expected: %s' % type(payload) if self._mangle_from_: diff --git a/Lib/email/Message.py b/Lib/email/Message.py index 10c2921ea0a2..bb8718fe23b0 100644 --- a/Lib/email/Message.py +++ b/Lib/email/Message.py @@ -272,11 +272,14 @@ class Message: charset=charset.get_output_charset()) else: self.set_param('charset', charset.get_output_charset()) + if str(charset) <> charset.get_output_charset(): + self._payload = charset.body_encode(self._payload) if not self.has_key('Content-Transfer-Encoding'): cte = charset.get_body_encoding() if callable(cte): cte(self) else: + self._payload = charset.body_encode(self._payload) self.add_header('Content-Transfer-Encoding', cte) def get_charset(self): diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index 1925889f67ed..edb65e32a884 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -2073,7 +2073,8 @@ class TestMiscellaneous(unittest.TestCase): charset = Charset(charsets[0]) eq(charset.get_body_encoding(), 'base64') msg.set_payload('hello world', charset=charset) - eq(msg.get_payload(), 'hello world') + eq(msg.get_payload(), 'aGVsbG8gd29ybGQ=\n') + eq(msg.get_payload(decode=True), 'hello world') eq(msg['content-transfer-encoding'], 'base64') # Try another one msg = Message() diff --git a/Lib/email/test/test_email_codecs.py b/Lib/email/test/test_email_codecs.py index 99a3227c38c3..afba94a036f7 100644 --- a/Lib/email/test/test_email_codecs.py +++ b/Lib/email/test/test_email_codecs.py @@ -1,17 +1,16 @@ -# Copyright (C) 2002 Python Software Foundation +# Copyright (C) 2002-2006 Python Software Foundation # email package unit tests for (optional) Asian codecs import unittest from test.test_support import TestSkipped, run_unittest from email.test.test_email import TestEmailBase -from email.Charset import Charset +from email.Charset import Charset, _find_asian_codec from email.Header import Header, decode_header +from email.Message import Message # See if we have the Japanese codecs package installed -try: - unicode('foo', 'japanese.iso-2022-jp') -except LookupError: +if not _find_asian_codec('iso-2022-jp', 'japanese'): raise TestSkipped, 'Optional Japanese codecs not installed' @@ -49,6 +48,14 @@ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= # TK: full decode comparison eq(h.__unicode__().encode('euc-jp'), long) + def test_payload_encoding(self): + jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa' + jcode = 'euc-jp' + msg = Message() + msg.set_payload(jhello, jcode) + ustr = unicode(msg.get_payload(), msg.get_content_charset()) + self.assertEqual(jhello, ustr.encode(jcode)) + def suite():