From: Barry Warsaw <barry@python.org>
Date: Wed, 8 Feb 2006 13:33:20 +0000 (+0000)
Subject: Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455
X-Git-Tag: v2.3.6c1~7
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f5853f7592fac39e7f6d0bccc0884b666b9edd99;p=thirdparty%2FPython%2Fcpython.git

Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455
(.set_payload() gives bad .get_payload() results).  Specific changes include:

Simplfy the default CODEC_MAP in Charset.py to not include the Japanese and
Korean codecs.  The names of the codecs are different depending on whether
you're using Python 2.4 and 2.5, which include the codecs by default, or
earlier Python's which provide the codecs under different names as a third
party library.  Now, we attempt to discover which (if either) is available and
populate the CODEC_MAP as appropriate.

Message.set_charset(): When the message does not already have a
Content-Transfer-Encoding header, instead of just adding the header, we also
encode the body as defined by the assigned Charset.  As before, if the
body_encoding is callable, we just call that.  If not, then we add a call to
body_encode() before setting the header.  This way, we guarantee that a
message's text payload is always encoded properly.

Remove the payload encoding code from Generator._handle_text().  With the
above patch, this would cause the body to be doubly encoded.  Doing this in
the Message class is better than only doing it in the Generator.

Added some new tests to ensure everything works correctly.  Also changed the
way the test_email_codecs.py tests get added (using the same lookup code that
the CODEC_MAP adjustments use).

This resolves both issues for email 2.5/Python 2.3.  I will patch forward to
email 3.0 for both Python 2.4 and 2.5.
---

diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py
index dd328e050152..fb4e5a9b185d 100644
--- a/Lib/email/Charset.py
+++ b/Lib/email/Charset.py
@@ -1,5 +1,5 @@
-# Copyright (C) 2001,2002 Python Software Foundation
-# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw)
 
 from types import UnicodeType
 from email.Encoders import encode_7or8bit
@@ -99,20 +99,13 @@ ALIASES = {
 # of stability and useability.
 
 CODEC_MAP = {
-    'euc-jp':      'japanese.euc-jp',
-    'iso-2022-jp': 'japanese.iso-2022-jp',
-    'shift_jis':   'japanese.shift_jis',
-    'euc-kr':      'korean.euc-kr',
-    'ks_c_5601-1987': 'korean.cp949',
-    'iso-2022-kr': 'korean.iso-2022-kr',
-    'johab':       'korean.johab',
-    'gb2132':      'eucgb2312_cn',
-    'big5':        'big5_tw',
-    'utf-8':       'utf-8',
+    'gb2132':   'eucgb2312_cn',
+    'big5':     'big5_tw',
+    'utf-8':    'utf-8',
     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
     # Let that stuff pass through without conversion to/from Unicode.
-    'us-ascii':    None,
+    'us-ascii': None,
     }
 
 
@@ -165,6 +158,26 @@ def add_codec(charset, codecname):
     CODEC_MAP[charset] = codecname
 
 
+def _find_asian_codec(charset, language):
+    try:
+        unicode('foo', charset)
+        return charset
+    except LookupError:
+        try:
+            codec = language + '.' + charset
+            unicode('foo', codec)
+            return codec
+        except LookupError:
+            return None
+
+
+for _charset in ('euc-jp', 'iso-2022-jp', 'shift_jis'):
+    add_codec(_charset, _find_asian_codec(_charset, 'japanese') or _charset)
+
+for _charset in ('euc-kr', 'cp949', 'iso-2022-kr', 'johab'):
+    add_codec(_charset, _find_asian_codec(_charset, 'korean') or _charset)
+
+
 
 class Charset:
     """Map character sets to their email properties.
@@ -229,7 +242,7 @@ class Charset:
         self.input_codec = CODEC_MAP.get(self.input_charset,
                                          self.input_charset)
         self.output_codec = CODEC_MAP.get(self.output_charset,
-                                            self.input_codec)
+                                          self.input_codec)
 
     def __str__(self):
         return self.input_charset.lower()
diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py
index 56d44ea52176..bbc19cd2c6dd 100644
--- a/Lib/email/Generator.py
+++ b/Lib/email/Generator.py
@@ -1,8 +1,7 @@
-# Copyright (C) 2001,2002 Python Software Foundation
-# Author: barry@zope.com (Barry Warsaw)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: barry@python.org (Barry Warsaw)
 
-"""Classes to generate plain text from a message object tree.
-"""
+"""Classes to generate plain text from a message object tree."""
 
 import re
 import sys
@@ -192,9 +191,6 @@ class Generator:
         payload = msg.get_payload()
         if payload is None:
             return
-        cset = msg.get_charset()
-        if cset is not None:
-            payload = cset.body_encode(payload)
         if not _isstring(payload):
             raise TypeError, 'string payload expected: %s' % type(payload)
         if self._mangle_from_:
diff --git a/Lib/email/Message.py b/Lib/email/Message.py
index 10c2921ea0a2..bb8718fe23b0 100644
--- a/Lib/email/Message.py
+++ b/Lib/email/Message.py
@@ -272,11 +272,14 @@ class Message:
                             charset=charset.get_output_charset())
         else:
             self.set_param('charset', charset.get_output_charset())
+        if str(charset) <> charset.get_output_charset():
+            self._payload = charset.body_encode(self._payload)
         if not self.has_key('Content-Transfer-Encoding'):
             cte = charset.get_body_encoding()
             if callable(cte):
                 cte(self)
             else:
+                self._payload = charset.body_encode(self._payload)
                 self.add_header('Content-Transfer-Encoding', cte)
 
     def get_charset(self):
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 1925889f67ed..edb65e32a884 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -2073,7 +2073,8 @@ class TestMiscellaneous(unittest.TestCase):
         charset = Charset(charsets[0])
         eq(charset.get_body_encoding(), 'base64')
         msg.set_payload('hello world', charset=charset)
-        eq(msg.get_payload(), 'hello world')
+        eq(msg.get_payload(), 'aGVsbG8gd29ybGQ=\n')
+        eq(msg.get_payload(decode=True), 'hello world')
         eq(msg['content-transfer-encoding'], 'base64')
         # Try another one
         msg = Message()
diff --git a/Lib/email/test/test_email_codecs.py b/Lib/email/test/test_email_codecs.py
index 99a3227c38c3..afba94a036f7 100644
--- a/Lib/email/test/test_email_codecs.py
+++ b/Lib/email/test/test_email_codecs.py
@@ -1,17 +1,16 @@
-# Copyright (C) 2002 Python Software Foundation
+# Copyright (C) 2002-2006 Python Software Foundation
 # email package unit tests for (optional) Asian codecs
 
 import unittest
 from test.test_support import TestSkipped, run_unittest
 
 from email.test.test_email import TestEmailBase
-from email.Charset import Charset
+from email.Charset import Charset, _find_asian_codec
 from email.Header import Header, decode_header
+from email.Message import Message
 
 # See if we have the Japanese codecs package installed
-try:
-    unicode('foo', 'japanese.iso-2022-jp')
-except LookupError:
+if not _find_asian_codec('iso-2022-jp', 'japanese'):
     raise TestSkipped, 'Optional Japanese codecs not installed'
 
 
@@ -49,6 +48,14 @@ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
         # TK: full decode comparison
         eq(h.__unicode__().encode('euc-jp'), long)
 
+    def test_payload_encoding(self):
+        jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa'
+        jcode  = 'euc-jp'
+        msg = Message()
+        msg.set_payload(jhello, jcode)
+        ustr = unicode(msg.get_payload(), msg.get_content_charset())
+        self.assertEqual(jhello, ustr.encode(jcode))
+
 
 
 def suite():