From: Barry Warsaw <barry@python.org>
Date: Mon, 14 Oct 2002 17:26:03 +0000 (+0000)
Subject: Backport bugfix microrelease of email 2.4.3 from cvs trunk.
X-Git-Tag: v2.2.2~7
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=31eac4e4b39a9ced6f0192e06a2b70fa2581d307;p=thirdparty%2FPython%2Fcpython.git

Backport bugfix microrelease of email 2.4.3 from cvs trunk.
---

diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py
index 67cc1ecb42d0..b852245b00af 100644
--- a/Lib/email/Charset.py
+++ b/Lib/email/Charset.py
@@ -43,6 +43,8 @@ CHARSETS = {
     'iso-2022-jp': (BASE64,    None,    None),
     'koi8-r':      (BASE64,    BASE64,  None),
     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
+    # We're making this one up to represent raw unencoded 8-bit
+    '8bit':        (None,      BASE64, 'utf-8'),
     }
 
 # Aliases for other commonly-used names for character sets.  Map
@@ -53,21 +55,16 @@ ALIASES = {
     'ascii':   'us-ascii',
     }
 
-# Map charsets to their Unicode codec strings.  Note that the Japanese
-# examples included below do not (yet) come with Python!  They are available
-# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
-
-# The Chinese and Korean codecs are available from SourceForge:
-#
-#     http://sourceforge.net/projects/python-codecs/
+# Map charsets to their Unicode codec strings.  Note that Python doesn't come
+# with any Asian codecs by default.  Here's where to get them:
 #
-# although you'll need to check them out of cvs since they haven't been file
-# released yet.  You might also try to use
+# Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
+# Korean   -- http://sf.net/projects/koco
+# Chinese  -- http://sf.net/projects/python-codecs
 #
-#     http://www.freshports.org/port-description.php3?port=6702
-#
-# if you can get logged in.  AFAICT, both the Chinese and Korean codecs are
-# fairly experimental at this point.
+# Note that these codecs have their own lifecycle and may be in varying states
+# of stability and useability.
+
 CODEC_MAP = {
     'euc-jp':      'japanese.euc-jp',
     'iso-2022-jp': 'japanese.iso-2022-jp',
diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py
index 7f05218d4859..58e2f91d622a 100644
--- a/Lib/email/Generator.py
+++ b/Lib/email/Generator.py
@@ -8,7 +8,7 @@ import time
 import re
 import random
 
-from types import ListType
+from types import ListType, StringType
 from cStringIO import StringIO
 
 from email.Header import Header
@@ -35,6 +35,14 @@ SPACE8 = ' ' * 8
 
 fcre = re.compile(r'^From ', re.MULTILINE)
 
+def _is8bitstring(s):
+    if isinstance(s, StringType):
+        try:
+            unicode(s, 'us-ascii')
+        except UnicodeError:
+            return True
+    return False
+
 
 
 class Generator:
@@ -174,6 +182,14 @@ class Generator:
             # No line was actually longer than maxheaderlen characters, so
             # just return the original unchanged.
             return text
+        # If we have raw 8bit data in a byte string, we have no idea what the
+        # encoding is.  I think there is no safe way to split this string.  If
+        # it's ascii-subset, then we could do a normal ascii split, but if
+        # it's multibyte then we could break the string.  There's no way to
+        # know so the least harm seems to be to not split the string and risk
+        # it being too long.
+        if _is8bitstring(text):
+            return text
         # The `text' argument already has the field name prepended, so don't
         # provide it here or the first line will get folded too short.
         h = Header(text, maxlinelen=maxheaderlen,
diff --git a/Lib/email/Header.py b/Lib/email/Header.py
index a40226dc1462..0ceacc7bf309 100644
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -153,6 +153,8 @@ class Header:
         """
         if charset is None:
             charset = USASCII
+        if not isinstance(charset, Charset):
+            charset = Charset(charset)
         self._charset = charset
         self._continuation_ws = continuation_ws
         cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
@@ -216,31 +218,52 @@ class Header:
             charset = self._charset
         elif not isinstance(charset, Charset):
             charset = Charset(charset)
-        # Normalize and check the string
-        if isinstance(s, StringType):
-            # Possibly raise UnicodeError if it can't e encoded
-            unicode(s, charset.get_output_charset())
-        elif isinstance(s, UnicodeType):
-            # Convert Unicode to byte string for later concatenation
-            for charset in USASCII, charset, UTF8:
-                try:
-                    s = s.encode(charset.get_output_charset())
-                    break
-                except UnicodeError:
-                    pass
-            else:
-                assert False, 'Could not encode to utf-8'
+        # If the charset is our faux 8bit charset, leave the string unchanged
+        if charset <> '8bit':
+            # We need to test that the string can be converted to unicode and
+            # back to a byte string, given the input and output codecs of the
+            # charset.
+            if isinstance(s, StringType):
+                # Possibly raise UnicodeError if the byte string can't be
+                # converted to a unicode with the input codec of the charset.
+                incodec = charset.input_codec or 'us-ascii'
+                ustr = unicode(s, incodec)
+                # Now make sure that the unicode could be converted back to a
+                # byte string with the output codec, which may be different
+                # than the iput coded.  Still, use the original byte string.
+                outcodec = charset.output_codec or 'us-ascii'
+                ustr.encode(outcodec)
+            elif isinstance(s, UnicodeType):
+                # Now we have to be sure the unicode string can be converted
+                # to a byte string with a reasonable output codec.  We want to
+                # use the byte string in the chunk.
+                for charset in USASCII, charset, UTF8:
+                    try:
+                        outcodec = charset.output_codec or 'us-ascii'
+                        s = s.encode(outcodec)
+                        break
+                    except UnicodeError:
+                        pass
+                else:
+                    assert False, 'utf-8 conversion failed'
         self._chunks.append((s, charset))
 
     def _split(self, s, charset, firstline=False):
-        # Split up a header safely for use with encode_chunks.  BAW: this
-        # appears to be a private convenience method.
+        # Split up a header safely for use with encode_chunks.
         splittable = charset.to_splittable(s)
         encoded = charset.from_splittable(splittable)
         elen = charset.encoded_header_len(encoded)
 
         if elen <= self._maxlinelen:
             return [(encoded, charset)]
+        # If we have undetermined raw 8bit characters sitting in a byte
+        # string, we really don't know what the right thing to do is.  We
+        # can't really split it because it might be multibyte data which we
+        # could break if we split it between pairs.  The least harm seems to
+        # be to not split the header at all, but that means they could go out
+        # longer than maxlinelen.
+        elif charset == '8bit':
+            return [(s, charset)]
         # BAW: I'm not sure what the right test here is.  What we're trying to
         # do is be faithful to RFC 2822's recommendation that ($2.2.3):
         #
@@ -346,27 +369,27 @@ class Header:
                 rtn.append(EMPTYSTRING.join(sublines))
         return [(chunk, charset) for chunk in rtn]
 
-    def _encode_chunks(self):
-        """MIME-encode a header with many different charsets and/or encodings.
-
-        Given a list of pairs (string, charset), return a MIME-encoded string
-        suitable for use in a header field.  Each pair may have different
-        charsets and/or encodings, and the resulting header will accurately
-        reflect each setting.
-
-        Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
-        character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
-        non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
-        (no encoding).
-
-        Each pair will be represented on a separate line; the resulting string
-        will be in the format:
-
-        "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
-          =?charset2?b?SvxyZ2VuIEL2aW5n?="
-        """
+    def _encode_chunks(self, newchunks):
+        # MIME-encode a header with many different charsets and/or encodings.
+        #
+        # Given a list of pairs (string, charset), return a MIME-encoded
+        # string suitable for use in a header field.  Each pair may have
+        # different charsets and/or encodings, and the resulting header will
+        # accurately reflect each setting.
+        #
+        # Each encoding can be email.Utils.QP (quoted-printable, for
+        # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
+        # (Base64, for non-ASCII like character sets like KOI8-R and
+        # iso-2022-jp), or None (no encoding).
+        #
+        # Each pair will be represented on a separate line; the resulting
+        # string will be in the format:
+        #
+        # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+        #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
+        #
         chunks = []
-        for header, charset in self._chunks:
+        for header, charset in newchunks:
             if charset is None or charset.header_encoding is None:
                 # There's no encoding for this chunk's charsets
                 _max_append(chunks, header, self._maxlinelen)
@@ -397,5 +420,4 @@ class Header:
         newchunks = []
         for s, charset in self._chunks:
             newchunks += self._split(s, charset, True)
-        self._chunks = newchunks
-        return self._encode_chunks()
+        return self._encode_chunks(newchunks)
diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py
index 2945b0510f02..b784da8247fa 100644
--- a/Lib/email/__init__.py
+++ b/Lib/email/__init__.py
@@ -4,7 +4,7 @@
 """A package for parsing, handling, and generating email messages.
 """
 
-__version__ = '2.4.2'
+__version__ = '2.4.3'
 
 __all__ = [
     'base64MIME',
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index daf9e287a71a..176e7740c225 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -703,6 +703,27 @@ from modemcable093.139-201-24.que.mc.videotron.ca ([24.201.139.93]
 \tid 17k4h5-00034i-00
 \tfor test@mems-exchange.org; Wed, 28 Aug 2002 11:25:20 -0400""")
 
+    def test_long_8bit_header(self):
+        eq = self.ndiffAssertEqual
+        msg = Message()
+        h = Header('Britische Regierung gibt', 'iso-8859-1')
+        h.append('gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte')
+        msg['Subject'] = h
+        eq(msg.as_string(), """\
+Subject: =?iso-8859-1?q?Britische_Regierung_gibt?=
+ =?iso-8859-1?q?gr=FCnes_Licht_f=FCr_Offshore-Windkraftprojekte?=
+
+""")
+
+    def test_long_8bit_header_no_charset(self):
+        eq = self.ndiffAssertEqual
+        msg = Message()
+        msg['Reply-To'] = 'Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte <a-very-long-address@example.com>'
+        eq(msg.as_string(), """\
+Reply-To: Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte <a-very-long-address@example.com>
+
+""")
+
 
 
 # Test mangling of "From " lines in the body of a message