Backport bugfix microrelease of email 2.4.3 from cvs trunk.

author Barry Warsaw <barry@python.org>

Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)

committer Barry Warsaw <barry@python.org>

Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)
author Barry Warsaw <barry@python.org>
Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)
committer Barry Warsaw <barry@python.org>
Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)
diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py

index 67cc1ecb42d08b54affab577277a909a3c474c65..b852245b00af024b46fe728d6a2a5d2f3b76812c 100644 (file)
--- a/Lib/email/Charset.py
+++ b/Lib/email/Charset.py
@@ -43,6 +43,8 @@ CHARSETS = {
      'iso-2022-jp': (BASE64,    None,    None),
      'koi8-r':      (BASE64,    BASE64,  None),
      'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
+    # We're making this one up to represent raw unencoded 8-bit
+    '8bit':        (None,      BASE64, 'utf-8'),
      }
  
  # Aliases for other commonly-used names for character sets.  Map
@@ -53,21 +55,16 @@ ALIASES = {
      'ascii':   'us-ascii',
      }
  
-# Map charsets to their Unicode codec strings.  Note that the Japanese
-# examples included below do not (yet) come with Python!  They are available
-# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
-
-# The Chinese and Korean codecs are available from SourceForge:
-#
-#     http://sourceforge.net/projects/python-codecs/
+# Map charsets to their Unicode codec strings.  Note that Python doesn't come
+# with any Asian codecs by default.  Here's where to get them:
  #
-# although you'll need to check them out of cvs since they haven't been file
-# released yet.  You might also try to use
+# Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
+# Korean   -- http://sf.net/projects/koco
+# Chinese  -- http://sf.net/projects/python-codecs
  #
-#     http://www.freshports.org/port-description.php3?port=6702
-#
-# if you can get logged in.  AFAICT, both the Chinese and Korean codecs are
-# fairly experimental at this point.
+# Note that these codecs have their own lifecycle and may be in varying states
+# of stability and useability.
+
  CODEC_MAP = {
      'euc-jp':      'japanese.euc-jp',
      'iso-2022-jp': 'japanese.iso-2022-jp',
diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py

index 7f05218d48599493f01d278556ef442dafdc8811..58e2f91d622a4c1ef07baecfa50fcbc822abe599 100644 (file)
--- a/Lib/email/Generator.py
+++ b/Lib/email/Generator.py
@@ -8,7 +8,7 @@ import time
  import re
  import random
  
-from types import ListType
+from types import ListType, StringType
  from cStringIO import StringIO
  
  from email.Header import Header
@@ -35,6 +35,14 @@ SPACE8 = ' ' * 8
  
  fcre = re.compile(r'^From ', re.MULTILINE)
  
+def _is8bitstring(s):
+    if isinstance(s, StringType):
+        try:
+            unicode(s, 'us-ascii')
+        except UnicodeError:
+            return True
+    return False
+
  
  \f
  class Generator:
@@ -174,6 +182,14 @@ class Generator:
              # No line was actually longer than maxheaderlen characters, so
              # just return the original unchanged.
              return text
+        # If we have raw 8bit data in a byte string, we have no idea what the
+        # encoding is.  I think there is no safe way to split this string.  If
+        # it's ascii-subset, then we could do a normal ascii split, but if
+        # it's multibyte then we could break the string.  There's no way to
+        # know so the least harm seems to be to not split the string and risk
+        # it being too long.
+        if _is8bitstring(text):
+            return text
          # The `text' argument already has the field name prepended, so don't
          # provide it here or the first line will get folded too short.
          h = Header(text, maxlinelen=maxheaderlen,
diff --git a/Lib/email/Header.py b/Lib/email/Header.py

index a40226dc1462c17b001ebb700847c1969b4e647d..0ceacc7bf309feeacf0fefbcec0367aa227c5bef 100644 (file)
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@@ -153,6 +153,8 @@ class Header:
          """
          if charset is None:
              charset = USASCII
+        if not isinstance(charset, Charset):
+            charset = Charset(charset)
          self._charset = charset
          self._continuation_ws = continuation_ws
          cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
@@ -216,31 +218,52 @@ class Header:
              charset = self._charset
          elif not isinstance(charset, Charset):
              charset = Charset(charset)
-        # Normalize and check the string
-        if isinstance(s, StringType):
-            # Possibly raise UnicodeError if it can't e encoded
-            unicode(s, charset.get_output_charset())
-        elif isinstance(s, UnicodeType):
-            # Convert Unicode to byte string for later concatenation
-            for charset in USASCII, charset, UTF8:
-                try:
-                    s = s.encode(charset.get_output_charset())
-                    break
-                except UnicodeError:
-                    pass
-            else:
-                assert False, 'Could not encode to utf-8'
+        # If the charset is our faux 8bit charset, leave the string unchanged
+        if charset <> '8bit':
+            # We need to test that the string can be converted to unicode and
+            # back to a byte string, given the input and output codecs of the
+            # charset.
+            if isinstance(s, StringType):
+                # Possibly raise UnicodeError if the byte string can't be
+                # converted to a unicode with the input codec of the charset.
+                incodec = charset.input_codec or 'us-ascii'
+                ustr = unicode(s, incodec)
+                # Now make sure that the unicode could be converted back to a
+                # byte string with the output codec, which may be different
+                # than the iput coded.  Still, use the original byte string.
+                outcodec = charset.output_codec or 'us-ascii'
+                ustr.encode(outcodec)
+            elif isinstance(s, UnicodeType):
+                # Now we have to be sure the unicode string can be converted
+                # to a byte string with a reasonable output codec.  We want to
+                # use the byte string in the chunk.
+                for charset in USASCII, charset, UTF8:
+                    try:
+                        outcodec = charset.output_codec or 'us-ascii'
+                        s = s.encode(outcodec)
+                        break
+                    except UnicodeError:
+                        pass
+                else:
+                    assert False, 'utf-8 conversion failed'
          self._chunks.append((s, charset))
  
      def _split(self, s, charset, firstline=False):
-        # Split up a header safely for use with encode_chunks.  BAW: this
-        # appears to be a private convenience method.
+        # Split up a header safely for use with encode_chunks.
          splittable = charset.to_splittable(s)
          encoded = charset.from_splittable(splittable)
          elen = charset.encoded_header_len(encoded)
  
          if elen <= self._maxlinelen:
              return [(encoded, charset)]
+        # If we have undetermined raw 8bit characters sitting in a byte
+        # string, we really don't know what the right thing to do is.  We
+        # can't really split it because it might be multibyte data which we
+        # could break if we split it between pairs.  The least harm seems to
+        # be to not split the header at all, but that means they could go out
+        # longer than maxlinelen.
+        elif charset == '8bit':
+            return [(s, charset)]
          # BAW: I'm not sure what the right test here is.  What we're trying to
          # do is be faithful to RFC 2822's recommendation that ($2.2.3):
          #
@@ -346,27 +369,27 @@ class Header:
                  rtn.append(EMPTYSTRING.join(sublines))
          return [(chunk, charset) for chunk in rtn]
  
-    def _encode_chunks(self):
-        """MIME-encode a header with many different charsets and/or encodings.
-
-        Given a list of pairs (string, charset), return a MIME-encoded string
-        suitable for use in a header field.  Each pair may have different
-        charsets and/or encodings, and the resulting header will accurately
-        reflect each setting.
-
-        Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
-        character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
-        non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
-        (no encoding).
-
-        Each pair will be represented on a separate line; the resulting string
-        will be in the format:
-
-        "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
-          =?charset2?b?SvxyZ2VuIEL2aW5n?="
-        """
+    def _encode_chunks(self, newchunks):
+        # MIME-encode a header with many different charsets and/or encodings.
+        #
+        # Given a list of pairs (string, charset), return a MIME-encoded
+        # string suitable for use in a header field.  Each pair may have
+        # different charsets and/or encodings, and the resulting header will
+        # accurately reflect each setting.
+        #
+        # Each encoding can be email.Utils.QP (quoted-printable, for
+        # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
+        # (Base64, for non-ASCII like character sets like KOI8-R and
+        # iso-2022-jp), or None (no encoding).
+        #
+        # Each pair will be represented on a separate line; the resulting
+        # string will be in the format:
+        #
+        # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+        #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
+        #
          chunks = []
-        for header, charset in self._chunks:
+        for header, charset in newchunks:
              if charset is None or charset.header_encoding is None:
                  # There's no encoding for this chunk's charsets
                  _max_append(chunks, header, self._maxlinelen)
@@ -397,5 +420,4 @@ class Header:
          newchunks = []
          for s, charset in self._chunks:
              newchunks += self._split(s, charset, True)
-        self._chunks = newchunks
-        return self._encode_chunks()
+        return self._encode_chunks(newchunks)
diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py

index 2945b0510f023432fd9bd99ea5571cb3cdf2d387..b784da8247fa9df50484e2e21acc51e87b44d4bd 100644 (file)
--- a/Lib/email/__init__.py
+++ b/Lib/email/__init__.py
@@ -4,7 +4,7 @@
  """A package for parsing, handling, and generating email messages.
  """
  
-__version__ = '2.4.2'
+__version__ = '2.4.3'
  
  __all__ = [
      'base64MIME',
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py

index daf9e287a71ac10375a5bbad450abfdcb5129ac4..176e7740c225e298b2bfda4f3479cf3f0e0596e1 100644 (file)
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -703,6 +703,27 @@ from modemcable093.139-201-24.que.mc.videotron.ca ([24.201.139.93]
  \tid 17k4h5-00034i-00
  \tfor test@mems-exchange.org; Wed, 28 Aug 2002 11:25:20 -0400""")
  
+    def test_long_8bit_header(self):
+        eq = self.ndiffAssertEqual
+        msg = Message()
+        h = Header('Britische Regierung gibt', 'iso-8859-1')
+        h.append('gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte')
+        msg['Subject'] = h
+        eq(msg.as_string(), """\
+Subject: =?iso-8859-1?q?Britische_Regierung_gibt?=
+ =?iso-8859-1?q?gr=FCnes_Licht_f=FCr_Offshore-Windkraftprojekte?=
+
+""")
+
+    def test_long_8bit_header_no_charset(self):
+        eq = self.ndiffAssertEqual
+        msg = Message()
+        msg['Reply-To'] = 'Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte <a-very-long-address@example.com>'
+        eq(msg.as_string(), """\
+Reply-To: Britische Regierung gibt gr\xfcnes Licht f\xfcr Offshore-Windkraftprojekte <a-very-long-address@example.com>
+
+""")
+
  
  \f
  # Test mangling of "From " lines in the body of a message
author	Barry Warsaw <barry@python.org>
	Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)
committer	Barry Warsaw <barry@python.org>
	Mon, 14 Oct 2002 17:26:03 +0000 (17:26 +0000)
Lib/email/Charset.py		patch \| blob \| blame \| history
Lib/email/Generator.py		patch \| blob \| blame \| history
Lib/email/Header.py		patch \| blob \| blame \| history
Lib/email/__init__.py		patch \| blob \| blame \| history
Lib/email/test/test_email.py		patch \| blob \| blame \| history