From bf503905a7910194076c934e0e3ba3e10fa1654d Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Thu, 10 Oct 2002 19:10:45 +0000 Subject: [PATCH] Backport email 2.4.2 changes from Python 2.3. --- Doc/lib/emailcharsets.tex | 9 +++++---- Doc/lib/emailmessage.tex | 5 +++-- Lib/email/Charset.py | 4 +++- Lib/email/Message.py | 10 ++++++---- Lib/email/__init__.py | 2 +- Lib/email/test/test_email.py | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 52 insertions(+), 12 deletions(-) diff --git a/Doc/lib/emailcharsets.tex b/Doc/lib/emailcharsets.tex index d1ae72804c00..d654adace1d8 100644 --- a/Doc/lib/emailcharsets.tex +++ b/Doc/lib/emailcharsets.tex @@ -23,10 +23,11 @@ Certain character sets must be encoded with quoted-printable or base64 when used in email headers or bodies. Certain character sets must be converted outright, and are not allowed in email. -Optional \var{input_charset} is as described below. After being alias -normalized it is also used as a lookup into the registry of character -sets to find out the header encoding, body encoding, and output -conversion codec to be used for the character set. For example, if +Optional \var{input_charset} is as described below; it is always +coerced to lower case. After being alias normalized it is also used +as a lookup into the registry of character sets to find out the header +encoding, body encoding, and output conversion codec to be used for +the character set. For example, if \var{input_charset} is \code{iso-8859-1}, then headers and bodies will be encoded using quoted-printable and no output conversion codec is necessary. If \var{input_charset} is \code{euc-jp}, then headers will diff --git a/Doc/lib/emailmessage.tex b/Doc/lib/emailmessage.tex index bfd86647cbbd..34c152db9ac8 100644 --- a/Doc/lib/emailmessage.tex +++ b/Doc/lib/emailmessage.tex @@ -443,8 +443,9 @@ have been present in the original \mailheader{Content-Type} header. \begin{methoddesc}[Message]{get_content_charset}{\optional{failobj}} Return the \code{charset} parameter of the \mailheader{Content-Type} -header. If there is no \mailheader{Content-Type} header, or if that -header has no \code{charset} parameter, \var{failobj} is returned. +header, coerced to lower case. If there is no +\mailheader{Content-Type} header, or if that header has no +\code{charset} parameter, \var{failobj} is returned. Note that this method differs from \method{get_charset()} which returns the \class{Charset} instance for the default encoding of the diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py index 9a7e51097640..67cc1ecb42d0 100644 --- a/Lib/email/Charset.py +++ b/Lib/email/Charset.py @@ -177,13 +177,15 @@ class Charset: this attribute will have the same value as the input_codec. """ def __init__(self, input_charset=DEFAULT_CHARSET): + # RFC 2046, $4.1.2 says charsets are not case sensitive + input_charset = input_charset.lower() # Set the input charset after filtering through the aliases self.input_charset = ALIASES.get(input_charset, input_charset) # We can try to guess which encoding and conversion to use by the # charset_map dictionary. Try that first, but let the user override # it. henc, benc, conv = CHARSETS.get(self.input_charset, - (SHORTEST, SHORTEST, None)) + (SHORTEST, BASE64, None)) # Set the attributes, allowing the arguments to override the default. self.header_encoding = henc self.body_encoding = benc diff --git a/Lib/email/Message.py b/Lib/email/Message.py index 87ab309885cc..16ae12082eea 100644 --- a/Lib/email/Message.py +++ b/Lib/email/Message.py @@ -760,8 +760,9 @@ class Message: def get_content_charset(self, failobj=None): """Return the charset parameter of the Content-Type header. - If there is no Content-Type header, or if that header has no charset - parameter, failobj is returned. + The returned string is always coerced to lower case. If there is no + Content-Type header, or if that header has no charset parameter, + failobj is returned. """ missing = [] charset = self.get_param('charset', missing) @@ -769,8 +770,9 @@ class Message: return failobj if isinstance(charset, TupleType): # RFC 2231 encoded, so decode it, and it better end up as ascii. - return unicode(charset[2], charset[0]).encode('us-ascii') - return charset + charset = unicode(charset[2], charset[0]).encode('us-ascii') + # RFC 2046, $4.1.2 says charsets are not case sensitive + return charset.lower() def get_charsets(self, failobj=None): """Return a list containing the charset(s) used in this message. diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py index 2dcf684150b2..2945b0510f02 100644 --- a/Lib/email/__init__.py +++ b/Lib/email/__init__.py @@ -4,7 +4,7 @@ """A package for parsing, handling, and generating email messages. """ -__version__ = '2.4.1' +__version__ = '2.4.2' __all__ = [ 'base64MIME', diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index 5bbb79416b59..daf9e287a71a 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -1689,6 +1689,40 @@ class TestMiscellaneous(unittest.TestCase): filename='foo\\wacky"name') eq(msg.get_filename(), 'foo\\wacky"name') + def test_get_body_encoding_with_bogus_charset(self): + charset = Charset('not a charset') + self.assertEqual(charset.get_body_encoding(), 'base64') + + def test_get_body_encoding_with_uppercase_charset(self): + eq = self.assertEqual + msg = Message() + msg['Content-Type'] = 'text/plain; charset=UTF-8' + eq(msg['content-type'], 'text/plain; charset=UTF-8') + charsets = msg.get_charsets() + eq(len(charsets), 1) + eq(charsets[0], 'utf-8') + charset = Charset(charsets[0]) + eq(charset.get_body_encoding(), 'base64') + msg.set_payload('hello world', charset=charset) + eq(msg.get_payload(), 'hello world') + eq(msg['content-transfer-encoding'], 'base64') + # Try another one + msg = Message() + msg['Content-Type'] = 'text/plain; charset="US-ASCII"' + charsets = msg.get_charsets() + eq(len(charsets), 1) + eq(charsets[0], 'us-ascii') + charset = Charset(charsets[0]) + eq(charset.get_body_encoding(), Encoders.encode_7or8bit) + msg.set_payload('hello world', charset=charset) + eq(msg.get_payload(), 'hello world') + eq(msg['content-transfer-encoding'], '7bit') + + def test_charsets_case_insensitive(self): + lc = Charset('us-ascii') + uc = Charset('US-ASCII') + self.assertEqual(lc.get_body_encoding(), uc.get_body_encoding()) + # Test the iterator/generators -- 2.47.3