From b7e173e07510cea4abcd6e8d67552e22737ea9d8 Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Sun, 23 Jul 2006 17:02:55 +0000 Subject: [PATCH] Port r50754 to Python 2.4/email 3.0. Bump the email version number to 3.0.2. --- Lib/email/Utils.py | 61 ++++++++++----- Lib/email/test/test_email.py | 145 ++++++++++++++++++++++++++++++++--- Misc/NEWS | 12 +++ 3 files changed, 189 insertions(+), 29 deletions(-) diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py index 9ba760116153..8c1e69e3336b 100644 --- a/Lib/email/Utils.py +++ b/Lib/email/Utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001-2004 Python Software Foundation +# Copyright (C) 2001-2006 Python Software Foundation # Author: Barry Warsaw # Contact: email-sig@python.org @@ -10,6 +10,7 @@ import time import base64 import random import socket +import urllib import warnings from cStringIO import StringIO @@ -30,6 +31,7 @@ COMMASPACE = ', ' EMPTYSTRING = '' UEMPTYSTRING = u'' CRLF = '\r\n' +TICK = "'" specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[][\\()"]') @@ -215,12 +217,14 @@ def unquote(str): # RFC2231-related functions - parameter encoding and decoding def decode_rfc2231(s): """Decode string according to RFC 2231""" - import urllib - parts = s.split("'", 2) - if len(parts) == 1: + parts = s.split(TICK, 2) + if len(parts) <= 2: return None, None, urllib.unquote(s) - charset, language, s = parts - return charset, language, urllib.unquote(s) + if len(parts) > 3: + charset, language = pars[:2] + s = TICK.join(parts[2:]) + return charset, language, s + return parts def encode_rfc2231(s, charset=None, language=None): @@ -244,37 +248,54 @@ rfc2231_continuation = re.compile(r'^(?P\w+)\*((?P[0-9]+)\*?)?$') def decode_params(params): """Decode parameters list according to RFC 2231. - params is a sequence of 2-tuples containing (content type, string value). + params is a sequence of 2-tuples containing (param name, string value). """ + # Copy params so we don't mess with the original + params = params[:] new_params = [] - # maps parameter's name to a list of continuations + # Map parameter's name to a list of continuations. The values are a + # 3-tuple of the continuation number, the string value, and a flag + # specifying whether a particular segment is %-encoded. rfc2231_params = {} - # params is a sequence of 2-tuples containing (content_type, string value) - name, value = params[0] + name, value = params.pop(0) new_params.append((name, value)) - # Cycle through each of the rest of the parameters. - for name, value in params[1:]: + while params: + name, value = params.pop(0) + if name.endswith('*'): + encoded = True + else: + encoded = False value = unquote(value) mo = rfc2231_continuation.match(name) if mo: name, num = mo.group('name', 'num') if num is not None: num = int(num) - rfc2231_param1 = rfc2231_params.setdefault(name, []) - rfc2231_param1.append((num, value)) + rfc2231_params.setdefault(name, []).append((num, value, encoded)) else: new_params.append((name, '"%s"' % quote(value))) if rfc2231_params: for name, continuations in rfc2231_params.items(): value = [] + extended = False # Sort by number continuations.sort() - # And now append all values in num order - for num, continuation in continuations: - value.append(continuation) - charset, language, value = decode_rfc2231(EMPTYSTRING.join(value)) - new_params.append( - (name, (charset, language, '"%s"' % quote(value)))) + # And now append all values in numerical order, converting + # %-encodings for the encoded segments. If any of the + # continuation names ends in a *, then the entire string, after + # decoding segments and concatenating, must have the charset and + # language specifiers at the beginning of the string. + for num, s, encoded in continuations: + if encoded: + s = urllib.unquote(s) + extended = True + value.append(s) + value = quote(EMPTYSTRING.join(value)) + if extended: + charset, language, value = decode_rfc2231(value) + new_params.append((name, (charset, language, '"%s"' % value))) + else: + new_params.append((name, '"%s"' % value)) return new_params def collapse_rfc2231_value(value, errors='replace', diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index 1e4a510ad1d6..24c6efa5fdfb 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -2997,14 +2997,29 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC ''' msg = email.message_from_string(m) - self.assertEqual(msg.get_param('NAME'), - (None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')) + param = msg.get_param('NAME') + self.failIf(isinstance(param, tuple)) + self.assertEqual( + param, + 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm') def test_rfc2231_no_language_or_charset_in_filename(self): m = '''\ Content-Disposition: inline; -\tfilename*0="This%20is%20even%20more%20"; -\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*0*="''This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2="is it not.pdf" + +''' + msg = email.message_from_string(m) + self.assertEqual(msg.get_filename(), + 'This is even more ***fun*** is it not.pdf') + + def test_rfc2231_no_language_or_charset_in_filename_encoded(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0*="''This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; \tfilename*2="is it not.pdf" ''' @@ -3012,11 +3027,37 @@ Content-Disposition: inline; self.assertEqual(msg.get_filename(), 'This is even more ***fun*** is it not.pdf') + def test_rfc2231_partly_encoded(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0="''This%20is%20even%20more%20"; +\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2="is it not.pdf" + +''' + msg = email.message_from_string(m) + self.assertEqual( + msg.get_filename(), + 'This%20is%20even%20more%20***fun*** is it not.pdf') + + def test_rfc2231_partly_nonencoded(self): + m = '''\ +Content-Disposition: inline; +\tfilename*0="This%20is%20even%20more%20"; +\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20"; +\tfilename*2="is it not.pdf" + +''' + msg = email.message_from_string(m) + self.assertEqual( + msg.get_filename(), + 'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf') + def test_rfc2231_no_language_or_charset_in_boundary(self): m = '''\ Content-Type: multipart/alternative; -\tboundary*0="This%20is%20even%20more%20"; -\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20"; +\tboundary*0*="''This%20is%20even%20more%20"; +\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20"; \tboundary*2="is it not.pdf" ''' @@ -3028,8 +3069,8 @@ Content-Type: multipart/alternative; # This is a nonsensical charset value, but tests the code anyway m = '''\ Content-Type: text/plain; -\tcharset*0="This%20is%20even%20more%20"; -\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20"; +\tcharset*0*="This%20is%20even%20more%20"; +\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20"; \tcharset*2="is it not.pdf" ''' @@ -3040,12 +3081,98 @@ Content-Type: text/plain; def test_rfc2231_unknown_encoding(self): m = """\ Content-Transfer-Encoding: 8bit -Content-Disposition: inline; filename*0=X-UNKNOWN''myfile.txt +Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt """ msg = email.message_from_string(m) self.assertEqual(msg.get_filename(), 'myfile.txt') + def test_rfc2231_single_tick_in_filename_extended(self): + eq = self.assertEqual + m = """\ +Content-Type: application/x-foo; +\tname*0*=\"Frank's\"; name*1*=\" Document\" + +""" + msg = email.message_from_string(m) + charset, language, s = msg.get_param('name') + eq(charset, None) + eq(language, None) + eq(s, "Frank's Document") + + def test_rfc2231_single_tick_in_filename(self): + m = """\ +Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\" + +""" + msg = email.message_from_string(m) + param = msg.get_param('name') + self.failIf(isinstance(param, tuple)) + self.assertEqual(param, "Frank's Document") + + def test_rfc2231_tick_attack_extended(self): + eq = self.assertEqual + m = """\ +Content-Type: application/x-foo; +\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\" + +""" + msg = email.message_from_string(m) + charset, language, s = msg.get_param('name') + eq(charset, 'us-ascii') + eq(language, 'en-us') + eq(s, "Frank's Document") + + def test_rfc2231_tick_attack(self): + m = """\ +Content-Type: application/x-foo; +\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\" + +""" + msg = email.message_from_string(m) + param = msg.get_param('name') + self.failIf(isinstance(param, tuple)) + self.assertEqual(param, "us-ascii'en-us'Frank's Document") + + def test_rfc2231_no_extended_values(self): + eq = self.assertEqual + m = """\ +Content-Type: application/x-foo; name=\"Frank's Document\" + +""" + msg = email.message_from_string(m) + eq(msg.get_param('name'), "Frank's Document") + + def test_rfc2231_encoded_then_unencoded_segments(self): + eq = self.assertEqual + m = """\ +Content-Type: application/x-foo; +\tname*0*=\"us-ascii'en-us'My\"; +\tname*1=\" Document\"; +\tname*2*=\" For You\" + +""" + msg = email.message_from_string(m) + charset, language, s = msg.get_param('name') + eq(charset, 'us-ascii') + eq(language, 'en-us') + eq(s, 'My Document For You') + + def test_rfc2231_unencoded_then_encoded_segments(self): + eq = self.assertEqual + m = """\ +Content-Type: application/x-foo; +\tname*0=\"us-ascii'en-us'My\"; +\tname*1*=\" Document\"; +\tname*2*=\" For You\" + +""" + msg = email.message_from_string(m) + charset, language, s = msg.get_param('name') + eq(charset, 'us-ascii') + eq(language, 'en-us') + eq(s, 'My Document For You') + def _testclasses(): diff --git a/Misc/NEWS b/Misc/NEWS index 87d5ac24da4f..715a7a92c12c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -59,6 +59,18 @@ Extension Modules Library ------- +- The email package has improved RFC 2231 support, specifically for + recognizing the difference between encoded (name*0*=) and non-encoded + (name*0=) parameter continuations. This may change the types of + values returned from email.message.Message.get_param() and friends. + Specifically in some cases where non-encoded continuations were used, + get_param() used to return a 3-tuple of (None, None, string) whereas now it + will just return the string (since non-encoded continuations don't have + charset and language parts). + + Also, whereas % values were decoded in all parameter continuations, they are + now only decoded in encoded parameter parts. + - Bug #822974: Honor timeout in telnetlib.{expect,read_until} even if some data are received. -- 2.47.3