From: Barry Warsaw <barry@python.org>
Date: Tue, 25 Jul 2006 13:06:56 +0000 (+0000)
Subject: Back port r50693 and r50754 from the trunk (and 2.4 branch):
X-Git-Tag: v2.3.6c1~4
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2bfcf5d1f1b0f20e5f2efe5f3dca2ce808f57263;p=thirdparty%2FPython%2Fcpython.git

Back port r50693 and r50754 from the trunk (and 2.4 branch):

decode_rfc2231(): Be more robust against buggy RFC 2231 encodings.
Specifically, instead of raising a ValueError when there is a single
tick in the parameter, simply return that the entire string unquoted, with
None for both the charset and the language.  Also, if there are more than 2
ticks in the parameter, interpret the first three parts as the standard RFC
2231 parts, then the rest of the parts as the encoded string.

More RFC 2231 improvements for the email 4.0 package.  As Mark Sapiro
rightly points out there are really two types of continued headers
defined in this RFC (i.e. "encoded" parameters with the form
"name*0*=" and unencoded parameters with the form "name*0="), but we
were were handling them both the same way and that isn't correct.

This patch should be much more RFC compliant in that only encoded
params are %-decoded and the charset/language information is only
extract if there are any encoded params in the segments.  If there are
no encoded params then the RFC says that there will be no
charset/language parts.

Note however that this will change the return value for
Message.get_param() in some cases.  For example, whereas before if you
had all unencoded param continuations you would have still gotten a
3-tuple back from this method (with charset and language == None), you
will now get just a string. I don't believe this is a backward
incompatible change though because the documentation for this method
already indicates that either return value is possible and that you
must do an isinstance(val, tuple) check to discriminate between the
two.  (Yeah that API kind of sucks but we can't change /that/ without
breaking code.)

Test cases, some documentation updates, and a NEWS item accompany this
patch.

Original fewer-than-3-parts fix by Tokio Kikuchi.

Resolves SF bug # 1218081.

Also, bump the package version number to 2.5.8 for release.
---

diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py
index a409e16e9161..ccd4f561eb5b 100644
--- a/Lib/email/Utils.py
+++ b/Lib/email/Utils.py
@@ -1,14 +1,15 @@
-# Copyright (C) 2001,2002 Python Software Foundation
-# Author: barry@zope.com (Barry Warsaw)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: Barry Warsaw
+# Contact: email-sig@python.org
 
-"""Miscellaneous utilities.
-"""
+"""Miscellaneous utilities."""
 
 import time
 import socket
 import re
 import random
 import os
+import urllib
 import warnings
 from cStringIO import StringIO
 from types import ListType
@@ -53,6 +54,7 @@ COMMASPACE = ', '
 EMPTYSTRING = ''
 UEMPTYSTRING = u''
 CRLF = '\r\n'
+TICK = "'"
 
 specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[][\\()"]')
@@ -277,12 +279,14 @@ def unquote(str):
 # RFC2231-related functions - parameter encoding and decoding
 def decode_rfc2231(s):
     """Decode string according to RFC 2231"""
-    import urllib
-    parts = s.split("'", 2)
-    if len(parts) == 1:
+    parts = s.split(TICK, 2)
+    if len(parts) <= 2:
         return None, None, urllib.unquote(s)
-    charset, language, s = parts
-    return charset, language, urllib.unquote(s)
+    if len(parts) > 3:
+        charset, language = pars[:2]
+        s = TICK.join(parts[2:])
+        return charset, language, s
+    return parts
 
 
 def encode_rfc2231(s, charset=None, language=None):
@@ -306,35 +310,52 @@ rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
 def decode_params(params):
     """Decode parameters list according to RFC 2231.
 
-    params is a sequence of 2-tuples containing (content type, string value).
+    params is a sequence of 2-tuples containing (param name, string value).
     """
+    # Copy params so we don't mess with the original
+    params = params[:]
     new_params = []
-    # maps parameter's name to a list of continuations
+    # Map parameter's name to a list of continuations.  The values are a
+    # 3-tuple of the continuation number, the string value, and a flag
+    # specifying whether a particular segment is %-encoded.
     rfc2231_params = {}
-    # params is a sequence of 2-tuples containing (content_type, string value)
-    name, value = params[0]
+    name, value = params.pop(0)
     new_params.append((name, value))
-    # Cycle through each of the rest of the parameters.
-    for name, value in params[1:]:
+    while params:
+        name, value = params.pop(0)
+        if name.endswith('*'):
+            encoded = True
+        else:
+            encoded = False
         value = unquote(value)
         mo = rfc2231_continuation.match(name)
         if mo:
             name, num = mo.group('name', 'num')
             if num is not None:
                 num = int(num)
-            rfc2231_param1 = rfc2231_params.setdefault(name, [])
-            rfc2231_param1.append((num, value))
+            rfc2231_params.setdefault(name, []).append((num, value, encoded))
         else:
             new_params.append((name, '"%s"' % quote(value)))
     if rfc2231_params:
         for name, continuations in rfc2231_params.items():
             value = []
+            extended = False
             # Sort by number
             continuations.sort()
-            # And now append all values in num order
-            for num, continuation in continuations:
-                value.append(continuation)
-            charset, language, value = decode_rfc2231(EMPTYSTRING.join(value))
-            new_params.append(
-                (name, (charset, language, '"%s"' % quote(value))))
+            # And now append all values in numerical order, converting
+            # %-encodings for the encoded segments.  If any of the
+            # continuation names ends in a *, then the entire string, after
+            # decoding segments and concatenating, must have the charset and
+            # language specifiers at the beginning of the string.
+            for num, s, encoded in continuations:
+                if encoded:
+                    s = urllib.unquote(s)
+                    extended = True
+                value.append(s)
+            value = quote(EMPTYSTRING.join(value))
+            if extended:
+                charset, language, value = decode_rfc2231(value)
+                new_params.append((name, (charset, language, '"%s"' % value)))
+            else:
+                new_params.append((name, '"%s"' % value))
     return new_params
diff --git a/Lib/email/__init__.py b/Lib/email/__init__.py
index 6bd14870cb42..1e03b0085f76 100644
--- a/Lib/email/__init__.py
+++ b/Lib/email/__init__.py
@@ -3,7 +3,7 @@
 
 """A package for parsing, handling, and generating email messages."""
 
-__version__ = '2.5.7'
+__version__ = '2.5.8'
 
 __all__ = [
     'base64MIME',
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 44cd2e278deb..ee0e665f7b57 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -2756,14 +2756,17 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC
 
 '''
         msg = email.message_from_string(m)
-        self.assertEqual(msg.get_param('NAME'),
-                         (None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm'))
+        param = msg.get_param('NAME')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(
+            param,
+            'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
 
     def test_rfc2231_no_language_or_charset_in_filename(self):
         m = '''\
 Content-Disposition: inline;
-\tfilename*0="This%20is%20even%20more%20";
-\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*0*="This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2="is it not.pdf"
 
 '''
@@ -2774,8 +2777,8 @@ Content-Disposition: inline;
     def test_rfc2231_no_language_or_charset_in_boundary(self):
         m = '''\
 Content-Type: multipart/alternative;
-\tboundary*0="This%20is%20even%20more%20";
-\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tboundary*0*="This%20is%20even%20more%20";
+\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tboundary*2="is it not.pdf"
 
 '''
@@ -2783,12 +2786,38 @@ Content-Type: multipart/alternative;
         self.assertEqual(msg.get_boundary(),
                          'This is even more ***fun*** is it not.pdf')
 
+    def test_rfc2231_partly_encoded(self):
+        m = '''\
+Content-Disposition: inline;
+\tfilename*0="''This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2="is it not.pdf"
+
+'''
+        msg = email.message_from_string(m)
+        self.assertEqual(
+            msg.get_filename(),
+            'This%20is%20even%20more%20***fun*** is it not.pdf') 
+
+    def test_rfc2231_partly_nonencoded(self):
+        m = '''\
+Content-Disposition: inline;
+\tfilename*0="This%20is%20even%20more%20";
+\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2="is it not.pdf"
+
+'''
+        msg = email.message_from_string(m)
+        self.assertEqual(
+            msg.get_filename(),
+            'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf') 
+
     def test_rfc2231_no_language_or_charset_in_charset(self):
         # This is a nonsensical charset value, but tests the code anyway
         m = '''\
 Content-Type: text/plain;
-\tcharset*0="This%20is%20even%20more%20";
-\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tcharset*0*="This%20is%20even%20more%20";
+\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tcharset*2="is it not.pdf"
 
 '''
@@ -2799,8 +2828,8 @@ Content-Type: text/plain;
     def test_rfc2231_bad_encoding_in_filename(self):
         m = '''\
 Content-Disposition: inline;
-\tfilename*0="bogus'xx'This%20is%20even%20more%20";
-\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*0*="bogus'xx'This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2="is it not.pdf"
 
 '''
@@ -2831,9 +2860,9 @@ Content-Type: text/plain; charset*=ascii''utf-8%E2%80%9D
     def test_rfc2231_bad_character_in_filename(self):
         m = '''\
 Content-Disposition: inline;
-\tfilename*0="ascii'xx'This%20is%20even%20more%20";
-\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
-\tfilename*2="is it not.pdf%E2"
+\tfilename*0*="ascii'xx'This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2*="is it not.pdf%E2"
 
 '''
         msg = email.message_from_string(m)
@@ -2841,6 +2870,102 @@ Content-Disposition: inline;
                          'This is even more ***fun*** is it not.pdf\xe2')
 
 
+    def test_rfc2231_unknown_encoding(self):
+        m = """\
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
+
+"""
+        msg = email.message_from_string(m)
+        self.assertEqual(msg.get_filename(), 'myfile.txt')
+
+    def test_rfc2231_single_tick_in_filename_extended(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"Frank's\"; name*1*=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, None)
+        eq(language, None)
+        eq(s, "Frank's Document")
+
+    def test_rfc2231_single_tick_in_filename(self):
+        m = """\
+Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        param = msg.get_param('name')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(param, "Frank's Document")
+
+    def test_rfc2231_tick_attack_extended(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, "Frank's Document") 
+
+    def test_rfc2231_tick_attack(self):
+        m = """\
+Content-Type: application/x-foo;
+\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        param = msg.get_param('name')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(param, "us-ascii'en-us'Frank's Document")
+
+    def test_rfc2231_no_extended_values(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo; name=\"Frank's Document\"
+
+"""
+        msg = email.message_from_string(m)
+        eq(msg.get_param('name'), "Frank's Document")
+
+    def test_rfc2231_encoded_then_unencoded_segments(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"us-ascii'en-us'My\";
+\tname*1=\" Document\";
+\tname*2*=\" For You\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, 'My Document For You')
+
+    def test_rfc2231_unencoded_then_encoded_segments(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0=\"us-ascii'en-us'My\";
+\tname*1*=\" Document\";
+\tname*2*=\" For You\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, 'My Document For You')
+
+
 
 def _testclasses():
     mod = sys.modules[__name__]
diff --git a/Misc/NEWS b/Misc/NEWS
index 48c54cdd7f6e..1c3e92ab8537 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -28,6 +28,18 @@ Core and builtins
 Library
 -------
 
+- The email package has improved RFC 2231 support, specifically for
+  recognizing the difference between encoded (name*0*=<blah>) and non-encoded
+  (name*0=<blah>) parameter continuations.  This may change the types of
+  values returned from email.message.Message.get_param() and friends.
+  Specifically in some cases where non-encoded continuations were used,
+  get_param() used to return a 3-tuple of (None, None, string) whereas now it
+  will just return the string (since non-encoded continuations don't have
+  charset and language parts).
+
+  Also, whereas % values were decoded in all parameter continuations, they are
+  now only decoded in encoded parameter parts.
+
 - Applied a security fix to SimpleXMLRPCserver (PSF-2005-001).  This
   disables recursive traversal through instance attributes, which can
   be exploited in various ways.