From b7e173e07510cea4abcd6e8d67552e22737ea9d8 Mon Sep 17 00:00:00 2001
From: Barry Warsaw <barry@python.org>
Date: Sun, 23 Jul 2006 17:02:55 +0000
Subject: [PATCH] Port r50754 to Python 2.4/email 3.0.  Bump the email version
 number to 3.0.2.

---
 Lib/email/Utils.py           |  61 ++++++++++-----
 Lib/email/test/test_email.py | 145 ++++++++++++++++++++++++++++++++---
 Misc/NEWS                    |  12 +++
 3 files changed, 189 insertions(+), 29 deletions(-)
diff --git a/Lib/email/Utils.py b/Lib/email/Utils.py
index 9ba760116153..8c1e69e3336b 100644
--- a/Lib/email/Utils.py
+++ b/Lib/email/Utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2004 Python Software Foundation
+# Copyright (C) 2001-2006 Python Software Foundation
 # Author: Barry Warsaw
 # Contact: email-sig@python.org
 
@@ -10,6 +10,7 @@ import time
 import base64
 import random
 import socket
+import urllib
 import warnings
 from cStringIO import StringIO
 
@@ -30,6 +31,7 @@ COMMASPACE = ', '
 EMPTYSTRING = ''
 UEMPTYSTRING = u''
 CRLF = '\r\n'
+TICK = "'"
 
 specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[][\\()"]')
@@ -215,12 +217,14 @@ def unquote(str):
 # RFC2231-related functions - parameter encoding and decoding
 def decode_rfc2231(s):
     """Decode string according to RFC 2231"""
-    import urllib
-    parts = s.split("'", 2)
-    if len(parts) == 1:
+    parts = s.split(TICK, 2)
+    if len(parts) <= 2:
         return None, None, urllib.unquote(s)
-    charset, language, s = parts
-    return charset, language, urllib.unquote(s)
+    if len(parts) > 3:
+        charset, language = pars[:2]
+        s = TICK.join(parts[2:])
+        return charset, language, s
+    return parts
 
 
 def encode_rfc2231(s, charset=None, language=None):
@@ -244,37 +248,54 @@ rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
 def decode_params(params):
     """Decode parameters list according to RFC 2231.
 
-    params is a sequence of 2-tuples containing (content type, string value).
+    params is a sequence of 2-tuples containing (param name, string value).
     """
+    # Copy params so we don't mess with the original
+    params = params[:]
     new_params = []
-    # maps parameter's name to a list of continuations
+    # Map parameter's name to a list of continuations.  The values are a
+    # 3-tuple of the continuation number, the string value, and a flag
+    # specifying whether a particular segment is %-encoded.
     rfc2231_params = {}
-    # params is a sequence of 2-tuples containing (content_type, string value)
-    name, value = params[0]
+    name, value = params.pop(0)
     new_params.append((name, value))
-    # Cycle through each of the rest of the parameters.
-    for name, value in params[1:]:
+    while params:
+        name, value = params.pop(0)
+        if name.endswith('*'):
+            encoded = True
+        else:
+            encoded = False
         value = unquote(value)
         mo = rfc2231_continuation.match(name)
         if mo:
             name, num = mo.group('name', 'num')
             if num is not None:
                 num = int(num)
-            rfc2231_param1 = rfc2231_params.setdefault(name, [])
-            rfc2231_param1.append((num, value))
+            rfc2231_params.setdefault(name, []).append((num, value, encoded))
         else:
             new_params.append((name, '"%s"' % quote(value)))
     if rfc2231_params:
         for name, continuations in rfc2231_params.items():
             value = []
+            extended = False
             # Sort by number
             continuations.sort()
-            # And now append all values in num order
-            for num, continuation in continuations:
-                value.append(continuation)
-            charset, language, value = decode_rfc2231(EMPTYSTRING.join(value))
-            new_params.append(
-                (name, (charset, language, '"%s"' % quote(value))))
+            # And now append all values in numerical order, converting
+            # %-encodings for the encoded segments.  If any of the
+            # continuation names ends in a *, then the entire string, after
+            # decoding segments and concatenating, must have the charset and
+            # language specifiers at the beginning of the string.
+            for num, s, encoded in continuations:
+                if encoded:
+                    s = urllib.unquote(s)
+                    extended = True
+                value.append(s)
+            value = quote(EMPTYSTRING.join(value))
+            if extended:
+                charset, language, value = decode_rfc2231(value)
+                new_params.append((name, (charset, language, '"%s"' % value)))
+            else:
+                new_params.append((name, '"%s"' % value))
     return new_params
 
 def collapse_rfc2231_value(value, errors='replace',
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
index 1e4a510ad1d6..24c6efa5fdfb 100644
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -2997,14 +2997,29 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC
 
 '''
         msg = email.message_from_string(m)
-        self.assertEqual(msg.get_param('NAME'),
-                         (None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm'))
+        param = msg.get_param('NAME')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(
+            param,
+            'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
 
     def test_rfc2231_no_language_or_charset_in_filename(self):
         m = '''\
 Content-Disposition: inline;
-\tfilename*0="This%20is%20even%20more%20";
-\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*0*="''This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2="is it not.pdf"
+
+'''
+        msg = email.message_from_string(m)
+        self.assertEqual(msg.get_filename(),
+                         'This is even more ***fun*** is it not.pdf')
+
+    def test_rfc2231_no_language_or_charset_in_filename_encoded(self):
+        m = '''\
+Content-Disposition: inline;
+\tfilename*0*="''This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2="is it not.pdf"
 
 '''
@@ -3012,11 +3027,37 @@ Content-Disposition: inline;
         self.assertEqual(msg.get_filename(),
                          'This is even more ***fun*** is it not.pdf')
 
+    def test_rfc2231_partly_encoded(self):
+        m = '''\
+Content-Disposition: inline;
+\tfilename*0="''This%20is%20even%20more%20";
+\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2="is it not.pdf"
+
+'''
+        msg = email.message_from_string(m)
+        self.assertEqual(
+            msg.get_filename(),
+            'This%20is%20even%20more%20***fun*** is it not.pdf')
+
+    def test_rfc2231_partly_nonencoded(self):
+        m = '''\
+Content-Disposition: inline;
+\tfilename*0="This%20is%20even%20more%20";
+\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tfilename*2="is it not.pdf"
+
+'''
+        msg = email.message_from_string(m)
+        self.assertEqual(
+            msg.get_filename(),
+            'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf')
+
     def test_rfc2231_no_language_or_charset_in_boundary(self):
         m = '''\
 Content-Type: multipart/alternative;
-\tboundary*0="This%20is%20even%20more%20";
-\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tboundary*0*="''This%20is%20even%20more%20";
+\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tboundary*2="is it not.pdf"
 
 '''
@@ -3028,8 +3069,8 @@ Content-Type: multipart/alternative;
         # This is a nonsensical charset value, but tests the code anyway
         m = '''\
 Content-Type: text/plain;
-\tcharset*0="This%20is%20even%20more%20";
-\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20";
+\tcharset*0*="This%20is%20even%20more%20";
+\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tcharset*2="is it not.pdf"
 
 '''
@@ -3040,12 +3081,98 @@ Content-Type: text/plain;
     def test_rfc2231_unknown_encoding(self):
         m = """\
 Content-Transfer-Encoding: 8bit
-Content-Disposition: inline; filename*0=X-UNKNOWN''myfile.txt
+Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
 
 """
         msg = email.message_from_string(m)
         self.assertEqual(msg.get_filename(), 'myfile.txt')
 
+    def test_rfc2231_single_tick_in_filename_extended(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"Frank's\"; name*1*=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, None)
+        eq(language, None)
+        eq(s, "Frank's Document")
+
+    def test_rfc2231_single_tick_in_filename(self):
+        m = """\
+Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        param = msg.get_param('name')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(param, "Frank's Document")
+
+    def test_rfc2231_tick_attack_extended(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, "Frank's Document")
+
+    def test_rfc2231_tick_attack(self):
+        m = """\
+Content-Type: application/x-foo;
+\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
+
+"""
+        msg = email.message_from_string(m)
+        param = msg.get_param('name')
+        self.failIf(isinstance(param, tuple))
+        self.assertEqual(param, "us-ascii'en-us'Frank's Document")
+
+    def test_rfc2231_no_extended_values(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo; name=\"Frank's Document\"
+
+"""
+        msg = email.message_from_string(m)
+        eq(msg.get_param('name'), "Frank's Document")
+
+    def test_rfc2231_encoded_then_unencoded_segments(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0*=\"us-ascii'en-us'My\";
+\tname*1=\" Document\";
+\tname*2*=\" For You\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, 'My Document For You')
+
+    def test_rfc2231_unencoded_then_encoded_segments(self):
+        eq = self.assertEqual
+        m = """\
+Content-Type: application/x-foo;
+\tname*0=\"us-ascii'en-us'My\";
+\tname*1*=\" Document\";
+\tname*2*=\" For You\"
+
+"""
+        msg = email.message_from_string(m)
+        charset, language, s = msg.get_param('name')
+        eq(charset, 'us-ascii')
+        eq(language, 'en-us')
+        eq(s, 'My Document For You')
+
 
 
 def _testclasses():
diff --git a/Misc/NEWS b/Misc/NEWS
index 87d5ac24da4f..715a7a92c12c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -59,6 +59,18 @@ Extension Modules
 Library
 -------
 
+- The email package has improved RFC 2231 support, specifically for
+  recognizing the difference between encoded (name*0*=<blah>) and non-encoded
+  (name*0=<blah>) parameter continuations.  This may change the types of
+  values returned from email.message.Message.get_param() and friends.
+  Specifically in some cases where non-encoded continuations were used,
+  get_param() used to return a 3-tuple of (None, None, string) whereas now it
+  will just return the string (since non-encoded continuations don't have
+  charset and language parts).
+
+  Also, whereas % values were decoded in all parameter continuations, they are
+  now only decoded in encoded parameter parts.
+
 - Bug #822974: Honor timeout in telnetlib.{expect,read_until}
   even if some data are received.
 
-- 
2.47.3