From: Ben Darnell <ben@bendarnell.com>
Date: Mon, 30 May 2011 06:32:22 +0000 (-0700)
Subject: Allow the application to determine the encoding used for url parameters (previously... 
X-Git-Tag: v2.0.0~42
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2f8dd6e4c02ad4a8444b0e675ca05fa125293dd5;p=thirdparty%2Ftornado.git

Allow the application to determine the encoding used for url parameters (previously hard-coded to utf8).

HTTPRequest.arguments now maps from native strings to bytes.  That's slightly
inconsistent, but having byte strings as dictionary keys is awkward.
---

diff --git a/tornado/escape.py b/tornado/escape.py
index 645790964..74a1caa7e 100644
--- a/tornado/escape.py
+++ b/tornado/escape.py
@@ -26,6 +26,11 @@ import urllib
 try: bytes
 except: bytes = str
 
+try:
+    from urlparse import parse_qs  # Python 2.6+
+except ImportError:
+    from cgi import parse_qs
+
 # json module is in the standard library as of python 2.6; fall back to
 # simplejson if present for older versions.
 try:
@@ -88,7 +93,8 @@ def url_escape(value):
     return urllib.quote_plus(utf8(value))
 
 # python 3 changed things around enough that we need two separate
-# implementations of url_unescape
+# implementations of url_unescape.  We also need our own implementation
+# of parse_qs since python 3's version insists on decoding everything.
 if sys.version_info[0] < 3:
     def url_unescape(value, encoding='utf-8'):
         """Decodes the given value from a URL.
@@ -102,6 +108,8 @@ if sys.version_info[0] < 3:
             return urllib.unquote_plus(utf8(value))
         else:
             return unicode(urllib.unquote_plus(utf8(value)), encoding)
+
+    parse_qs_bytes = parse_qs
 else:
     def url_unescape(value, encoding='utf-8'):
         """Decodes the given value from a URL.
@@ -116,6 +124,24 @@ else:
         else:
             return urllib.unquote_plus(native_str(value), encoding=encoding)
 
+    def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
+        """Parses a query string like urlparse.parse_qs, but returns the
+        values as byte strings.
+
+        Keys still become type str (interpreted as latin1 in python3!)
+        because it's too painful to keep them as byte strings in
+        python3 and in practice they're nearly always ascii anyway.
+        """
+        # This is gross, but python3 doesn't give us another way.
+        # Latin1 is the universal donor of character encodings.
+        result = parse_qs(qs, keep_blank_values, strict_parsing,
+                          encoding='latin1', errors='strict')
+        encoded = {}
+        for k,v in result.iteritems():
+            encoded[k] = [i.encode('latin1') for i in v]
+        return encoded
+        
+
 
 _UTF8_TYPES = (bytes, type(None))
 def utf8(value):
@@ -153,6 +179,22 @@ else:
     native_str = utf8
 
 
+def recursive_unicode(obj):
+    """Walks a simple data structure, converting byte strings to unicode.
+
+    Supports lists, tuples, and dictionaries.
+    """
+    if isinstance(obj, dict):
+        return dict((recursive_unicode(k), recursive_unicode(v)) for (k,v) in obj.iteritems())
+    elif isinstance(obj, list):
+        return list(recursive_unicode(i) for i in obj)
+    elif isinstance(obj, tuple):
+        return tuple(recursive_unicode(i) for i in obj)
+    elif isinstance(obj, bytes):
+        return to_unicode(obj)
+    else:
+        return obj
+
 # I originally used the regex from 
 # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 # but it gets all exponential on certain patterns (such as too many trailing
diff --git a/tornado/httpserver.py b/tornado/httpserver.py
index bcfdc78fa..f5ab0e6c8 100644
--- a/tornado/httpserver.py
+++ b/tornado/httpserver.py
@@ -23,18 +23,13 @@ import socket
 import time
 import urlparse
 
-from tornado.escape import utf8, native_str
+from tornado.escape import utf8, native_str, parse_qs_bytes
 from tornado import httputil
 from tornado import ioloop
 from tornado import iostream
 from tornado import stack_context
 from tornado.util import b, bytes_type
 
-try:
-    from urlparse import parse_qs  # Python 2.6+
-except ImportError:
-    from cgi import parse_qs
-
 try:
     import fcntl
 except ImportError:
@@ -398,7 +393,7 @@ class HTTPConnection(object):
         content_type = self._request.headers.get("Content-Type", "")
         if self._request.method in ("POST", "PUT"):
             if content_type.startswith("application/x-www-form-urlencoded"):
-                arguments = parse_qs(native_str(self._request.body))
+                arguments = parse_qs_bytes(native_str(self._request.body))
                 for name, values in arguments.iteritems():
                     values = [v for v in values if v]
                     if values:
@@ -511,7 +506,7 @@ class HTTPRequest(object):
         scheme, netloc, path, query, fragment = urlparse.urlsplit(native_str(uri))
         self.path = path
         self.query = query
-        arguments = parse_qs(query)
+        arguments = parse_qs_bytes(query)
         self.arguments = {}
         for name, values in arguments.iteritems():
             values = [v for v in values if v]
diff --git a/tornado/test/httpserver_test.py b/tornado/test/httpserver_test.py
index 4600f77f3..def4f0fd2 100644
--- a/tornado/test/httpserver_test.py
+++ b/tornado/test/httpserver_test.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 from tornado import httpclient, simple_httpclient
-from tornado.escape import json_decode, utf8, _unicode
+from tornado.escape import json_decode, utf8, _unicode, recursive_unicode
 from tornado.iostream import IOStream
 from tornado.simple_httpclient import SimpleAsyncHTTPClient
 from tornado.testing import AsyncHTTPTestCase, LogTrapTestCase
@@ -138,7 +138,7 @@ class HTTPConnectionTest(AsyncHTTPTestCase, LogTrapTestCase):
 
 class EchoHandler(RequestHandler):
     def get(self):
-        self.write(self.request.arguments)
+        self.write(recursive_unicode(self.request.arguments))
 
 class TypeCheckHandler(RequestHandler):
     def prepare(self):
@@ -160,7 +160,7 @@ class TypeCheckHandler(RequestHandler):
         self.check_type('header_value', self.request.headers.values()[0], str)
 
         self.check_type('arg_key', self.request.arguments.keys()[0], str)
-        self.check_type('arg_value', self.request.arguments.values()[0][0], str)
+        self.check_type('arg_value', self.request.arguments.values()[0][0], bytes_type)
 
     def post(self):
         self.check_type('body', self.request.body, bytes_type)
diff --git a/tornado/test/web_test.py b/tornado/test/web_test.py
index 5e0d40ddc..70b2edb07 100644
--- a/tornado/test/web_test.py
+++ b/tornado/test/web_test.py
@@ -1,4 +1,4 @@
-from tornado.escape import json_decode, utf8
+from tornado.escape import json_decode, utf8, to_unicode, recursive_unicode, native_str
 from tornado.iostream import IOStream
 from tornado.testing import LogTrapTestCase, AsyncHTTPTestCase
 from tornado.util import b, bytes_type
@@ -148,18 +148,19 @@ class ConnectionCloseTest(AsyncHTTPTestCase, LogTrapTestCase):
 
 class EchoHandler(RequestHandler):
     def get(self, path):
-        # Type checks:  web.py interfaces convert arguments to unicode
-        # strings.  In httpserver.py (i.e. self.request.arguments),
-        # they're left as the native str type.
+        # Type checks: web.py interfaces convert argument values to
+        # unicode strings (by default, but see also decode_argument).
+        # In httpserver.py (i.e. self.request.arguments), they're left
+        # as bytes.  Keys are always native strings.
         for key in self.request.arguments:
-            assert type(key) == type(""), repr(key)
+            assert type(key) == str, repr(key)
             for value in self.request.arguments[key]:
-                assert type(value) == type(""), repr(value)
+                assert type(value) == bytes_type, repr(value)
             for value in self.get_arguments(key):
                 assert type(value) == unicode, repr(value)
         assert type(path) == unicode, repr(path)
         self.write(dict(path=path,
-                        args=self.request.arguments))
+                        args=recursive_unicode(self.request.arguments)))
 
 class RequestEncodingTest(AsyncHTTPTestCase, LogTrapTestCase):
     def get_app(self):
@@ -215,9 +216,33 @@ class TypeCheckHandler(RequestHandler):
             self.errors[name] = "expected %s, got %s" % (expected_type,
                                                          actual_type)
 
+class DecodeArgHandler(RequestHandler):
+    def decode_argument(self, value, name=None):
+        assert type(value) == bytes_type, repr(value)
+        # use self.request.arguments directly to avoid recursion
+        if 'encoding' in self.request.arguments:
+            return value.decode(to_unicode(self.request.arguments['encoding'][0]))
+        else:
+            return value
+
+    def get(self, arg):
+        def describe(s):
+            if type(s) == bytes_type:
+                return ["bytes", native_str(binascii.b2a_hex(s))]
+            elif type(s) == unicode:
+                return ["unicode", s]
+            raise Exception("unknown type")
+        self.write({'path': describe(arg),
+                    'query': describe(self.get_argument("foo")),
+                    })
+
 class WebTest(AsyncHTTPTestCase, LogTrapTestCase):
     def get_app(self):
-        return Application([url("/typecheck/(.*)", TypeCheckHandler, name='typecheck')])
+        return Application([
+                url("/typecheck/(.*)", TypeCheckHandler, name='typecheck'),
+                url("/decode_arg/(.*)", DecodeArgHandler),
+                url("/decode_arg_kw/(?P<arg>.*)", DecodeArgHandler),
+                ])
 
     def test_types(self):
         response = self.fetch("/typecheck/asdf?foo=bar",
@@ -228,3 +253,24 @@ class WebTest(AsyncHTTPTestCase, LogTrapTestCase):
         response = self.fetch("/typecheck/asdf?foo=bar", method="POST",
                               headers={"Cookie": "cook=ie"},
                               body="foo=bar")
+
+    def test_decode_argument(self):
+        # These urls all decode to the same thing
+        urls = ["/decode_arg/%C3%A9?foo=%C3%A9&encoding=utf-8",
+                "/decode_arg/%E9?foo=%E9&encoding=latin1",
+                "/decode_arg_kw/%E9?foo=%E9&encoding=latin1",
+                ]
+        for url in urls:
+            response = self.fetch(url)
+            response.rethrow()
+            data = json_decode(response.body)
+            self.assertEqual(data, {u'path': [u'unicode', u'\u00e9'],
+                                    u'query': [u'unicode', u'\u00e9'],
+                                    })
+
+        response = self.fetch("/decode_arg/%C3%A9?foo=%C3%A9")
+        response.rethrow()
+        data = json_decode(response.body)
+        self.assertEqual(data, {u'path': [u'bytes', u'c3a9'],
+                                u'query': [u'bytes', u'c3a9'],
+                                })
diff --git a/tornado/web.py b/tornado/web.py
index d631b15bc..63e1ccf16 100644
--- a/tornado/web.py
+++ b/tornado/web.py
@@ -249,14 +249,32 @@ class RequestHandler(object):
 
         The returned values are always unicode.
         """
-        values = self.request.arguments.get(name, [])
-        # Get rid of any weird control chars
-        values = [re.sub(r"[\x00-\x08\x0e-\x1f]", " ", _unicode(x)) 
-                  for x in values]
-        if strip:
-            values = [x.strip() for x in values]
+        values = []
+        for v in self.request.arguments.get(name, []):
+            v = self.decode_argument(v, name=name)
+            if isinstance(v, unicode):
+                # Get rid of any weird control chars (unless decoding gave
+                # us bytes, in which case leave it alone)
+                v = re.sub(r"[\x00-\x08\x0e-\x1f]", " ", v)
+            if strip:
+                v = v.strip()
+            values.append(v)
         return values
 
+    def decode_argument(self, value, name=None):
+        """Decodes an argument from the request.
+
+        The argument has been percent-decoded and is now a byte string.
+        By default, this method decodes the argument as utf-8 and returns
+        a unicode string, but this may be overridden in subclasses.
+
+        This method is used as a filter for both get_argument() and for
+        values extracted from the url and passed to get()/post()/etc.
+
+        The name of the argument is provided if known, but may be None
+        (e.g. for unnamed groups in the url regex).
+        """
+        return _unicode(value)
 
     @property
     def cookies(self):
@@ -881,6 +899,9 @@ class RequestHandler(object):
                 self.check_xsrf_cookie()
             self.prepare()
             if not self._finished:
+                args = [self.decode_argument(arg) for arg in args]
+                kwargs = dict((k, self.decode_argument(v, name=k))
+                              for (k,v) in kwargs.iteritems())
                 getattr(self, self.request.method.lower())(*args, **kwargs)
                 if self._auto_finish and not self._finished:
                     self.finish()
@@ -1198,15 +1219,17 @@ class Application(object):
             for spec in handlers:
                 match = spec.regex.match(request.path)
                 if match:
-                    # None-safe wrapper around urllib.unquote to handle
+                    # None-safe wrapper around url_unescape to handle
                     # unmatched optional groups correctly
                     def unquote(s):
                         if s is None: return s
-                        return _unicode(urllib.unquote(s))
+                        return escape.url_unescape(s, encoding=None)
                     handler = spec.handler_class(self, request, **spec.kwargs)
                     # Pass matched groups to the handler.  Since
                     # match.groups() includes both named and unnamed groups,
                     # we want to use either groups or groupdict but not both.
+                    # Note that args are passed as bytes so the handler can
+                    # decide what encoding to use.
                     kwargs = dict((k, unquote(v))
                                   for (k, v) in match.groupdict().iteritems())
                     if kwargs: