From: Ben Darnell Date: Sun, 12 May 2013 19:48:52 +0000 (-0400) Subject: Add a 'plus' argument to url_{un,}escape, defaulting to True. X-Git-Tag: v3.1.0~76^2~14 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7056d6fbd1c4778cf56177698d57152188203e0d;p=thirdparty%2Ftornado.git Add a 'plus' argument to url_{un,}escape, defaulting to True. Closes #25. --- diff --git a/tornado/escape.py b/tornado/escape.py index 546bd2ea8..688f7f087 100644 --- a/tornado/escape.py +++ b/tornado/escape.py @@ -88,41 +88,67 @@ def squeeze(value): return re.sub(r"[\x00-\x20]+", " ", value).strip() -def url_escape(value): - """Returns a URL-encoded version of the given value.""" - return urllib_parse.quote_plus(utf8(value)) +def url_escape(value, plus=True): + """Returns a URL-encoded version of the given value. + + If ``plus`` is true (the default), spaces will be represented + as "+" instead of "%20". This is appropriate for query strings + but not for the path component of a URL. Note that this default + is the reverse of Python's urllib module. + """ + quote = urllib_parse.quote_plus if plus else urllib_parse.quote + return quote(utf8(value)) + # python 3 changed things around enough that we need two separate # implementations of url_unescape. We also need our own implementation # of parse_qs since python 3's version insists on decoding everything. if sys.version_info[0] < 3: - def url_unescape(value, encoding='utf-8'): + def url_unescape(value, encoding='utf-8', plus=True): """Decodes the given value from a URL. The argument may be either a byte or unicode string. If encoding is None, the result will be a byte string. Otherwise, the result is a unicode string in the specified encoding. + + If ``plus`` is true (the default), plus signs will be interpreted + as spaces (literal plus signs must be represented as "%2B"). This + is appropriate for query strings and form-encoded values but not + for the path component of a URL. Note that this default is the + reverse of Python's urllib module. """ + unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote) if encoding is None: - return urllib_parse.unquote_plus(utf8(value)) + return unquote(utf8(value)) else: - return unicode_type(urllib_parse.unquote_plus(utf8(value)), encoding) + return unicode_type(unquote(utf8(value)), encoding) parse_qs_bytes = _parse_qs else: - def url_unescape(value, encoding='utf-8'): + def url_unescape(value, encoding='utf-8', plus=True): """Decodes the given value from a URL. The argument may be either a byte or unicode string. If encoding is None, the result will be a byte string. Otherwise, the result is a unicode string in the specified encoding. + + If ``plus`` is true (the default), plus signs will be interpreted + as spaces (literal plus signs must be represented as "%2B"). This + is appropriate for query strings and form-encoded values but not + for the path component of a URL. Note that this default is the + reverse of Python's urllib module. """ if encoding is None: + if plus: + # unquote_to_bytes doesn't have a _plus variant + value = to_basestring(value).replace('+', ' ') return urllib_parse.unquote_to_bytes(value) else: - return urllib_parse.unquote_plus(to_basestring(value), encoding=encoding) + unquote = (urllib_parse.unquote_plus if plus + else urllib_parse.unquote) + return unquote(to_basestring(value), encoding=encoding) def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False): """Parses a query string like urlparse.parse_qs, but returns the diff --git a/tornado/test/escape_test.py b/tornado/test/escape_test.py index 8b4522c0c..90573c166 100644 --- a/tornado/test/escape_test.py +++ b/tornado/test/escape_test.py @@ -151,7 +151,7 @@ class EscapeTestCase(unittest.TestCase): self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped)) self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped))) - def test_url_escape(self): + def test_url_escape_unicode(self): tests = [ # byte strings are passed through as-is (u('\u00e9').encode('utf8'), '%C3%A9'), @@ -163,7 +163,7 @@ class EscapeTestCase(unittest.TestCase): for unescaped, escaped in tests: self.assertEqual(url_escape(unescaped), escaped) - def test_url_unescape(self): + def test_url_unescape_unicode(self): tests = [ ('%C3%A9', u('\u00e9'), 'utf8'), ('%C3%A9', u('\u00c3\u00a9'), 'latin1'), @@ -176,6 +176,19 @@ class EscapeTestCase(unittest.TestCase): self.assertEqual(url_unescape(to_unicode(escaped), encoding), unescaped) self.assertEqual(url_unescape(utf8(escaped), encoding), unescaped) + def test_url_escape_quote_plus(self): + unescaped = '+ #%' + plus_escaped = '%2B+%23%25' + escaped = '%2B%20%23%25' + self.assertEqual(url_escape(unescaped), plus_escaped) + self.assertEqual(url_escape(unescaped, plus=False), escaped) + self.assertEqual(url_unescape(plus_escaped), unescaped) + self.assertEqual(url_unescape(escaped, plus=False), unescaped) + self.assertEqual(url_unescape(plus_escaped, encoding=None), + utf8(unescaped)) + self.assertEqual(url_unescape(escaped, encoding=None, plus=False), + utf8(unescaped)) + def test_escape_return_types(self): # On python2 the escape methods should generally return the same # type as their argument