return re.sub(r"[\x00-\x20]+", " ", value).strip()
-def url_escape(value):
- """Returns a URL-encoded version of the given value."""
- return urllib_parse.quote_plus(utf8(value))
+def url_escape(value, plus=True):
+ """Returns a URL-encoded version of the given value.
+
+ If ``plus`` is true (the default), spaces will be represented
+ as "+" instead of "%20". This is appropriate for query strings
+ but not for the path component of a URL. Note that this default
+ is the reverse of Python's urllib module.
+ """
+ quote = urllib_parse.quote_plus if plus else urllib_parse.quote
+ return quote(utf8(value))
+
# python 3 changed things around enough that we need two separate
# implementations of url_unescape. We also need our own implementation
# of parse_qs since python 3's version insists on decoding everything.
if sys.version_info[0] < 3:
- def url_unescape(value, encoding='utf-8'):
+ def url_unescape(value, encoding='utf-8', plus=True):
"""Decodes the given value from a URL.
The argument may be either a byte or unicode string.
If encoding is None, the result will be a byte string. Otherwise,
the result is a unicode string in the specified encoding.
+
+ If ``plus`` is true (the default), plus signs will be interpreted
+ as spaces (literal plus signs must be represented as "%2B"). This
+ is appropriate for query strings and form-encoded values but not
+ for the path component of a URL. Note that this default is the
+ reverse of Python's urllib module.
"""
+ unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote)
if encoding is None:
- return urllib_parse.unquote_plus(utf8(value))
+ return unquote(utf8(value))
else:
- return unicode_type(urllib_parse.unquote_plus(utf8(value)), encoding)
+ return unicode_type(unquote(utf8(value)), encoding)
parse_qs_bytes = _parse_qs
else:
- def url_unescape(value, encoding='utf-8'):
+ def url_unescape(value, encoding='utf-8', plus=True):
"""Decodes the given value from a URL.
The argument may be either a byte or unicode string.
If encoding is None, the result will be a byte string. Otherwise,
the result is a unicode string in the specified encoding.
+
+ If ``plus`` is true (the default), plus signs will be interpreted
+ as spaces (literal plus signs must be represented as "%2B"). This
+ is appropriate for query strings and form-encoded values but not
+ for the path component of a URL. Note that this default is the
+ reverse of Python's urllib module.
"""
if encoding is None:
+ if plus:
+ # unquote_to_bytes doesn't have a _plus variant
+ value = to_basestring(value).replace('+', ' ')
return urllib_parse.unquote_to_bytes(value)
else:
- return urllib_parse.unquote_plus(to_basestring(value), encoding=encoding)
+ unquote = (urllib_parse.unquote_plus if plus
+ else urllib_parse.unquote)
+ return unquote(to_basestring(value), encoding=encoding)
def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
"""Parses a query string like urlparse.parse_qs, but returns the
self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped))
self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
- def test_url_escape(self):
+ def test_url_escape_unicode(self):
tests = [
# byte strings are passed through as-is
(u('\u00e9').encode('utf8'), '%C3%A9'),
for unescaped, escaped in tests:
self.assertEqual(url_escape(unescaped), escaped)
- def test_url_unescape(self):
+ def test_url_unescape_unicode(self):
tests = [
('%C3%A9', u('\u00e9'), 'utf8'),
('%C3%A9', u('\u00c3\u00a9'), 'latin1'),
self.assertEqual(url_unescape(to_unicode(escaped), encoding), unescaped)
self.assertEqual(url_unescape(utf8(escaped), encoding), unescaped)
+ def test_url_escape_quote_plus(self):
+ unescaped = '+ #%'
+ plus_escaped = '%2B+%23%25'
+ escaped = '%2B%20%23%25'
+ self.assertEqual(url_escape(unescaped), plus_escaped)
+ self.assertEqual(url_escape(unescaped, plus=False), escaped)
+ self.assertEqual(url_unescape(plus_escaped), unescaped)
+ self.assertEqual(url_unescape(escaped, plus=False), unescaped)
+ self.assertEqual(url_unescape(plus_escaped, encoding=None),
+ utf8(unescaped))
+ self.assertEqual(url_unescape(escaped, encoding=None, plus=False),
+ utf8(unescaped))
+
def test_escape_return_types(self):
# On python2 the escape methods should generally return the same
# type as their argument