escape: Use the standard library where possible

author Ben Darnell <ben@bendarnell.com>

Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)

committer Ben Darnell <ben@bendarnell.com>

Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)
author Ben Darnell <ben@bendarnell.com>
Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)
committer Ben Darnell <ben@bendarnell.com>
Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)
diff --git a/tornado/escape.py b/tornado/escape.py

index 55354c30f423cc2ce15724aaa906b9ec05dea386..af2eb59b4921ca395b0089a8d1d8dca510bddc1f 100644 (file)
--- a/tornado/escape.py
+++ b/tornado/escape.py
@@ -17,9 +17,15 @@
  
  Also includes a few other miscellaneous string manipulation functions that
  have crept in over time.
+
+Many functions in this module have near-equivalents in the standard library
+(the differences mainly relate to handling of bytes and unicode strings,
+and were more relevant in Python 2). In new code, the standard library
+functions are encouraged instead of this module where applicable. See the
+docstrings on each function for details.
  """
  
-import html.entities
+import html
  import json
  import re
  import urllib.parse
@@ -30,16 +36,6 @@ import typing
  from typing import Union, Any, Optional, Dict, List, Callable
  
  
-_XHTML_ESCAPE_RE = re.compile("[&<>\"']")
-_XHTML_ESCAPE_DICT = {
-    "&": "&amp;",
-    "<": "&lt;",
-    ">": "&gt;",
-    '"': "&quot;",
-    "'": "&#39;",
-}
-
-
  def xhtml_escape(value: Union[str, bytes]) -> str:
      """Escapes a string so it is valid within HTML or XML.
  
@@ -47,25 +43,50 @@ def xhtml_escape(value: Union[str, bytes]) -> str:
      When used in attribute values the escaped strings must be enclosed
      in quotes.
  
+    Equivalent to `html.escape` except that this function always returns
+    type `str` while `html.escape` returns `bytes` if its input is `bytes`.
+
      .. versionchanged:: 3.2
  
         Added the single quote to the list of escaped characters.
+
+    .. versionchanged:: 6.4
+
+       Now simply wraps `html.escape`. This is equivalent to the old behavior
+       except that single quotes are now escaped as ``&#x27;`` instead of
+       ``&#39;`` and performance may be different.
      """
-    return _XHTML_ESCAPE_RE.sub(
-        lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value)
-    )
+    return html.escape(to_unicode(value))
  
  
  def xhtml_unescape(value: Union[str, bytes]) -> str:
-    """Un-escapes an XML-escaped string."""
-    return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
+    """Un-escapes an XML-escaped string.
+    
+    Equivalent to `html.unescape` except that this function always returns
+    type `str` while `html.unescape` returns `bytes` if its input is `bytes`.
+
+    .. versionchanged:: 6.4
+
+       Now simply wraps `html.unescape`. This changes behavior for some inputs
+       as required by the HTML 5 specification
+       https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+
+       Some invalid inputs such as surrogates now raise an error, and numeric
+       references to certain ISO-8859-1 characters are now handled correctly.
+    """
+    return html.unescape(to_unicode(value))
  
  
  # The fact that json_encode wraps json.dumps is an implementation detail.
  # Please see https://github.com/tornadoweb/tornado/pull/706
  # before sending a pull request that adds **kwargs to this function.
  def json_encode(value: Any) -> str:
-    """JSON-encodes the given Python object."""
+    """JSON-encodes the given Python object.
+    
+    Equivalent to `json.dumps` with the additional guarantee that the output
+    will never contain the character sequence ``</`` which can be problematic
+    when JSON is embedded in an HTML ``<script>`` tag.
+    """
      # JSON permits but does not require forward slashes to be escaped.
      # This is useful when json data is emitted in a <script> tag
      # in HTML, as it prevents </script> tags from prematurely terminating
@@ -78,9 +99,9 @@ def json_encode(value: Any) -> str:
  def json_decode(value: Union[str, bytes]) -> Any:
      """Returns Python objects for the given JSON string.
  
-    Supports both `str` and `bytes` inputs.
+    Supports both `str` and `bytes` inputs. Equvalent to `json.loads`.
      """
-    return json.loads(to_basestring(value))
+    return json.loads(value)
  
  
  def squeeze(value: str) -> str:
@@ -91,16 +112,20 @@ def squeeze(value: str) -> str:
  def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
      """Returns a URL-encoded version of the given value.
  
-    If ``plus`` is true (the default), spaces will be represented
-    as "+" instead of "%20".  This is appropriate for query strings
-    but not for the path component of a URL.  Note that this default
-    is the reverse of Python's urllib module.
+    Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus``
+    argument. 
+    
+    If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be
+    represented as ``%2F``.  This is appropriate for query strings. If ``plus`` is false, spaces
+    will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path
+    component of a URL. Note that the default of ``plus=True`` is effectively the
+    reverse of Python's urllib module.
  
      .. versionadded:: 3.1
          The ``plus`` argument
      """
      quote = urllib.parse.quote_plus if plus else urllib.parse.quote
-    return quote(utf8(value))
+    return quote(value)
  
  
  @typing.overload
@@ -122,14 +147,15 @@ def url_unescape(  # noqa: F811
  
      The argument may be either a byte or unicode string.
  
-    If encoding is None, the result will be a byte string.  Otherwise,
-    the result is a unicode string in the specified encoding.
+    If encoding is None, the result will be a byte string and this function is equivalent to
+    `urllib.parse.unquote_to_bytes` if ``plus=False``.  Otherwise, the result is a unicode string in
+    the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or
+    `urllib.parse.unquote` except that this function also accepts `bytes` as input.
  
-    If ``plus`` is true (the default), plus signs will be interpreted
-    as spaces (literal plus signs must be represented as "%2B").  This
-    is appropriate for query strings and form-encoded values but not
-    for the path component of a URL.  Note that this default is the
-    reverse of Python's urllib module.
+    If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs
+    must be represented as "%2B").  This is appropriate for query strings and form-encoded values
+    but not for the path component of a URL.  Note that this default is the reverse of Python's
+    urllib module.
  
      .. versionadded:: 3.1
         The ``plus`` argument
@@ -375,28 +401,3 @@ def linkify(
      # that we won't pick up &quot;, etc.
      text = _unicode(xhtml_escape(text))
      return _URL_RE.sub(make_link, text)
-
-
-def _convert_entity(m: typing.Match) -> str:
-    if m.group(1) == "#":
-        try:
-            if m.group(2)[:1].lower() == "x":
-                return chr(int(m.group(2)[1:], 16))
-            else:
-                return chr(int(m.group(2)))
-        except ValueError:
-            return "&#%s;" % m.group(2)
-    try:
-        return _HTML_UNICODE_MAP[m.group(2)]
-    except KeyError:
-        return "&%s;" % m.group(2)
-
-
-def _build_unicode_map() -> Dict[str, str]:
-    unicode_map = {}
-    for name, value in html.entities.name2codepoint.items():
-        unicode_map[name] = chr(value)
-    return unicode_map
-
-
-_HTML_UNICODE_MAP = _build_unicode_map()
diff --git a/tornado/test/escape_test.py b/tornado/test/escape_test.py

index a90d11d663581d7786cacf0f04a09499b8f260e1..6bd2ae79e444760d46a94caf7b0619f253bc9e90 100644 (file)
--- a/tornado/test/escape_test.py
+++ b/tornado/test/escape_test.py
@@ -220,7 +220,7 @@ class EscapeTestCase(unittest.TestCase):
              ("<foo>", "&lt;foo&gt;"),
              ("<foo>", "&lt;foo&gt;"),
              (b"<foo>", b"&lt;foo&gt;"),
-            ("<>&\"'", "&lt;&gt;&amp;&quot;&#39;"),
+            ("<>&\"'", "&lt;&gt;&amp;&quot;&#x27;"),
              ("&amp;", "&amp;amp;"),
              ("<\u00e9>", "&lt;\u00e9&gt;"),
              (b"<\xc3\xa9>", b"&lt;\xc3\xa9&gt;"),
author	Ben Darnell <ben@bendarnell.com>
	Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)
committer	Ben Darnell <ben@bendarnell.com>
	Wed, 23 Aug 2023 01:27:05 +0000 (21:27 -0400)
tornado/escape.py		patch \| blob \| blame \| history
tornado/test/escape_test.py		patch \| blob \| blame \| history