gh-140793: Improve documentatation and tests for the ensure_ascii option in the json...

author Serhiy Storchaka <storchaka@gmail.com>

Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)

committer GitHub <noreply@github.com>

Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)
committer GitHub <noreply@github.com>
Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)
diff --git a/Doc/library/json.rst b/Doc/library/json.rst

index 12a5a96a3c56f3b9733760de74d62309216fe2e7..8b4217c210d5b3ea9c908936ec1e64f551732a17 100644 (file)
--- a/Doc/library/json.rst
+++ b/Doc/library/json.rst
@@ -183,8 +183,10 @@ Basic Usage
  
     :param bool ensure_ascii:
        If ``True`` (the default), the output is guaranteed to
-      have all incoming non-ASCII characters escaped.
-      If ``False``, these characters will be outputted as-is.
+      have all incoming non-ASCII and non-printable characters escaped.
+      If ``False``, all characters will be outputted as-is, except for
+      the characters that must be escaped: quotation mark, reverse solidus,
+      and the control characters U+0000 through U+001F.
  
     :param bool check_circular:
        If ``False``, the circular reference check for container types is skipped
@@ -495,8 +497,10 @@ Encoders and Decoders
     :class:`bool` or ``None``.  If *skipkeys* is true, such items are simply skipped.
  
     If *ensure_ascii* is true (the default), the output is guaranteed to
-   have all incoming non-ASCII characters escaped.  If *ensure_ascii* is
-   false, these characters will be output as-is.
+   have all incoming non-ASCII and non-printable characters escaped.
+   If *ensure_ascii* is false, all characters will be output as-is, except for
+   the characters that must be escaped: quotation mark, reverse solidus,
+   and the control characters U+0000 through U+001F.
  
     If *check_circular* is true (the default), then lists, dicts, and custom
     encoded objects will be checked for circular references during encoding to
@@ -636,7 +640,7 @@ UTF-32, with UTF-8 being the recommended default for maximum interoperability.
  
  As permitted, though not required, by the RFC, this module's serializer sets
  *ensure_ascii=True* by default, thus escaping the output so that the resulting
-strings only contain ASCII characters.
+strings only contain printable ASCII characters.
  
  Other than the *ensure_ascii* parameter, this module is defined strictly in
  terms of conversion between Python objects and
diff --git a/Lib/json/__init__.py b/Lib/json/__init__.py

index c8fdd0d99a0efc36684e0100fcb36ddb9f1ae8ff..89396b25a2cbb3e7f2046e1b5b647d5691ea0664 100644 (file)
--- a/Lib/json/__init__.py
+++ b/Lib/json/__init__.py
@@ -127,8 +127,9 @@ def dump(obj, fp, *, skipkeys=False, ensure_ascii=True, check_circular=True,
      instead of raising a ``TypeError``.
  
      If ``ensure_ascii`` is false, then the strings written to ``fp`` can
-    contain non-ASCII characters if they appear in strings contained in
-    ``obj``. Otherwise, all such characters are escaped in JSON strings.
+    contain non-ASCII and non-printable characters if they appear in strings
+    contained in ``obj``. Otherwise, all such characters are escaped in JSON
+    strings.
  
      If ``check_circular`` is false, then the circular reference check
      for container types will be skipped and a circular reference will
@@ -144,10 +145,11 @@ def dump(obj, fp, *, skipkeys=False, ensure_ascii=True, check_circular=True,
      level of 0 will only insert newlines. ``None`` is the most compact
      representation.
  
-    If specified, ``separators`` should be an ``(item_separator, key_separator)``
-    tuple.  The default is ``(', ', ': ')`` if *indent* is ``None`` and
-    ``(',', ': ')`` otherwise.  To get the most compact JSON representation,
-    you should specify ``(',', ':')`` to eliminate whitespace.
+    If specified, ``separators`` should be an ``(item_separator,
+    key_separator)`` tuple.  The default is ``(', ', ': ')`` if *indent* is
+    ``None`` and ``(',', ': ')`` otherwise.  To get the most compact JSON
+    representation, you should specify ``(',', ':')`` to eliminate
+    whitespace.
  
      ``default(obj)`` is a function that should return a serializable version
      of obj or raise TypeError. The default simply raises TypeError.
@@ -188,9 +190,10 @@ def dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True,
      (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
      instead of raising a ``TypeError``.
  
-    If ``ensure_ascii`` is false, then the return value can contain non-ASCII
-    characters if they appear in strings contained in ``obj``. Otherwise, all
-    such characters are escaped in JSON strings.
+    If ``ensure_ascii`` is false, then the return value can contain
+    non-ASCII and non-printable characters if they appear in strings
+    contained in ``obj``.  Otherwise, all such characters are escaped in
+    JSON strings.
  
      If ``check_circular`` is false, then the circular reference check
      for container types will be skipped and a circular reference will
@@ -206,10 +209,11 @@ def dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True,
      level of 0 will only insert newlines. ``None`` is the most compact
      representation.
  
-    If specified, ``separators`` should be an ``(item_separator, key_separator)``
-    tuple.  The default is ``(', ', ': ')`` if *indent* is ``None`` and
-    ``(',', ': ')`` otherwise.  To get the most compact JSON representation,
-    you should specify ``(',', ':')`` to eliminate whitespace.
+    If specified, ``separators`` should be an ``(item_separator,
+    key_separator)`` tuple.  The default is ``(', ', ': ')`` if *indent* is
+    ``None`` and ``(',', ': ')`` otherwise.  To get the most compact JSON
+    representation, you should specify ``(',', ':')`` to eliminate
+    whitespace.
  
      ``default(obj)`` is a function that should return a serializable version
      of obj or raise TypeError. The default simply raises TypeError.
@@ -280,11 +284,12 @@ def load(fp, *, cls=None, object_hook=None, parse_float=None,
      ``object_hook`` will be used instead of the ``dict``. This feature
      can be used to implement custom decoders (e.g. JSON-RPC class hinting).
  
-    ``object_pairs_hook`` is an optional function that will be called with the
-    result of any object literal decoded with an ordered list of pairs.  The
-    return value of ``object_pairs_hook`` will be used instead of the ``dict``.
-    This feature can be used to implement custom decoders.  If ``object_hook``
-    is also defined, the ``object_pairs_hook`` takes priority.
+    ``object_pairs_hook`` is an optional function that will be called with
+    the result of any object literal decoded with an ordered list of pairs.
+    The return value of ``object_pairs_hook`` will be used instead of the
+    ``dict``.  This feature can be used to implement custom decoders.  If
+    ``object_hook`` is also defined, the ``object_pairs_hook`` takes
+    priority.
  
      To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
      kwarg; otherwise ``JSONDecoder`` is used.
@@ -305,11 +310,12 @@ def loads(s, *, cls=None, object_hook=None, parse_float=None,
      ``object_hook`` will be used instead of the ``dict``. This feature
      can be used to implement custom decoders (e.g. JSON-RPC class hinting).
  
-    ``object_pairs_hook`` is an optional function that will be called with the
-    result of any object literal decoded with an ordered list of pairs.  The
-    return value of ``object_pairs_hook`` will be used instead of the ``dict``.
-    This feature can be used to implement custom decoders.  If ``object_hook``
-    is also defined, the ``object_pairs_hook`` takes priority.
+    ``object_pairs_hook`` is an optional function that will be called with
+    the result of any object literal decoded with an ordered list of pairs.
+    The return value of ``object_pairs_hook`` will be used instead of the
+    ``dict``.  This feature can be used to implement custom decoders.  If
+    ``object_hook`` is also defined, the ``object_pairs_hook`` takes
+    priority.
  
      ``parse_float``, if specified, will be called with the string
      of every JSON float to be decoded. By default this is equivalent to
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py

index ff4bfcdcc407b924b55132a91e602797de21b907..92ad635255764098c0feb5aff1a3f4584d92fd90 100644 (file)
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -297,10 +297,10 @@ class JSONDecoder(object):
          place of the given ``dict``.  This can be used to provide custom
          deserializations (e.g. to support JSON-RPC class hinting).
  
-        ``object_pairs_hook``, if specified will be called with the result of
-        every JSON object decoded with an ordered list of pairs.  The return
-        value of ``object_pairs_hook`` will be used instead of the ``dict``.
-        This feature can be used to implement custom decoders.
+        ``object_pairs_hook``, if specified will be called with the result
+        of every JSON object decoded with an ordered list of pairs.  The
+        return value of ``object_pairs_hook`` will be used instead of the
+        ``dict``.  This feature can be used to implement custom decoders.
          If ``object_hook`` is also defined, the ``object_pairs_hook`` takes
          priority.
  
diff --git a/Lib/json/encoder.py b/Lib/json/encoder.py

index bc446e0f377a119f369f694e70e9810a9e3975ca..5cf6d64f3eade60b65f1324ae394454e97883c45 100644 (file)
--- a/Lib/json/encoder.py
+++ b/Lib/json/encoder.py
@@ -111,9 +111,10 @@ class JSONEncoder(object):
          encoding of keys that are not str, int, float, bool or None.
          If skipkeys is True, such items are simply skipped.
  
-        If ensure_ascii is true, the output is guaranteed to be str
-        objects with all incoming non-ASCII characters escaped.  If
-        ensure_ascii is false, the output can contain non-ASCII characters.
+        If ensure_ascii is true, the output is guaranteed to be str objects
+        with all incoming non-ASCII and non-printable characters escaped.
+        If ensure_ascii is false, the output can contain non-ASCII and
+        non-printable characters.
  
          If check_circular is true, then lists, dicts, and custom encoded
          objects will be checked for circular references during encoding to
@@ -134,14 +135,15 @@ class JSONEncoder(object):
          indent level.  An indent level of 0 will only insert newlines.
          None is the most compact representation.
  
-        If specified, separators should be an (item_separator, key_separator)
-        tuple.  The default is (', ', ': ') if *indent* is ``None`` and
-        (',', ': ') otherwise.  To get the most compact JSON representation,
-        you should specify (',', ':') to eliminate whitespace.
+        If specified, separators should be an (item_separator,
+        key_separator) tuple.  The default is (', ', ': ') if *indent* is
+        ``None`` and (',', ': ') otherwise.  To get the most compact JSON
+        representation, you should specify (',', ':') to eliminate
+        whitespace.
  
          If specified, default is a function that gets called for objects
-        that can't otherwise be serialized.  It should return a JSON encodable
-        version of the object or raise a ``TypeError``.
+        that can't otherwise be serialized.  It should return a JSON
+        encodable version of the object or raise a ``TypeError``.
  
          """
  
diff --git a/Lib/test/test_json/test_encode_basestring_ascii.py b/Lib/test/test_json/test_encode_basestring_ascii.py

index 6a39b72a09df3571ab0c754c303f19204505f208..c90d3e968e5ef9c8d5e47b00ba3e55172b09bb0a 100644 (file)
--- a/Lib/test/test_json/test_encode_basestring_ascii.py
+++ b/Lib/test/test_json/test_encode_basestring_ascii.py
@@ -8,13 +8,12 @@ CASES = [
      ('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
      ('controls', '"controls"'),
      ('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
+    ('\x00\x1f\x7f', '"\\u0000\\u001f\\u007f"'),
      ('{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'),
      (' s p a c e d ', '" s p a c e d "'),
      ('\U0001d120', '"\\ud834\\udd20"'),
      ('\u03b1\u03a9', '"\\u03b1\\u03a9"'),
      ("`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
-    ('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
-    ('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
  ]
  
  class TestEncodeBasestringAscii:
diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py

index 68629cceeb9be94e32a0c243fed4bfde7ba4f2dd..1aa9546dc4630618b23f0a91414959a6d4bb3b9f 100644 (file)
--- a/Lib/test/test_json/test_unicode.py
+++ b/Lib/test/test_json/test_unicode.py
@@ -32,6 +32,29 @@ class TestUnicode:
          j = self.dumps(u + "\n", ensure_ascii=False)
          self.assertEqual(j, f'"{u}\\n"')
  
+    def test_ascii_non_printable_encode(self):
+        u = '\b\t\n\f\r\x00\x1f\x7f'
+        self.assertEqual(self.dumps(u),
+                         '"\\b\\t\\n\\f\\r\\u0000\\u001f\\u007f"')
+        self.assertEqual(self.dumps(u, ensure_ascii=False),
+                         '"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')
+
+    def test_ascii_non_printable_decode(self):
+        self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
+                         '\b\t\n\f\r')
+        s = ''.join(map(chr, range(32)))
+        for c in s:
+            self.assertRaises(self.JSONDecodeError, self.loads, f'"{c}"')
+        self.assertEqual(self.loads(f'"{s}"', strict=False), s)
+        self.assertEqual(self.loads('"\x7f"'), '\x7f')
+
+    def test_escaped_decode(self):
+        self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'), '\b\t\n\f\r')
+        self.assertEqual(self.loads('"\\"\\\\\\/"'), '"\\/')
+        for c in set(map(chr, range(0x100))) - set('"\\/bfnrt'):
+            self.assertRaises(self.JSONDecodeError, self.loads, f'"\\{c}"')
+            self.assertRaises(self.JSONDecodeError, self.loads, f'"\\{c}"', strict=False)
+
      def test_big_unicode_encode(self):
          u = '\U0001d120'
          self.assertEqual(self.dumps(u), '"\\ud834\\udd20"')
@@ -48,6 +71,18 @@ class TestUnicode:
              s = f'"\\u{i:04x}"'
              self.assertEqual(self.loads(s), u)
  
+    def test_single_surrogate_encode(self):
+        self.assertEqual(self.dumps('\uD83D'), '"\\ud83d"')
+        self.assertEqual(self.dumps('\uD83D', ensure_ascii=False), '"\ud83d"')
+        self.assertEqual(self.dumps('\uDC0D'), '"\\udc0d"')
+        self.assertEqual(self.dumps('\uDC0D', ensure_ascii=False), '"\udc0d"')
+
+    def test_single_surrogate_decode(self):
+        self.assertEqual(self.loads('"\uD83D"'), '\ud83d')
+        self.assertEqual(self.loads('"\\uD83D"'), '\ud83d')
+        self.assertEqual(self.loads('"\udc0d"'), '\udc0d')
+        self.assertEqual(self.loads('"\\udc0d"'), '\udc0d')
+
      def test_unicode_preservation(self):
          self.assertEqual(type(self.loads('""')), str)
          self.assertEqual(type(self.loads('"a"')), str)
author	Serhiy Storchaka <storchaka@gmail.com>
	Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)
committer	GitHub <noreply@github.com>
	Sat, 8 Nov 2025 10:07:27 +0000 (12:07 +0200)
Doc/library/json.rst		patch \| blob \| blame \| history
Lib/json/__init__.py		patch \| blob \| blame \| history
Lib/json/decoder.py		patch \| blob \| blame \| history
Lib/json/encoder.py		patch \| blob \| blame \| history
Lib/test/test_json/test_encode_basestring_ascii.py		patch \| blob \| blame \| history
Lib/test/test_json/test_unicode.py		patch \| blob \| blame \| history