gh-98836: Extend PyUnicode_FromFormat() (GH-98838)

author Serhiy Storchaka <storchaka@gmail.com>

Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)

committer GitHub <noreply@github.com>

Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)
committer GitHub <noreply@github.com>
Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index ab3a2e274d939513adb03e4b8dd31a06f7f91177..6771f378bfbc319be6243b44a62b0eddce0e73b3 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -394,98 +394,149 @@ APIs:
     arguments, calculate the size of the resulting Python Unicode string and return
     a string with the values formatted into it.  The variable arguments must be C
     types and must correspond exactly to the format characters in the *format*
-   ASCII-encoded string. The following format characters are allowed:
-
-   .. % This should be exactly the same as the table in PyErr_Format.
-
-   .. tabularcolumns:: |l|l|L|
-
-   +-------------------+---------------------+----------------------------------+
-   | Format Characters | Type                | Comment                          |
-   +===================+=====================+==================================+
-   | :attr:`%%`        | *n/a*               | The literal % character.         |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%c`        | int                 | A single character,              |
-   |                   |                     | represented as a C int.          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%d`        | int                 | Equivalent to                    |
-   |                   |                     | ``printf("%d")``. [1]_           |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%u`        | unsigned int        | Equivalent to                    |
-   |                   |                     | ``printf("%u")``. [1]_           |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%ld`       | long                | Equivalent to                    |
-   |                   |                     | ``printf("%ld")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%li`       | long                | Equivalent to                    |
-   |                   |                     | ``printf("%li")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%lu`       | unsigned long       | Equivalent to                    |
-   |                   |                     | ``printf("%lu")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%lld`      | long long           | Equivalent to                    |
-   |                   |                     | ``printf("%lld")``. [1]_         |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%lli`      | long long           | Equivalent to                    |
-   |                   |                     | ``printf("%lli")``. [1]_         |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%llu`      | unsigned long long  | Equivalent to                    |
-   |                   |                     | ``printf("%llu")``. [1]_         |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%zd`       | :c:type:`\          | Equivalent to                    |
-   |                   | Py_ssize_t`         | ``printf("%zd")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%zi`       | :c:type:`\          | Equivalent to                    |
-   |                   | Py_ssize_t`         | ``printf("%zi")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%zu`       | size_t              | Equivalent to                    |
-   |                   |                     | ``printf("%zu")``. [1]_          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%i`        | int                 | Equivalent to                    |
-   |                   |                     | ``printf("%i")``. [1]_           |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%x`        | int                 | Equivalent to                    |
-   |                   |                     | ``printf("%x")``. [1]_           |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%s`        | const char\*        | A null-terminated C character    |
-   |                   |                     | array.                           |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%p`        | const void\*        | The hex representation of a C    |
-   |                   |                     | pointer. Mostly equivalent to    |
-   |                   |                     | ``printf("%p")`` except that     |
-   |                   |                     | it is guaranteed to start with   |
-   |                   |                     | the literal ``0x`` regardless    |
-   |                   |                     | of what the platform's           |
-   |                   |                     | ``printf`` yields.               |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%A`        | PyObject\*          | The result of calling            |
-   |                   |                     | :func:`ascii`.                   |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%U`        | PyObject\*          | A Unicode object.                |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%V`        | PyObject\*,         | A Unicode object (which may be   |
-   |                   | const char\*        | ``NULL``) and a null-terminated  |
-   |                   |                     | C character array as a second    |
-   |                   |                     | parameter (which will be used,   |
-   |                   |                     | if the first parameter is        |
-   |                   |                     | ``NULL``).                       |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%S`        | PyObject\*          | The result of calling            |
-   |                   |                     | :c:func:`PyObject_Str`.          |
-   +-------------------+---------------------+----------------------------------+
-   | :attr:`%R`        | PyObject\*          | The result of calling            |
-   |                   |                     | :c:func:`PyObject_Repr`.         |
-   +-------------------+---------------------+----------------------------------+
+   ASCII-encoded string.
+
+   A conversion specifier contains two or more characters and has the following
+   components, which must occur in this order:
+
+   #. The ``'%'`` character, which marks the start of the specifier.
+
+   #. Conversion flags (optional), which affect the result of some conversion
+      types.
+
+   #. Minimum field width (optional).
+      If specified as an ``'*'`` (asterisk), the actual width is given in the
+      next argument, which must be of type :c:expr:`int`, and the object to
+      convert comes after the minimum field width and optional precision.
+
+   #. Precision (optional), given as a ``'.'`` (dot) followed by the precision.
+      If specified as ``'*'`` (an asterisk), the actual precision is given in
+      the next argument, which must be of type :c:expr:`int`, and the value to
+      convert comes after the precision.
+
+   #. Length modifier (optional).
+
+   #. Conversion type.
+
+   The conversion flag characters are:
+
+   .. tabularcolumns:: |l|L|
+
+   +-------+-------------------------------------------------------------+
+   | Flag  | Meaning                                                     |
+   +=======+=============================================================+
+   | ``0`` | The conversion will be zero padded for numeric values.      |
+   +-------+-------------------------------------------------------------+
+   | ``-`` | The converted value is left adjusted (overrides the ``0``   |
+   |       | flag if both are given).                                    |
+   +-------+-------------------------------------------------------------+
+
+   The length modifiers for following integer conversions (``d``, ``i``,
+   ``o``, ``u``, ``x``, or ``X``) specify the type of the argument
+   (:c:expr:`int` by default):
+
+   .. tabularcolumns:: |l|L|
+
+   +----------+-----------------------------------------------------+
+   | Modifier | Types                                               |
+   +==========+=====================================================+
+   | ``l``    | :c:expr:`long` or :c:expr:`unsigned long`           |
+   +----------+-----------------------------------------------------+
+   | ``ll``   | :c:expr:`long long` or :c:expr:`unsigned long long` |
+   +----------+-----------------------------------------------------+
+   | ``j``    | :c:expr:`intmax_t` or :c:expr:`uintmax_t`           |
+   +----------+-----------------------------------------------------+
+   | ``z``    | :c:expr:`size_t` or :c:expr:`ssize_t`               |
+   +----------+-----------------------------------------------------+
+   | ``t``    | :c:expr:`ptrdiff_t`                                 |
+   +----------+-----------------------------------------------------+
+
+   The length modifier ``l`` for following conversions ``s`` or ``V`` specify
+   that the type of the argument is :c:expr:`const wchar_t*`.
+
+   The conversion specifiers are:
+
+   .. list-table::
+      :widths: auto
+      :header-rows: 1
+
+      * - Conversion Specifier
+        - Type
+        - Comment
+
+      * - ``%``
+        - *n/a*
+        - The literal ``%`` character.
+
+      * - ``d``, ``i``
+        - Specified by the length modifier
+        - The decimal representation of a signed C integer.
+
+      * - ``u``
+        - Specified by the length modifier
+        - The decimal representation of an unsigned C integer.
+
+      * - ``o``
+        - Specified by the length modifier
+        - The octal representation of an unsigned C integer.
+
+      * - ``x``
+        - Specified by the length modifier
+        - The hexadecimal representation of an unsigned C integer (lowercase).
+
+      * - ``X``
+        - Specified by the length modifier
+        - The hexadecimal representation of an unsigned C integer (uppercase).
+
+      * - ``c``
+        - :c:expr:`int`
+        - A single character.
+
+      * - ``s``
+        - :c:expr:`const char*` or :c:expr:`const wchar_t*`
+        - A null-terminated C character array.
+
+      * - ``p``
+        - :c:expr:`const void*`
+        - The hex representation of a C  pointer.
+          Mostly equivalent to ``printf("%p")`` except that it is guaranteed to
+          start with the literal ``0x`` regardless of what the platform's
+          ``printf`` yields.
+
+      * - ``A``
+        - :c:expr:`PyObject*`
+        - The result of calling :func:`ascii`.
+
+      * - ``U``
+        - :c:expr:`PyObject*`
+        - A Unicode object.
+
+      * - ``V``
+        - :c:expr:`PyObject*`, :c:expr:`const char*` or :c:expr:`const wchar_t*`
+        - A Unicode object (which may be ``NULL``) and a null-terminated
+          C character array as a second parameter (which will be used,
+          if the first parameter is ``NULL``).
+
+      * - ``S``
+        - :c:expr:`PyObject*`
+        - The result of calling :c:func:`PyObject_Str`.
+
+      * - ``R``
+        - :c:expr:`PyObject*`
+        - The result of calling :c:func:`PyObject_Repr`.
  
     .. note::
        The width formatter unit is number of characters rather than bytes.
-      The precision formatter unit is number of bytes for ``"%s"`` and
+      The precision formatter unit is number of bytes or :c:expr:`wchar_t`
+      items (if the length modifier ``l`` is used) for ``"%s"`` and
        ``"%V"`` (if the ``PyObject*`` argument is ``NULL``), and a number of
        characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"``
        (if the ``PyObject*`` argument is not ``NULL``).
  
-   .. [1] For integer specifiers (d, u, ld, li, lu, lld, lli, llu, zd, zi,
-      zu, i, x): the 0-conversion flag has effect even when a precision is given.
+   .. note::
+      Unlike to C :c:func:`printf` the ``0`` flag has effect even when
+      a precision is given for integer conversions (``d``, ``i``, ``u``, ``o``,
+      ``x``, or ``X``).
  
     .. versionchanged:: 3.2
        Support for ``"%lld"`` and ``"%llu"`` added.
@@ -498,6 +549,13 @@ APIs:
        ``"%V"``, ``"%S"``, ``"%R"`` added.
  
     .. versionchanged:: 3.12
+      Support for conversion specifiers ``o`` and ``X``.
+      Support for length modifiers ``j`` and ``t``.
+      Length modifiers are now applied to all integer conversions.
+      Length modifier ``l`` is now applied to conversion specifiers ``s`` and ``V``.
+      Support for variable width and precision ``*``.
+      Support for flag ``-``.
+
        An unrecognized format character now sets a :exc:`SystemError`.
        In previous versions it caused all the rest of the format string to be
        copied as-is to the result string, and any extra arguments discarded.
diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst

index 14f03ef755c734e78591557c6da53609954311f1..caf21078b9bd592fb9aa4386dee363d4b47f4a3b 100644 (file)
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -1402,6 +1402,12 @@ Porting to Python 3.12
    :py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`,
    for example).
  
+* Add support of more formatting options (left aligning, octals, uppercase
+  hexadecimals, ``intmax_t``, ``ptrdiff_t``, ``wchar_t`` C
+  strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and
+  :c:func:`PyUnicode_FromFormatV`.
+  (Contributed by Serhiy Storchaka in :gh:`98836`.)
+
  * An unrecognized format character in :c:func:`PyUnicode_FromFormat` and
    :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`.
    In previous versions it caused all the rest of the format string to be
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py

index 00807d968a7c43047469d0537181e53b15297826..9c7662065689ea50dfd5244eaa1ce69ddcaea6a8 100644 (file)
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -319,12 +319,17 @@ class CAPITest(unittest.TestCase):
  
      def test_from_format(self):
          """Test PyUnicode_FromFormat()"""
+        # Length modifiers "j" and "t" are not tested here because ctypes does
+        # not expose types for intmax_t and ptrdiff_t.
+        # _testcapi.test_string_from_format() has a wider coverage of all
+        # formats.
          import_helper.import_module('ctypes')
          from ctypes import (
              c_char_p,
              pythonapi, py_object, sizeof,
              c_int, c_long, c_longlong, c_ssize_t,
-            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
+            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p,
+            sizeof, c_wchar, c_wchar_p)
          name = "PyUnicode_FromFormat"
          _PyUnicode_FromFormat = getattr(pythonapi, name)
          _PyUnicode_FromFormat.argtypes = (c_char_p,)
@@ -449,37 +454,28 @@ class CAPITest(unittest.TestCase):
          check_format("repr=   12",
                       b'repr=%5.2V', None, b'123')
  
-        # test integer formats (%i, %d, %u)
+        # test integer formats (%i, %d, %u, %o, %x, %X)
          check_format('010',
                       b'%03i', c_int(10))
          check_format('0010',
                       b'%0.4i', c_int(10))
-        check_format('-123',
-                     b'%i', c_int(-123))
-        check_format('-123',
-                     b'%li', c_long(-123))
-        check_format('-123',
-                     b'%lli', c_longlong(-123))
-        check_format('-123',
-                     b'%zi', c_ssize_t(-123))
-
-        check_format('-123',
-                     b'%d', c_int(-123))
-        check_format('-123',
-                     b'%ld', c_long(-123))
-        check_format('-123',
-                     b'%lld', c_longlong(-123))
-        check_format('-123',
-                     b'%zd', c_ssize_t(-123))
-
-        check_format('123',
-                     b'%u', c_uint(123))
-        check_format('123',
-                     b'%lu', c_ulong(123))
-        check_format('123',
-                     b'%llu', c_ulonglong(123))
-        check_format('123',
-                     b'%zu', c_size_t(123))
+        for conv, signed, value, expected in [
+            (b'i', True, -123, '-123'),
+            (b'd', True, -123, '-123'),
+            (b'u', False, 123, '123'),
+            (b'o', False, 0o123, '123'),
+            (b'x', False, 0xabc, 'abc'),
+            (b'X', False, 0xabc, 'ABC'),
+        ]:
+            for mod, ctype in [
+                (b'', c_int if signed else c_uint),
+                (b'l', c_long if signed else c_ulong),
+                (b'll', c_longlong if signed else c_ulonglong),
+                (b'z', c_ssize_t if signed else c_size_t),
+            ]:
+                with self.subTest(format=b'%' + mod + conv):
+                    check_format(expected,
+                                 b'%' + mod + conv, ctype(value))
  
          # test long output
          min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
@@ -494,40 +490,144 @@ class CAPITest(unittest.TestCase):
          PyUnicode_FromFormat(b'%p', c_void_p(-1))
  
          # test padding (width and/or precision)
-        check_format('123'.rjust(10, '0'),
-                     b'%010i', c_int(123))
-        check_format('123'.rjust(100),
-                     b'%100i', c_int(123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100i', c_int(123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80i', c_int(123))
-
-        check_format('123'.rjust(10, '0'),
-                     b'%010u', c_uint(123))
-        check_format('123'.rjust(100),
-                     b'%100u', c_uint(123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100u', c_uint(123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80u', c_uint(123))
-
-        check_format('123'.rjust(10, '0'),
-                     b'%010x', c_int(0x123))
-        check_format('123'.rjust(100),
-                     b'%100x', c_int(0x123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100x', c_int(0x123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80x', c_int(0x123))
+        check_format('123',        b'%2i', c_int(123))
+        check_format('       123', b'%10i', c_int(123))
+        check_format('0000000123', b'%010i', c_int(123))
+        check_format('123       ', b'%-10i', c_int(123))
+        check_format('123       ', b'%-010i', c_int(123))
+        check_format('123',        b'%.2i', c_int(123))
+        check_format('0000123',    b'%.7i', c_int(123))
+        check_format('       123', b'%10.2i', c_int(123))
+        check_format('   0000123', b'%10.7i', c_int(123))
+        check_format('0000000123', b'%010.7i', c_int(123))
+        check_format('0000123   ', b'%-10.7i', c_int(123))
+        check_format('0000123   ', b'%-010.7i', c_int(123))
+
+        check_format('-123',       b'%2i', c_int(-123))
+        check_format('      -123', b'%10i', c_int(-123))
+        check_format('-000000123', b'%010i', c_int(-123))
+        check_format('-123      ', b'%-10i', c_int(-123))
+        check_format('-123      ', b'%-010i', c_int(-123))
+        check_format('-123',       b'%.2i', c_int(-123))
+        check_format('-0000123',   b'%.7i', c_int(-123))
+        check_format('      -123', b'%10.2i', c_int(-123))
+        check_format('  -0000123', b'%10.7i', c_int(-123))
+        check_format('-000000123', b'%010.7i', c_int(-123))
+        check_format('-0000123  ', b'%-10.7i', c_int(-123))
+        check_format('-0000123  ', b'%-010.7i', c_int(-123))
+
+        check_format('123',        b'%2u', c_uint(123))
+        check_format('       123', b'%10u', c_uint(123))
+        check_format('0000000123', b'%010u', c_uint(123))
+        check_format('123       ', b'%-10u', c_uint(123))
+        check_format('123       ', b'%-010u', c_uint(123))
+        check_format('123',        b'%.2u', c_uint(123))
+        check_format('0000123',    b'%.7u', c_uint(123))
+        check_format('       123', b'%10.2u', c_uint(123))
+        check_format('   0000123', b'%10.7u', c_uint(123))
+        check_format('0000000123', b'%010.7u', c_uint(123))
+        check_format('0000123   ', b'%-10.7u', c_uint(123))
+        check_format('0000123   ', b'%-010.7u', c_uint(123))
+
+        check_format('123',        b'%2o', c_uint(0o123))
+        check_format('       123', b'%10o', c_uint(0o123))
+        check_format('0000000123', b'%010o', c_uint(0o123))
+        check_format('123       ', b'%-10o', c_uint(0o123))
+        check_format('123       ', b'%-010o', c_uint(0o123))
+        check_format('123',        b'%.2o', c_uint(0o123))
+        check_format('0000123',    b'%.7o', c_uint(0o123))
+        check_format('       123', b'%10.2o', c_uint(0o123))
+        check_format('   0000123', b'%10.7o', c_uint(0o123))
+        check_format('0000000123', b'%010.7o', c_uint(0o123))
+        check_format('0000123   ', b'%-10.7o', c_uint(0o123))
+        check_format('0000123   ', b'%-010.7o', c_uint(0o123))
+
+        check_format('abc',        b'%2x', c_uint(0xabc))
+        check_format('       abc', b'%10x', c_uint(0xabc))
+        check_format('0000000abc', b'%010x', c_uint(0xabc))
+        check_format('abc       ', b'%-10x', c_uint(0xabc))
+        check_format('abc       ', b'%-010x', c_uint(0xabc))
+        check_format('abc',        b'%.2x', c_uint(0xabc))
+        check_format('0000abc',    b'%.7x', c_uint(0xabc))
+        check_format('       abc', b'%10.2x', c_uint(0xabc))
+        check_format('   0000abc', b'%10.7x', c_uint(0xabc))
+        check_format('0000000abc', b'%010.7x', c_uint(0xabc))
+        check_format('0000abc   ', b'%-10.7x', c_uint(0xabc))
+        check_format('0000abc   ', b'%-010.7x', c_uint(0xabc))
+
+        check_format('ABC',        b'%2X', c_uint(0xabc))
+        check_format('       ABC', b'%10X', c_uint(0xabc))
+        check_format('0000000ABC', b'%010X', c_uint(0xabc))
+        check_format('ABC       ', b'%-10X', c_uint(0xabc))
+        check_format('ABC       ', b'%-010X', c_uint(0xabc))
+        check_format('ABC',        b'%.2X', c_uint(0xabc))
+        check_format('0000ABC',    b'%.7X', c_uint(0xabc))
+        check_format('       ABC', b'%10.2X', c_uint(0xabc))
+        check_format('   0000ABC', b'%10.7X', c_uint(0xabc))
+        check_format('0000000ABC', b'%010.7X', c_uint(0xabc))
+        check_format('0000ABC   ', b'%-10.7X', c_uint(0xabc))
+        check_format('0000ABC   ', b'%-010.7X', c_uint(0xabc))
  
          # test %A
          check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
                       b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
  
          # test %V
-        check_format('repr=abc',
-                     b'repr=%V', 'abc', b'xyz')
+        check_format('abc',
+                     b'%V', 'abc', b'xyz')
+        check_format('xyz',
+                     b'%V', None, b'xyz')
+
+        # test %ls
+        check_format('abc', b'%ls', c_wchar_p('abc'))
+        check_format('\u4eba\u6c11', b'%ls', c_wchar_p('\u4eba\u6c11'))
+        check_format('\U0001f4bb+\U0001f40d',
+                     b'%ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('   ab', b'%5.2ls', c_wchar_p('abc'))
+        check_format('   \u4eba\u6c11', b'%5ls', c_wchar_p('\u4eba\u6c11'))
+        check_format('  \U0001f4bb+\U0001f40d',
+                     b'%5ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('\u4eba', b'%.1ls', c_wchar_p('\u4eba\u6c11'))
+        check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d',
+                     b'%.1ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb',
+                     b'%.2ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+
+        # test %lV
+        check_format('abc',
+                     b'%lV', 'abc', c_wchar_p('xyz'))
+        check_format('xyz',
+                     b'%lV', None, c_wchar_p('xyz'))
+        check_format('\u4eba\u6c11',
+                     b'%lV', None, c_wchar_p('\u4eba\u6c11'))
+        check_format('\U0001f4bb+\U0001f40d',
+                     b'%lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('   ab',
+                     b'%5.2lV', None, c_wchar_p('abc'))
+        check_format('   \u4eba\u6c11',
+                     b'%5lV', None, c_wchar_p('\u4eba\u6c11'))
+        check_format('  \U0001f4bb+\U0001f40d',
+                     b'%5lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('\u4eba',
+                     b'%.1lV', None, c_wchar_p('\u4eba\u6c11'))
+        check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d',
+                     b'%.1lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+        check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb',
+                     b'%.2lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+
+        # test variable width and precision
+        check_format('  abc', b'%*s', c_int(5), b'abc')
+        check_format('ab', b'%.*s', c_int(2), b'abc')
+        check_format('   ab', b'%*.*s', c_int(5), c_int(2), b'abc')
+        check_format('  abc', b'%*U', c_int(5), 'abc')
+        check_format('ab', b'%.*U', c_int(2), 'abc')
+        check_format('   ab', b'%*.*U', c_int(5), c_int(2), 'abc')
+        check_format('   ab', b'%*.*V', c_int(5), c_int(2), None, b'abc')
+        check_format('   ab', b'%*.*lV', c_int(5), c_int(2),
+                     None, c_wchar_p('abc'))
+        check_format('     123', b'%*i', c_int(8), c_int(123))
+        check_format('00123', b'%.*i', c_int(5), c_int(123))
+        check_format('   00123', b'%*.*i', c_int(8), c_int(5), c_int(123))
  
          # test %p
          # We cannot test the exact result,
@@ -564,10 +664,11 @@ class CAPITest(unittest.TestCase):
          check_format('',
                       b'%s', b'')
  
-        # check for crashes
+        # test invalid format strings. these tests are just here
+        # to check for crashes and should not be considered as specifications
          for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1',
                      b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc',
-                    b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'):
+                    b'%l', b'%ll', b'%z', b'%lls', b'%zs'):
              with self.subTest(fmt=fmt):
                  self.assertRaisesRegex(SystemError, 'invalid format string',
                      PyUnicode_FromFormat, fmt, b'abc')
diff --git a/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst

new file mode 100644 (file)

index 0000000..e3730eb
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst
@@ -0,0 +1,4 @@
+Add support of more formatting options (left aligning, octals, uppercase
+hexadecimals, :c:expr:`intmax_t`, :c:expr:`ptrdiff_t`, :c:expr:`wchar_t` C
+strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and
+:c:func:`PyUnicode_FromFormatV`.
diff --git a/Modules/_ssl.c b/Modules/_ssl.c

index 016a5a5cbca54817bfff646afe61652dd8ee8b7d..5bf6b3bc19b2d1f34304865dcd04cde62e29ca44 100644 (file)
--- a/Modules/_ssl.c
+++ b/Modules/_ssl.c
@@ -1330,10 +1330,8 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) {
                          p[0], p[1], p[2], p[3]
                      );
                  } else if (name->d.ip->length == 16) {
-                    /* PyUnicode_FromFormat() does not support %X */
                      unsigned char *p = name->d.ip->data;
-                    len = sprintf(
-                        buf,
+                    v = PyUnicode_FromFormat(
                          "%X:%X:%X:%X:%X:%X:%X:%X",
                          p[0] << 8 | p[1],
                          p[2] << 8 | p[3],
@@ -1344,7 +1342,6 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) {
                          p[12] << 8 | p[13],
                          p[14] << 8 | p[15]
                      );
-                    v = PyUnicode_FromStringAndSize(buf, len);
                  } else {
                      v = PyUnicode_FromString("<invalid>");
                  }
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c

index 7dd3b9c0c03e504018bd0c5800538070f9572d0b..73929eaffc676d7c603e17875ac2711212162d8a 100644 (file)
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -1,3 +1,5 @@
+#include <stddef.h>               // ptrdiff_t
+
  #define PY_SSIZE_T_CLEAN
  #include "parts.h"
  
@@ -1130,25 +1132,48 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1(  "%c", "c", 'c');
      CHECK_FORMAT_1( "%0c", "c", 'c');
      CHECK_FORMAT_1("%00c", "c", 'c');
-    CHECK_FORMAT_1( "%2c", "c", 'c');
-    CHECK_FORMAT_1("%02c", "c", 'c');
-    CHECK_FORMAT_1("%.0c", "c", 'c');
-    CHECK_FORMAT_1("%.2c", "c", 'c');
+    CHECK_FORMAT_1( "%2c", NULL, 'c');
+    CHECK_FORMAT_1("%02c", NULL, 'c');
+    CHECK_FORMAT_1("%.0c", NULL, 'c');
+    CHECK_FORMAT_1("%.2c", NULL, 'c');
  
      // Integers
      CHECK_FORMAT_1("%d",             "123",                (int)123);
      CHECK_FORMAT_1("%i",             "123",                (int)123);
      CHECK_FORMAT_1("%u",             "123",       (unsigned int)123);
+    CHECK_FORMAT_1("%x",              "7b",       (unsigned int)123);
+    CHECK_FORMAT_1("%X",              "7B",       (unsigned int)123);
+    CHECK_FORMAT_1("%o",             "173",       (unsigned int)123);
      CHECK_FORMAT_1("%ld",            "123",               (long)123);
      CHECK_FORMAT_1("%li",            "123",               (long)123);
      CHECK_FORMAT_1("%lu",            "123",      (unsigned long)123);
+    CHECK_FORMAT_1("%lx",             "7b",      (unsigned long)123);
+    CHECK_FORMAT_1("%lX",             "7B",      (unsigned long)123);
+    CHECK_FORMAT_1("%lo",            "173",      (unsigned long)123);
      CHECK_FORMAT_1("%lld",           "123",          (long long)123);
      CHECK_FORMAT_1("%lli",           "123",          (long long)123);
      CHECK_FORMAT_1("%llu",           "123", (unsigned long long)123);
+    CHECK_FORMAT_1("%llx",            "7b", (unsigned long long)123);
+    CHECK_FORMAT_1("%llX",            "7B", (unsigned long long)123);
+    CHECK_FORMAT_1("%llo",           "173", (unsigned long long)123);
      CHECK_FORMAT_1("%zd",            "123",         (Py_ssize_t)123);
      CHECK_FORMAT_1("%zi",            "123",         (Py_ssize_t)123);
      CHECK_FORMAT_1("%zu",            "123",             (size_t)123);
-    CHECK_FORMAT_1("%x",              "7b",                (int)123);
+    CHECK_FORMAT_1("%zx",             "7b",             (size_t)123);
+    CHECK_FORMAT_1("%zX",             "7B",             (size_t)123);
+    CHECK_FORMAT_1("%zo",            "173",             (size_t)123);
+    CHECK_FORMAT_1("%td",            "123",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%ti",            "123",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%tu",            "123",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%tx",             "7b",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%tX",             "7B",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%to",            "173",          (ptrdiff_t)123);
+    CHECK_FORMAT_1("%jd",            "123",           (intmax_t)123);
+    CHECK_FORMAT_1("%ji",            "123",           (intmax_t)123);
+    CHECK_FORMAT_1("%ju",            "123",          (uintmax_t)123);
+    CHECK_FORMAT_1("%jx",             "7b",          (uintmax_t)123);
+    CHECK_FORMAT_1("%jX",             "7B",          (uintmax_t)123);
+    CHECK_FORMAT_1("%jo",            "173",          (uintmax_t)123);
  
      CHECK_FORMAT_1("%d",            "-123",               (int)-123);
      CHECK_FORMAT_1("%i",            "-123",               (int)-123);
@@ -1158,7 +1183,10 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%lli",          "-123",         (long long)-123);
      CHECK_FORMAT_1("%zd",           "-123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%zi",           "-123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%x",        "ffffff85",               (int)-123);
+    CHECK_FORMAT_1("%td",           "-123",         (ptrdiff_t)-123);
+    CHECK_FORMAT_1("%ti",           "-123",         (ptrdiff_t)-123);
+    CHECK_FORMAT_1("%jd",           "-123",          (intmax_t)-123);
+    CHECK_FORMAT_1("%ji",           "-123",          (intmax_t)-123);
  
      // Integers: width < length
      CHECK_FORMAT_1("%1d",            "123",                (int)123);
@@ -1183,7 +1211,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%1lli",         "-123",         (long long)-123);
      CHECK_FORMAT_1("%1zd",          "-123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%1zi",          "-123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%1x",       "ffffff85",               (int)-123);
  
      // Integers: width > length
      CHECK_FORMAT_1("%5d",          "  123",                (int)123);
@@ -1208,7 +1235,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%5lli",        " -123",         (long long)-123);
      CHECK_FORMAT_1("%5zd",         " -123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%5zi",         " -123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%9x",      " ffffff85",               (int)-123);
  
      // Integers: width > length, 0-flag
      CHECK_FORMAT_1("%05d",         "00123",                (int)123);
@@ -1233,7 +1259,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%05lli",       "-0123",         (long long)-123);
      CHECK_FORMAT_1("%05zd",        "-0123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%05zi",        "-0123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%09x",     "0ffffff85",               (int)-123);
  
      // Integers: precision < length
      CHECK_FORMAT_1("%.1d",           "123",                (int)123);
@@ -1258,7 +1283,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%.1lli",        "-123",         (long long)-123);
      CHECK_FORMAT_1("%.1zd",         "-123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%.1zi",         "-123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%.1x",      "ffffff85",               (int)-123);
  
      // Integers: precision > length
      CHECK_FORMAT_1("%.5d",         "00123",                (int)123);
@@ -1283,7 +1307,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%.5lli",      "-00123",         (long long)-123);
      CHECK_FORMAT_1("%.5zd",       "-00123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%.5zi",       "-00123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%.9x",     "0ffffff85",               (int)-123);
  
      // Integers: width > precision > length
      CHECK_FORMAT_1("%7.5d",      "  00123",                (int)123);
@@ -1308,7 +1331,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%7.5lli",    " -00123",         (long long)-123);
      CHECK_FORMAT_1("%7.5zd",     " -00123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%7.5zi",     " -00123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%10.9x",  " 0ffffff85",               (int)-123);
  
      // Integers: width > precision > length, 0-flag
      CHECK_FORMAT_1("%07.5d",     "0000123",                (int)123);
@@ -1333,7 +1355,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%07.5lli",   "-000123",         (long long)-123);
      CHECK_FORMAT_1("%07.5zd",    "-000123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%07.5zi",    "-000123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%010.9x", "00ffffff85",               (int)-123);
  
      // Integers: precision > width > length
      CHECK_FORMAT_1("%5.7d",      "0000123",                (int)123);
@@ -1358,7 +1379,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%5.7lli",   "-0000123",         (long long)-123);
      CHECK_FORMAT_1("%5.7zd",    "-0000123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%5.7zi",    "-0000123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%9.10x",  "00ffffff85",               (int)-123);
  
      // Integers: precision > width > length, 0-flag
      CHECK_FORMAT_1("%05.7d",     "0000123",                (int)123);
@@ -1383,7 +1403,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
      CHECK_FORMAT_1("%05.7lli",  "-0000123",         (long long)-123);
      CHECK_FORMAT_1("%05.7zd",   "-0000123",        (Py_ssize_t)-123);
      CHECK_FORMAT_1("%05.7zi",   "-0000123",        (Py_ssize_t)-123);
-    CHECK_FORMAT_1("%09.10x", "00ffffff85",               (int)-123);
  
      // Integers: precision = 0, arg = 0 (empty string in C)
      CHECK_FORMAT_1("%.0d",             "0",                  (int)0);
@@ -1402,66 +1421,80 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
  
      // Strings
      CHECK_FORMAT_1("%s",     "None",  "None");
+    CHECK_FORMAT_1("%ls",    "None", L"None");
      CHECK_FORMAT_1("%U",     "None", unicode);
      CHECK_FORMAT_1("%A",     "None", Py_None);
      CHECK_FORMAT_1("%S",     "None", Py_None);
      CHECK_FORMAT_1("%R",     "None", Py_None);
      CHECK_FORMAT_2("%V",     "None", unicode, "ignored");
      CHECK_FORMAT_2("%V",     "None",    NULL,    "None");
+    CHECK_FORMAT_2("%lV",    "None",    NULL,   L"None");
  
      // Strings: width < length
      CHECK_FORMAT_1("%1s",    "None",  "None");
+    CHECK_FORMAT_1("%1ls",   "None", L"None");
      CHECK_FORMAT_1("%1U",    "None", unicode);
      CHECK_FORMAT_1("%1A",    "None", Py_None);
      CHECK_FORMAT_1("%1S",    "None", Py_None);
      CHECK_FORMAT_1("%1R",    "None", Py_None);
      CHECK_FORMAT_2("%1V",    "None", unicode, "ignored");
      CHECK_FORMAT_2("%1V",    "None",    NULL,    "None");
+    CHECK_FORMAT_2("%1lV",   "None",    NULL,    L"None");
  
      // Strings: width > length
      CHECK_FORMAT_1("%5s",   " None",  "None");
+    CHECK_FORMAT_1("%5ls",  " None", L"None");
      CHECK_FORMAT_1("%5U",   " None", unicode);
      CHECK_FORMAT_1("%5A",   " None", Py_None);
      CHECK_FORMAT_1("%5S",   " None", Py_None);
      CHECK_FORMAT_1("%5R",   " None", Py_None);
      CHECK_FORMAT_2("%5V",   " None", unicode, "ignored");
      CHECK_FORMAT_2("%5V",   " None",    NULL,    "None");
+    CHECK_FORMAT_2("%5lV",  " None",    NULL,   L"None");
  
      // Strings: precision < length
      CHECK_FORMAT_1("%.1s",      "N",  "None");
+    CHECK_FORMAT_1("%.1ls",     "N", L"None");
      CHECK_FORMAT_1("%.1U",      "N", unicode);
      CHECK_FORMAT_1("%.1A",      "N", Py_None);
      CHECK_FORMAT_1("%.1S",      "N", Py_None);
      CHECK_FORMAT_1("%.1R",      "N", Py_None);
      CHECK_FORMAT_2("%.1V",      "N", unicode, "ignored");
      CHECK_FORMAT_2("%.1V",      "N",    NULL,    "None");
+    CHECK_FORMAT_2("%.1lV",     "N",    NULL,   L"None");
  
      // Strings: precision > length
      CHECK_FORMAT_1("%.5s",   "None",  "None");
+    CHECK_FORMAT_1("%.5ls",  "None", L"None");
      CHECK_FORMAT_1("%.5U",   "None", unicode);
      CHECK_FORMAT_1("%.5A",   "None", Py_None);
      CHECK_FORMAT_1("%.5S",   "None", Py_None);
      CHECK_FORMAT_1("%.5R",   "None", Py_None);
      CHECK_FORMAT_2("%.5V",   "None", unicode, "ignored");
      CHECK_FORMAT_2("%.5V",   "None",    NULL,    "None");
+    CHECK_FORMAT_2("%.5lV",  "None",    NULL,   L"None");
  
      // Strings: precision < length, width > length
      CHECK_FORMAT_1("%5.1s", "    N",  "None");
+    CHECK_FORMAT_1("%5.1ls","    N", L"None");
      CHECK_FORMAT_1("%5.1U", "    N", unicode);
      CHECK_FORMAT_1("%5.1A", "    N", Py_None);
      CHECK_FORMAT_1("%5.1S", "    N", Py_None);
      CHECK_FORMAT_1("%5.1R", "    N", Py_None);
      CHECK_FORMAT_2("%5.1V", "    N", unicode, "ignored");
      CHECK_FORMAT_2("%5.1V", "    N",    NULL,    "None");
+    CHECK_FORMAT_2("%5.1lV","    N",    NULL,   L"None");
  
      // Strings: width < length, precision > length
      CHECK_FORMAT_1("%1.5s",  "None",  "None");
+    CHECK_FORMAT_1("%1.5ls", "None",  L"None");
      CHECK_FORMAT_1("%1.5U",  "None", unicode);
      CHECK_FORMAT_1("%1.5A",  "None", Py_None);
      CHECK_FORMAT_1("%1.5S",  "None", Py_None);
      CHECK_FORMAT_1("%1.5R",  "None", Py_None);
      CHECK_FORMAT_2("%1.5V",  "None", unicode, "ignored");
      CHECK_FORMAT_2("%1.5V",  "None",    NULL,    "None");
+    CHECK_FORMAT_2("%1.5lV", "None",    NULL,   L"None");
  
      Py_XDECREF(unicode);
      Py_RETURN_NONE;
diff --git a/Modules/selectmodule.c b/Modules/selectmodule.c

index 79bd5b59ab68f9651aeab569d1e83599107b8209..9a4943c9eb2f759dc92584f4d3854fef0dc67e0f 100644 (file)
--- a/Modules/selectmodule.c
+++ b/Modules/selectmodule.c
@@ -1849,14 +1849,11 @@ static PyObject *
  
  kqueue_event_repr(kqueue_event_Object *s)
  {
-    char buf[1024];
-    PyOS_snprintf(
-        buf, sizeof(buf),
+    return PyUnicode_FromFormat(
          "<select.kevent ident=%zu filter=%d flags=0x%x fflags=0x%x "
          "data=0x%llx udata=%p>",
          (size_t)(s->e.ident), (int)s->e.filter, (unsigned int)s->e.flags,
          (unsigned int)s->e.fflags, (long long)(s->e.data), (void *)s->e.udata);
-    return PyUnicode_FromString(buf);
  }
  
  static int
diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c

index c11fb4400eab2fdaaf93e139a3197f5d3c775200..a86aaed501fa33ea38650c67ffb7fa925bc4205b 100644 (file)
--- a/Modules/socketmodule.c
+++ b/Modules/socketmodule.c
@@ -1339,8 +1339,6 @@ setbdaddr(const char *name, bdaddr_t *bdaddr)
  static PyObject *
  makebdaddr(bdaddr_t *bdaddr)
  {
-    char buf[(6 * 2) + 5 + 1];
-
  #ifdef MS_WINDOWS
      int i;
      unsigned int octets[6];
@@ -1349,16 +1347,14 @@ makebdaddr(bdaddr_t *bdaddr)
          octets[i] = ((*bdaddr) >> (8 * i)) & 0xFF;
      }
  
-    sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X",
+    return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X",
          octets[5], octets[4], octets[3],
          octets[2], octets[1], octets[0]);
  #else
-    sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X",
+    return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X",
          bdaddr->b[5], bdaddr->b[4], bdaddr->b[3],
          bdaddr->b[2], bdaddr->b[1], bdaddr->b[0]);
  #endif
-
-    return PyUnicode_FromString(buf);
  }
  #endif
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 7726f2fb17afde0067691de7b20112f4b03fab62..ec5684b1d0950265931ea063770bffffb75b2952 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -56,6 +56,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  #include "pycore_unicodeobject.h" // struct _Py_unicode_state
  #include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
  #include "stringlib/eq.h"         // unicode_eq()
+#include <stddef.h>               // ptrdiff_t
  
  #ifdef MS_WINDOWS
  #include <windows.h>
@@ -2285,14 +2286,15 @@ PyUnicode_AsUCS4Copy(PyObject *string)
      return as_ucs4(string, NULL, 0, 1);
  }
  
-/* maximum number of characters required for output of %lld or %p.
-   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
-   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
-#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+/* maximum number of characters required for output of %jo or %jd or %p.
+   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
+   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
+   plus 1 for the terminal NUL. */
+#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
  
  static int
  unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
-                             Py_ssize_t width, Py_ssize_t precision)
+                             Py_ssize_t width, Py_ssize_t precision, int flags)
  {
      Py_ssize_t length, fill, arglen;
      Py_UCS4 maxchar;
@@ -2314,8 +2316,8 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
      if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
          return -1;
  
-    if (width > length) {
-        fill = width - length;
+    fill = Py_MAX(width - length, 0);
+    if (fill && !(flags & F_LJUST)) {
          if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
              return -1;
          writer->pos += fill;
@@ -2324,12 +2326,19 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                    str, 0, length);
      writer->pos += length;
+
+    if (fill && (flags & F_LJUST)) {
+        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
+            return -1;
+        writer->pos += fill;
+    }
+
      return 0;
  }
  
  static int
  unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
-                              Py_ssize_t width, Py_ssize_t precision)
+                              Py_ssize_t width, Py_ssize_t precision, int flags)
  {
      /* UTF-8 */
      Py_ssize_t length;
@@ -2349,24 +2358,58 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
      if (unicode == NULL)
          return -1;
  
-    res = unicode_fromformat_write_str(writer, unicode, width, -1);
+    res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
+    Py_DECREF(unicode);
+    return res;
+}
+
+static int
+unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
+                              Py_ssize_t width, Py_ssize_t precision, int flags)
+{
+    /* UTF-8 */
+    Py_ssize_t length;
+    PyObject *unicode;
+    int res;
+
+    if (precision == -1) {
+        length = wcslen(str);
+    }
+    else {
+        length = 0;
+        while (length < precision && str[length]) {
+            length++;
+        }
+    }
+    unicode = PyUnicode_FromWideChar(str, length);
+    if (unicode == NULL)
+        return -1;
+
+    res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
      Py_DECREF(unicode);
      return res;
  }
  
+#define F_LONG 1
+#define F_LONGLONG 2
+#define F_SIZE 3
+#define F_PTRDIFF 4
+#define F_INTMAX 5
+static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
+static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
+static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
+static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
+static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
+
  static const char*
  unicode_fromformat_arg(_PyUnicodeWriter *writer,
                         const char *f, va_list *vargs)
  {
      const char *p;
      Py_ssize_t len;
-    int zeropad;
+    int flags = 0;
      Py_ssize_t width;
      Py_ssize_t precision;
-    int longflag;
-    int longlongflag;
-    int size_tflag;
-    Py_ssize_t fill;
  
      p = f;
      f++;
@@ -2377,15 +2420,31 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          return f;
      }
  
-    zeropad = 0;
-    if (*f == '0') {
-        zeropad = 1;
-        f++;
+    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
+    /* Flags '+', ' ' and '#' are not particularly useful.
+     * They are not worth the implementation and maintenance costs.
+     * In addition, '#' should add "0" for "o" conversions for compatibility
+     * with printf, but it would confuse Python users. */
+    while (1) {
+        switch (*f++) {
+        case '-': flags |= F_LJUST; continue;
+        case '0': flags |= F_ZERO; continue;
+        }
+        f--;
+        break;
      }
  
      /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
      width = -1;
-    if (Py_ISDIGIT((unsigned)*f)) {
+    if (*f == '*') {
+        width = va_arg(*vargs, int);
+        if (width < 0) {
+            flags |= F_LJUST;
+            width = -width;
+        }
+        f++;
+    }
+    else if (Py_ISDIGIT((unsigned)*f)) {
          width = *f - '0';
          f++;
          while (Py_ISDIGIT((unsigned)*f)) {
@@ -2401,7 +2460,14 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
      precision = -1;
      if (*f == '.') {
          f++;
-        if (Py_ISDIGIT((unsigned)*f)) {
+        if (*f == '*') {
+            precision = va_arg(*vargs, int);
+            if (precision < 0) {
+                precision = -2;
+            }
+            f++;
+        }
+        else if (Py_ISDIGIT((unsigned)*f)) {
              precision = (*f - '0');
              f++;
              while (Py_ISDIGIT((unsigned)*f)) {
@@ -2416,30 +2482,47 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          }
      }
  
-    /* Handle %ld, %lu, %lld and %llu. */
-    longflag = 0;
-    longlongflag = 0;
-    size_tflag = 0;
+    int sizemod = 0;
      if (*f == 'l') {
-        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
-            longflag = 1;
-            ++f;
-        }
-        else if (f[1] == 'l' &&
-                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
-            longlongflag = 1;
+        if (f[1] == 'l') {
+            sizemod = F_LONGLONG;
              f += 2;
          }
+        else {
+            sizemod = F_LONG;
+            ++f;
+        }
      }
-    /* handle the size_t flag. */
-    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
-        size_tflag = 1;
+    else if (*f == 'z') {
+        sizemod = F_SIZE;
+        ++f;
+    }
+    else if (*f == 't') {
+        sizemod = F_PTRDIFF;
+        ++f;
+    }
+    else if (*f == 'j') {
+        sizemod = F_INTMAX;
          ++f;
      }
-
      if (f[0] != '\0' && f[1] == '\0')
          writer->overallocate = 0;
  
+    switch (*f) {
+    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
+        break;
+    case 'c': case 'p':
+        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
+        break;
+    case 's':
+    case 'V':
+        if (sizemod && sizemod != F_LONG) goto invalid_format;
+        break;
+    default:
+        if (sizemod) goto invalid_format;
+        break;
+    }
+
      switch (*f) {
      case 'c':
      {
@@ -2454,91 +2537,98 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          break;
      }
  
-    case 'i':
-    case 'd':
-    case 'u':
-    case 'x':
+    case 'd': case 'i':
+    case 'o': case 'u': case 'x': case 'X':
      {
          /* used by sprintf */
-        char buffer[MAX_LONG_LONG_CHARS];
-        Py_ssize_t arglen;
-
-        if (*f == 'u') {
-            if (longflag) {
-                len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
-            }
-            else if (longlongflag) {
-                len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
-            }
-            else if (size_tflag) {
-                len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
-            }
-            else {
-                len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
-            }
-        }
-        else if (*f == 'x') {
-            len = sprintf(buffer, "%x", va_arg(*vargs, int));
-        }
-        else {
-            if (longflag) {
-                len = sprintf(buffer, "%li", va_arg(*vargs, long));
-            }
-            else if (longlongflag) {
-                len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
-            }
-            else if (size_tflag) {
-                len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
-            }
-            else {
-                len = sprintf(buffer, "%i", va_arg(*vargs, int));
-            }
+        char buffer[MAX_INTMAX_CHARS];
+        const char *fmt = NULL;
+        switch (*f) {
+            case 'o': fmt = formats_o[sizemod]; break;
+            case 'u': fmt = formats_u[sizemod]; break;
+            case 'x': fmt = formats_x[sizemod]; break;
+            case 'X': fmt = formats_X[sizemod]; break;
+            default: fmt = formats[sizemod]; break;
+        }
+        int issigned = (*f == 'd' || *f == 'i');
+        switch (sizemod) {
+            case F_LONG:
+                len = issigned ?
+                    sprintf(buffer, fmt, va_arg(*vargs, long)) :
+                    sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
+                break;
+            case F_LONGLONG:
+                len = issigned ?
+                    sprintf(buffer, fmt, va_arg(*vargs, long long)) :
+                    sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
+                break;
+            case F_SIZE:
+                len = issigned ?
+                    sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
+                    sprintf(buffer, fmt, va_arg(*vargs, size_t));
+                break;
+            case F_PTRDIFF:
+                len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
+                break;
+            case F_INTMAX:
+                len = issigned ?
+                    sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
+                    sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
+                break;
+            default:
+                len = issigned ?
+                    sprintf(buffer, fmt, va_arg(*vargs, int)) :
+                    sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
+                break;
          }
          assert(len >= 0);
  
-        int negative = (buffer[0] == '-');
-        len -= negative;
+        int sign = (buffer[0] == '-');
+        len -= sign;
  
          precision = Py_MAX(precision, len);
-        width = Py_MAX(width, precision + negative);
+        width = Py_MAX(width, precision + sign);
+        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
+            precision = width - sign;
+        }
  
-        arglen = Py_MAX(precision, width);
-        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
-            return NULL;
+        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
+        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
  
-        if (width > precision) {
-            if (negative && zeropad) {
-                if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
-                    return NULL;
-            }
+        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
+            return NULL;
  
-            Py_UCS4 fillchar = zeropad?'0':' ';
-            fill = width - precision - negative;
-            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
+        if (spacepad && !(flags & F_LJUST)) {
+            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
                  return NULL;
-            writer->pos += fill;
+            writer->pos += spacepad;
+        }
  
-            if (negative && !zeropad) {
-                if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
-                    return NULL;
-            }
+        if (sign) {
+            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
+                return NULL;
          }
  
-        if (precision > len) {
-            fill = precision - len;
-            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
+        if (zeropad) {
+            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
                  return NULL;
-            writer->pos += fill;
+            writer->pos += zeropad;
          }
  
-        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[negative], len) < 0)
+        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
              return NULL;
+
+        if (spacepad && (flags & F_LJUST)) {
+            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
+                return NULL;
+            writer->pos += spacepad;
+        }
          break;
      }
  
      case 'p':
      {
-        char number[MAX_LONG_LONG_CHARS];
+        char number[MAX_INTMAX_CHARS];
  
          len = sprintf(number, "%p", va_arg(*vargs, void*));
          assert(len >= 0);
@@ -2561,10 +2651,17 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
  
      case 's':
      {
-        /* UTF-8 */
-        const char *s = va_arg(*vargs, const char*);
-        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
-            return NULL;
+        if (sizemod) {
+            const wchar_t *s = va_arg(*vargs, const wchar_t*);
+            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
+                return NULL;
+        }
+        else {
+            /* UTF-8 */
+            const char *s = va_arg(*vargs, const char*);
+            if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
+                return NULL;
+        }
          break;
      }
  
@@ -2573,7 +2670,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          PyObject *obj = va_arg(*vargs, PyObject *);
          assert(obj && _PyUnicode_CHECK(obj));
  
-        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
              return NULL;
          break;
      }
@@ -2581,15 +2678,27 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
      case 'V':
      {
          PyObject *obj = va_arg(*vargs, PyObject *);
-        const char *str = va_arg(*vargs, const char *);
+        const char *str;
+        const wchar_t *wstr;
+        if (sizemod) {
+            wstr = va_arg(*vargs, const wchar_t*);
+        }
+        else {
+            str = va_arg(*vargs, const char *);
+        }
          if (obj) {
              assert(_PyUnicode_CHECK(obj));
-            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
+                return NULL;
+        }
+        else if (sizemod) {
+            assert(wstr != NULL);
+            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
                  return NULL;
          }
          else {
              assert(str != NULL);
-            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
+            if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
                  return NULL;
          }
          break;
@@ -2603,7 +2712,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          str = PyObject_Str(obj);
          if (!str)
              return NULL;
-        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
+        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
              Py_DECREF(str);
              return NULL;
          }
@@ -2619,7 +2728,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          repr = PyObject_Repr(obj);
          if (!repr)
              return NULL;
-        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
+        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
              Py_DECREF(repr);
              return NULL;
          }
@@ -2635,7 +2744,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
          ascii = PyObject_ASCII(obj);
          if (!ascii)
              return NULL;
-        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
+        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
              Py_DECREF(ascii);
              return NULL;
          }
@@ -2644,6 +2753,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
      }
  
      default:
+    invalid_format:
          PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
          return NULL;
      }
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index fb94fbeac42bae64686e3b2a6c3a219b15548b4d..fc4afccbfc40083d40cc5bdc4d43bf16fcbf58f7 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1556,14 +1556,11 @@ verify_identifier(struct tok_state *tok)
              tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
          }
          Py_DECREF(s);
-        // PyUnicode_FromFormatV() does not support %X
-        char hex[9];
-        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
          if (Py_UNICODE_ISPRINTABLE(ch)) {
-            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+            syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
          }
          else {
-            syntaxerror(tok, "invalid non-printable character U+%s", hex);
+            syntaxerror(tok, "invalid non-printable character U+%04X", ch);
          }
          return 0;
      }
@@ -2541,9 +2538,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
      }
  
      if (!Py_UNICODE_ISPRINTABLE(c)) {
-        char hex[9];
-        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
-        return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
+        return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c));
      }
  
      if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
author	Serhiy Storchaka <storchaka@gmail.com>
	Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)
committer	GitHub <noreply@github.com>
	Sun, 21 May 2023 21:32:39 +0000 (00:32 +0300)
Doc/c-api/unicode.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.12.rst		patch \| blob \| blame \| history
Lib/test/test_capi/test_unicode.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst	[new file with mode: 0644]	patch \| blob
Modules/_ssl.c		patch \| blob \| blame \| history
Modules/_testcapi/unicode.c		patch \| blob \| blame \| history
Modules/selectmodule.c		patch \| blob \| blame \| history
Modules/socketmodule.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history