]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-119182: Add PyUnicodeWriter_WriteUCS4() function (#120849)
authorVictor Stinner <vstinner@python.org>
Mon, 24 Jun 2024 15:40:39 +0000 (17:40 +0200)
committerGitHub <noreply@github.com>
Mon, 24 Jun 2024 15:40:39 +0000 (17:40 +0200)
Doc/c-api/unicode.rst
Doc/whatsnew/3.14.rst
Include/cpython/unicodeobject.h
Lib/test/test_capi/test_unicode.py
Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst
Modules/_testcapi/unicode.c
Objects/unicodeobject.c

index 4ea20bde38c1db449c7b7b2276c8982da6ab7fd1..246cf47df62e783f7c549a9c52ae5dce1b761156 100644 (file)
@@ -1563,6 +1563,15 @@ object.
    On success, return ``0``.
    On error, set an exception, leave the writer unchanged, and return ``-1``.
 
+.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size)
+
+   Writer the UCS4 string *str* into *writer*.
+
+   *size* is a number of UCS4 characters.
+
+   On success, return ``0``.
+   On error, set an exception, leave the writer unchanged, and return ``-1``.
+
 .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
 
    Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*.
index b134ed31f6df402188d2399e240eca36ed6788a1..9662044915b8cabe79f662b1d5f4d1e1286a74cb 100644 (file)
@@ -314,6 +314,7 @@ New Features
   * :c:func:`PyUnicodeWriter_Finish`.
   * :c:func:`PyUnicodeWriter_WriteChar`.
   * :c:func:`PyUnicodeWriter_WriteUTF8`.
+  * :c:func:`PyUnicodeWriter_WriteUCS4`.
   * :c:func:`PyUnicodeWriter_WriteWideChar`.
   * :c:func:`PyUnicodeWriter_WriteStr`.
   * :c:func:`PyUnicodeWriter_WriteRepr`.
index 059bec8618c8d9a7d380ad4693f982e3f8016061..917991371012806f8f9a32d5f47725d7b699c74f 100644 (file)
@@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
     PyUnicodeWriter *writer,
     const wchar_t *str,
     Py_ssize_t size);
+PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4(
+    PyUnicodeWriter *writer,
+    Py_UCS4 *str,
+    Py_ssize_t size);
 
 PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
     PyUnicodeWriter *writer,
index 48a802c3f8bcb29cb5472779caa15926b319dd01..9ef476a02de47d27293493c0d276b6ff7368573c 100644 (file)
@@ -1826,8 +1826,42 @@ class PyUnicodeWriterTest(unittest.TestCase):
         writer.write_widechar("latin1=\xE9")
         writer.write_widechar("-")
         writer.write_widechar("euro=\u20AC")
+        writer.write_char("-")
+        writer.write_widechar("max=\U0010ffff")
         writer.write_char('.')
-        self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.")
+        self.assertEqual(writer.finish(),
+                         "latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
+
+    def test_ucs4(self):
+        writer = self.create_writer(0)
+        writer.write_ucs4("ascii IGNORED", 5)
+        writer.write_char("-")
+        writer.write_ucs4("latin1=\xe9", 8)
+        writer.write_char("-")
+        writer.write_ucs4("euro=\u20ac", 6)
+        writer.write_char("-")
+        writer.write_ucs4("max=\U0010ffff", 5)
+        writer.write_char(".")
+        self.assertEqual(writer.finish(),
+                         "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
+
+        # Test some special characters
+        writer = self.create_writer(0)
+        # Lone surrogate character
+        writer.write_ucs4("lone\uDC80", 5)
+        writer.write_char("-")
+        # Surrogate pair
+        writer.write_ucs4("pair\uDBFF\uDFFF", 5)
+        writer.write_char("-")
+        writer.write_ucs4("null[\0]", 7)
+        self.assertEqual(writer.finish(),
+                         "lone\udc80-pair\udbff-null[\0]")
+
+        # invalid size
+        writer = self.create_writer(0)
+        with self.assertRaises(ValueError):
+            writer.write_ucs4("text", -1)
+
 
 
 @unittest.skipIf(ctypes is None, 'need ctypes')
index 3d1384c9f3252f0d0d322d43e33ba64727f5fd74..243f290fbd47e212f9f7a725b5767af232ba993c 100644 (file)
@@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object:
 * :c:func:`PyUnicodeWriter_Finish`.
 * :c:func:`PyUnicodeWriter_WriteChar`.
 * :c:func:`PyUnicodeWriter_WriteUTF8`.
+* :c:func:`PyUnicodeWriter_WriteUCS4`.
+* :c:func:`PyUnicodeWriter_WriteWideChar`.
 * :c:func:`PyUnicodeWriter_WriteStr`.
 * :c:func:`PyUnicodeWriter_WriteRepr`.
 * :c:func:`PyUnicodeWriter_WriteSubstring`.
 * :c:func:`PyUnicodeWriter_Format`.
+* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
 
 Patch by Victor Stinner.
index c723e087baa308835208d94beed6d62f6d419d08..b8ecf53f4f8b9c5a7d87a1f240f2cbd4da45d5ef 100644 (file)
@@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args)
 }
 
 
+static PyObject*
+writer_write_ucs4(PyObject *self_raw, PyObject *args)
+{
+    WriterObject *self = (WriterObject *)self_raw;
+    if (writer_check(self) < 0) {
+        return NULL;
+    }
+
+    PyObject *str;
+    Py_ssize_t size;
+    if (!PyArg_ParseTuple(args, "Un", &str, &size)) {
+        return NULL;
+    }
+    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
+    size = Py_MIN(size, len);
+
+    Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);
+    if (ucs4 == NULL) {
+        return NULL;
+    }
+
+    int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size);
+    PyMem_Free(ucs4);
+    if (res < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
 static PyObject*
 writer_write_str(PyObject *self_raw, PyObject *args)
 {
@@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = {
     {"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS},
     {"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS},
     {"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS},
+    {"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS},
     {"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS},
     {"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS},
     {"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS},
index d11a9dca14b2805f561eb93c6a9d97210c9a3f1e..698e57f5ad0407e112591cea8aa8e2521f8723ce 100644 (file)
@@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
         if (!converted) {
             return -1;
         }
-        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
-        PyMem_Free(converted);
 
-        int res = _PyUnicodeWriter_WriteStr(writer, unicode);
-        Py_DECREF(unicode);
+        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
+        PyMem_Free(converted);
         return res;
     }
 #endif
@@ -2289,6 +2287,51 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
     return res;
 }
 
+
+int
+PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
+                          Py_UCS4 *str,
+                          Py_ssize_t size)
+{
+    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
+
+    if (size < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "size must be positive");
+        return -1;
+    }
+
+    if (size == 0) {
+        return 0;
+    }
+
+    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
+
+    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
+        return -1;
+    }
+
+    int kind = writer->kind;
+    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
+    if (kind == PyUnicode_1BYTE_KIND) {
+        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
+                                 str, str + size,
+                                 data);
+    }
+    else if (kind == PyUnicode_2BYTE_KIND) {
+        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
+                                 str, str + size,
+                                 data);
+    }
+    else {
+        memcpy(data, str, size * sizeof(Py_UCS4));
+    }
+    writer->pos += size;
+
+    return 0;
+}
+
+
 PyObject*
 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
 {
@@ -13357,7 +13400,7 @@ PyUnicodeWriter*
 PyUnicodeWriter_Create(Py_ssize_t length)
 {
     if (length < 0) {
-        PyErr_SetString(PyExc_TypeError,
+        PyErr_SetString(PyExc_ValueError,
                         "length must be positive");
         return NULL;
     }