gh-129813, PEP 782: Use PyBytesWriter in utf8_encoder() (#138874)

author Victor Stinner <vstinner@python.org>

Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)

committer GitHub <noreply@github.com>

Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)
author Victor Stinner <vstinner@python.org>
Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)
committer GitHub <noreply@github.com>
Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h

index 440410d0aef17d33ab793a746db329607e4bdaa7..9e53fab842909a09d7a2f87b1a0497c6a67ddc2b 100644 (file)
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -257,16 +257,14 @@ InvalidContinuation3:
  /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
     PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
     UCS-1 strings don't need to handle surrogates for example. */
-Py_LOCAL_INLINE(char *)
-STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
-                        PyObject *unicode,
+Py_LOCAL_INLINE(PyBytesWriter*)
+STRINGLIB(utf8_encoder)(PyObject *unicode,
                          const STRINGLIB_CHAR *data,
                          Py_ssize_t size,
                          _Py_error_handler error_handler,
-                        const char *errors)
+                        const char *errors,
+                        char **end)
  {
-    Py_ssize_t i;                /* index into data of next input character */
-    char *p;                     /* next free byte in output buffer */
  #if STRINGLIB_SIZEOF_CHAR > 1
      PyObject *error_handler_obj = NULL;
      PyObject *exc = NULL;
@@ -284,14 +282,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
      if (size > PY_SSIZE_T_MAX / max_char_size) {
          /* integer overflow */
          PyErr_NoMemory();
+        *end = NULL;
          return NULL;
      }
  
-    _PyBytesWriter_Init(writer);
-    p = _PyBytesWriter_Alloc(writer, size * max_char_size);
-    if (p == NULL)
+    PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size);
+    if (writer == NULL) {
+        *end = NULL;
          return NULL;
+    }
+    /* next free byte in output buffer */
+    char *p = PyBytesWriter_GetData(writer);
  
+    Py_ssize_t i;                /* index into data of next input character */
      for (i = 0; i < size;) {
          Py_UCS4 ch = data[i++];
  
@@ -348,7 +351,7 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
  
              case _Py_ERROR_BACKSLASHREPLACE:
                  /* subtract preallocated bytes */
-                writer->min_size -= max_char_size * (endpos - startpos);
+                writer->size -= max_char_size * (endpos - startpos);
                  p = backslashreplace(writer, p,
                                       unicode, startpos, endpos);
                  if (p == NULL)
@@ -358,7 +361,7 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
  
              case _Py_ERROR_XMLCHARREFREPLACE:
                  /* subtract preallocated bytes */
-                writer->min_size -= max_char_size * (endpos - startpos);
+                writer->size -= max_char_size * (endpos - startpos);
                  p = xmlcharrefreplace(writer, p,
                                        unicode, startpos, endpos);
                  if (p == NULL)
@@ -389,22 +392,25 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
  
                  if (newpos < startpos) {
                      writer->overallocate = 1;
-                    p = _PyBytesWriter_Prepare(writer, p,
-                                               max_char_size * (startpos - newpos));
-                    if (p == NULL)
+                    p = PyBytesWriter_GrowAndUpdatePointer(writer,
+                                               max_char_size * (startpos - newpos),
+                                               p);
+                    if (p == NULL) {
                          goto error;
+                    }
                  }
                  else {
                      /* subtract preallocated bytes */
-                    writer->min_size -= max_char_size * (newpos - startpos);
+                    writer->size -= max_char_size * (newpos - startpos);
                      /* Only overallocate the buffer if it's not the last write */
                      writer->overallocate = (newpos < size);
                  }
  
+                char *rep_str;
+                Py_ssize_t rep_len;
                  if (PyBytes_Check(rep)) {
-                    p = _PyBytesWriter_WriteBytes(writer, p,
-                                                  PyBytes_AS_STRING(rep),
-                                                  PyBytes_GET_SIZE(rep));
+                    rep_str = PyBytes_AS_STRING(rep);
+                    rep_len = PyBytes_GET_SIZE(rep);
                  }
                  else {
                      /* rep is unicode */
@@ -415,13 +421,16 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
                          goto error;
                      }
  
-                    p = _PyBytesWriter_WriteBytes(writer, p,
-                                                  PyUnicode_DATA(rep),
-                                                  PyUnicode_GET_LENGTH(rep));
+                    rep_str = PyUnicode_DATA(rep);
+                    rep_len = PyUnicode_GET_LENGTH(rep);
                  }
  
-                if (p == NULL)
+                p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p);
+                if (p == NULL) {
                      goto error;
+                }
+                memcpy(p, rep_str, rep_len);
+                p += rep_len;
                  Py_CLEAR(rep);
  
                  i = newpos;
@@ -458,13 +467,16 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
      Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
  #endif
-    return p;
+    *end = p;
+    return writer;
  
  #if STRINGLIB_SIZEOF_CHAR > 1
   error:
+    PyBytesWriter_Discard(writer);
      Py_XDECREF(rep);
      Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
+    *end = NULL;
      return NULL;
  #endif
  }
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 42fef029222504da02e9711addbac213eb71e553..5799d92211aa978c0cb30a6db3900f590667d25c 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -828,7 +828,7 @@ unicode_result_unchanged(PyObject *unicode)
  /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
     ASCII, Latin1, UTF-8, etc. */
  static char*
-backslashreplace(_PyBytesWriter *writer, char *str,
+backslashreplace(PyBytesWriter *writer, char *str,
                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
  {
      Py_ssize_t size, i;
@@ -861,9 +861,10 @@ backslashreplace(_PyBytesWriter *writer, char *str,
          size += incr;
      }
  
-    str = _PyBytesWriter_Prepare(writer, str, size);
-    if (str == NULL)
+    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
+    if (str == NULL) {
          return NULL;
+    }
  
      /* generate replacement */
      for (i = collstart; i < collend; ++i) {
@@ -894,7 +895,7 @@ backslashreplace(_PyBytesWriter *writer, char *str,
  /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
     ASCII, Latin1, UTF-8, etc. */
  static char*
-xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+xmlcharrefreplace(PyBytesWriter *writer, char *str,
                    PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
  {
      Py_ssize_t size, i;
@@ -935,9 +936,10 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str,
          size += incr;
      }
  
-    str = _PyBytesWriter_Prepare(writer, str, size);
-    if (str == NULL)
+    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
+    if (str == NULL) {
          return NULL;
+    }
  
      /* generate replacement */
      for (i = collstart; i < collend; ++i) {
@@ -5828,7 +5830,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
      const void *data = PyUnicode_DATA(unicode);
      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
  
-    _PyBytesWriter writer;
+    PyBytesWriter *writer;
      char *end;
  
      switch (kind) {
@@ -5837,21 +5839,24 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
      case PyUnicode_1BYTE_KIND:
          /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
          assert(!PyUnicode_IS_ASCII(unicode));
-        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+        writer = ucs1lib_utf8_encoder(unicode, data, size,
+                                      error_handler, errors, &end);
          break;
      case PyUnicode_2BYTE_KIND:
-        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+        writer = ucs2lib_utf8_encoder(unicode, data, size,
+                                      error_handler, errors, &end);
          break;
      case PyUnicode_4BYTE_KIND:
-        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+        writer = ucs4lib_utf8_encoder(unicode, data, size,
+                                      error_handler, errors, &end);
          break;
      }
  
-    if (end == NULL) {
-        _PyBytesWriter_Dealloc(&writer);
+    if (writer == NULL) {
+        PyBytesWriter_Discard(writer);
          return NULL;
      }
-    return _PyBytesWriter_Finish(&writer, end);
+    return PyBytesWriter_FinishWithPointer(writer, end);
  }
  
  static int
@@ -5865,37 +5870,35 @@ unicode_fill_utf8(PyObject *unicode)
      const void *data = PyUnicode_DATA(unicode);
      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
  
-    _PyBytesWriter writer;
+    PyBytesWriter *writer;
      char *end;
  
      switch (kind) {
      default:
          Py_UNREACHABLE();
      case PyUnicode_1BYTE_KIND:
-        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
-                                   _Py_ERROR_STRICT, NULL);
+        writer = ucs1lib_utf8_encoder(unicode, data, size,
+                                      _Py_ERROR_STRICT, NULL, &end);
          break;
      case PyUnicode_2BYTE_KIND:
-        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
-                                   _Py_ERROR_STRICT, NULL);
+        writer = ucs2lib_utf8_encoder(unicode, data, size,
+                                      _Py_ERROR_STRICT, NULL, &end);
          break;
      case PyUnicode_4BYTE_KIND:
-        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
-                                   _Py_ERROR_STRICT, NULL);
+        writer = ucs4lib_utf8_encoder(unicode, data, size,
+                                      _Py_ERROR_STRICT, NULL, &end);
          break;
      }
-    if (end == NULL) {
-        _PyBytesWriter_Dealloc(&writer);
+    if (writer == NULL) {
          return -1;
      }
  
-    const char *start = writer.use_small_buffer ? writer.small_buffer :
-                    PyBytes_AS_STRING(writer.buffer);
+    const char *start = PyBytesWriter_GetData(writer);
      Py_ssize_t len = end - start;
  
      char *cache = PyMem_Malloc(len + 1);
      if (cache == NULL) {
-        _PyBytesWriter_Dealloc(&writer);
+        PyBytesWriter_Discard(writer);
          PyErr_NoMemory();
          return -1;
      }
@@ -5903,7 +5906,7 @@ unicode_fill_utf8(PyObject *unicode)
      cache[len] = '\0';
      PyUnicode_SET_UTF8_LENGTH(unicode, len);
      PyUnicode_SET_UTF8(unicode, cache);
-    _PyBytesWriter_Dealloc(&writer);
+    PyBytesWriter_Discard(writer);
      return 0;
  }
  
@@ -7323,16 +7326,12 @@ unicode_encode_ucs1(PyObject *unicode,
      Py_ssize_t pos=0, size;
      int kind;
      const void *data;
-    /* pointer into the output */
-    char *str;
      const char *encoding = (limit == 256) ? "latin-1" : "ascii";
      const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
      PyObject *error_handler_obj = NULL;
      PyObject *exc = NULL;
      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
      PyObject *rep = NULL;
-    /* output object */
-    _PyBytesWriter writer;
  
      size = PyUnicode_GET_LENGTH(unicode);
      kind = PyUnicode_KIND(unicode);
@@ -7342,10 +7341,13 @@ unicode_encode_ucs1(PyObject *unicode,
      if (size == 0)
          return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
  
-    _PyBytesWriter_Init(&writer);
-    str = _PyBytesWriter_Alloc(&writer, size);
-    if (str == NULL)
+    /* output object */
+    PyBytesWriter *writer = PyBytesWriter_Create(size);
+    if (writer == NULL) {
          return NULL;
+    }
+    /* pointer into the output */
+    char *str = PyBytesWriter_GetData(writer);
  
      while (pos < size) {
          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
@@ -7367,7 +7369,7 @@ unicode_encode_ucs1(PyObject *unicode,
                  ++collend;
  
              /* Only overallocate the buffer if it's not the last write */
-            writer.overallocate = (collend < size);
+            writer->overallocate = (collend < size);
  
              /* cache callback name lookup (if not done yet, i.e. it's the first error) */
              if (error_handler == _Py_ERROR_UNKNOWN)
@@ -7388,8 +7390,8 @@ unicode_encode_ucs1(PyObject *unicode,
  
              case _Py_ERROR_BACKSLASHREPLACE:
                  /* subtract preallocated bytes */
-                writer.min_size -= (collend - collstart);
-                str = backslashreplace(&writer, str,
+                writer->size -= (collend - collstart);
+                str = backslashreplace(writer, str,
                                         unicode, collstart, collend);
                  if (str == NULL)
                      goto onError;
@@ -7398,8 +7400,8 @@ unicode_encode_ucs1(PyObject *unicode,
  
              case _Py_ERROR_XMLCHARREFREPLACE:
                  /* subtract preallocated bytes */
-                writer.min_size -= (collend - collstart);
-                str = xmlcharrefreplace(&writer, str,
+                writer->size -= (collend - collstart);
+                str = xmlcharrefreplace(writer, str,
                                          unicode, collstart, collend);
                  if (str == NULL)
                      goto onError;
@@ -7430,24 +7432,27 @@ unicode_encode_ucs1(PyObject *unicode,
                      goto onError;
  
                  if (newpos < collstart) {
-                    writer.overallocate = 1;
-                    str = _PyBytesWriter_Prepare(&writer, str,
-                                                 collstart - newpos);
-                    if (str == NULL)
+                    writer->overallocate = 1;
+                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
+                                                             collstart - newpos,
+                                                             str);
+                    if (str == NULL) {
                          goto onError;
+                    }
                  }
                  else {
                      /* subtract preallocated bytes */
-                    writer.min_size -= newpos - collstart;
+                    writer->size -= newpos - collstart;
                      /* Only overallocate the buffer if it's not the last write */
-                    writer.overallocate = (newpos < size);
+                    writer->overallocate = (newpos < size);
                  }
  
+                char *rep_str;
+                Py_ssize_t rep_len;
                  if (PyBytes_Check(rep)) {
                      /* Directly copy bytes result to output. */
-                    str = _PyBytesWriter_WriteBytes(&writer, str,
-                                                    PyBytes_AS_STRING(rep),
-                                                    PyBytes_GET_SIZE(rep));
+                    rep_str = PyBytes_AS_STRING(rep);
+                    rep_len = PyBytes_GET_SIZE(rep);
                  }
                  else {
                      assert(PyUnicode_Check(rep));
@@ -7462,12 +7467,16 @@ unicode_encode_ucs1(PyObject *unicode,
                          goto onError;
                      }
                      assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
-                    str = _PyBytesWriter_WriteBytes(&writer, str,
-                                                    PyUnicode_DATA(rep),
-                                                    PyUnicode_GET_LENGTH(rep));
+                    rep_str = PyUnicode_DATA(rep);
+                    rep_len = PyUnicode_GET_LENGTH(rep);
                  }
-                if (str == NULL)
+
+                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
+                if (str == NULL) {
                      goto onError;
+                }
+                memcpy(str, rep_str, rep_len);
+                str += rep_len;
  
                  pos = newpos;
                  Py_CLEAR(rep);
@@ -7475,17 +7484,17 @@ unicode_encode_ucs1(PyObject *unicode,
  
              /* If overallocation was disabled, ensure that it was the last
                 write. Otherwise, we missed an optimization */
-            assert(writer.overallocate || pos == size);
+            assert(writer->overallocate || pos == size);
          }
      }
  
      Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
-    return _PyBytesWriter_Finish(&writer, str);
+    return PyBytesWriter_FinishWithPointer(writer, str);
  
    onError:
      Py_XDECREF(rep);
-    _PyBytesWriter_Dealloc(&writer);
+    PyBytesWriter_Discard(writer);
      Py_XDECREF(error_handler_obj);
      Py_XDECREF(exc);
      return NULL;
author	Victor Stinner <vstinner@python.org>
	Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)
committer	GitHub <noreply@github.com>
	Tue, 23 Sep 2025 09:47:09 +0000 (11:47 +0200)
Objects/stringlib/codecs.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history