--- /dev/null
+/*
+
+Unicode implementation based on original code by Fredrik Lundh,
+modified by Marc-Andre Lemburg <mal@lemburg.com>.
+
+Major speed upgrades to the method implementations at the Reykjavik
+NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
+
+Copyright (c) Corporation for National Research Initiatives.
+
+--------------------------------------------------------------------
+The original string type implementation is:
+
+ Copyright (c) 1999 by Secret Labs AB
+ Copyright (c) 1999 by Fredrik Lundh
+
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+
+Permission to use, copy, modify, and distribute this software and its
+associated documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appears in all
+copies, and that both that copyright notice and this permission notice
+appear in supporting documentation, and that the name of Secret Labs
+AB or the author not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+--------------------------------------------------------------------
+
+*/
+
+#include "Python.h"
+#include "pycore_freelist.h" // _Py_FREELIST_FREE()
+#include "pycore_long.h" // _PyLong_FormatWriter()
+#include "pycore_unicodeobject.h" // _PyUnicode_Result()
+
+
+#ifdef MS_WINDOWS
+ /* On Windows, overallocate by 50% is the best factor */
+# define OVERALLOCATE_FACTOR 2
+#else
+ /* On Linux, overallocate by 25% is the best factor */
+# define OVERALLOCATE_FACTOR 4
+#endif
+
+
+/* Compilation of templated routines */
+
+#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
+
+#include "stringlib/ucs1lib.h"
+#include "stringlib/find_max_char.h"
+#include "stringlib/undef.h"
+
+
+/* Copy an ASCII or latin1 char* string into a Python Unicode string.
+
+ WARNING: The function doesn't copy the terminating null character and
+ doesn't check the maximum character (may write a latin1 character in an
+ ASCII string). */
+static void
+unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
+ const char *str, Py_ssize_t len)
+{
+ int kind = PyUnicode_KIND(unicode);
+ const void *data = PyUnicode_DATA(unicode);
+ const char *end = str + len;
+
+ assert(index + len <= PyUnicode_GET_LENGTH(unicode));
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND: {
+#ifdef Py_DEBUG
+ if (PyUnicode_IS_ASCII(unicode)) {
+ Py_UCS4 maxchar = ucs1lib_find_max_char(
+ (const Py_UCS1*)str,
+ (const Py_UCS1*)str + len);
+ assert(maxchar < 128);
+ }
+#endif
+ memcpy((char *) data + index, str, len);
+ break;
+ }
+ case PyUnicode_2BYTE_KIND: {
+ Py_UCS2 *start = (Py_UCS2 *)data + index;
+ Py_UCS2 *ucs2 = start;
+
+ for (; str < end; ++ucs2, ++str)
+ *ucs2 = (Py_UCS2)*str;
+
+ assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
+ break;
+ }
+ case PyUnicode_4BYTE_KIND: {
+ Py_UCS4 *start = (Py_UCS4 *)data + index;
+ Py_UCS4 *ucs4 = start;
+
+ for (; str < end; ++ucs4, ++str)
+ *ucs4 = (Py_UCS4)*str;
+
+ assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
+ break;
+ }
+ default:
+ Py_UNREACHABLE();
+ }
+}
+
+
+static inline void
+_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
+{
+ writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ writer->data = PyUnicode_DATA(writer->buffer);
+
+ if (!writer->readonly) {
+ writer->kind = PyUnicode_KIND(writer->buffer);
+ writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ }
+ else {
+ /* use a value smaller than PyUnicode_1BYTE_KIND() so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ writer->kind = 0;
+ assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
+ /* Copy-on-write mode: set buffer size to 0 so
+ * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
+ * next write. */
+ writer->size = 0;
+ }
+}
+
+
+void
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
+{
+ memset(writer, 0, sizeof(*writer));
+
+ /* ASCII is the bare minimum */
+ writer->min_char = 127;
+
+ /* use a kind value smaller than PyUnicode_1BYTE_KIND so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ assert(writer->kind == 0);
+ assert(writer->kind < PyUnicode_1BYTE_KIND);
+}
+
+
+PyUnicodeWriter*
+PyUnicodeWriter_Create(Py_ssize_t length)
+{
+ if (length < 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "length must be positive");
+ return NULL;
+ }
+
+ const size_t size = sizeof(_PyUnicodeWriter);
+ PyUnicodeWriter *pub_writer;
+ pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
+ if (pub_writer == NULL) {
+ pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
+ if (pub_writer == NULL) {
+ return (PyUnicodeWriter *)PyErr_NoMemory();
+ }
+ }
+ _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+ _PyUnicodeWriter_Init(writer);
+ if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
+ PyUnicodeWriter_Discard(pub_writer);
+ return NULL;
+ }
+ writer->overallocate = 1;
+
+ return pub_writer;
+}
+
+
+void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
+{
+ if (writer == NULL) {
+ return;
+ }
+ _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
+ _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
+}
+
+
+// Initialize _PyUnicodeWriter with initial buffer
+void
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
+{
+ memset(writer, 0, sizeof(*writer));
+ writer->buffer = buffer;
+ _PyUnicodeWriter_Update(writer);
+ writer->min_length = writer->size;
+}
+
+
+int
+_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+ Py_ssize_t length, Py_UCS4 maxchar)
+{
+ Py_ssize_t newlen;
+ PyObject *newbuffer;
+
+ assert(length >= 0);
+ assert(maxchar <= _Py_MAX_UNICODE);
+
+ /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+ assert((maxchar > writer->maxchar && length >= 0)
+ || length > 0);
+
+ if (length > PY_SSIZE_T_MAX - writer->pos) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ newlen = writer->pos + length;
+
+ maxchar = Py_MAX(maxchar, writer->min_char);
+
+ if (writer->buffer == NULL) {
+ assert(!writer->readonly);
+ if (writer->overallocate
+ && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
+ /* overallocate to limit the number of realloc() */
+ newlen += newlen / OVERALLOCATE_FACTOR;
+ }
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
+
+ writer->buffer = PyUnicode_New(newlen, maxchar);
+ if (writer->buffer == NULL)
+ return -1;
+ }
+ else if (newlen > writer->size) {
+ if (writer->overallocate
+ && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
+ /* overallocate to limit the number of realloc() */
+ newlen += newlen / OVERALLOCATE_FACTOR;
+ }
+ if (newlen < writer->min_length)
+ newlen = writer->min_length;
+
+ if (maxchar > writer->maxchar || writer->readonly) {
+ /* resize + widen */
+ maxchar = Py_MAX(maxchar, writer->maxchar);
+ newbuffer = PyUnicode_New(newlen, maxchar);
+ if (newbuffer == NULL)
+ return -1;
+ _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ writer->buffer, 0, writer->pos);
+ Py_DECREF(writer->buffer);
+ writer->readonly = 0;
+ }
+ else {
+ newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
+ if (newbuffer == NULL)
+ return -1;
+ }
+ writer->buffer = newbuffer;
+ }
+ else if (maxchar > writer->maxchar) {
+ assert(!writer->readonly);
+ newbuffer = PyUnicode_New(writer->size, maxchar);
+ if (newbuffer == NULL)
+ return -1;
+ _PyUnicode_FastCopyCharacters(newbuffer, 0,
+ writer->buffer, 0, writer->pos);
+ Py_SETREF(writer->buffer, newbuffer);
+ }
+ _PyUnicodeWriter_Update(writer);
+ return 0;
+
+#undef OVERALLOCATE_FACTOR
+}
+
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+ int kind)
+{
+ Py_UCS4 maxchar;
+
+ /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+ assert(writer->kind < kind);
+
+ switch (kind)
+ {
+ case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+ case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+ case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
+ default:
+ Py_UNREACHABLE();
+ }
+
+ return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
+
+int
+_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
+{
+ return _PyUnicodeWriter_WriteCharInline(writer, ch);
+}
+
+
+int
+PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
+{
+ if (ch > _Py_MAX_UNICODE) {
+ PyErr_SetString(PyExc_ValueError,
+ "character must be in range(0x110000)");
+ return -1;
+ }
+
+ return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
+}
+
+
+int
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
+{
+ assert(PyUnicode_Check(str));
+
+ Py_UCS4 maxchar;
+ Py_ssize_t len;
+
+ len = PyUnicode_GET_LENGTH(str);
+ if (len == 0)
+ return 0;
+ maxchar = PyUnicode_MAX_CHAR_VALUE(str);
+ if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
+ if (writer->buffer == NULL && !writer->overallocate) {
+ assert(_PyUnicode_CheckConsistency(str, 1));
+ writer->readonly = 1;
+ writer->buffer = Py_NewRef(str);
+ _PyUnicodeWriter_Update(writer);
+ writer->pos += len;
+ return 0;
+ }
+ if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
+ return -1;
+ }
+ _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+ str, 0, len);
+ writer->pos += len;
+ return 0;
+}
+
+
+int
+PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
+{
+ PyTypeObject *type = Py_TYPE(obj);
+ if (type == &PyUnicode_Type) {
+ return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
+ }
+
+ if (type == &PyLong_Type) {
+ return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
+ }
+
+ PyObject *str = PyObject_Str(obj);
+ if (str == NULL) {
+ return -1;
+ }
+
+ int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+ Py_DECREF(str);
+ return res;
+}
+
+
+int
+PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
+{
+ if (Py_TYPE(obj) == &PyLong_Type) {
+ return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
+ }
+
+ PyObject *repr = PyObject_Repr(obj);
+ if (repr == NULL) {
+ return -1;
+ }
+
+ int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
+ Py_DECREF(repr);
+ return res;
+}
+
+
+int
+_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
+ Py_ssize_t start, Py_ssize_t end)
+{
+ assert(0 <= start);
+ assert(end <= PyUnicode_GET_LENGTH(str));
+ assert(start <= end);
+
+ if (start == 0 && end == PyUnicode_GET_LENGTH(str))
+ return _PyUnicodeWriter_WriteStr(writer, str);
+
+ Py_ssize_t len = end - start;
+ if (len == 0) {
+ return 0;
+ }
+
+ Py_UCS4 maxchar;
+ if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
+ maxchar = _PyUnicode_FindMaxChar(str, start, end);
+ }
+ else {
+ maxchar = writer->maxchar;
+ }
+ if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
+ return -1;
+ }
+
+ _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+ str, start, len);
+ writer->pos += len;
+ return 0;
+}
+
+
+int
+PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
+ Py_ssize_t start, Py_ssize_t end)
+{
+ if (!PyUnicode_Check(str)) {
+ PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
+ return -1;
+ }
+ if (start < 0 || start > end) {
+ PyErr_Format(PyExc_ValueError, "invalid start argument");
+ return -1;
+ }
+ if (end > PyUnicode_GET_LENGTH(str)) {
+ PyErr_Format(PyExc_ValueError, "invalid end argument");
+ return -1;
+ }
+
+ return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
+ start, end);
+}
+
+
+int
+_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
+ const char *ascii, Py_ssize_t len)
+{
+ if (len == -1)
+ len = strlen(ascii);
+
+ assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
+
+ if (writer->buffer == NULL && !writer->overallocate) {
+ PyObject *str;
+
+ str = _PyUnicode_FromASCII(ascii, len);
+ if (str == NULL)
+ return -1;
+
+ writer->readonly = 1;
+ writer->buffer = str;
+ _PyUnicodeWriter_Update(writer);
+ writer->pos += len;
+ return 0;
+ }
+
+ if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
+ return -1;
+
+ switch (writer->kind)
+ {
+ case PyUnicode_1BYTE_KIND:
+ {
+ const Py_UCS1 *str = (const Py_UCS1 *)ascii;
+ Py_UCS1 *data = writer->data;
+
+ memcpy(data + writer->pos, str, len);
+ break;
+ }
+ case PyUnicode_2BYTE_KIND:
+ {
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS1, Py_UCS2,
+ ascii, ascii + len,
+ (Py_UCS2 *)writer->data + writer->pos);
+ break;
+ }
+ case PyUnicode_4BYTE_KIND:
+ {
+ _PyUnicode_CONVERT_BYTES(
+ Py_UCS1, Py_UCS4,
+ ascii, ascii + len,
+ (Py_UCS4 *)writer->data + writer->pos);
+ break;
+ }
+ default:
+ Py_UNREACHABLE();
+ }
+
+ writer->pos += len;
+ return 0;
+}
+
+
+int
+PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
+ const char *str,
+ Py_ssize_t size)
+{
+ assert(writer != NULL);
+ _Py_AssertHoldsTstate();
+
+ _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
+ return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
+}
+
+
+int
+PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
+ const char *str,
+ Py_ssize_t size)
+{
+ if (size < 0) {
+ size = strlen(str);
+ }
+
+ _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
+ Py_ssize_t old_pos = _writer->pos;
+ int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
+ _Py_ERROR_STRICT, NULL, NULL);
+ if (res < 0) {
+ _writer->pos = old_pos;
+ }
+ return res;
+}
+
+
+int
+PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
+ const char *string,
+ Py_ssize_t length,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ if (length < 0) {
+ length = strlen(string);
+ }
+
+ _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
+ Py_ssize_t old_pos = _writer->pos;
+ int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
+ _Py_ERROR_UNKNOWN, errors,
+ consumed);
+ if (res < 0) {
+ _writer->pos = old_pos;
+ if (consumed) {
+ *consumed = 0;
+ }
+ }
+ return res;
+}
+
+
+int
+_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
+ const char *str, Py_ssize_t len)
+{
+ Py_UCS4 maxchar;
+
+ maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
+ if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
+ return -1;
+ unicode_write_cstr(writer->buffer, writer->pos, str, len);
+ writer->pos += len;
+ return 0;
+}
+
+
+PyObject *
+_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
+{
+ PyObject *str;
+
+ if (writer->pos == 0) {
+ Py_CLEAR(writer->buffer);
+ return _PyUnicode_GetEmpty();
+ }
+
+ str = writer->buffer;
+ writer->buffer = NULL;
+
+ if (writer->readonly) {
+ assert(PyUnicode_GET_LENGTH(str) == writer->pos);
+ return str;
+ }
+
+ if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+ PyObject *str2;
+ str2 = _PyUnicode_ResizeCompact(str, writer->pos);
+ if (str2 == NULL) {
+ Py_DECREF(str);
+ return NULL;
+ }
+ str = str2;
+ }
+
+ assert(_PyUnicode_CheckConsistency(str, 1));
+ return _PyUnicode_Result(str);
+}
+
+
+PyObject*
+PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
+{
+ PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
+ assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
+ _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
+ return str;
+}
+
+
+void
+_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
+{
+ Py_CLEAR(writer->buffer);
+}
#include "pycore_codecs.h" // _PyCodec_Lookup()
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
#include "pycore_format.h" // F_LJUST
-#include "pycore_freelist.h" // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
#include "pycore_initconfig.h" // _PyStatus_OK()
#include "pycore_interp.h" // PyInterpreterState.fs_codec
#include "pycore_long.h" // _PyLong_FormatWriter()
}
-/* Generic helper macro to convert characters of different types.
- from_type and to_type have to be valid type names, begin and end
- are pointers to the source characters which should be of type
- "from_type *". to is a pointer of type "to_type *" and points to the
- buffer where the result characters are written to. */
-#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
- do { \
- to_type *_to = (to_type *)(to); \
- const from_type *_iter = (const from_type *)(begin);\
- const from_type *_end = (const from_type *)(end);\
- Py_ssize_t n = (_end) - (_iter); \
- const from_type *_unrolled_end = \
- _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
- while (_iter < (_unrolled_end)) { \
- _to[0] = (to_type) _iter[0]; \
- _to[1] = (to_type) _iter[1]; \
- _to[2] = (to_type) _iter[2]; \
- _to[3] = (to_type) _iter[3]; \
- _iter += 4; _to += 4; \
- } \
- while (_iter < (_end)) \
- *_to++ = (to_type) *_iter++; \
- } while (0)
-
#define LATIN1 _Py_LATIN1_CHR
-#ifdef MS_WINDOWS
- /* On Windows, overallocate by 50% is the best factor */
-# define OVERALLOCATE_FACTOR 2
-#else
- /* On Linux, overallocate by 25% is the best factor */
-# define OVERALLOCATE_FACTOR 4
-#endif
-
/* Forward declaration */
-static inline int
-_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
-static inline void
-_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors);
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
-static int
-unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
- const char *s, Py_ssize_t size,
- _Py_error_handler error_handler, const char *errors,
- Py_ssize_t *consumed);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
// Return a reference to the immortal empty string singleton.
-static inline PyObject* unicode_get_empty(void)
+PyObject*
+_PyUnicode_GetEmpty(void)
{
_Py_DECLARE_STR(empty, "");
return &_Py_STR(empty);
#define _Py_RETURN_UNICODE_EMPTY() \
do { \
- return unicode_get_empty(); \
+ return _PyUnicode_GetEmpty();\
} while (0)
#undef CHECK
}
-static PyObject*
-unicode_result(PyObject *unicode)
+PyObject*
+_PyUnicode_Result(PyObject *unicode)
{
assert(_PyUnicode_CHECK(unicode));
Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
if (length == 0) {
- PyObject *empty = unicode_get_empty();
+ PyObject *empty = _PyUnicode_GetEmpty();
if (unicode != empty) {
Py_DECREF(unicode);
}
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
+#define unicode_result _PyUnicode_Result
static PyObject*
unicode_result_unchanged(PyObject *unicode)
/* Compilation of templated routines */
-#define STRINGLIB_GET_EMPTY() unicode_get_empty()
+#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
#include "stringlib/asciilib.h"
#include "stringlib/fastsearch.h"
return copy;
}
-static PyObject*
-resize_compact(PyObject *unicode, Py_ssize_t length)
+PyObject*
+_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t char_size;
Py_ssize_t struct_size;
{
/* Optimization for empty strings */
if (size == 0) {
- return unicode_get_empty();
+ return _PyUnicode_GetEmpty();
}
PyObject *obj;
return 0;
if (length == 0) {
- PyObject *empty = unicode_get_empty();
+ PyObject *empty = _PyUnicode_GetEmpty();
Py_SETREF(*p_unicode, empty);
return 0;
}
}
if (PyUnicode_IS_COMPACT(unicode)) {
- PyObject *new_unicode = resize_compact(unicode, length);
+ PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
if (new_unicode == NULL)
return -1;
*p_unicode = new_unicode;
return unicode_resize(p_unicode, length);
}
-/* Copy an ASCII or latin1 char* string into a Python Unicode string.
-
- WARNING: The function doesn't copy the terminating null character and
- doesn't check the maximum character (may write a latin1 character in an
- ASCII string). */
-static void
-unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
- const char *str, Py_ssize_t len)
-{
- int kind = PyUnicode_KIND(unicode);
- const void *data = PyUnicode_DATA(unicode);
- const char *end = str + len;
-
- assert(index + len <= PyUnicode_GET_LENGTH(unicode));
- switch (kind) {
- case PyUnicode_1BYTE_KIND: {
-#ifdef Py_DEBUG
- if (PyUnicode_IS_ASCII(unicode)) {
- Py_UCS4 maxchar = ucs1lib_find_max_char(
- (const Py_UCS1*)str,
- (const Py_UCS1*)str + len);
- assert(maxchar < 128);
- }
-#endif
- memcpy((char *) data + index, str, len);
- break;
- }
- case PyUnicode_2BYTE_KIND: {
- Py_UCS2 *start = (Py_UCS2 *)data + index;
- Py_UCS2 *ucs2 = start;
-
- for (; str < end; ++ucs2, ++str)
- *ucs2 = (Py_UCS2)*str;
-
- assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
- break;
- }
- case PyUnicode_4BYTE_KIND: {
- Py_UCS4 *start = (Py_UCS4 *)data + index;
- Py_UCS4 *ucs4 = start;
-
- for (; str < end; ++ucs4, ++str)
- *ucs4 = (Py_UCS4)*str;
-
- assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
- break;
- }
- default:
- Py_UNREACHABLE();
- }
-}
-
static PyObject*
get_latin1_char(Py_UCS1 ch)
{
"NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
return NULL;
}
- return unicode_get_empty();
+ return _PyUnicode_GetEmpty();
}
PyObject *
}
if (width < 0) {
- return unicode_decode_utf8_writer(writer, str, length,
- _Py_ERROR_REPLACE, "replace", pconsumed);
+ return _PyUnicode_DecodeUTF8Writer(writer, str, length,
+ _Py_ERROR_REPLACE, "replace", pconsumed);
}
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
// Used by PyUnicodeWriter_WriteUTF8() implementation
-static int
-unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
- const char *s, Py_ssize_t size,
- _Py_error_handler error_handler, const char *errors,
- Py_ssize_t *consumed)
+int
+_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
+ const char *s, Py_ssize_t size,
+ _Py_error_handler error_handler, const char *errors,
+ Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
}
new_size = slen + n * (len2 - len1);
if (new_size == 0) {
- u = unicode_get_empty();
+ u = _PyUnicode_GetEmpty();
goto done;
}
if (new_size > (PY_SSIZE_T_MAX / rkind)) {
}
/* Shortcuts */
- PyObject *empty = unicode_get_empty(); // Borrowed reference
+ PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
if (left == empty) {
return PyUnicode_FromObject(right);
}
}
/* Shortcuts */
- PyObject *empty = unicode_get_empty(); // Borrowed reference
+ PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
if (left == empty) {
Py_DECREF(left);
*p_left = Py_NewRef(right);
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) {
- PyObject *empty = unicode_get_empty(); // Borrowed reference
+ PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
return PyTuple_Pack(3, str_obj, empty, empty);
}
buf1 = PyUnicode_DATA(str_obj);
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) {
- PyObject *empty = unicode_get_empty(); // Borrowed reference
+ PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
return PyTuple_Pack(3, empty, empty, str_obj);
}
buf1 = PyUnicode_DATA(str_obj);
}
-static inline void
-_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
-{
- writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
- writer->data = PyUnicode_DATA(writer->buffer);
-
- if (!writer->readonly) {
- writer->kind = PyUnicode_KIND(writer->buffer);
- writer->size = PyUnicode_GET_LENGTH(writer->buffer);
- }
- else {
- /* use a value smaller than PyUnicode_1BYTE_KIND() so
- _PyUnicodeWriter_PrepareKind() will copy the buffer. */
- writer->kind = 0;
- assert(writer->kind <= PyUnicode_1BYTE_KIND);
-
- /* Copy-on-write mode: set buffer size to 0 so
- * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
- * next write. */
- writer->size = 0;
- }
-}
-
-
-void
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
-{
- memset(writer, 0, sizeof(*writer));
-
- /* ASCII is the bare minimum */
- writer->min_char = 127;
-
- /* use a kind value smaller than PyUnicode_1BYTE_KIND so
- _PyUnicodeWriter_PrepareKind() will copy the buffer. */
- assert(writer->kind == 0);
- assert(writer->kind < PyUnicode_1BYTE_KIND);
-}
-
-
-PyUnicodeWriter*
-PyUnicodeWriter_Create(Py_ssize_t length)
-{
- if (length < 0) {
- PyErr_SetString(PyExc_ValueError,
- "length must be positive");
- return NULL;
- }
-
- const size_t size = sizeof(_PyUnicodeWriter);
- PyUnicodeWriter *pub_writer;
- pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
- if (pub_writer == NULL) {
- pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
- if (pub_writer == NULL) {
- return (PyUnicodeWriter *)PyErr_NoMemory();
- }
- }
- _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
-
- _PyUnicodeWriter_Init(writer);
- if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
- PyUnicodeWriter_Discard(pub_writer);
- return NULL;
- }
- writer->overallocate = 1;
-
- return pub_writer;
-}
-
-
-void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
-{
- if (writer == NULL) {
- return;
- }
- _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
- _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
-}
-
-
-// Initialize _PyUnicodeWriter with initial buffer
-static inline void
-_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
-{
- memset(writer, 0, sizeof(*writer));
- writer->buffer = buffer;
- _PyUnicodeWriter_Update(writer);
- writer->min_length = writer->size;
-}
-
-
-int
-_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
- Py_ssize_t length, Py_UCS4 maxchar)
-{
- Py_ssize_t newlen;
- PyObject *newbuffer;
-
- assert(length >= 0);
- assert(maxchar <= MAX_UNICODE);
-
- /* ensure that the _PyUnicodeWriter_Prepare macro was used */
- assert((maxchar > writer->maxchar && length >= 0)
- || length > 0);
-
- if (length > PY_SSIZE_T_MAX - writer->pos) {
- PyErr_NoMemory();
- return -1;
- }
- newlen = writer->pos + length;
-
- maxchar = Py_MAX(maxchar, writer->min_char);
-
- if (writer->buffer == NULL) {
- assert(!writer->readonly);
- if (writer->overallocate
- && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
- /* overallocate to limit the number of realloc() */
- newlen += newlen / OVERALLOCATE_FACTOR;
- }
- if (newlen < writer->min_length)
- newlen = writer->min_length;
-
- writer->buffer = PyUnicode_New(newlen, maxchar);
- if (writer->buffer == NULL)
- return -1;
- }
- else if (newlen > writer->size) {
- if (writer->overallocate
- && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
- /* overallocate to limit the number of realloc() */
- newlen += newlen / OVERALLOCATE_FACTOR;
- }
- if (newlen < writer->min_length)
- newlen = writer->min_length;
-
- if (maxchar > writer->maxchar || writer->readonly) {
- /* resize + widen */
- maxchar = Py_MAX(maxchar, writer->maxchar);
- newbuffer = PyUnicode_New(newlen, maxchar);
- if (newbuffer == NULL)
- return -1;
- _PyUnicode_FastCopyCharacters(newbuffer, 0,
- writer->buffer, 0, writer->pos);
- Py_DECREF(writer->buffer);
- writer->readonly = 0;
- }
- else {
- newbuffer = resize_compact(writer->buffer, newlen);
- if (newbuffer == NULL)
- return -1;
- }
- writer->buffer = newbuffer;
- }
- else if (maxchar > writer->maxchar) {
- assert(!writer->readonly);
- newbuffer = PyUnicode_New(writer->size, maxchar);
- if (newbuffer == NULL)
- return -1;
- _PyUnicode_FastCopyCharacters(newbuffer, 0,
- writer->buffer, 0, writer->pos);
- Py_SETREF(writer->buffer, newbuffer);
- }
- _PyUnicodeWriter_Update(writer);
- return 0;
-
-#undef OVERALLOCATE_FACTOR
-}
-
-int
-_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
- int kind)
-{
- Py_UCS4 maxchar;
-
- /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
- assert(writer->kind < kind);
-
- switch (kind)
- {
- case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
- case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
- case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
- default:
- Py_UNREACHABLE();
- }
-
- return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
-}
-
-int
-_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
-{
- return _PyUnicodeWriter_WriteCharInline(writer, ch);
-}
-
-int
-PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
-{
- if (ch > MAX_UNICODE) {
- PyErr_SetString(PyExc_ValueError,
- "character must be in range(0x110000)");
- return -1;
- }
-
- return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
-}
-
-int
-_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
-{
- assert(PyUnicode_Check(str));
-
- Py_UCS4 maxchar;
- Py_ssize_t len;
-
- len = PyUnicode_GET_LENGTH(str);
- if (len == 0)
- return 0;
- maxchar = PyUnicode_MAX_CHAR_VALUE(str);
- if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
- if (writer->buffer == NULL && !writer->overallocate) {
- assert(_PyUnicode_CheckConsistency(str, 1));
- writer->readonly = 1;
- writer->buffer = Py_NewRef(str);
- _PyUnicodeWriter_Update(writer);
- writer->pos += len;
- return 0;
- }
- if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
- return -1;
- }
- _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
- str, 0, len);
- writer->pos += len;
- return 0;
-}
-
-int
-PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
-{
- PyTypeObject *type = Py_TYPE(obj);
- if (type == &PyUnicode_Type) {
- return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
- }
-
- if (type == &PyLong_Type) {
- return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
- }
-
- PyObject *str = PyObject_Str(obj);
- if (str == NULL) {
- return -1;
- }
-
- int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
- Py_DECREF(str);
- return res;
-}
-
-
-int
-PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
-{
- if (Py_TYPE(obj) == &PyLong_Type) {
- return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
- }
-
- PyObject *repr = PyObject_Repr(obj);
- if (repr == NULL) {
- return -1;
- }
-
- int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
- Py_DECREF(repr);
- return res;
-}
-
-
-int
-_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
- Py_ssize_t start, Py_ssize_t end)
-{
- assert(0 <= start);
- assert(end <= PyUnicode_GET_LENGTH(str));
- assert(start <= end);
-
- if (start == 0 && end == PyUnicode_GET_LENGTH(str))
- return _PyUnicodeWriter_WriteStr(writer, str);
-
- Py_ssize_t len = end - start;
- if (len == 0) {
- return 0;
- }
-
- Py_UCS4 maxchar;
- if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
- maxchar = _PyUnicode_FindMaxChar(str, start, end);
- }
- else {
- maxchar = writer->maxchar;
- }
- if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
- return -1;
- }
-
- _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
- str, start, len);
- writer->pos += len;
- return 0;
-}
-
-
-int
-PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
- Py_ssize_t start, Py_ssize_t end)
-{
- if (!PyUnicode_Check(str)) {
- PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
- return -1;
- }
- if (start < 0 || start > end) {
- PyErr_Format(PyExc_ValueError, "invalid start argument");
- return -1;
- }
- if (end > PyUnicode_GET_LENGTH(str)) {
- PyErr_Format(PyExc_ValueError, "invalid end argument");
- return -1;
- }
-
- return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
- start, end);
-}
-
-
-int
-_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
- const char *ascii, Py_ssize_t len)
-{
- if (len == -1)
- len = strlen(ascii);
-
- assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
-
- if (writer->buffer == NULL && !writer->overallocate) {
- PyObject *str;
-
- str = _PyUnicode_FromASCII(ascii, len);
- if (str == NULL)
- return -1;
-
- writer->readonly = 1;
- writer->buffer = str;
- _PyUnicodeWriter_Update(writer);
- writer->pos += len;
- return 0;
- }
-
- if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
- return -1;
-
- switch (writer->kind)
- {
- case PyUnicode_1BYTE_KIND:
- {
- const Py_UCS1 *str = (const Py_UCS1 *)ascii;
- Py_UCS1 *data = writer->data;
-
- memcpy(data + writer->pos, str, len);
- break;
- }
- case PyUnicode_2BYTE_KIND:
- {
- _PyUnicode_CONVERT_BYTES(
- Py_UCS1, Py_UCS2,
- ascii, ascii + len,
- (Py_UCS2 *)writer->data + writer->pos);
- break;
- }
- case PyUnicode_4BYTE_KIND:
- {
- _PyUnicode_CONVERT_BYTES(
- Py_UCS1, Py_UCS4,
- ascii, ascii + len,
- (Py_UCS4 *)writer->data + writer->pos);
- break;
- }
- default:
- Py_UNREACHABLE();
- }
-
- writer->pos += len;
- return 0;
-}
-
-
-int
-PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
- const char *str,
- Py_ssize_t size)
-{
- assert(writer != NULL);
- _Py_AssertHoldsTstate();
-
- _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
- return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
-}
-
-
-int
-PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
- const char *str,
- Py_ssize_t size)
-{
- if (size < 0) {
- size = strlen(str);
- }
-
- _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
- Py_ssize_t old_pos = _writer->pos;
- int res = unicode_decode_utf8_writer(_writer, str, size,
- _Py_ERROR_STRICT, NULL, NULL);
- if (res < 0) {
- _writer->pos = old_pos;
- }
- return res;
-}
-
-
-int
-PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
- const char *string,
- Py_ssize_t length,
- const char *errors,
- Py_ssize_t *consumed)
-{
- if (length < 0) {
- length = strlen(string);
- }
-
- _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
- Py_ssize_t old_pos = _writer->pos;
- int res = unicode_decode_utf8_writer(_writer, string, length,
- _Py_ERROR_UNKNOWN, errors, consumed);
- if (res < 0) {
- _writer->pos = old_pos;
- if (consumed) {
- *consumed = 0;
- }
- }
- return res;
-}
-
-
-int
-_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
- const char *str, Py_ssize_t len)
-{
- Py_UCS4 maxchar;
-
- maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
- if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
- return -1;
- unicode_write_cstr(writer->buffer, writer->pos, str, len);
- writer->pos += len;
- return 0;
-}
-
-PyObject *
-_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
-{
- PyObject *str;
-
- if (writer->pos == 0) {
- Py_CLEAR(writer->buffer);
- _Py_RETURN_UNICODE_EMPTY();
- }
-
- str = writer->buffer;
- writer->buffer = NULL;
-
- if (writer->readonly) {
- assert(PyUnicode_GET_LENGTH(str) == writer->pos);
- return str;
- }
-
- if (PyUnicode_GET_LENGTH(str) != writer->pos) {
- PyObject *str2;
- str2 = resize_compact(str, writer->pos);
- if (str2 == NULL) {
- Py_DECREF(str);
- return NULL;
- }
- str = str2;
- }
-
- assert(_PyUnicode_CheckConsistency(str, 1));
- return unicode_result(str);
-}
-
-
-PyObject*
-PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
-{
- PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
- assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
- _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
- return str;
-}
-
-
-void
-_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
-{
- Py_CLEAR(writer->buffer);
-}
-
#include "stringlib/unicode_format.h"
PyDoc_STRVAR(format__doc__,
{
PyObject *unicode;
if (x == NULL) {
- unicode = unicode_get_empty();
+ unicode = _PyUnicode_GetEmpty();
}
else if (encoding == NULL && errors == NULL) {
unicode = PyObject_Str(x);
return NULL;
}
if (nargs == 0) {
- return unicode_get_empty();
+ return _PyUnicode_GetEmpty();
}
PyObject *object = args[0];
if (nargs == 1) {
if (it->it_seq != NULL) {
return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
} else {
- PyObject *u = unicode_get_empty();
+ PyObject *u = _PyUnicode_GetEmpty();
if (u == NULL) {
Py_XDECREF(iter);
return NULL;