gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize()...

author Serhiy Storchaka <storchaka@gmail.com>

Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)

committer GitHub <noreply@github.com>

Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)
committer GitHub <noreply@github.com>
Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index 2a2cb1b8c458e7236c018b6a5dfa4e4d2f178340..5ab9f1cab23ef84b64e63f4993247419b17889f8 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
     :c:func:`PyErr_Occurred` to check for errors.
  
  
+.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size)
+
+   Compare a Unicode object with a char buffer which is interpreted as
+   being UTF-8 or ASCII encoded and return true (``1``) if they are equal,
+   or false (``0``) otherwise.
+   If the Unicode object contains surrogate characters or
+   the C string is not valid UTF-8, false (``0``) is returned.
+
+   This function does not raise exceptions.
+
+   .. versionadded:: 3.13
+
+
+.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
+
+   Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string*
+   length using :c:func:`!strlen`.
+   If the Unicode object contains null characters, false (``0``) is returned.
+
+   .. versionadded:: 3.13
+
+
  .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string)
  
     Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less
diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat

index 5bccd5edf586f4f6ce59edd196f9bd2b1ed10f7e..6ec9c907254b04c0075f295689896e4016a311bc 100644 (file)
--- a/Doc/data/stable_abi.dat
+++ b/Doc/data/stable_abi.dat
@@ -755,6 +755,8 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,,
  function,PyUnicode_EncodeCodePage,3.7,on Windows,
  function,PyUnicode_EncodeFSDefault,3.2,,
  function,PyUnicode_EncodeLocale,3.7,,
+function,PyUnicode_EqualToUTF8,3.13,,
+function,PyUnicode_EqualToUTF8AndSize,3.13,,
  function,PyUnicode_FSConverter,3.2,,
  function,PyUnicode_FSDecoder,3.2,,
  function,PyUnicode_Find,3.2,,
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst

index 8b67c2737cde5d1976c6ff47cc46849ec3bfa24a..bbc1fecf4964d8479872d53a67bb957b07f5064a 100644 (file)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -1024,6 +1024,12 @@ New Features
    functions on Python 3.11 and 3.12.
    (Contributed by Victor Stinner in :gh:`107073`.)
  
+* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8`
+  functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded
+  string and return true (``1``) if they are equal, or false (``0``) otherwise.
+  These functions do not raise exceptions.
+  (Contributed by Serhiy Storchaka in :gh:`110289`.)
+
  * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to
    :c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error
    if it is NULL. The caller is responsible to check if the result is NULL.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index f00277787122aa1921e654d9f4376613cafd2722..dee00715b3c51d576da7bfbcd84ae362f5f14a0b 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
      const char *right           /* ASCII-encoded string */
      );
  
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+/* Compare a Unicode object with UTF-8 encoded C string.
+   Return 1 if they are equal, or 0 otherwise.
+   This function does not raise exceptions. */
+
+PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
+PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
+#endif
+
  /* Rich compare two strings and return one of the following:
  
     - NULL in case an exception was raised
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py

index 622ee8993907fa053634d61534447b427921b4c4..a73e669dda7ddce0c708fa47494144e81e213ede 100644 (file)
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -1297,6 +1297,118 @@ class CAPITest(unittest.TestCase):
          # CRASHES comparewithasciistring([], b'abc')
          # CRASHES comparewithasciistring(NULL, b'abc')
  
+    @support.cpython_only
+    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
+    def test_equaltoutf8(self):
+        # Test PyUnicode_EqualToUTF8()
+        from _testcapi import unicode_equaltoutf8 as equaltoutf8
+        from _testcapi import unicode_asutf8andsize as asutf8andsize
+
+        strings = [
+            'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
+            '\U0001f600\U0001f601\U0001f602',
+            '\U0010ffff',
+        ]
+        for s in strings:
+            # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
+            # encoded string cached in the Unicode object.
+            asutf8andsize(s, 0)
+            b = s.encode()
+            self.assertEqual(equaltoutf8(s, b), 1)  # Use the UTF-8 cache.
+            s2 = b.decode()  # New Unicode object without the UTF-8 cache.
+            self.assertEqual(equaltoutf8(s2, b), 1)
+            self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
+            self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
+            self.assertEqual(equaltoutf8(s, b + b'\0'), 1)
+            self.assertEqual(equaltoutf8(s2, b + b'\0'), 1)
+            self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
+            self.assertEqual(equaltoutf8(s + '\0', b), 0)
+            self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
+            self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
+            self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
+
+        self.assertEqual(equaltoutf8('', b''), 1)
+        self.assertEqual(equaltoutf8('', b'\0'), 1)
+
+        # embedded null chars/bytes
+        self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1)
+        self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0)
+        self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0)
+
+        # Surrogate characters are always treated as not equal
+        self.assertEqual(equaltoutf8('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogateescape")), 0)
+        self.assertEqual(equaltoutf8('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogatepass")), 0)
+        self.assertEqual(equaltoutf8('\ud801',
+                            '\ud801'.encode("utf8", "surrogatepass")), 0)
+
+    @support.cpython_only
+    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
+    def test_equaltoutf8andsize(self):
+        # Test PyUnicode_EqualToUTF8AndSize()
+        from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
+        from _testcapi import unicode_asutf8andsize as asutf8andsize
+
+        strings = [
+            'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
+            '\U0001f600\U0001f601\U0001f602',
+            '\U0010ffff',
+        ]
+        for s in strings:
+            # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
+            # encoded string cached in the Unicode object.
+            asutf8andsize(s, 0)
+            b = s.encode()
+            self.assertEqual(equaltoutf8andsize(s, b), 1)  # Use the UTF-8 cache.
+            s2 = b.decode()  # New Unicode object without the UTF-8 cache.
+            self.assertEqual(equaltoutf8andsize(s2, b), 1)
+            self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1)
+            self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0)
+            self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0)
+            self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0)
+            self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1)
+            self.assertEqual(equaltoutf8andsize(s + '\0', b), 0)
+            self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0)
+            self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0)
+            self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0)
+            # Not null-terminated,
+            self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1)
+            self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1)
+            self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1)
+            self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0)
+
+        self.assertEqual(equaltoutf8andsize('', b''), 1)
+        self.assertEqual(equaltoutf8andsize('', b'\0'), 0)
+        self.assertEqual(equaltoutf8andsize('', b'x', 0), 1)
+
+        # embedded null chars/bytes
+        self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1)
+        self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)
+
+        # Surrogate characters are always treated as not equal
+        self.assertEqual(equaltoutf8andsize('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogateescape")), 0)
+        self.assertEqual(equaltoutf8andsize('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogatepass")), 0)
+        self.assertEqual(equaltoutf8andsize('\ud801',
+                            '\ud801'.encode("utf8", "surrogatepass")), 0)
+
+        def check_not_equal_encoding(text, encoding):
+            self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0)
+            self.assertNotEqual(text.encode(encoding), text.encode("utf8"))
+
+        # Strings encoded to other encodings are not equal to expected UTF8-encoding string
+        check_not_equal_encoding('Stéphane', 'latin1')
+        check_not_equal_encoding('Stéphane', 'utf-16-le')  # embedded null characters
+        check_not_equal_encoding('北京市', 'gbk')
+
+        # CRASHES equaltoutf8andsize('abc', b'abc', -1)
+        # CRASHES equaltoutf8andsize(b'abc', b'abc')
+        # CRASHES equaltoutf8andsize([], b'abc')
+        # CRASHES equaltoutf8andsize(NULL, b'abc')
+        # CRASHES equaltoutf8andsize('abc', NULL)
+
      @support.cpython_only
      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
      def test_richcompare(self):
diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py

index 4691687ed9d391f27b15debf0e13f5e8353d4742..e06f9cabf4366bc2490c62ddc6644d96b8e21ded 100644 (file)
--- a/Lib/test/test_stable_abi_ctypes.py
+++ b/Lib/test/test_stable_abi_ctypes.py
@@ -770,6 +770,8 @@ SYMBOL_NAMES = (
      "PyUnicode_DecodeUnicodeEscape",
      "PyUnicode_EncodeFSDefault",
      "PyUnicode_EncodeLocale",
+    "PyUnicode_EqualToUTF8",
+    "PyUnicode_EqualToUTF8AndSize",
      "PyUnicode_FSConverter",
      "PyUnicode_FSDecoder",
      "PyUnicode_Find",
diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst

new file mode 100644 (file)

index 0000000..9028e35
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst
@@ -0,0 +1 @@
+Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions.
diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml

index 469fd27b622344d7573273110e956d9167a9a9e4..9d66b92eb8edf06675da3c43fff162a9c767b95d 100644 (file)
--- a/Misc/stable_abi.toml
+++ b/Misc/stable_abi.toml
@@ -2462,3 +2462,7 @@
      added = '3.13'
  [function.Py_IsFinalizing]
      added = '3.13'
+[function.PyUnicode_EqualToUTF8]
+    added = '3.13'
+[function.PyUnicode_EqualToUTF8AndSize]
+    added = '3.13'
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c

index 232b2ad543fca0c7088b0486b9d87d7bbd24dab4..d52d88a65d86fcc93285fd05d95a5298d80f95d3 100644 (file)
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args)
      return PyLong_FromLong(result);
  }
  
+/* Test PyUnicode_EqualToUTF8() */
+static PyObject *
+unicode_equaltoutf8(PyObject *self, PyObject *args)
+{
+    PyObject *left;
+    const char *right = NULL;
+    Py_ssize_t right_len;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) {
+        return NULL;
+    }
+
+    NULLABLE(left);
+    result = PyUnicode_EqualToUTF8(left, right);
+    assert(!PyErr_Occurred());
+    return PyLong_FromLong(result);
+}
+
+/* Test PyUnicode_EqualToUTF8AndSize() */
+static PyObject *
+unicode_equaltoutf8andsize(PyObject *self, PyObject *args)
+{
+    PyObject *left;
+    const char *right = NULL;
+    Py_ssize_t right_len;
+    Py_ssize_t size = -100;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) {
+        return NULL;
+    }
+
+    NULLABLE(left);
+    if (size == -100) {
+        size = right_len;
+    }
+    result = PyUnicode_EqualToUTF8AndSize(left, right, size);
+    assert(!PyErr_Occurred());
+    return PyLong_FromLong(result);
+}
+
  /* Test PyUnicode_RichCompare() */
  static PyObject *
  unicode_richcompare(PyObject *self, PyObject *args)
@@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = {
      {"unicode_replace",          unicode_replace,                METH_VARARGS},
      {"unicode_compare",          unicode_compare,                METH_VARARGS},
      {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS},
+    {"unicode_equaltoutf8",      unicode_equaltoutf8,            METH_VARARGS},
+    {"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize,    METH_VARARGS},
      {"unicode_richcompare",      unicode_richcompare,            METH_VARARGS},
      {"unicode_format",           unicode_format,                 METH_VARARGS},
      {"unicode_contains",         unicode_contains,               METH_VARARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 49981a1f881c21188b7582361d9f4c7db70ed920..33cbc987d4328253a65b6e7e618519efd94c9a8f 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
      }
  }
  
+int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
+}
+
+int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
+{
+    assert(_PyUnicode_CHECK(unicode));
+    assert(str);
+
+    if (PyUnicode_IS_ASCII(unicode)) {
+        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+        return size == len &&
+            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
+    }
+    if (PyUnicode_UTF8(unicode) != NULL) {
+        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
+        return size == len &&
+            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
+    }
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
+        return 0;
+    }
+    const unsigned char *s = (const unsigned char *)str;
+    const unsigned char *ends = s + (size_t)size;
+    int kind = PyUnicode_KIND(unicode);
+    const void *data = PyUnicode_DATA(unicode);
+    /* Compare Unicode string and UTF-8 string */
+    for (Py_ssize_t i = 0; i < len; i++) {
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        if (ch < 0x80) {
+            if (ends == s || s[0] != ch) {
+                return 0;
+            }
+            s += 1;
+        }
+        else if (ch < 0x800) {
+            if ((ends - s) < 2 ||
+                s[0] != (0xc0 | (ch >> 6)) ||
+                s[1] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 2;
+        }
+        else if (ch < 0x10000) {
+            if (Py_UNICODE_IS_SURROGATE(ch) ||
+                (ends - s) < 3 ||
+                s[0] != (0xe0 | (ch >> 12)) ||
+                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[2] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 3;
+        }
+        else {
+            assert(ch <= MAX_UNICODE);
+            if ((ends - s) < 4 ||
+                s[0] != (0xf0 | (ch >> 18)) ||
+                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
+                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[3] != (0x80 | (ch & 0x3f)))
+            {
+                return 0;
+            }
+            s += 4;
+        }
+    }
+    return s == ends;
+}
+
  int
  _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
  {
diff --git a/PC/python3dll.c b/PC/python3dll.c

index 785d6886f39f6d0d647a6391b9030954463b699f..7ee11746770442c6cdcfb8eb88416f33d881a0e0 100755 (executable)
--- a/PC/python3dll.c
+++ b/PC/python3dll.c
@@ -689,6 +689,8 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful)
  EXPORT_FUNC(PyUnicode_EncodeCodePage)
  EXPORT_FUNC(PyUnicode_EncodeFSDefault)
  EXPORT_FUNC(PyUnicode_EncodeLocale)
+EXPORT_FUNC(PyUnicode_EqualToUTF8)
+EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize)
  EXPORT_FUNC(PyUnicode_Find)
  EXPORT_FUNC(PyUnicode_FindChar)
  EXPORT_FUNC(PyUnicode_Format)
author	Serhiy Storchaka <storchaka@gmail.com>
	Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)
committer	GitHub <noreply@github.com>
	Wed, 11 Oct 2023 13:41:58 +0000 (16:41 +0300)
Doc/c-api/unicode.rst		patch \| blob \| blame \| history
Doc/data/stable_abi.dat		patch \| blob \| blame \| history
Doc/whatsnew/3.13.rst		patch \| blob \| blame \| history
Include/unicodeobject.h		patch \| blob \| blame \| history
Lib/test/test_capi/test_unicode.py		patch \| blob \| blame \| history
Lib/test/test_stable_abi_ctypes.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst	[new file with mode: 0644]	patch \| blob
Misc/stable_abi.toml		patch \| blob \| blame \| history
Modules/_testcapi/unicode.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
PC/python3dll.c		patch \| blob \| blame \| history