]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-129117: Add unicodedata.isxidstart() function (#140269)
authorStan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>
Thu, 30 Oct 2025 10:18:12 +0000 (10:18 +0000)
committerGitHub <noreply@github.com>
Thu, 30 Oct 2025 10:18:12 +0000 (10:18 +0000)
Expose `_PyUnicode_IsXidContinue/Start` in `unicodedata`:
add isxidstart() and isxidcontinue() functions.

Co-authored-by: Victor Stinner <vstinner@python.org>
13 files changed:
Doc/library/unicodedata.rst
Doc/whatsnew/3.15.rst
Include/internal/pycore_unicodectype.h [new file with mode: 0644]
Include/internal/pycore_unicodeobject.h
Lib/test/test_unicodedata.py
Makefile.pre.in
Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst [new file with mode: 0644]
Modules/clinic/unicodedata.c.h
Modules/unicodedata.c
Objects/unicodectype.c
Objects/unicodeobject.c
PCbuild/pythoncore.vcxproj
PCbuild/pythoncore.vcxproj.filters

index 0369cd99c47c1818c3f8665daac91d926e8f39d0..c49bf6417046169ba44c41b6ce2bd91eb0eef1e2 100644 (file)
@@ -144,6 +144,36 @@ following functions:
       1
 
 
+.. function:: isxidstart(chr, /)
+
+   Return ``True`` if *chr* is a valid identifier start per the
+   `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
+   that is, it has the ``XID_Start`` property. Return ``False`` otherwise.
+   For example::
+
+      >>> unicodedata.isxidstart('S')
+      True
+      >>> unicodedata.isxidstart('0')
+      False
+
+   .. versionadded:: next
+
+
+.. function:: isxidcontinue(chr, /)
+
+   Return ``True`` if *chr* is a valid identifier character per the
+   `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
+   that is, it has the ``XID_Continue`` property. Return ``False`` otherwise.
+   For example::
+
+      >>> unicodedata.isxidcontinue('S')
+      True
+      >>> unicodedata.isxidcontinue(' ')
+      False
+
+   .. versionadded:: next
+
+
 .. function:: decomposition(chr)
 
    Returns the character decomposition mapping assigned to the character
index 85b4c12544a0c9c9f8fca0c66e79e2754d07938f..fe9adfe9f730ecc1aba479fdb32bd78d7994cbca 100644 (file)
@@ -794,6 +794,11 @@ unicodedata
 
 * The Unicode database has been updated to Unicode 17.0.0.
 
+* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue`
+  functions to check whether a character can start or continue a
+  `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
+  (Contributed by Stan Ulbrych in :gh:`129117`.)
+
 
 wave
 ----
diff --git a/Include/internal/pycore_unicodectype.h b/Include/internal/pycore_unicodectype.h
new file mode 100644 (file)
index 0000000..523bdb5
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_UNICODECTYPE_H
+#define Py_INTERNAL_UNICODECTYPE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
+extern int _PyUnicode_IsCased(Py_UCS4 ch);
+
+// Export for 'unicodedata' shared extension.
+PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
+PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODECTYPE_H */
index b83039c1869f234d95aa4f1f11db8e59bb2dd4a6..f384fad8713adcc99e72f25eb4f4d2d536f60645 100644 (file)
@@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
     return 0;
 }
 
-
-/* --- Characters Type APIs ----------------------------------------------- */
-
-extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
-extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
-extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
-extern int _PyUnicode_IsCased(Py_UCS4 ch);
-
 /* --- Unicode API -------------------------------------------------------- */
 
 // Export for '_json' shared extension
index 8013eaf6e9d85191c7d4403d413960978258406a..a3c22a4f27ee77e24675682fcbc16d00b62c91ff 100644 (file)
@@ -276,6 +276,33 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
         self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
         self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
 
+    def test_isxidstart(self):
+        self.assertTrue(self.db.isxidstart('S'))
+        self.assertTrue(self.db.isxidstart('\u0AD0'))  # GUJARATI OM
+        self.assertTrue(self.db.isxidstart('\u0EC6'))  # LAO KO LA
+        self.assertTrue(self.db.isxidstart('\u17DC'))  # KHMER SIGN AVAKRAHASANYA
+        self.assertTrue(self.db.isxidstart('\uA015'))  # YI SYLLABLE WU
+        self.assertTrue(self.db.isxidstart('\uFE7B'))  # ARABIC KASRA MEDIAL FORM
+
+        self.assertFalse(self.db.isxidstart(' '))
+        self.assertFalse(self.db.isxidstart('0'))
+        self.assertRaises(TypeError, self.db.isxidstart)
+        self.assertRaises(TypeError, self.db.isxidstart, 'xx')
+
+    def test_isxidcontinue(self):
+        self.assertTrue(self.db.isxidcontinue('S'))
+        self.assertTrue(self.db.isxidcontinue('_'))
+        self.assertTrue(self.db.isxidcontinue('0'))
+        self.assertTrue(self.db.isxidcontinue('\u00BA'))  # MASCULINE ORDINAL INDICATOR
+        self.assertTrue(self.db.isxidcontinue('\u0640'))  # ARABIC TATWEEL
+        self.assertTrue(self.db.isxidcontinue('\u0710'))  # SYRIAC LETTER ALAPH
+        self.assertTrue(self.db.isxidcontinue('\u0B3E'))  # ORIYA VOWEL SIGN AA
+        self.assertTrue(self.db.isxidcontinue('\u17D7'))  # KHMER SIGN LEK TOO
+
+        self.assertFalse(self.db.isxidcontinue(' '))
+        self.assertRaises(TypeError, self.db.isxidcontinue)
+        self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
+
 class UnicodeMiscTest(UnicodeDatabaseTest):
 
     @cpython_only
index 19423c11545c1984a9592088a32353365ef7385c..0a1b8d028addad5d798716c6aa644eabc80add37 100644 (file)
@@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \
                $(srcdir)/Include/internal/pycore_typeobject.h \
                $(srcdir)/Include/internal/pycore_typevarobject.h \
                $(srcdir)/Include/internal/pycore_ucnhash.h \
+               $(srcdir)/Include/internal/pycore_unicodectype.h \
                $(srcdir)/Include/internal/pycore_unicodeobject.h \
                $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
                $(srcdir)/Include/internal/pycore_unionobject.h \
diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst
new file mode 100644 (file)
index 0000000..8767b1b
--- /dev/null
@@ -0,0 +1,3 @@
+:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and
+:func:`~unicodedata.isxidcontinue` functions to check whether a character can
+start or continue a `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
index 345440eeee89a688c212f90d286f606bfb7a8588..5fcba083c2f4ce44795bc96990a8c31143b87a4c 100644 (file)
@@ -518,6 +518,78 @@ exit:
     return return_value;
 }
 
+PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__,
+"isxidstart($self, chr, /)\n"
+"--\n"
+"\n"
+"Return True if the character has the XID_Start property, else False.");
+
+#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF    \
+    {"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__},
+
+static PyObject *
+unicodedata_UCD_isxidstart_impl(PyObject *self, int chr);
+
+static PyObject *
+unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "isxidstart(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_UCD_isxidstart_impl(self, chr);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__,
+"isxidcontinue($self, chr, /)\n"
+"--\n"
+"\n"
+"Return True if the character has the XID_Continue property, else False.");
+
+#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF    \
+    {"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__},
+
+static PyObject *
+unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr);
+
+static PyObject *
+unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "isxidcontinue(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_UCD_isxidcontinue_impl(self, chr);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(unicodedata_UCD_lookup__doc__,
 "lookup($self, name, /)\n"
 "--\n"
@@ -549,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
index a3699beff7da0171d484e3caa74092f09e7033a0..a6094676d4194cca15e8b1293039ef22340edb8d 100644 (file)
@@ -19,6 +19,7 @@
 #include "Python.h"
 #include "pycore_object.h"        // _PyObject_VisitType()
 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
+#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart()
 
 #include <stdbool.h>
 #include <stddef.h>               // offsetof()
@@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
     return PyUnicode_FromString(name);
 }
 
+/*[clinic input]
+unicodedata.UCD.isxidstart
+
+    self: self
+    chr: int(accept={str})
+    /
+
+Return True if the character has the XID_Start property, else False.
+
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_isxidstart_impl(PyObject *self, int chr)
+/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/
+{
+    if (UCD_Check(self)) {
+        const change_record *old = get_old_record(self, chr);
+        if (old->category_changed == 0) {
+            /* unassigned */
+            Py_RETURN_FALSE;
+        }
+    }
+
+    return PyBool_FromLong(_PyUnicode_IsXidStart(chr));
+}
+
+/*[clinic input]
+unicodedata.UCD.isxidcontinue
+
+    self: self
+    chr: int(accept={str})
+    /
+
+Return True if the character has the XID_Continue property, else False.
+
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr)
+/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/
+{
+    if (UCD_Check(self)) {
+        const change_record *old = get_old_record(self, chr);
+        if (old->category_changed == 0) {
+            /* unassigned */
+            Py_RETURN_FALSE;
+        }
+    }
+
+    return PyBool_FromLong(_PyUnicode_IsXidContinue(chr));
+}
+
 /*[clinic input]
 unicodedata.UCD.lookup
 
@@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = {
     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
     UNICODEDATA_UCD_NAME_METHODDEF
+    UNICODEDATA_UCD_ISXIDSTART_METHODDEF
+    UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF
     UNICODEDATA_UCD_LOOKUP_METHODDEF
     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
     UNICODEDATA_UCD_NORMALIZE_METHODDEF
index 7cd0dca3d1354551b8d431fec384aed97ac70033..fdd380190ac1eccfa8beb24a4b1dfd76a3d46c3d 100644 (file)
@@ -9,6 +9,7 @@
 */
 
 #include "Python.h"
+#include "pycore_unicodectype.h"   // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue()
 
 #define ALPHA_MASK 0x01
 #define DECIMAL_MASK 0x02
index f60f7dd2d13604992ee9378db8e6b47093b276ca..8a5638ac1406ab6e0b5c49cb11897ee740c50366 100644 (file)
@@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
+#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
 #include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
 
index 2657ee5c444e607f4e6a2e0fbbc512632506a2b9..a101c1b45cf25c7a6869f352961c6305d461019b 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_typevarobject.h" />
     <ClInclude Include="..\Include\internal\pycore_ucnhash.h" />
     <ClInclude Include="..\Include\internal\pycore_unionobject.h" />
+    <ClInclude Include="..\Include\internal\pycore_unicodectype.h" />
     <ClInclude Include="..\Include\internal\pycore_unicodeobject.h" />
     <ClInclude Include="..\Include\internal\pycore_unicodeobject_generated.h" />
     <ClInclude Include="..\Include\internal\pycore_uniqueid.h" />
index 9c12be6e9356a62cce50d4a3ac657b506f68ee5d..e3f261c2b92ab97454f40bf34ab753ee84971aee 100644 (file)
     <ClInclude Include="..\Include\cpython\initconfig.h">
       <Filter>Include\cpython</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_unicodectype.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_unicodeobject.h">
       <Filter>Include\internal</Filter>
     </ClInclude>