From dbe3950a76cce176c6c185b873f9552503d87043 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:18:12 +0000 Subject: [PATCH] gh-129117: Add unicodedata.isxidstart() function (#140269) Expose `_PyUnicode_IsXidContinue/Start` in `unicodedata`: add isxidstart() and isxidcontinue() functions. Co-authored-by: Victor Stinner --- Doc/library/unicodedata.rst | 30 ++++++++ Doc/whatsnew/3.15.rst | 5 ++ Include/internal/pycore_unicodectype.h | 25 +++++++ Include/internal/pycore_unicodeobject.h | 12 --- Lib/test/test_unicodedata.py | 27 +++++++ Makefile.pre.in | 1 + ...-10-17-20-42-38.gh-issue-129117.X9jr4p.rst | 3 + Modules/clinic/unicodedata.c.h | 74 ++++++++++++++++++- Modules/unicodedata.c | 55 ++++++++++++++ Objects/unicodectype.c | 1 + Objects/unicodeobject.c | 1 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + 13 files changed, 225 insertions(+), 13 deletions(-) create mode 100644 Include/internal/pycore_unicodectype.h create mode 100644 Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 0369cd99c47c..c49bf6417046 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -144,6 +144,36 @@ following functions: 1 +.. function:: isxidstart(chr, /) + + Return ``True`` if *chr* is a valid identifier start per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Start`` property. Return ``False`` otherwise. + For example:: + + >>> unicodedata.isxidstart('S') + True + >>> unicodedata.isxidstart('0') + False + + .. versionadded:: next + + +.. function:: isxidcontinue(chr, /) + + Return ``True`` if *chr* is a valid identifier character per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Continue`` property. Return ``False`` otherwise. + For example:: + + >>> unicodedata.isxidcontinue('S') + True + >>> unicodedata.isxidcontinue(' ') + False + + .. versionadded:: next + + .. function:: decomposition(chr) Returns the character decomposition mapping assigned to the character diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 85b4c12544a0..fe9adfe9f730 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -794,6 +794,11 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. +* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue` + functions to check whether a character can start or continue a + `Unicode Standard Annex #31 `_ identifier. + (Contributed by Stan Ulbrych in :gh:`129117`.) + wave ---- diff --git a/Include/internal/pycore_unicodectype.h b/Include/internal/pycore_unicodectype.h new file mode 100644 index 000000000000..523bdb56b09c --- /dev/null +++ b/Include/internal/pycore_unicodectype.h @@ -0,0 +1,25 @@ +#ifndef Py_INTERNAL_UNICODECTYPE_H +#define Py_INTERNAL_UNICODECTYPE_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); +extern int _PyUnicode_IsCased(Py_UCS4 ch); + +// Export for 'unicodedata' shared extension. +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_UNICODECTYPE_H */ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index b83039c1869f..f384fad8713a 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) return 0; } - -/* --- Characters Type APIs ----------------------------------------------- */ - -extern int _PyUnicode_IsXidStart(Py_UCS4 ch); -extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); -extern int _PyUnicode_IsCased(Py_UCS4 ch); - /* --- Unicode API -------------------------------------------------------- */ // Export for '_json' shared extension diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8013eaf6e9d8..a3c22a4f27ee 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -276,6 +276,33 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') + def test_isxidstart(self): + self.assertTrue(self.db.isxidstart('S')) + self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM + self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA + self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA + self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU + self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM + + self.assertFalse(self.db.isxidstart(' ')) + self.assertFalse(self.db.isxidstart('0')) + self.assertRaises(TypeError, self.db.isxidstart) + self.assertRaises(TypeError, self.db.isxidstart, 'xx') + + def test_isxidcontinue(self): + self.assertTrue(self.db.isxidcontinue('S')) + self.assertTrue(self.db.isxidcontinue('_')) + self.assertTrue(self.db.isxidcontinue('0')) + self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR + self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL + self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH + self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA + self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO + + self.assertFalse(self.db.isxidcontinue(' ')) + self.assertRaises(TypeError, self.db.isxidcontinue) + self.assertRaises(TypeError, self.db.isxidcontinue, 'xx') + class UnicodeMiscTest(UnicodeDatabaseTest): @cpython_only diff --git a/Makefile.pre.in b/Makefile.pre.in index 19423c11545c..0a1b8d028add 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ + $(srcdir)/Include/internal/pycore_unicodectype.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst new file mode 100644 index 000000000000..8767b1bb4837 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst @@ -0,0 +1,3 @@ +:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and +:func:`~unicodedata.isxidcontinue` functions to check whether a character can +start or continue a `Unicode Standard Annex #31 `_ identifier. diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 345440eeee89..5fcba083c2f4 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -518,6 +518,78 @@ exit: return return_value; } +PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__, +"isxidstart($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Start property, else False."); + +#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF \ + {"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__}, + +static PyObject * +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isxidstart(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isxidstart_impl(self, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__, +"isxidcontinue($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Continue property, else False."); + +#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF \ + {"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__}, + +static PyObject * +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isxidcontinue(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isxidcontinue_impl(self, chr); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_UCD_lookup__doc__, "lookup($self, name, /)\n" "--\n" @@ -549,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index a3699beff7da..a6094676d419 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() #include #include // offsetof() @@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) return PyUnicode_FromString(name); } +/*[clinic input] +unicodedata.UCD.isxidstart + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Start property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr) +/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/ +{ + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, chr); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidStart(chr)); +} + +/*[clinic input] +unicodedata.UCD.isxidcontinue + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Continue property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr) +/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/ +{ + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, chr); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidContinue(chr)); +} + /*[clinic input] unicodedata.UCD.lookup @@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF + UNICODEDATA_UCD_ISXIDSTART_METHODDEF + UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d135..fdd380190ac1 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,6 +9,7 @@ */ #include "Python.h" +#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f60f7dd2d136..8a5638ac1406 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart #include "pycore_unicodeobject.h" // struct _Py_unicode_state #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 2657ee5c444e..a101c1b45cf2 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,6 +328,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 9c12be6e9356..e3f261c2b92a 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -528,6 +528,9 @@ Include\cpython + + Include\internal + Include\internal -- 2.47.3