1
+.. function:: isxidstart(chr, /)
+
+ Return ``True`` if *chr* is a valid identifier start per the
+ `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
+ that is, it has the ``XID_Start`` property. Return ``False`` otherwise.
+ For example::
+
+ >>> unicodedata.isxidstart('S')
+ True
+ >>> unicodedata.isxidstart('0')
+ False
+
+ .. versionadded:: next
+
+
+.. function:: isxidcontinue(chr, /)
+
+ Return ``True`` if *chr* is a valid identifier character per the
+ `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
+ that is, it has the ``XID_Continue`` property. Return ``False`` otherwise.
+ For example::
+
+ >>> unicodedata.isxidcontinue('S')
+ True
+ >>> unicodedata.isxidcontinue(' ')
+ False
+
+ .. versionadded:: next
+
+
.. function:: decomposition(chr)
Returns the character decomposition mapping assigned to the character
* The Unicode database has been updated to Unicode 17.0.0.
+* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue`
+ functions to check whether a character can start or continue a
+ `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
+ (Contributed by Stan Ulbrych in :gh:`129117`.)
+
wave
----
--- /dev/null
+#ifndef Py_INTERNAL_UNICODECTYPE_H
+#define Py_INTERNAL_UNICODECTYPE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+# error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
+extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
+extern int _PyUnicode_IsCased(Py_UCS4 ch);
+
+// Export for 'unicodedata' shared extension.
+PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
+PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODECTYPE_H */
return 0;
}
-
-/* --- Characters Type APIs ----------------------------------------------- */
-
-extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
-extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
-extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
-extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
-extern int _PyUnicode_IsCased(Py_UCS4 ch);
-
/* --- Unicode API -------------------------------------------------------- */
// Export for '_json' shared extension
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
+ def test_isxidstart(self):
+ self.assertTrue(self.db.isxidstart('S'))
+ self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM
+ self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA
+ self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
+ self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU
+ self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM
+
+ self.assertFalse(self.db.isxidstart(' '))
+ self.assertFalse(self.db.isxidstart('0'))
+ self.assertRaises(TypeError, self.db.isxidstart)
+ self.assertRaises(TypeError, self.db.isxidstart, 'xx')
+
+ def test_isxidcontinue(self):
+ self.assertTrue(self.db.isxidcontinue('S'))
+ self.assertTrue(self.db.isxidcontinue('_'))
+ self.assertTrue(self.db.isxidcontinue('0'))
+ self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
+ self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL
+ self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH
+ self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
+ self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO
+
+ self.assertFalse(self.db.isxidcontinue(' '))
+ self.assertRaises(TypeError, self.db.isxidcontinue)
+ self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
+
class UnicodeMiscTest(UnicodeDatabaseTest):
@cpython_only
$(srcdir)/Include/internal/pycore_typeobject.h \
$(srcdir)/Include/internal/pycore_typevarobject.h \
$(srcdir)/Include/internal/pycore_ucnhash.h \
+ $(srcdir)/Include/internal/pycore_unicodectype.h \
$(srcdir)/Include/internal/pycore_unicodeobject.h \
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
$(srcdir)/Include/internal/pycore_unionobject.h \
--- /dev/null
+:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and
+:func:`~unicodedata.isxidcontinue` functions to check whether a character can
+start or continue a `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
return return_value;
}
+PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__,
+"isxidstart($self, chr, /)\n"
+"--\n"
+"\n"
+"Return True if the character has the XID_Start property, else False.");
+
+#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF \
+ {"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__},
+
+static PyObject *
+unicodedata_UCD_isxidstart_impl(PyObject *self, int chr);
+
+static PyObject *
+unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ int chr;
+
+ if (!PyUnicode_Check(arg)) {
+ _PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg);
+ goto exit;
+ }
+ if (PyUnicode_GET_LENGTH(arg) != 1) {
+ PyErr_Format(PyExc_TypeError,
+ "isxidstart(): argument must be a unicode character, "
+ "not a string of length %zd",
+ PyUnicode_GET_LENGTH(arg));
+ goto exit;
+ }
+ chr = PyUnicode_READ_CHAR(arg, 0);
+ return_value = unicodedata_UCD_isxidstart_impl(self, chr);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__,
+"isxidcontinue($self, chr, /)\n"
+"--\n"
+"\n"
+"Return True if the character has the XID_Continue property, else False.");
+
+#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF \
+ {"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__},
+
+static PyObject *
+unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr);
+
+static PyObject *
+unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ int chr;
+
+ if (!PyUnicode_Check(arg)) {
+ _PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg);
+ goto exit;
+ }
+ if (PyUnicode_GET_LENGTH(arg) != 1) {
+ PyErr_Format(PyExc_TypeError,
+ "isxidcontinue(): argument must be a unicode character, "
+ "not a string of length %zd",
+ PyUnicode_GET_LENGTH(arg));
+ goto exit;
+ }
+ chr = PyUnicode_READ_CHAR(arg, 0);
+ return_value = unicodedata_UCD_isxidcontinue_impl(self, chr);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(unicodedata_UCD_lookup__doc__,
"lookup($self, name, /)\n"
"--\n"
exit:
return return_value;
}
-/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/
#include "Python.h"
#include "pycore_object.h" // _PyObject_VisitType()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
+#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart()
#include <stdbool.h>
#include <stddef.h> // offsetof()
return PyUnicode_FromString(name);
}
+/*[clinic input]
+unicodedata.UCD.isxidstart
+
+ self: self
+ chr: int(accept={str})
+ /
+
+Return True if the character has the XID_Start property, else False.
+
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_isxidstart_impl(PyObject *self, int chr)
+/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/
+{
+ if (UCD_Check(self)) {
+ const change_record *old = get_old_record(self, chr);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ Py_RETURN_FALSE;
+ }
+ }
+
+ return PyBool_FromLong(_PyUnicode_IsXidStart(chr));
+}
+
+/*[clinic input]
+unicodedata.UCD.isxidcontinue
+
+ self: self
+ chr: int(accept={str})
+ /
+
+Return True if the character has the XID_Continue property, else False.
+
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr)
+/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/
+{
+ if (UCD_Check(self)) {
+ const change_record *old = get_old_record(self, chr);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ Py_RETURN_FALSE;
+ }
+ }
+
+ return PyBool_FromLong(_PyUnicode_IsXidContinue(chr));
+}
+
/*[clinic input]
unicodedata.UCD.lookup
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
+ UNICODEDATA_UCD_ISXIDSTART_METHODDEF
+ UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
*/
#include "Python.h"
+#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue()
#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
+#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
<ClInclude Include="..\Include\internal\pycore_typevarobject.h" />
<ClInclude Include="..\Include\internal\pycore_ucnhash.h" />
<ClInclude Include="..\Include\internal\pycore_unionobject.h" />
+ <ClInclude Include="..\Include\internal\pycore_unicodectype.h" />
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h" />
<ClInclude Include="..\Include\internal\pycore_unicodeobject_generated.h" />
<ClInclude Include="..\Include\internal\pycore_uniqueid.h" />
<ClInclude Include="..\Include\cpython\initconfig.h">
<Filter>Include\cpython</Filter>
</ClInclude>
+ <ClInclude Include="..\Include\internal\pycore_unicodectype.h">
+ <Filter>Include\internal</Filter>
+ </ClInclude>
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h">
<Filter>Include\internal</Filter>
</ClInclude>