bpo-39500: Document PyUnicode_IsIdentifier() function (GH-18397)

author Victor Stinner <vstinner@python.org>

Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)

committer GitHub <noreply@github.com>

Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)
author Victor Stinner <vstinner@python.org>
Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)
committer GitHub <noreply@github.com>
Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst

index 96d77c4084132c17c1970b25fb612fa0f31d0e4f..b1787ed1ce89cf080cb1b887fe57620eee58bce5 100644 (file)
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -240,6 +240,16 @@ access internal read-only data of Unicode objects:
        :c:func:`PyUnicode_nBYTE_DATA` family of macros.
  
  
+.. c:function:: int PyUnicode_IsIdentifier(PyObject *o)
+
+   Return ``1`` if the string is a valid identifier according to the language
+   definition, section :ref:`identifiers`. Return ``0`` otherwise.
+
+   .. versionchanged:: 3.9
+      The function does not call :c:func:`Py_FatalError` anymore if the string
+      is not ready.
+
+
  Unicode Character Properties
  """"""""""""""""""""""""""""
  
diff --git a/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst b/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst

new file mode 100644 (file)

index 0000000..2ca359f
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst
@@ -0,0 +1,2 @@
+:c:func:`PyUnicode_IsIdentifier` does not call :c:func:`Py_FatalError`
+anymore if the string is not ready.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index fd08ddbf57434f684b3f00f3bd2e05f1a886f427..aa874f2a12d29343480f6775ba063def0a63251d 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12198,22 +12198,33 @@ unicode_isnumeric_impl(PyObject *self)
  int
  PyUnicode_IsIdentifier(PyObject *self)
  {
-    int kind;
-    void *data;
      Py_ssize_t i;
-    Py_UCS4 first;
+    int ready = PyUnicode_IS_READY(self);
  
-    if (PyUnicode_READY(self) == -1) {
-        Py_FatalError("identifier not ready");
+    Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
+    if (len == 0) {
+        /* an empty string is not a valid identifier */
          return 0;
      }
  
-    /* Special case for empty strings */
-    if (PyUnicode_GET_LENGTH(self) == 0)
-        return 0;
-    kind = PyUnicode_KIND(self);
-    data = PyUnicode_DATA(self);
+    int kind;
+    void *data;
+    wchar_t *wstr;
+    if (ready) {
+        kind = PyUnicode_KIND(self);
+        data = PyUnicode_DATA(self);
+    }
+    else {
+        wstr = _PyUnicode_WSTR(self);
+    }
  
+    Py_UCS4 ch;
+    if (ready) {
+        ch = PyUnicode_READ(kind, data, 0);
+    }
+    else {
+        ch = wstr[0];
+    }
      /* PEP 3131 says that the first character must be in
         XID_Start and subsequent characters in XID_Continue,
         and for the ASCII range, the 2.x rules apply (i.e
@@ -12222,13 +12233,21 @@ PyUnicode_IsIdentifier(PyObject *self)
         definition of XID_Start and XID_Continue, it is sufficient
         to check just for these, except that _ must be allowed
         as starting an identifier.  */
-    first = PyUnicode_READ(kind, data, 0);
-    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
+    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
          return 0;
+    }
  
-    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
-        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
+    for (i = 1; i < len; i++) {
+        if (ready) {
+            ch = PyUnicode_READ(kind, data, i);
+        }
+        else {
+            ch = wstr[i];
+        }
+        if (!_PyUnicode_IsXidContinue(ch)) {
              return 0;
+        }
+    }
      return 1;
  }
  
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index f73c32684c7b73fb3169c68e3a4302dd8fe7a4b6..c37cd927df5a4142690f3b0c2f4a78eb812b2b96 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1079,8 +1079,9 @@ verify_identifier(struct tok_state *tok)
      }
      result = PyUnicode_IsIdentifier(s);
      Py_DECREF(s);
-    if (result == 0)
+    if (result == 0) {
          tok->done = E_IDENTIFIER;
+    }
      return result;
  }
author	Victor Stinner <vstinner@python.org>
	Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)
committer	GitHub <noreply@github.com>
	Tue, 11 Feb 2020 13:29:33 +0000 (14:29 +0100)
Doc/c-api/unicode.rst		patch \| blob \| blame \| history
Misc/NEWS.d/next/C API/2020-02-07-09-35-43.bpo-39500.xRAEgX.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history