gh-91924: Optimize unicode_check_encoding_errors() (#93200)

author Victor Stinner <vstinner@python.org>

Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)

committer GitHub <noreply@github.com>

Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)
author Victor Stinner <vstinner@python.org>
Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)
committer GitHub <noreply@github.com>
Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index e935829072483232089113b5030cf0fb3d5b7430..3acbf54f1c0b25ab32b0848b7584f51436835a99 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -454,7 +454,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors)
          return 0;
      }
  
-    if (encoding != NULL) {
+    if (encoding != NULL
+        // Fast path for the most common built-in encodings. Even if the codec
+        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
+        // create a temporary Unicode string (the key in the cache).
+        && strcmp(encoding, "utf-8") != 0
+        && strcmp(encoding, "utf8") != 0
+        && strcmp(encoding, "ascii") != 0)
+    {
          PyObject *handler = _PyCodec_Lookup(encoding);
          if (handler == NULL) {
              return -1;
@@ -462,7 +469,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors)
          Py_DECREF(handler);
      }
  
-    if (errors != NULL) {
+    if (errors != NULL
+        // Fast path for the most common built-in error handlers.
+        && strcmp(errors, "strict") != 0
+        && strcmp(errors, "ignore") != 0
+        && strcmp(errors, "replace") != 0
+        && strcmp(errors, "surrogateescape") != 0
+        && strcmp(errors, "surrogatepass") != 0)
+    {
          PyObject *handler = PyCodec_LookupError(errors);
          if (handler == NULL) {
              return -1;
author	Victor Stinner <vstinner@python.org>
	Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)
committer	GitHub <noreply@github.com>
	Thu, 26 May 2022 22:39:49 +0000 (00:39 +0200)