From: Victor Stinner Date: Thu, 26 May 2022 22:39:49 +0000 (+0200) Subject: gh-91924: Optimize unicode_check_encoding_errors() (#93200) X-Git-Tag: v3.12.0a1~1416 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5f8c3fb99746b9a7fadd4ec24cc4025b9c5d79d0;p=thirdparty%2FPython%2Fcpython.git gh-91924: Optimize unicode_check_encoding_errors() (#93200) Avoid _PyCodec_Lookup() and PyCodec_LookupError() for most common built-in encodings and error handlers to avoid creating a temporary Unicode string object, whereas these encodings and error handlers are known to be valid. --- diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e93582907248..3acbf54f1c0b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -454,7 +454,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) return 0; } - if (encoding != NULL) { + if (encoding != NULL + // Fast path for the most common built-in encodings. Even if the codec + // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to + // create a temporary Unicode string (the key in the cache). + && strcmp(encoding, "utf-8") != 0 + && strcmp(encoding, "utf8") != 0 + && strcmp(encoding, "ascii") != 0) + { PyObject *handler = _PyCodec_Lookup(encoding); if (handler == NULL) { return -1; @@ -462,7 +469,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) Py_DECREF(handler); } - if (errors != NULL) { + if (errors != NULL + // Fast path for the most common built-in error handlers. + && strcmp(errors, "strict") != 0 + && strcmp(errors, "ignore") != 0 + && strcmp(errors, "replace") != 0 + && strcmp(errors, "surrogateescape") != 0 + && strcmp(errors, "surrogatepass") != 0) + { PyObject *handler = PyCodec_LookupError(errors); if (handler == NULL) { return -1;