import codecs
import sys
+from _codecs import _normalize_encoding
from . import aliases
_cache = {}
if isinstance(encoding, bytes):
encoding = str(encoding, "ascii")
- chars = []
- punct = False
- for c in encoding:
- if c.isalnum() or c == '.':
- if punct and chars:
- chars.append('_')
- if c.isascii():
- chars.append(c)
- punct = False
- else:
- punct = True
- return ''.join(chars)
+ return _normalize_encoding(encoding)
def search_function(encoding):
--- /dev/null
+:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
+by implementing the function in C using the private
+``_Py_normalize_encoding`` which has been modified to make lowercase
+conversion optional.
return PyCodec_LookupError(name);
}
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
+
+/*[clinic input]
+_codecs._normalize_encoding
+ encoding: unicode
+
+Normalize an encoding name *encoding*.
+
+Used for encodings.normalize_encoding. Does not convert to lower case.
+[clinic start generated code]*/
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
+/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
+{
+ Py_ssize_t len;
+ const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
+ if (cstr == NULL) {
+ return NULL;
+ }
+
+ if (len > PY_SSIZE_T_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "encoding is too large");
+ return NULL;
+ }
+
+ char *normalized = PyMem_Malloc(len + 1);
+ if (normalized == NULL) {
+ return PyErr_NoMemory();
+ }
+
+ if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
+ PyMem_Free(normalized);
+ return NULL;
+ }
+
+ PyObject *result = PyUnicode_FromString(normalized);
+ PyMem_Free(normalized);
+ return result;
+}
+
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
_CODECS_REGISTER_ERROR_METHODDEF
_CODECS__UNREGISTER_ERROR_METHODDEF
_CODECS_LOOKUP_ERROR_METHODDEF
+ _CODECS__NORMALIZE_ENCODING_METHODDEF
{NULL, NULL} /* sentinel */
};
return return_value;
}
+PyDoc_STRVAR(_codecs__normalize_encoding__doc__,
+"_normalize_encoding($module, /, encoding)\n"
+"--\n"
+"\n"
+"Normalize an encoding name *encoding*.\n"
+"\n"
+"Used for encodings.normalize_encoding. Does not convert to lower case.");
+
+#define _CODECS__NORMALIZE_ENCODING_METHODDEF \
+ {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__},
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding);
+
+static PyObject *
+_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+ PyObject *return_value = NULL;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 1
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ Py_hash_t ob_hash;
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_hash = -1,
+ .ob_item = { &_Py_ID(encoding), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"encoding", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "_normalize_encoding",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[1];
+ PyObject *encoding;
+
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+ /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+ if (!args) {
+ goto exit;
+ }
+ if (!PyUnicode_Check(args[0])) {
+ _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]);
+ goto exit;
+ }
+ encoding = args[0];
+ return_value = _codecs__normalize_encoding_impl(module, encoding);
+
+exit:
+ return return_value;
+}
+
#ifndef _CODECS_MBCS_DECODE_METHODDEF
#define _CODECS_MBCS_DECODE_METHODDEF
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/
return v;
}
-/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
- also convert to lowercase. Return 1 on success, or 0 on error (encoding is
- longer than lower_len-1). */
+/* Normalize an encoding name like encodings.normalize_encoding()
+ but allow to convert to lowercase if *to_lower* is true.
+ Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
int
_Py_normalize_encoding(const char *encoding,
char *lower,
- size_t lower_len)
+ size_t lower_len,
+ int to_lower)
{
const char *e;
char *l;
if (l == l_end) {
return 0;
}
- *l++ = Py_TOLOWER(c);
+ *l++ = to_lower ? Py_TOLOWER(c) : c;
}
else {
punct = 1;
}
/* Shortcuts for common default encodings */
- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */
}
/* Shortcuts for common default encodings */
- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
char *lower = buflower;
/* Fast paths */
#define USE_FORCE_ASCII
-extern int _Py_normalize_encoding(const char *, char *, size_t);
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
}
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
- if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
goto error;
}