}
+/*
+ * Determine the number of digits for a decimal representation of Unicode
+ * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
+ */
+static inline int
+n_decimal_digits_for_codepoint(Py_UCS4 ch)
+{
+ if (ch < 10) return 1;
+ if (ch < 100) return 2;
+ if (ch < 1000) return 3;
+ if (ch < 10000) return 4;
+ if (ch < 100000) return 5;
+ if (ch < 1000000) return 6;
+ if (ch < 10000000) return 7;
+ // Unicode codepoints are limited to 1114111 (7 decimal digits)
+ Py_UNREACHABLE();
+}
+
+
/*
* Create a Unicode string containing 'count' copies of the official
* Unicode REPLACEMENT CHARACTER (0xFFFD).
}
}
+
+// --- handler: 'xmlcharrefreplace' -------------------------------------------
+
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{
- if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+ if (!_PyIsUnicodeEncodeError(exc)) {
wrong_exception_type(exc);
return NULL;
}
Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) {
- /* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
- if (ch < 10) {
- ressize += 2 + 1 + 1;
- }
- else if (ch < 100) {
- ressize += 2 + 2 + 1;
- }
- else if (ch < 1000) {
- ressize += 2 + 3 + 1;
- }
- else if (ch < 10000) {
- ressize += 2 + 4 + 1;
- }
- else if (ch < 100000) {
- ressize += 2 + 5 + 1;
- }
- else if (ch < 1000000) {
- ressize += 2 + 6 + 1;
- }
- else {
- assert(ch < 10000000);
- ressize += 2 + 7 + 1;
- }
+ int k = n_decimal_digits_for_codepoint(ch);
+ assert(k != 0);
+ assert(k <= 7);
+ ressize += 2 + k + 1;
}
/* allocate replacement */
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */
for (Py_ssize_t i = start; i < end; ++i) {
- int digits, base;
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
- if (ch < 10) {
- digits = 1;
- base = 1;
- }
- else if (ch < 100) {
- digits = 2;
- base = 10;
- }
- else if (ch < 1000) {
- digits = 3;
- base = 100;
- }
- else if (ch < 10000) {
- digits = 4;
- base = 1000;
- }
- else if (ch < 100000) {
- digits = 5;
- base = 10000;
- }
- else if (ch < 1000000) {
- digits = 6;
- base = 100000;
- }
- else {
- assert(ch < 10000000);
- digits = 7;
- base = 1000000;
- }
+ /*
+ * Write the decimal representation of 'ch' to the buffer pointed by 'p'
+ * using at most 7 characters prefixed by '&#' and suffixed by ';'.
+ */
*outp++ = '&';
*outp++ = '#';
- while (digits-- > 0) {
- assert(base >= 1);
- *outp++ = '0' + ch / base;
- ch %= base;
- base /= 10;
+ Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
+ for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
+ *p_digit = '0' + (ch % 10);
+ ch /= 10;
}
+ assert(ch == 0);
+ outp = digit_end;
*outp++ = ';';
}
assert(_PyUnicode_CheckConsistency(res, 1));
}
-static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
+static inline PyObject *
+xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_XMLCharRefReplaceErrors(exc);
}