Issue #15866: The xmlcharrefreplace error handler no more produces two XML

author Serhiy Storchaka <storchaka@gmail.com>

Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)

committer Serhiy Storchaka <storchaka@gmail.com>

Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)
committer Serhiy Storchaka <storchaka@gmail.com>
Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py

index 61c2df20c437df798f2c521fbd74c6c4f6ccf3e8..ecaf9970a1ddbb72b3768dd9011828aa67e5ffb2 100644 (file)
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -66,15 +66,34 @@ class CodecCallbackTest(unittest.TestCase):
          # replace unencodable characters which numeric character entities.
          # For ascii, latin-1 and charmaps this is completely implemented
          # in C and should be reasonably fast.
-        s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+        s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
          self.assertEqual(
              s.encode("ascii", "xmlcharrefreplace"),
-            "&#12473;&#12497;&#12514; &#228;nd eggs"
+            "&#12473;&#12497;&#12514; &#228;nd egg&#353;"
          )
          self.assertEqual(
              s.encode("latin-1", "xmlcharrefreplace"),
-            "&#12473;&#12497;&#12514; \xe4nd eggs"
+            "&#12473;&#12497;&#12514; \xe4nd egg&#353;"
          )
+        self.assertEqual(
+            s.encode("iso-8859-15", "xmlcharrefreplace"),
+            "&#12473;&#12497;&#12514; \xe4nd egg\xa8"
+        )
+
+    def test_xmlcharrefreplace_with_surrogates(self):
+        tests = [(u'\U0001f49d', '&#128157;'),
+                 (u'\ud83d', '&#55357;'),
+                 (u'\udc9d', '&#56477;'),
+                 (u'\ud83d\udc9d', '&#128157;' if len(u'\U0001f49d') > 1 else
+                                   '&#55357;&#56477;'),
+                ]
+        for encoding in ['ascii', 'latin1', 'iso-8859-15']:
+            for s, exp in tests:
+                self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
+                                 exp, msg='%r.encode(%r)' % (s, encoding))
+                self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
+                                 exp+'X',
+                                 msg='%r.encode(%r)' % (s + 'X', encoding))
  
      def test_xmlcharnamereplace(self):
          # This time use a named character entity for unencodable
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index e44fe03d240d281fabc98d4e5d954e7f2d0aff07..666cab87255eaa9ee7cbd6a78a721feba3fc5a0a 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1658,6 +1658,18 @@ class UnicodeTest(
          self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
                           b'123?0')
  
+    def test_encode_decimal_with_surrogates(self):
+        from _testcapi import unicode_encodedecimal
+        tests = [(u'\U0001f49d', '&#128157;'),
+                 (u'\ud83d', '&#55357;'),
+                 (u'\udc9d', '&#56477;'),
+                 (u'\ud83d\udc9d', '&#128157;' if len(u'\U0001f49d') > 1 else
+                                  '&#55357;&#56477;'),
+                ]
+        for s, exp in tests:
+            self.assertEqual(
+                    unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
+                    '123' + exp)
  
  def test_main():
      test_support.run_unittest(__name__)
diff --git a/Misc/NEWS b/Misc/NEWS

index 64668dd4ea71a9584171d470fb652c115fa7468d..af3a94bc78a255ab9d013972aeb8faee842467a0 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -9,6 +9,9 @@ What's New in Python 2.7.6?
  Core and Builtins
  -----------------
  
+- Issue #15866: The xmlcharrefreplace error handler no more produces two XML
+  entities for a non-BMP character on narrow build.
+
  - Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise
    OverflowError when an argument of %c format is out of range.
  
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c

index b0386f0cace985c24959c42261b1d02186428602..4e7d47d8593730f9957e3ac9b7a9c6b0732a7963 100644 (file)
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1118,7 +1118,7 @@ unicode_encodedecimal(PyObject *self, PyObject *args)
      if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
          return NULL;
  
-    decimal_length = length * 7; /* len('&#8364;') */
+    decimal_length = length * 10; /* len('&#1114111;') */
      decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
      if (decimal == NULL)
          return NULL;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 64a5ef557c05754d17fe1863c6251ce172d1c2ed..866eb9b0589e2bf0347e1689dd53e896849727a2 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u)
      return PyUnicode_FromStringAndSize(u, size);
  }
  
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ *       (especially if used with other macros).
+ */
+
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
+    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
+      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end)                                      \
+     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
+        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
+       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+       (Py_UCS4)*(ptr)++)
+#endif
+
  #ifdef HAVE_WCHAR_H
  
  #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
@@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
              case 4: /* xmlcharrefreplace */
                  respos = str-PyString_AS_STRING(res);
                  /* determine replacement size (temporarily (mis)uses p) */
-                for (p = collstart, repsize = 0; p < collend; ++p) {
-                    if (*p<10)
+                for (p = collstart, repsize = 0; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    if (ch < 10)
                          repsize += 2+1+1;
-                    else if (*p<100)
+                    else if (ch < 100)
                          repsize += 2+2+1;
-                    else if (*p<1000)
+                    else if (ch < 1000)
                          repsize += 2+3+1;
-                    else if (*p<10000)
+                    else if (ch < 10000)
                          repsize += 2+4+1;
-#ifndef Py_UNICODE_WIDE
-                    else
+                    else if (ch < 100000)
                          repsize += 2+5+1;
-#else
-                    else if (*p<100000)
-                        repsize += 2+5+1;
-                    else if (*p<1000000)
+                    else if (ch < 1000000)
                          repsize += 2+6+1;
                      else
                          repsize += 2+7+1;
-#endif
                  }
                  requiredsize = respos+repsize+(endp-collend);
                  if (requiredsize > ressize) {
@@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                      ressize = requiredsize;
                  }
                  /* generate replacement (temporarily (mis)uses p) */
-                for (p = collstart; p < collend; ++p) {
-                    str += sprintf(str, "&#%d;", (int)*p);
+                for (p = collstart; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    str += sprintf(str, "&#%d;", (int)ch);
                  }
                  p = collend;
                  break;
@@ -4649,11 +4677,20 @@ int charmap_encoding_error(
          *inpos = collendpos;
          break;
      case 4: /* xmlcharrefreplace */
-        /* generate replacement (temporarily (mis)uses p) */
-        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+        /* generate replacement */
+        for (collpos = collstartpos; collpos < collendpos;) {
              char buffer[2+29+1+1];
              char *cp;
-            sprintf(buffer, "&#%d;", (int)p[collpos]);
+            Py_UCS4 ch = p[collpos++];
+#ifndef Py_UNICODE_WIDE
+            if ((0xD800 <= ch && ch <= 0xDBFF) &&
+                (collpos < collendpos) &&
+                (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
+                ch = ((((ch & 0x03FF) << 10) |
+                       ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
+            }
+#endif
+            sprintf(buffer, "&#%d;", (int)ch);
              for (cp = buffer; *cp; ++cp) {
                  x = charmapencode_output(*cp, mapping, res, respos);
                  if (x==enc_EXCEPTION)
@@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
                  break;
              case 4: /* xmlcharrefreplace */
                  /* generate replacement (temporarily (mis)uses p) */
-                for (p = collstart; p < collend; ++p) {
+                for (p = collstart; p < collend;) {
                      char buffer[2+29+1+1];
                      char *cp;
-                    sprintf(buffer, "&#%d;", (int)*p);
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    sprintf(buffer, "&#%d;", (int)ch);
                      if (charmaptranslate_makespace(&res, &str,
                                                     (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
                          goto onError;
@@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
              break;
          case 4: /* xmlcharrefreplace */
              /* generate replacement (temporarily (mis)uses p) */
-            for (p = collstart; p < collend; ++p)
-                output += sprintf(output, "&#%d;", (int)*p);
+            for (p = collstart; p < collend;) {
+                Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                output += sprintf(output, "&#%d;", ch);
+            }
              p = collend;
              break;
          default:
diff --git a/Python/codecs.c b/Python/codecs.c

index 7334eb3e360cdf397d1d02023362240706aa3307..91147a07a32405aa9b62349b5fcd5a74ee6992a6 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -556,6 +556,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
          PyObject *res;
          Py_UNICODE *p;
          Py_UNICODE *startp;
+        Py_UNICODE *e;
          Py_UNICODE *outp;
          int ressize;
          if (PyUnicodeEncodeError_GetStart(exc, &start))
@@ -565,26 +566,31 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
          if (!(object = PyUnicodeEncodeError_GetObject(exc)))
              return NULL;
          startp = PyUnicode_AS_UNICODE(object);
-        for (p = startp+start, ressize = 0; p < startp+end; ++p) {
-            if (*p<10)
+        e = startp + end;
+        for (p = startp+start, ressize = 0; p < e;) {
+            Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+            if ((0xD800 <= ch && ch <= 0xDBFF) &&
+                (p < e) &&
+                (0xDC00 <= *p && *p <= 0xDFFF)) {
+                ch = ((((ch & 0x03FF) << 10) |
+                       ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+            }
+#endif
+            if (ch < 10)
                  ressize += 2+1+1;
-            else if (*p<100)
+            else if (ch < 100)
                  ressize += 2+2+1;
-            else if (*p<1000)
+            else if (ch < 1000)
                  ressize += 2+3+1;
-            else if (*p<10000)
+            else if (ch < 10000)
                  ressize += 2+4+1;
-#ifndef Py_UNICODE_WIDE
-            else
-                ressize += 2+5+1;
-#else
-            else if (*p<100000)
+            else if (ch < 100000)
                  ressize += 2+5+1;
-            else if (*p<1000000)
+            else if (ch < 1000000)
                  ressize += 2+6+1;
              else
                  ressize += 2+7+1;
-#endif
          }
          /* allocate replacement */
          res = PyUnicode_FromUnicode(NULL, ressize);
@@ -593,40 +599,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
              return NULL;
          }
          /* generate replacement */
-        for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
-            p < startp+end; ++p) {
-            Py_UNICODE c = *p;
+        for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
              int digits;
              int base;
+            Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+            if ((0xD800 <= ch && ch <= 0xDBFF) &&
+                (p < startp+end) &&
+                (0xDC00 <= *p && *p <= 0xDFFF)) {
+                ch = ((((ch & 0x03FF) << 10) |
+                       ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+            }
+#endif
              *outp++ = '&';
              *outp++ = '#';
-            if (*p<10) {
+            if (ch < 10) {
                  digits = 1;
                  base = 1;
              }
-            else if (*p<100) {
+            else if (ch < 100) {
                  digits = 2;
                  base = 10;
              }
-            else if (*p<1000) {
+            else if (ch < 1000) {
                  digits = 3;
                  base = 100;
              }
-            else if (*p<10000) {
+            else if (ch < 10000) {
                  digits = 4;
                  base = 1000;
              }
-#ifndef Py_UNICODE_WIDE
-            else {
-                digits = 5;
-                base = 10000;
-            }
-#else
-            else if (*p<100000) {
+            else if (ch < 100000) {
                  digits = 5;
                  base = 10000;
              }
-            else if (*p<1000000) {
+            else if (ch < 1000000) {
                  digits = 6;
                  base = 100000;
              }
@@ -634,10 +641,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
                  digits = 7;
                  base = 1000000;
              }
-#endif
              while (digits-->0) {
-                *outp++ = '0' + c/base;
-                c %= base;
+                *outp++ = '0' + ch/base;
+                ch %= base;
                  base /= 10;
              }
              *outp++ = ';';
author	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 6 Aug 2013 13:56:26 +0000 (16:56 +0300)
Lib/test/test_codeccallbacks.py		patch \| blob \| blame \| history
Lib/test/test_unicode.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Modules/_testcapimodule.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/codecs.c		patch \| blob \| blame \| history