Backport checkin:

author Walter Dörwald <walter@livinglogic.de>

Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)

committer Walter Dörwald <walter@livinglogic.de>

Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)
author Walter Dörwald <walter@livinglogic.de>
Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)
committer Walter Dörwald <walter@livinglogic.de>
Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 6738cbd6792daa20caf076aaff12ee7fb6053ba0..b5341875e6fc617965d1b9e80bbb5399182fcfbd 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
      int length                 /* Number of Py_UNICODE chars to encode */
      );
  
+/* --- Unicode Internal Codec ---------------------------------------------
+
+    Only for internal use in _codecsmodule.c */
+
+PyObject *_PyUnicode_DecodeUnicodeInternal(
+    const char *string,
+    int length,
+    const char *errors
+    );
+
  /* --- Latin-1 Codecs ----------------------------------------------------- 
  
     Note: Latin-1 corresponds to the first 256 Unicode ordinals.
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py

index 8f0d59046555beb84ee220586a9d835f22b32ecf..f8e59cd6e3cce45cdaea7b094591cb56fc34d033 100644 (file)
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase):
              sout += "\\U%08x" % sys.maxunicode
          self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
  
-    def test_relaxedutf8(self):
+    def test_decoderelaxedutf8(self):
          # This is the test for a decoding callback handler,
          # that relaxes the UTF-8 minimal encoding restriction.
          # A null byte that is encoded as "\xc0\x80" will be
@@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase):
          charmap[ord("?")] = u"XYZ"
          self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
  
+    def test_decodeunicodeinternal(self):
+        self.assertRaises(
+            UnicodeDecodeError,
+            "\x00\x00\x00\x00\x00".decode,
+            "unicode-internal",
+        )
+        if sys.maxunicode > 0xffff:
+            def handler_unicodeinternal(exc):
+                if not isinstance(exc, UnicodeDecodeError):
+                    raise TypeError("don't know how to handle %r" % exc)
+                return (u"\x01", 1)
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
+                u"\u0000"
+            )
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
+                u"\u0000\ufffd"
+            )
+
+            codecs.register_error("test.hui", handler_unicodeinternal)
+
+            self.assertEqual(
+                "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
+                u"\u0000\u0001\u0000"
+            )
+
      def test_callbacks(self):
          def handler1(exc):
              if not isinstance(exc, UnicodeEncodeError) \
@@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase):
              for (enc, bytes) in (
                  ("ascii", "\xff"),
                  ("utf-8", "\xff"),
-                ("utf-7", "+x-")
+                ("utf-7", "+x-"),
+                ("unicode-internal", "\x00"),
              ):
                  self.assertRaises(
                      TypeError,
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 96ed5b82523b1b16356d7c0a7cea94c3fe72fbb6..44d7896948bb67a690ff9fbaabae9b4b64b500df 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,7 +1,7 @@
  from test import test_support
  import unittest
  import codecs
-import StringIO
+import sys, StringIO
  
  class Queue(object):
      """
@@ -455,6 +455,54 @@ class PunycodeTest(unittest.TestCase):
          for uni, puny in punycode_testcases:
              self.assertEquals(uni, puny.decode("punycode"))
  
+class UnicodeInternalTest(unittest.TestCase):
+    def test_bug1251300(self):
+        # Decoding with unicode_internal used to not correctly handle "code
+        # points" above 0x10ffff on UCS-4 builds.
+        if sys.maxunicode > 0xffff:
+            ok = [
+                ("\x00\x10\xff\xff", u"\U0010ffff"),
+                ("\x00\x00\x01\x01", u"\U00000101"),
+                ("", u""),
+            ]
+            not_ok = [
+                "\x7f\xff\xff\xff",
+                "\x80\x00\x00\x00",
+                "\x81\x00\x00\x00",
+                "\x00",
+                "\x00\x00\x00\x00\x00",
+            ]
+            for internal, uni in ok:
+                if sys.byteorder == "little":
+                    internal = "".join(reversed(internal))
+                self.assertEquals(uni, internal.decode("unicode_internal"))
+            for internal in not_ok:
+                if sys.byteorder == "little":
+                    internal = "".join(reversed(internal))
+                self.assertRaises(UnicodeDecodeError, internal.decode,
+                    "unicode_internal")
+
+    def test_decode_error_attributes(self):
+        if sys.maxunicode > 0xffff:
+            try:
+                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+            except UnicodeDecodeError, ex:
+                self.assertEquals("unicode_internal", ex.encoding)
+                self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
+                self.assertEquals(4, ex.start)
+                self.assertEquals(8, ex.end)
+            else:
+                self.fail()
+
+    def test_decode_callback(self):
+        if sys.maxunicode > 0xffff:
+            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+            decoder = codecs.getdecoder("unicode_internal")
+            ab = u"ab".encode("unicode_internal")
+            ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
+                "UnicodeInternalTest")
+            self.assertEquals((u"ab", 12), ignored)
+
  # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
  nameprep_tests = [
      # 3.1 Map to nothing.
@@ -696,6 +744,7 @@ def test_main():
          EscapeDecodeTest,
          RecodingTest,
          PunycodeTest,
+        UnicodeInternalTest,
          NameprepTest,
          CodecTest,
          CodecsModuleTest,
diff --git a/Misc/NEWS b/Misc/NEWS

index 9d22ee55167e1298799da7a592dfa0b5f667b141..b265e33896dde185cd9cf2dbe7efb2aa9068f843 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -137,6 +137,10 @@ Library
    line ending. Remove the special handling of a "\r\n" that has been split
    between two lines.
  
+- Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
+  about illegal code points. The codec now supports PEP 293 style error
+  handlers.
+
  
  Build
  -----
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c

index a6c42b134bd55a773a8ff2edd394c4da1a47be7b..3441f6195856ebab14636bd73d3f0e70149fdf8e 100644 (file)
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self,
      else {
         if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
             return NULL;
-       return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
-                                                size / sizeof(Py_UNICODE)),
+
+       return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
                            size);
      }
  }
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 5e5dac55a775d3b57efe65cbb92cf5647107ef5e..5d096edee63d4073a7a9395ba63f373330b5f4db 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
                                             PyUnicode_GET_SIZE(unicode));
  }
  
+/* --- Unicode Internal Codec ------------------------------------------- */
+
+PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
+                                          int size,
+                                          const char *errors)
+{
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
+    Py_UNICODE unimax;
+    PyUnicodeObject *v;
+    Py_UNICODE *p;
+    const char *end;
+    const char *reason;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+
+    unimax = PyUnicode_GetMax();
+    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
+    if (v == NULL)
+       goto onError;
+    if (PyUnicode_GetSize((PyObject *)v) == 0)
+       return (PyObject *)v;
+    p = PyUnicode_AS_UNICODE(v);
+    end = s + size;
+
+    while (s < end) {
+        *p = *(Py_UNICODE *)s;
+        /* We have to sanity check the raw data, otherwise doom looms for
+           some malformed UCS-4 data. */
+        if (
+            #ifdef Py_UNICODE_WIDE
+            *p > unimax || *p < 0 ||
+            #endif
+            end-s < Py_UNICODE_SIZE
+            )
+            {
+            startinpos = s - starts;
+            if (end-s < Py_UNICODE_SIZE) {
+                endinpos = end-starts;
+                reason = "truncated input";
+            }
+            else {
+                endinpos = s - starts + Py_UNICODE_SIZE;
+                reason = "illegal code point (> 0x10FFFF)";
+            }
+            outpos = p - PyUnicode_AS_UNICODE(v);
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicode_internal", reason,
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p)) {
+                goto onError;
+            }
+        }
+        else {
+            p++;
+            s += Py_UNICODE_SIZE;
+        }
+    }
+
+    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
+        goto onError;
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return (PyObject *)v;
+
+ onError:
+    Py_XDECREF(v);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return NULL;
+}
+
  /* --- Latin-1 Codec ------------------------------------------------------ */
  
  PyObject *PyUnicode_DecodeLatin1(const char *s,
author	Walter Dörwald <walter@livinglogic.de>
	Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)
committer	Walter Dörwald <walter@livinglogic.de>
	Tue, 30 Aug 2005 10:46:06 +0000 (10:46 +0000)
Include/unicodeobject.h		patch \| blob \| blame \| history
Lib/test/test_codeccallbacks.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Modules/_codecsmodule.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history