Issue #19543: Emit deprecation warning for known non-text encodings.

author Serhiy Storchaka <storchaka@gmail.com>

Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)

committer Serhiy Storchaka <storchaka@gmail.com>

Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)
committer Serhiy Storchaka <storchaka@gmail.com>
Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)
diff --git a/Include/codecs.h b/Include/codecs.h

index c038c6a92c698cfbd7fdf297ac7970ea43454eca..8a9041badf986bbd986ac34a4b7bbf306f7bbc02 100644 (file)
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -81,6 +81,51 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
         const char *errors
         );
  
+/* Text codec specific encoding and decoding API.
+
+   Checks the encoding against a list of codecs which do not
+   implement a unicode<->bytes encoding before attempting the
+   operation.
+
+   Please note that these APIs are internal and should not
+   be used in Python C extensions.
+
+   XXX (ncoghlan): should we make these, or something like them, public
+   in Python 3.5+?
+
+ */
+PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
+       const char *encoding,
+       const char *alternate_command
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+
+
+
  /* --- Codec Lookup APIs -------------------------------------------------- 
  
     All APIs return a codec object with incremented refcount and are
diff --git a/Lib/_pyio.py b/Lib/_pyio.py

index a7f4301cc1d69695830a46da35114f53ff673ded..694b778cc5b37db3863424d80399a3906b7f9eb5 100644 (file)
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -7,6 +7,7 @@ from __future__ import (print_function, unicode_literals)
  import os
  import abc
  import codecs
+import sys
  import warnings
  import errno
  # Import thread instead of threading to reduce startup cost
@@ -1497,6 +1498,11 @@ class TextIOWrapper(TextIOBase):
          if not isinstance(encoding, basestring):
              raise ValueError("invalid encoding: %r" % encoding)
  
+        if sys.py3kwarning and not codecs.lookup(encoding)._is_text_encoding:
+            msg = ("%r is not a text encoding; "
+                   "use codecs.open() to handle arbitrary codecs")
+            warnings.warnpy3k(msg % encoding, stacklevel=2)
+
          if errors is None:
              errors = "strict"
          else:
diff --git a/Lib/codecs.py b/Lib/codecs.py

index 049a3f0fd1f3c65d795f5b5176212a2cb4345a39..12213e26f361613bb280c532ea578e0f1a2ec0bc 100644 (file)
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -79,9 +79,19 @@ BOM64_BE = BOM_UTF32_BE
  ### Codec base classes (defining the API)
  
  class CodecInfo(tuple):
+    """Codec details when looking up the codec registry"""
+
+    # Private API to allow Python to blacklist the known non-Unicode
+    # codecs in the standard library. A more general mechanism to
+    # reliably distinguish test encodings from other codecs will hopefully
+    # be defined for Python 3.5
+    #
+    # See http://bugs.python.org/issue19619
+    _is_text_encoding = True # Assume codecs are text encodings by default
  
      def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
+        _is_text_encoding=None):
          self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
          self.name = name
          self.encode = encode
@@ -90,6 +100,8 @@ class CodecInfo(tuple):
          self.incrementaldecoder = incrementaldecoder
          self.streamwriter = streamwriter
          self.streamreader = streamreader
+        if _is_text_encoding is not None:
+            self._is_text_encoding = _is_text_encoding
          return self
  
      def __repr__(self):
diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py

index f84e7808e99433f8666b986d5abb35d446047065..34ac55542881981c827cb25775f500b42802e19f 100644 (file)
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py
@@ -76,4 +76,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _is_text_encoding=False,
      )
diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py

index 054b36b401a66e54ba39c6a96485a9d0f19c9713..136503ac1e5e907270a53fa2e615d61f279b1c90 100644 (file)
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py
@@ -99,4 +99,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _is_text_encoding=False,
      )
diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py

index 91b38d952e1f9e638e34b867ec74fa4eb2323e9f..154488cd0ab12378e3ebab2b71a2d269ee38d09c 100644 (file)
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py
@@ -76,4 +76,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _is_text_encoding=False,
      )
diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py

index d8683fd56d325415a25116c129ab41ae45126016..f2591496fee82412f60faf8004093a55612a2108 100644 (file)
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py
@@ -72,4 +72,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _is_text_encoding=False,
      )
diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py

index 52b6431cf30d5edcc7454352cce4a8f7fe2581fe..4eaf4338f9c915f36ac0f93111a49c3e739885e2 100755 (executable)
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py
@@ -44,6 +44,7 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _is_text_encoding=False,
      )
  
  ### Decoding Map
diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py

index 4b137a5474ed0fc5d21d4904605ccbb229c990e6..5cb0d2b13e07127532dd47d755f0a1d187de917a 100644 (file)
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py
@@ -126,4 +126,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _is_text_encoding=False,
      )
diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py

index 3419f9f48f5efbf1be4802f562536ca235e2748e..0c2599d401c2396c086da6342e81e66fe5faf01b 100644 (file)
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py
@@ -99,4 +99,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _is_text_encoding=False,
      )
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py

index 1b4323839b96b07ee60291dec1d9d6b8e1d4437c..5141f879d956e4715ea002197fc467b6eb640188 100644 (file)
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -15,10 +15,8 @@ __all__ = ['JSONDecoder']
  FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
  
  def _floatconstants():
-    _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
-    if sys.byteorder != 'big':
-        _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
-    nan, inf = struct.unpack('dd', _BYTES)
+    nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
+    inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
      return nan, inf, -inf
  
  NaN, PosInf, NegInf = _floatconstants()
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py

index 6d87eb695741287eec2854086a16f7a021592a21..b2f837bab657f19ba9f3e6e5a05a9a6f59326093 100644 (file)
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -1295,8 +1295,10 @@ class MixinStrUserStringTest:
                    ('hex', '68656c6c6f20776f726c64'),
                    ('uu', 'begin 666 <data>\n+:&5L;&\\@=V]R;&0 \n \nend\n')]
          for encoding, data in codecs:
-            self.checkequal(data, 'hello world', 'encode', encoding)
-            self.checkequal('hello world', data, 'decode', encoding)
+            with test_support.check_py3k_warnings():
+                self.checkequal(data, 'hello world', 'encode', encoding)
+            with test_support.check_py3k_warnings():
+                self.checkequal('hello world', data, 'decode', encoding)
          # zlib is optional, so we make the test optional too...
          try:
              import zlib
@@ -1304,8 +1306,10 @@ class MixinStrUserStringTest:
              pass
          else:
              data = 'x\x9c\xcbH\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x1a\x0b\x04]'
-            self.checkequal(data, 'hello world', 'encode', 'zlib')
-            self.checkequal('hello world', data, 'decode', 'zlib')
+            with test_support.check_py3k_warnings():
+                self.checkequal(data, 'hello world', 'encode', 'zlib')
+            with test_support.check_py3k_warnings():
+                self.checkequal('hello world', data, 'decode', 'zlib')
  
          self.checkraises(TypeError, 'xyz', 'decode', 42)
          self.checkraises(TypeError, 'xyz', 'encode', 42)
diff --git a/Lib/test/test_calendar.py b/Lib/test/test_calendar.py

index 5692642db183d1d712ed37af22a3f19fce67ca4c..46c4a6fe3b31f2dbe3ff72be83f258b4cb9b7557 100644 (file)
--- a/Lib/test/test_calendar.py
+++ b/Lib/test/test_calendar.py
@@ -513,8 +513,8 @@ class CommandLineTestCase(unittest.TestCase):
      def test_option_encoding(self):
          self.assertFailure('-e')
          self.assertFailure('--encoding')
-        stdout = self.run_ok('--encoding', 'rot-13', '2004')
-        self.assertEqual(stdout.strip(), conv(result_2004_text.encode('rot-13')).strip())
+        stdout = self.run_ok('--encoding', 'utf-16-le', '2004')
+        self.assertEqual(stdout.strip(), conv(result_2004_text.encode('utf-16-le')).strip())
  
      def test_option_locale(self):
          self.assertFailure('-L')
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index de80b0776c8116e806ee57f653a6bff18fbf2d63..c7072a65bed4a5da75a9c099638e74d68a821c7f 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1395,14 +1395,14 @@ class EncodedFileTest(unittest.TestCase):
  class Str2StrTest(unittest.TestCase):
  
      def test_read(self):
-        sin = "\x80".encode("base64_codec")
+        sin = codecs.encode("\x80", "base64_codec")
          reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
          sout = reader.read()
          self.assertEqual(sout, "\x80")
          self.assertIsInstance(sout, str)
  
      def test_readline(self):
-        sin = "\x80".encode("base64_codec")
+        sin = codecs.encode("\x80", "base64_codec")
          reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
          sout = reader.readline()
          self.assertEqual(sout, "\x80")
@@ -1536,6 +1536,9 @@ broken_unicode_with_streams = [
  ]
  broken_incremental_coders = broken_unicode_with_streams[:]
  
+if sys.flags.py3k_warning:
+    broken_unicode_with_streams.append("rot_13")
+
  # The following encodings only support "strict" mode
  only_strict_mode = [
      "idna",
@@ -2135,6 +2138,47 @@ def test_main():
          # Missing "begin" line
          self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
  
+    def test_text_to_binary_blacklists_binary_transforms(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.encode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.encode(encoding)
+            self.assertIsNone(failure.exception.__cause__)
+
+    def test_text_to_binary_blacklists_text_transforms(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = (r"^'rot_13' is not a text encoding; "
+               r"use codecs.encode\(\) to handle arbitrary codecs")
+        with self.assertRaisesRegex(LookupError, msg):
+            "just an example message".encode("rot_13")
+
+    def test_binary_to_text_blacklists_binary_transforms(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            encoded_data = codecs.encode(data, encoding)
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                encoded_data.decode(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                bytearray(encoded_data).decode(encoding)
+
+    def test_binary_to_text_blacklists_text_transforms(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            msg = (r"^'rot_13' is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.decode("rot_13")
+            self.assertIsNone(failure.exception.__cause__)
+
  
  if __name__ == "__main__":
      test_main()
diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py

index c15ad847bbc3fb0083157499548cee78268b8023..facc56e24f66e4fcb5b337fa6464fe532a2c30a4 100644 (file)
--- a/Lib/test/test_fileinput.py
+++ b/Lib/test/test_fileinput.py
@@ -211,10 +211,11 @@ class FileInputTests(unittest.TestCase):
          except ValueError:
              pass
          try:
-            t1 = writeTmp(1, ["A\nB"], mode="wb")
-            fi = FileInput(files=t1, openhook=hook_encoded("rot13"))
+            # UTF-7 is a convenient, seldom used encoding
+            t1 = writeTmp(1, ['+AEE-\n+AEI-'], mode="wb")
+            fi = FileInput(files=t1, openhook=hook_encoded("utf-7"))
              lines = list(fi)
-            self.assertEqual(lines, ["N\n", "O"])
+            self.assertEqual(lines, [u'A\n', u'B'])
          finally:
              remove_tempfiles(t1)
  
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py

index bbc804b6a58a3aa76a96d52c6b1b305ae359c92d..1a17d814ee248d00bbed1fae3be3c1096e8e0ef1 100644 (file)
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2001,6 +2001,15 @@ class TextIOWrapperTest(unittest.TestCase):
          t.__init__(self.MockRawIO())
          self.assertEqual(t.read(0), u'')
  
+    def test_non_text_encoding_codecs_are_rejected(self):
+        # Ensure the constructor complains if passed a codec that isn't
+        # marked as a text encoding
+        # http://bugs.python.org/issue20404
+        r = self.BytesIO()
+        b = self.BufferedWriter(r)
+        with support.check_py3k_warnings():
+            self.TextIOWrapper(b, encoding="hex_codec")
+
      def test_detach(self):
          r = self.BytesIO()
          b = self.BufferedWriter(r)
@@ -2617,19 +2626,39 @@ class TextIOWrapperTest(unittest.TestCase):
  
      def test_illegal_decoder(self):
          # Issue #17106
+        # Bypass the early encoding check added in issue 20404
+        def _make_illegal_wrapper():
+            quopri = codecs.lookup("quopri_codec")
+            quopri._is_text_encoding = True
+            try:
+                t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
+                                       newline='\n', encoding="quopri_codec")
+            finally:
+                quopri._is_text_encoding = False
+            return t
          # Crash when decoder returns non-string
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
          with self.maybeRaises(TypeError):
              t.read(1)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
          with self.maybeRaises(TypeError):
              t.readline()
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
          with self.maybeRaises(TypeError):
              t.read()
+        #else:
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.read, 1)
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.readline)
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.read)
  
  
  class CTextIOWrapperTest(TextIOWrapperTest):
@@ -3002,9 +3031,11 @@ class MiscIOTest(unittest.TestCase):
  
  class CMiscIOTest(MiscIOTest):
      io = io
+    shutdown_error = "RuntimeError: could not find io module state"
  
  class PyMiscIOTest(MiscIOTest):
      io = pyio
+    shutdown_error = "LookupError: unknown encoding: ascii"
  
  
  @unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.')
diff --git a/Misc/NEWS b/Misc/NEWS

index 4de6d0de4c01c51bbd54865dd9a1a83208583fb8..05cb0cca01539f58b02c846c4dc1ab75e9dc0d2b 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 2.7.11?
  Core and Builtins
  -----------------
  
+- Issue #19543: encode() and decode() methods and constructors of str,
+  unicode and bytearray classes now emit deprecation warning for known
+  non-text encodings when Python is ran with the -3 option.
+
  - Issue #24115: Update uses of PyObject_IsTrue(), PyObject_Not(),
    PyObject_IsInstance(), PyObject_RichCompareBool() and _PyDict_Contains()
    to check for and handle errors correctly.
@@ -26,6 +30,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #19543: io.TextIOWrapper (and hence io.open()) now uses the internal
+  codec marking system added to emit deprecation warning for known non-text
+  encodings at stream construction time when Python is ran with the -3 option.
+
  - Issue #24264: Fixed buffer overflow in the imageop module.
  
  - Issue #5633: Fixed timeit when the statement is a string and the setup is not.
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c

index 8ac8a4acdea6e0eae598e92a88d4bfebf6a97b78..9981d4c3ba32c63d57bfb125f9bba12080e3d155 100644 (file)
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -826,7 +826,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
      char *kwlist[] = {"buffer", "encoding", "errors",
                        "newline", "line_buffering",
                        NULL};
-    PyObject *buffer, *raw;
+    PyObject *buffer, *raw, *codec_info = NULL;
      char *encoding = NULL;
      char *errors = NULL;
      char *newline = NULL;
@@ -909,6 +909,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
                          "could not determine default encoding");
      }
  
+    /* Check we have been asked for a real text encoding */
+    codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()");
+    if (codec_info == NULL) {
+        Py_CLEAR(self->encoding);
+        goto error;
+    }
+
+    /* XXX: Failures beyond this point have the potential to leak elements
+     * of the partially constructed object (like self->encoding)
+     */
+
      if (errors == NULL)
          errors = "strict";
      self->errors = PyBytes_FromString(errors);
@@ -922,7 +933,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
      if (newline) {
          self->readnl = PyString_FromString(newline);
          if (self->readnl == NULL)
-            return -1;
+            goto error;
      }
      self->writetranslate = (newline == NULL || newline[0] != '\0');
      if (!self->readuniversal && self->writetranslate) {
@@ -944,8 +955,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
      if (r == -1)
          goto error;
      if (r == 1) {
-        self->decoder = PyCodec_IncrementalDecoder(
-            encoding, errors);
+        self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info,
+                                                           errors);
          if (self->decoder == NULL)
              goto error;
  
@@ -969,17 +980,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
      if (r == -1)
          goto error;
      if (r == 1) {
-        PyObject *ci;
-        self->encoder = PyCodec_IncrementalEncoder(
-            encoding, errors);
+        self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info,
+                                                           errors);
          if (self->encoder == NULL)
              goto error;
          /* Get the normalized named of the codec */
-        ci = _PyCodec_Lookup(encoding);
-        if (ci == NULL)
-            goto error;
-        res = PyObject_GetAttrString(ci, "name");
-        Py_DECREF(ci);
+        res = PyObject_GetAttrString(codec_info, "name");
          if (res == NULL) {
              if (PyErr_ExceptionMatches(PyExc_AttributeError))
                  PyErr_Clear();
@@ -999,6 +1005,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
          Py_XDECREF(res);
      }
  
+    /* Finished sorting out the codec details */
+    Py_DECREF(codec_info);
+
      self->buffer = buffer;
      Py_INCREF(buffer);
  
@@ -1059,6 +1068,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
      return 0;
  
    error:
+    Py_XDECREF(codec_info);
      return -1;
  }
  
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c

index fd201cafa75ac7da5e10e7a8046d421f23e76202..5f575805d494030c041bbb6e3cce3d115c7e85b4 100644 (file)
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -783,7 +783,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
      if (PyBytes_Check(arg)) {
          PyObject *new, *encoded;
          if (encoding != NULL) {
-            encoded = PyCodec_Encode(arg, encoding, errors);
+            encoded = _PyCodec_EncodeText(arg, encoding, errors);
              if (encoded == NULL)
                  return -1;
              assert(PyBytes_Check(encoded));
@@ -809,7 +809,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
                              "unicode argument without an encoding");
              return -1;
          }
-        encoded = PyCodec_Encode(arg, encoding, errors);
+        encoded = _PyCodec_EncodeText(arg, encoding, errors);
          if (encoded == NULL)
              return -1;
          assert(PyBytes_Check(encoded));
@@ -2567,7 +2567,7 @@ bytearray_decode(PyObject *self, PyObject *args, PyObject *kwargs)
          return NULL;
  #endif
      }
-    return PyCodec_Decode(self, encoding, errors);
+    return _PyCodec_DecodeText(self, encoding, errors);
  }
  
  PyDoc_STRVAR(alloc_doc,
diff --git a/Objects/stringobject.c b/Objects/stringobject.c

index 46f46db0e0fce9ff2c0992422912fe18d1fd2c8a..c1e12a7aaea3fa125e0684893e83666e58a25ead 100644 (file)
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -449,7 +449,7 @@ PyObject *PyString_AsDecodedObject(PyObject *str,
      }
  
      /* Decode via the codec registry */
-    v = PyCodec_Decode(str, encoding, errors);
+    v = _PyCodec_DecodeText(str, encoding, errors);
      if (v == NULL)
          goto onError;
  
@@ -529,7 +529,7 @@ PyObject *PyString_AsEncodedObject(PyObject *str,
      }
  
      /* Encode via the codec registry */
-    v = PyCodec_Encode(str, encoding, errors);
+    v = _PyCodec_EncodeText(str, encoding, errors);
      if (v == NULL)
          goto onError;
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 91e75244bf4be33de9640c6de6d9fcb573796cd4..08723ac9b868b11b99fff3ebb04067ac57f0e0a1 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1259,7 +1259,7 @@ PyObject *PyUnicode_Decode(const char *s,
      buffer = PyBuffer_FromMemory((void *)s, size);
      if (buffer == NULL)
          goto onError;
-    unicode = PyCodec_Decode(buffer, encoding, errors);
+    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
      if (unicode == NULL)
          goto onError;
      if (!PyUnicode_Check(unicode)) {
@@ -1292,7 +1292,7 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
          encoding = PyUnicode_GetDefaultEncoding();
  
      /* Decode via the codec registry */
-    v = PyCodec_Decode(unicode, encoding, errors);
+    v = _PyCodec_DecodeText(unicode, encoding, errors);
      if (v == NULL)
          goto onError;
      return v;
@@ -1331,7 +1331,7 @@ PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
          encoding = PyUnicode_GetDefaultEncoding();
  
      /* Encode via the codec registry */
-    v = PyCodec_Encode(unicode, encoding, errors);
+    v = _PyCodec_EncodeText(unicode, encoding, errors);
      if (v == NULL)
          goto onError;
      return v;
@@ -1369,7 +1369,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
      }
  
      /* Encode via the codec registry */
-    v = PyCodec_Encode(unicode, encoding, errors);
+    v = _PyCodec_EncodeText(unicode, encoding, errors);
      if (v == NULL)
          goto onError;
      if (!PyString_Check(v)) {
diff --git a/Python/codecs.c b/Python/codecs.c

index 184d1471036aa26203e0347c5fbdd5e875c1e1d3..d672362c4ce49715f85b021c6d98ac0e9034fc41 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -217,20 +217,15 @@ PyObject *codec_getitem(const char *encoding, int index)
      return v;
  }
  
-/* Helper function to create an incremental codec. */
-
+/* Helper functions to create an incremental codec. */
  static
-PyObject *codec_getincrementalcodec(const char *encoding,
-                                    const char *errors,
-                                    const char *attrname)
+PyObject *codec_makeincrementalcodec(PyObject *codec_info,
+                                     const char *errors,
+                                     const char *attrname)
  {
-    PyObject *codecs, *ret, *inccodec;
+    PyObject *ret, *inccodec;
  
-    codecs = _PyCodec_Lookup(encoding);
-    if (codecs == NULL)
-        return NULL;
-    inccodec = PyObject_GetAttrString(codecs, attrname);
-    Py_DECREF(codecs);
+    inccodec = PyObject_GetAttrString(codec_info, attrname);
      if (inccodec == NULL)
          return NULL;
      if (errors)
@@ -241,6 +236,21 @@ PyObject *codec_getincrementalcodec(const char *encoding,
      return ret;
  }
  
+static
+PyObject *codec_getincrementalcodec(const char *encoding,
+                                    const char *errors,
+                                    const char *attrname)
+{
+    PyObject *codec_info, *ret;
+
+    codec_info = _PyCodec_Lookup(encoding);
+    if (codec_info == NULL)
+        return NULL;
+    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
+    Py_DECREF(codec_info);
+    return ret;
+}
+
  /* Helper function to create a stream codec. */
  
  static
@@ -264,6 +274,24 @@ PyObject *codec_getstreamcodec(const char *encoding,
      return streamcodec;
  }
  
+/* Helpers to work with the result of _PyCodec_Lookup
+
+ */
+PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementaldecoder");
+}
+
+PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementalencoder");
+}
+
+
  /* Convenience APIs to query the Codec registry.
  
     All APIs return a codec object with incremented refcount.
@@ -311,18 +339,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
  
     errors is passed to the encoder factory as argument if non-NULL. */
  
-PyObject *PyCodec_Encode(PyObject *object,
-                         const char *encoding,
-                         const char *errors)
+static PyObject *
+_PyCodec_EncodeInternal(PyObject *object,
+                        PyObject *encoder,
+                        const char *encoding,
+                        const char *errors)
  {
-    PyObject *encoder = NULL;
      PyObject *args = NULL, *result = NULL;
      PyObject *v;
  
-    encoder = PyCodec_Encoder(encoding);
-    if (encoder == NULL)
-        goto onError;
-
      args = args_tuple(object, errors);
      if (args == NULL)
          goto onError;
@@ -358,18 +383,15 @@ PyObject *PyCodec_Encode(PyObject *object,
  
     errors is passed to the decoder factory as argument if non-NULL. */
  
-PyObject *PyCodec_Decode(PyObject *object,
-                         const char *encoding,
-                         const char *errors)
+static PyObject *
+_PyCodec_DecodeInternal(PyObject *object,
+                        PyObject *decoder,
+                        const char *encoding,
+                        const char *errors)
  {
-    PyObject *decoder = NULL;
      PyObject *args = NULL, *result = NULL;
      PyObject *v;
  
-    decoder = PyCodec_Decoder(encoding);
-    if (decoder == NULL)
-        goto onError;
-
      args = args_tuple(object, errors);
      if (args == NULL)
          goto onError;
@@ -399,6 +421,139 @@ PyObject *PyCodec_Decode(PyObject *object,
      return NULL;
  }
  
+/* Generic encoding/decoding API */
+PyObject *PyCodec_Encode(PyObject *object,
+                         const char *encoding,
+                         const char *errors)
+{
+    PyObject *encoder;
+
+    encoder = PyCodec_Encoder(encoding);
+    if (encoder == NULL)
+        return NULL;
+
+    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *PyCodec_Decode(PyObject *object,
+                         const char *encoding,
+                         const char *errors)
+{
+    PyObject *decoder;
+
+    decoder = PyCodec_Decoder(encoding);
+    if (decoder == NULL)
+        return NULL;
+
+    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
+/* Text encoding/decoding API */
+PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
+                                       const char *alternate_command)
+{
+    PyObject *codec;
+    PyObject *attr;
+    int is_text_codec;
+
+    codec = _PyCodec_Lookup(encoding);
+    if (codec == NULL)
+        return NULL;
+
+    /* Backwards compatibility: assume any raw tuple describes a text
+     * encoding, and the same for anything lacking the private
+     * attribute.
+     */
+    if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
+        attr = PyObject_GetAttrString(codec, "_is_text_encoding");
+        if (attr == NULL) {
+            if (!PyErr_ExceptionMatches(PyExc_AttributeError))
+                goto onError;
+            PyErr_Clear();
+        } else {
+            is_text_codec = PyObject_IsTrue(attr);
+            Py_DECREF(attr);
+            if (is_text_codec < 0)
+                goto onError;
+            if (!is_text_codec) {
+                PyObject *msg = PyString_FromFormat(
+                            "'%.400s' is not a text encoding; "
+                            "use %s to handle arbitrary codecs",
+                            encoding, alternate_command);
+                if (msg == NULL)
+                    goto onError;
+                if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
+                    Py_DECREF(msg);
+                    goto onError;
+                }
+                Py_DECREF(msg);
+            }
+        }
+    }
+
+    /* This appears to be a valid text encoding */
+    return codec;
+
+ onError:
+    Py_DECREF(codec);
+    return NULL;
+}
+
+
+static
+PyObject *codec_getitem_checked(const char *encoding,
+                                const char *alternate_command,
+                                int index)
+{
+    PyObject *codec;
+    PyObject *v;
+
+    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
+    if (codec == NULL)
+        return NULL;
+
+    v = PyTuple_GET_ITEM(codec, index);
+    Py_INCREF(v);
+    Py_DECREF(codec);
+    return v;
+}
+
+static PyObject * _PyCodec_TextEncoder(const char *encoding)
+{
+    return codec_getitem_checked(encoding, "codecs.encode()", 0);
+}
+
+static PyObject * _PyCodec_TextDecoder(const char *encoding)
+{
+    return codec_getitem_checked(encoding, "codecs.decode()", 1);
+}
+
+PyObject *_PyCodec_EncodeText(PyObject *object,
+                              const char *encoding,
+                              const char *errors)
+{
+    PyObject *encoder;
+
+    encoder = _PyCodec_TextEncoder(encoding);
+    if (encoder == NULL)
+        return NULL;
+
+    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *_PyCodec_DecodeText(PyObject *object,
+                              const char *encoding,
+                              const char *errors)
+{
+    PyObject *decoder;
+
+    decoder = _PyCodec_TextDecoder(encoding);
+    if (decoder == NULL)
+        return NULL;
+
+    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
  /* Register the error handling callback function error under the name
     name. This function will be called by the codec when it encounters
     an unencodable characters/undecodable bytes and doesn't know the
author	Serhiy Storchaka <storchaka@gmail.com>
	Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Sun, 31 May 2015 17:21:00 +0000 (20:21 +0300)
Include/codecs.h		patch \| blob \| blame \| history
Lib/_pyio.py		patch \| blob \| blame \| history
Lib/codecs.py		patch \| blob \| blame \| history
Lib/encodings/base64_codec.py		patch \| blob \| blame \| history
Lib/encodings/bz2_codec.py		patch \| blob \| blame \| history
Lib/encodings/hex_codec.py		patch \| blob \| blame \| history
Lib/encodings/quopri_codec.py		patch \| blob \| blame \| history
Lib/encodings/rot_13.py		patch \| blob \| blame \| history
Lib/encodings/uu_codec.py		patch \| blob \| blame \| history
Lib/encodings/zlib_codec.py		patch \| blob \| blame \| history
Lib/json/decoder.py		patch \| blob \| blame \| history
Lib/test/string_tests.py		patch \| blob \| blame \| history
Lib/test/test_calendar.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Lib/test/test_fileinput.py		patch \| blob \| blame \| history
Lib/test/test_io.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Modules/_io/textio.c		patch \| blob \| blame \| history
Objects/bytearrayobject.c		patch \| blob \| blame \| history
Objects/stringobject.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/codecs.c		patch \| blob \| blame \| history