gh-62259: Add support of multi-byte encodings in the XML parser (GH-149860)

author Serhiy Storchaka <storchaka@gmail.com>

Tue, 26 May 2026 19:40:25 +0000 (22:40 +0300)

committer GitHub <noreply@github.com>

Tue, 26 May 2026 19:40:25 +0000 (19:40 +0000)
author Serhiy Storchaka <storchaka@gmail.com>
Tue, 26 May 2026 19:40:25 +0000 (22:40 +0300)
committer GitHub <noreply@github.com>
Tue, 26 May 2026 19:40:25 +0000 (19:40 +0000)
diff --git a/Doc/library/pyexpat.rst b/Doc/library/pyexpat.rst

index 2e6938b5cf68602b4597be7d96b2a07302c34945..c88411ce0b7b91fc2d6b04c6dcf1d3defa8a20f5 100644 (file)
--- a/Doc/library/pyexpat.rst
+++ b/Doc/library/pyexpat.rst
@@ -63,12 +63,26 @@ The :mod:`!xml.parsers.expat` module contains two functions:
  
  .. function:: ParserCreate(encoding=None, namespace_separator=None)
  
-   Creates and returns a new :class:`xmlparser` object.   *encoding*, if specified,
-   must be a string naming the encoding  used by the XML data.  Expat doesn't
-   support as many encodings as Python does, and its repertoire of encodings can't
-   be extended; it supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII.  If
-   *encoding* [1]_ is given it will override the implicit or explicit encoding of the
-   document.
+   Creates and returns a new :class:`xmlparser` object.
+   *encoding* [1]_, if specified, must be a string naming the encoding
+   used by the XML data.
+   If it is given it will override the implicit or explicit encoding
+   of the document.
+
+   .. impl-detail::
+
+      Expat natively understands and processes UTF-8, UTF-16, UTF-16BE,
+      UTF-16LE, ISO-8859-1, and US-ASCII.
+      For other encodings (including aliases like Latin1 and ASCII) it
+      falls back to Python.
+      It supports most of 8-bit encodings and many multi-byte encodings
+      like Shift_JIS, although only BMP characters (``U+0000-U+FFFF``)
+      are supported with non-native encodings (this restriction is also
+      applied to aliases like UTF8).
+      These restrictions only apply if *encoding* is not given.
+
+      .. versionchanged:: next
+         Added support for multi-byte encodings.
  
     .. _xmlparser-non-root:
  
@@ -113,7 +127,6 @@ The :mod:`!xml.parsers.expat` module contains two functions:
     XML document.  Call ``ParserCreate`` for each document to provide unique
     parser instances.
  
-
  .. seealso::
  
     `The Expat XML Parser <http://www.libexpat.org/>`_
@@ -1083,9 +1096,11 @@ The ``errors`` module has the following attributes:
  
  .. rubric:: Footnotes
  
-.. [1] The encoding string included in XML output should conform to the
-   appropriate standards. For example, "UTF-8" is valid, but "UTF8" is
-   not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
+.. [1] The encoding string included in XML output should conform to
+   the appropriate standards. For example, "UTF-8" is valid, but
+   "UTF8" is not valid in an XML document's declaration, even though
+   Python accepts it as an encoding name.
+   See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
     and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
  
  
diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst

index 8dac804b9519dad24c341b54751aacf76e2001ed..4aaf5cda57e83b841c6d5b47ae2658546a61b920 100644 (file)
--- a/Doc/whatsnew/3.16.rst
+++ b/Doc/whatsnew/3.16.rst
@@ -86,7 +86,6 @@ New modules
  Improved modules
  ================
  
-
  gzip
  ----
  
@@ -101,6 +100,21 @@ os
    process via a pidfd.  Available on Linux 5.6+.
    (Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.)
  
+xml
+---
+
+* Add support for multiple multi-byte encodings in the :mod:`XML parser
+  <xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP",
+  "GB2312", "GBK", "johab", and "Shift_JIS".
+  Add partial support (only BMP characters) for multi-byte encodings
+  "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
+  "Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
+  (without hyphen).
+  The parser now raises :exc:`ValueError` for known unsupported
+  multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
+  instead of failing later, when encounter non-ASCII data.
+  (Contributed by Serhiy Storchaka in :gh:`62259`.)
+
  .. Add improved modules above alphabetically, not here at the end.
  
  Optimizations
diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h

index 52dca1362592d693e5871d87bbda01cb2cd9420d..bfa10eadf7357313287d2886576e230b602183b4 100644 (file)
--- a/Include/internal/pycore_codecs.h
+++ b/Include/internal/pycore_codecs.h
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
     in Python 3.5+?
  
   */
-extern PyObject* _PyCodec_LookupTextEncoding(
+PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
     const char *encoding,
     const char *alternate_command);
  
diff --git a/Lib/codecs.py b/Lib/codecs.py

index e4a8010aba90a503aa25ae4a81eef68790150be3..af6ab031157e7907d36de5e7c56d50fd34950cbb 100644 (file)
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -93,7 +93,7 @@ class CodecInfo(tuple):
  
      def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
          incrementalencoder=None, incrementaldecoder=None, name=None,
-        *, _is_text_encoding=None):
+        *, _is_text_encoding=None, _expat_decoding_table=None):
          self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
          self.name = name
          self.encode = encode
@@ -104,6 +104,8 @@ class CodecInfo(tuple):
          self.streamreader = streamreader
          if _is_text_encoding is not None:
              self._is_text_encoding = _is_text_encoding
+        if _expat_decoding_table is not None:
+            self._expat_decoding_table = _expat_decoding_table
          return self
  
      def __repr__(self):
diff --git a/Lib/encodings/big5.py b/Lib/encodings/big5.py

index 7adeb0e1605274414014ee7849c36b203f7589f7..4f749507d78b8b6429d310bc19b2aeb63d6ea35b 100644 (file)
--- a/Lib/encodings/big5.py
+++ b/Lib/encodings/big5.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/big5hkscs.py b/Lib/encodings/big5hkscs.py

index 350df37baaedafcd75120723bf8e9b85a7e2a4b8..a88caa5e1404c0ed9ef9116054a4d6d5d38a1f37 100644 (file)
--- a/Lib/encodings/big5hkscs.py
+++ b/Lib/encodings/big5hkscs.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
      )
diff --git a/Lib/encodings/cp932.py b/Lib/encodings/cp932.py

index e01f59b719057657c65f739e43fc0400722293e6..86e6ffe3b16c4f8435f6741ef3ac326094b2565b 100644 (file)
--- a/Lib/encodings/cp932.py
+++ b/Lib/encodings/cp932.py
@@ -36,4 +36,18 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            0x80, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            0xf8f0, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+            0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+            0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+            0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+            0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+            0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+            0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+            0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1,
+            -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -1, -1, -1, 0xf8f1, 0xf8f2, 0xf8f3),
      )
diff --git a/Lib/encodings/cp949.py b/Lib/encodings/cp949.py

index 627c87125e2affe4bc75eae769f901a54e2f92fb..7283dba8dbb8d3242a52394cec7d17b393016c23 100644 (file)
--- a/Lib/encodings/cp949.py
+++ b/Lib/encodings/cp949.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1),
      )
diff --git a/Lib/encodings/cp950.py b/Lib/encodings/cp950.py

index 39eec5ed0ddef9579cf1be928acfd1a2cf37d39b..d530f914880a32a0b689c1f67923a9bb66ef0c43 100644 (file)
--- a/Lib/encodings/cp950.py
+++ b/Lib/encodings/cp950.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/euc_jis_2004.py b/Lib/encodings/euc_jis_2004.py

index 72b87aea68862f2fabbb7b30274e683a6654d4ae..557f926b8cdb97cdcdc6c20834ca5b215f402b65 100644 (file)
--- a/Lib/encodings/euc_jis_2004.py
+++ b/Lib/encodings/euc_jis_2004.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
      )
diff --git a/Lib/encodings/euc_jisx0213.py b/Lib/encodings/euc_jisx0213.py

index cc47d04112a187d585dce4af4bd76a38e89e0f51..bace554431ba3a64267d882f2927bf54099c0822 100644 (file)
--- a/Lib/encodings/euc_jisx0213.py
+++ b/Lib/encodings/euc_jisx0213.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
      )
diff --git a/Lib/encodings/euc_jp.py b/Lib/encodings/euc_jp.py

index 7bcbe4147f2ad49c6a0d3a9cc6eb5417cdabdf2d..b8df1bc0e2d5fb880f2869e5cd29d9bab14ea35d 100644 (file)
--- a/Lib/encodings/euc_jp.py
+++ b/Lib/encodings/euc_jp.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/euc_kr.py b/Lib/encodings/euc_kr.py

index c1fb1260e879f0c42491be24b6a0cd0b8a365122..ee54e17180b5e15e101fcaf448a3d9db054ef6d7 100644 (file)
--- a/Lib/encodings/euc_kr.py
+++ b/Lib/encodings/euc_kr.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/gb18030.py b/Lib/encodings/gb18030.py

index 34fb6c366a76147b9e2b4a62f886b2a91eb6add8..c2269a7a98105c47e286df6b69d16820c9016e9a 100644 (file)
--- a/Lib/encodings/gb18030.py
+++ b/Lib/encodings/gb18030.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/gb2312.py b/Lib/encodings/gb2312.py

index 3c3b837d618ecd262007eec7d3bd786209e91c06..0a9313b05bd75ff28cd1e2f86b5da0f96fc8c9cb 100644 (file)
--- a/Lib/encodings/gb2312.py
+++ b/Lib/encodings/gb2312.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/gbk.py b/Lib/encodings/gbk.py

index 1b45db89859cdf58931436a954e7eb304b5962b8..45e38bba391533ade867bae99681e932bc9fb351 100644 (file)
--- a/Lib/encodings/gbk.py
+++ b/Lib/encodings/gbk.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
      )
diff --git a/Lib/encodings/hz.py b/Lib/encodings/hz.py

index 383442a3c9ac9ae5d093d972f1ff26fe758897d5..f17f32e0f6f64c4825e1b92e930e887ea5a6bd1c 100644 (file)
--- a/Lib/encodings/hz.py
+++ b/Lib/encodings/hz.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py

index d31ee07ab45b76df294ab089429a081b29c57de0..c896ffdeadfef7ee63586053f6a72d8a86fa3342 100644 (file)
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -385,4 +385,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp.py b/Lib/encodings/iso2022_jp.py

index ab0406069356e4dfd7cfcc4a981535c40c9c5574..40892f5c151d4d6067cc81a9386bc3b9e433e0f4 100644 (file)
--- a/Lib/encodings/iso2022_jp.py
+++ b/Lib/encodings/iso2022_jp.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp_1.py b/Lib/encodings/iso2022_jp_1.py

index 997044dc3787496be0ed70f2ed1b63d9ebe1b0ac..98210d617879e2f26335da151d9fd82615917c3a 100644 (file)
--- a/Lib/encodings/iso2022_jp_1.py
+++ b/Lib/encodings/iso2022_jp_1.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp_2.py b/Lib/encodings/iso2022_jp_2.py

index 9106bf762512fdd3e9deb73cba91529ab0db94ac..047cd7c9677c54c2a14d5d00ff8b7204af1d2968 100644 (file)
--- a/Lib/encodings/iso2022_jp_2.py
+++ b/Lib/encodings/iso2022_jp_2.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp_2004.py b/Lib/encodings/iso2022_jp_2004.py

index 40198bf098570be21f67335c42fbf9f956204725..9b29edacce3fea4142b9f62b027218829ad3d7e1 100644 (file)
--- a/Lib/encodings/iso2022_jp_2004.py
+++ b/Lib/encodings/iso2022_jp_2004.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp_3.py b/Lib/encodings/iso2022_jp_3.py

index 346e08beccbbafd347371743ec89333370d3b083..a39de6301ccdeceff3c569bd3bba0b79226585a9 100644 (file)
--- a/Lib/encodings/iso2022_jp_3.py
+++ b/Lib/encodings/iso2022_jp_3.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_jp_ext.py b/Lib/encodings/iso2022_jp_ext.py

index 752bab9813a094e9446b084ae90957290f1324c0..b7470ec9893655988da55c7695fd847fce27be67 100644 (file)
--- a/Lib/encodings/iso2022_jp_ext.py
+++ b/Lib/encodings/iso2022_jp_ext.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/iso2022_kr.py b/Lib/encodings/iso2022_kr.py

index bf7018763eae38729d6c206379467f36e41910c3..48dff8dc68e85c80ace748566a1ef0912b9b8fa6 100644 (file)
--- a/Lib/encodings/iso2022_kr.py
+++ b/Lib/encodings/iso2022_kr.py
@@ -36,4 +36,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/johab.py b/Lib/encodings/johab.py

index 512aeeb732b5221ba35585a97e3cc1b85d3c5b02..99c9cf6335aaf0f89b610bd58b1b48ee0c5387ff 100644 (file)
--- a/Lib/encodings/johab.py
+++ b/Lib/encodings/johab.py
@@ -36,4 +36,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py

index 268fccbd53974e53cd7af6763376f0761b9665ef..279245f435e1791bf4c1cb349ab463578cd92409 100644 (file)
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -250,4 +250,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py

index 46c8e070dd192ed4bae9682d91e5d56413345cf1..911f59ccf9a1dd5f983c7a4ba59c2b2abd8459cc 100644 (file)
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/shift_jis.py b/Lib/encodings/shift_jis.py

index 83381172764dea0f76874716cda28852a5906f7f..4b33b6fd93cbbe2fab13fea453a9f08a647ac62a 100644 (file)
--- a/Lib/encodings/shift_jis.py
+++ b/Lib/encodings/shift_jis.py
@@ -36,4 +36,17 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+            0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+            0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+            0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+            0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+            0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+            0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+            0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/shift_jis_2004.py b/Lib/encodings/shift_jis_2004.py

index 161b1e86f9918a4df5c3290505a98d5de4566e0e..195519eddf50f391cbc7d02dbecee86ba2bac3ec 100644 (file)
--- a/Lib/encodings/shift_jis_2004.py
+++ b/Lib/encodings/shift_jis_2004.py
@@ -36,4 +36,18 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(
+            *range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+            0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+            0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+            0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+            0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+            0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+            0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+            0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
      )
diff --git a/Lib/encodings/shift_jisx0213.py b/Lib/encodings/shift_jisx0213.py

index cb653f53055e679c513218b0102ffa9d1b4ca82c..b533eed6c18cbfbad39b0b4382537b2b6c197fe9 100644 (file)
--- a/Lib/encodings/shift_jisx0213.py
+++ b/Lib/encodings/shift_jisx0213.py
@@ -36,4 +36,18 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(
+            *range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
+            -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
+            0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
+            0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
+            0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+            0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
+            0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
+            0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
+            0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
      )
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py

index 9b1ce99b339ae08eb63334393d8d81d54ab26475..52e4dc256ce7ff77d362e2cf80620fd1e7752f3a 100644 (file)
--- a/Lib/encodings/unicode_escape.py
+++ b/Lib/encodings/unicode_escape.py
@@ -43,4 +43,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamwriter=StreamWriter,
          streamreader=StreamReader,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py

index d3b9980026666f10d646569ec8f138a98bd62844..01853d46c89bf0f50961d6dbb96a2d72900a04d9 100644 (file)
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -152,4 +152,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py

index 86b458eb9bcd96b50f128df1978a14f5d1e41835..c4f8753e999b9031ac8d66fddaa0a5c8df8aeb91 100644 (file)
--- a/Lib/encodings/utf_16_be.py
+++ b/Lib/encodings/utf_16_be.py
@@ -39,4 +39,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py

index ec454142eedf251e31eab481a1cbea6424273a0a..aa68f019f9ea2cfec9fd111f25f4cd13ef7f147a 100644 (file)
--- a/Lib/encodings/utf_16_le.py
+++ b/Lib/encodings/utf_16_le.py
@@ -39,4 +39,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py

index 1924bedbb74c686d80d36aa0032948d31e28f977..446503ccb32ee0f39d85332cc84c7fbffc2e6a03 100644 (file)
--- a/Lib/encodings/utf_32.py
+++ b/Lib/encodings/utf_32.py
@@ -147,4 +147,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py

index fe272b5fafec69e506c7cb398712d7b6dd9e6145..c430c7ee0ac897749159f54268fe13176720f14f 100644 (file)
--- a/Lib/encodings/utf_32_be.py
+++ b/Lib/encodings/utf_32_be.py
@@ -34,4 +34,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py

index 9e48210928ee652b17fe797785a20267b1d12a6e..7fb33289054770a4851255a2de748c7d38f52752 100644 (file)
--- a/Lib/encodings/utf_32_le.py
+++ b/Lib/encodings/utf_32_le.py
@@ -34,4 +34,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_7.py b/Lib/encodings/utf_7.py

index 8e0567f2087d65e33b3152b8240bcaf703d855fb..9f70aaff4f7a7f2a81436fb2014f9d36c7556c5d 100644 (file)
--- a/Lib/encodings/utf_7.py
+++ b/Lib/encodings/utf_7.py
@@ -35,4 +35,5 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=False,
      )
diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py

index 1bf6336571547bcec3f5aa7978ff1c7f247490ce..854cb88375c37f31f9b550052392ae99b6c59459 100644 (file)
--- a/Lib/encodings/utf_8.py
+++ b/Lib/encodings/utf_8.py
@@ -39,4 +39,13 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+            -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py

index 1bb479203f365dabd6cc59bb0b6debef55e40396..cc895cddcc561bca4a4a30cbac0f9bf9db6d0846 100644 (file)
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -127,4 +127,14 @@ def getregentry():
          incrementaldecoder=IncrementalDecoder,
          streamreader=StreamReader,
          streamwriter=StreamWriter,
+        # The same as for UTF-8.
+        _expat_decoding_table=(*range(128),
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+            -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+            -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
      )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 79c8a7ef8864829312a36cfc954ebead2ced0b91..8fdd08df9e4f46a847a3a776b7395f21ece698f5 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1892,9 +1892,11 @@ class CodecsModuleTest(unittest.TestCase):
          self.assertIsNot(dup, orig)
          self.assertEqual(dup, orig)
          self.assertTrue(orig._is_text_encoding)
+        self.assertIsInstance(orig._expat_decoding_table, tuple)
          self.assertEqual(dup.encode, orig.encode)
          self.assertEqual(dup.name, orig.name)
          self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+        self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
  
          # Test a CodecInfo with _is_text_encoding equal to false.
          orig = codecs.lookup("base64")
@@ -1902,9 +1904,11 @@ class CodecsModuleTest(unittest.TestCase):
          self.assertIsNot(dup, orig)
          self.assertEqual(dup, orig)
          self.assertFalse(orig._is_text_encoding)
+        self.assertNotHasAttr(orig, '_expat_decoding_table')
          self.assertEqual(dup.encode, orig.encode)
          self.assertEqual(dup.name, orig.name)
          self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+        self.assertNotHasAttr(dup, '_expat_decoding_table')
  
      def test_deepcopy(self):
          orig = codecs.lookup('utf-8')
@@ -1912,9 +1916,11 @@ class CodecsModuleTest(unittest.TestCase):
          self.assertIsNot(dup, orig)
          self.assertEqual(dup, orig)
          self.assertTrue(orig._is_text_encoding)
+        self.assertIsInstance(orig._expat_decoding_table, tuple)
          self.assertEqual(dup.encode, orig.encode)
          self.assertEqual(dup.name, orig.name)
          self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+        self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
  
          # Test a CodecInfo with _is_text_encoding equal to false.
          orig = codecs.lookup("base64")
@@ -1922,9 +1928,11 @@ class CodecsModuleTest(unittest.TestCase):
          self.assertIsNot(dup, orig)
          self.assertEqual(dup, orig)
          self.assertFalse(orig._is_text_encoding)
+        self.assertNotHasAttr(orig, '_expat_decoding_table')
          self.assertEqual(dup.encode, orig.encode)
          self.assertEqual(dup.name, orig.name)
          self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
+        self.assertNotHasAttr(dup, '_expat_decoding_table')
  
      def test_pickle(self):
          codec_info = codecs.lookup('utf-8')
@@ -1940,6 +1948,8 @@ class CodecsModuleTest(unittest.TestCase):
                       unpickled_codec_info.incrementalencoder
                  )
                  self.assertTrue(unpickled_codec_info._is_text_encoding)
+                self.assertEqual(unpickled_codec_info._expat_decoding_table,
+                                 codec_info._expat_decoding_table)
  
          # Test a CodecInfo with _is_text_encoding equal to false.
          codec_info = codecs.lookup('base64')
@@ -1955,6 +1965,7 @@ class CodecsModuleTest(unittest.TestCase):
                       unpickled_codec_info.incrementalencoder
                  )
                  self.assertFalse(unpickled_codec_info._is_text_encoding)
+                self.assertNotHasAttr(unpickled_codec_info, '_expat_decoding_table')
  
  
  class StreamReaderTest(unittest.TestCase):
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py

index cddf52d569bcd311f232080567514d67614fcc5b..3f2c5f7021018de6870a6495e79da6ccf1786293 100644 (file)
--- a/Lib/test/test_pyexpat.py
+++ b/Lib/test/test_pyexpat.py
@@ -276,7 +276,9 @@ class ParseTest(unittest.TestCase):
                            expat.errors.XML_ERROR_FINISHED)
  
      @support.subTests('encoding', [
-        'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+        # built-in Expat encodings
+        'iso-8859-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+        # 8-bit Python encodings
          'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
          'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
          'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -288,6 +290,12 @@ class ParseTest(unittest.TestCase):
          'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
          'mac-roman', 'mac-turkish',
          'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
+        # multi-byte Python encodings
+        "cp932", "cp949", "cp950",
+        "Big5","EUC-JP", "GB2312", "GBK", "johab", "Shift_JIS",
+        'UTF8', 'utf-8-sig',
+        "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+        "Shift_JIS-2004", "Shift_JISX0213",
      ])
      def test_supported_encodings(self, encoding):
          out = self.Outputter()
@@ -305,7 +313,7 @@ class ParseTest(unittest.TestCase):
          ])
  
      @support.subTests('encoding', [
-        'UTF-8', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be',
+        'UTF-8', 'utf-8', 'utf8', 'utf-16', 'utf-16le', 'utf-16be',
          'koi8-u', 'cp1125', 'cp1251', 'iso8859-5', 'mac-cyrillic',
      ])
      def test_supported_encodings2(self, encoding):
@@ -324,15 +332,46 @@ class ParseTest(unittest.TestCase):
              "End element: 'корінь'",
          ])
  
+    @support.subTests('encoding', [
+        'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+    ])
+    def test_supported_non_bmp(self, encoding):
+        out = self.Outputter()
+        parser = expat.ParserCreate()
+        self._hookup_callbacks(parser, out)
+        c = '\U00020e6d\U00028e36'
+        data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
+                f'<root>{c}</root>').encode(encoding)
+        parser.Parse(data, True)
+        self.assertEqual(out.out, [
+            ('XML declaration', ('1.0', encoding, -1)),
+            "Start element: 'root' {}",
+            f'Character data: {c!r}',
+            "End element: 'root'",
+        ])
+
+    @support.subTests('encoding', [
+        'UTF8', 'utf-8-sig',
+        "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+        "Shift_JIS-2004", "Shift_JISX0213",
+    ])
+    def test_unsupported_non_bmp(self, encoding):
+        parser = expat.ParserCreate()
+        c = '\U00020e6d\U00028e36'
+        data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
+                f'<root>{c}</root>').encode(encoding)
+        with self.assertRaises(expat.ExpatError):
+            parser.Parse(data, True)
+
      @support.subTests('encoding', [
          'UTF-7',
-        "Big5-HKSCS", "Big5",
-        "cp932", "cp949", "cp950",
-        "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
-        "GB18030", "GB2312", "GBK",
+        "unicode-escape", "raw-unicode-escape",
+        "EUC-KR",
+        "GB18030",
+        "HZ-GB-2312",
+        "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
+        "ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
          "ISO-2022-KR",
-        "johab",
-        "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
      ])
      def test_unsupported_encodings(self, encoding):
          parser = expat.ParserCreate()
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py

index 3a41ea97a2e0a2630dd7a00906e46a651be7d21c..8fef5bf663a7c47af1975cf2ac21674f6870817e 100644 (file)
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1008,6 +1008,8 @@ class ElementTreeTest(unittest.TestCase):
          check("iso-8859-15", '\u20ac')
          check("cp437", '\u221a')
          check("mac-roman", '\u02da')
+        check('shift-jis-2004', '\u203e\u3406\uff66')
+        check('euc-jis-2004', '\u3406\uff66')
  
          def xml(encoding, body=''):
              return "<?xml version='1.0' encoding='%s'?><xml>%s</xml>" % (encoding, body)
@@ -1026,6 +1028,12 @@ class ElementTreeTest(unittest.TestCase):
              'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
              'mac-roman', 'mac-turkish',
              'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
+            'big5', 'big5hkscs',
+            'cp932', 'cp949', 'cp950',
+            'euc-jp', 'euc-jis-2004', 'euc-jisx0213',
+            'gb2312', 'gbk', 'johab',
+            'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+            'utf-8-sig', 'utf8',
          ]
          for encoding in supported_encodings:
              with self.subTest(encoding=encoding):
@@ -1035,12 +1043,10 @@ class ElementTreeTest(unittest.TestCase):
                                   ('<xml>&#%d;</xml>' % ord(c)).encode())
  
          unsupported_ascii_compatible_encodings = [
-            'big5', 'big5hkscs',
-            'cp932', 'cp949', 'cp950',
-            'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
-            'gb2312', 'gbk', 'gb18030',
-            'iso2022-kr', 'johab',
-            'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+            'euc-kr', 'gb18030',
+            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+            'iso2022-jp-3', 'iso2022-jp-ext',
+            'iso2022-kr', 'hz',
              'utf-7',
          ]
          for encoding in unsupported_ascii_compatible_encodings:
diff --git a/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst b/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst

new file mode 100644 (file)

index 0000000..d0af773
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst
@@ -0,0 +1,9 @@
+Add support for multiple multi-byte encodings in the :mod:`XML parser
+<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312",
+"GBK", "johab", and "Shift_JIS". Add partial support (only BMP characters)
+for multi-byte encodings "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
+"Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases
+like "UTF8" (without hyphen). The parser now raises :exc:`ValueError` for
+known unsupported multi-byte encodings such us "ISO-2022-JP" or
+"raw-unicode-escape" instead of failing later, when encounter non-ASCII
+data.
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c

index d204b6f27d99082df940f5e4efbdefcded26a765..aef6ebad9ce578eb9f6727517aca74659eacd468 100644 (file)
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -4,6 +4,7 @@
  
  #include "Python.h"
  #include "pycore_ceval.h"         // _Py_EnterRecursiveCall()
+#include "pycore_codecs.h"        // _PyCodec_LookupTextEncoding()
  #include "pycore_import.h"        // _PyImport_SetModule()
  #include "pycore_pyhash.h"        // _Py_HashSecret
  #include "pycore_traceback.h"     // _PyTraceback_Add()
@@ -1438,6 +1439,57 @@ static struct PyMethodDef xmlparse_methods[] = {
     Make it as simple as possible.
  */
  
+typedef struct {
+    int map[256];
+    char name[0];
+} pyexpat_encoding_info;
+
+static pyexpat_encoding_info *
+pyexpat_encoding_create(const char *name, PyObject *mapping)
+{
+    if (!PyTuple_Check(mapping) || PyTuple_GET_SIZE(mapping) != 256) {
+        PyErr_SetString(PyExc_ValueError,
+                        "_expat_decoding_table must be a 256-tuple of integers");
+        return NULL;
+    }
+    pyexpat_encoding_info *info = (pyexpat_encoding_info *)PyMem_Malloc(
+        sizeof(pyexpat_encoding_info) + strlen(name) + 1);
+    if (info == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (int i = 0; i < 256; i++) {
+        int j = PyLong_AsInt(PyTuple_GET_ITEM(mapping, i));
+        if (j == -1 && PyErr_Occurred()) {
+            PyMem_Free(info);
+            return NULL;
+        }
+        info->map[i] = j;
+    }
+    strcpy(info->name, name);
+    return info;
+}
+
+static int
+pyexpat_encoding_convert(void *data, const char *s)
+{
+    pyexpat_encoding_info *info = (pyexpat_encoding_info *)data;
+    int i = (unsigned char)s[0];
+    assert(info->map[i] < -1);
+    PyObject *u = PyUnicode_Decode(s, -info->map[i], info->name, NULL);
+    if (u == NULL) {
+        return -1;
+    }
+    if (PyUnicode_GET_LENGTH(u) != 1) {
+        Py_DECREF(u);
+        return -1;
+    }
+    Py_UCS4 ch = PyUnicode_ReadChar(u, 0);
+    Py_DECREF(u);
+    return (int)ch;
+}
+
+
  static const unsigned char template_buffer[256] =
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@@ -1470,6 +1522,43 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
      if (PyErr_Occurred())
          return XML_STATUS_ERROR;
  
+    PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
+    if (codec == NULL) {
+        return XML_STATUS_ERROR;
+    }
+    if (!PyTuple_CheckExact(codec)) {
+        PyObject *attr;
+        if (PyObject_GetOptionalAttrString(codec, "_expat_decoding_table", &attr) < 0) {
+            Py_DECREF(codec);
+            return XML_STATUS_ERROR;
+        }
+        if (attr != NULL) {
+            if (attr == Py_False) {
+                Py_DECREF(attr);
+                Py_DECREF(codec);
+                PyErr_Format(PyExc_ValueError,
+                             "encoding '%s' is not supported",
+                             name);
+                return XML_STATUS_ERROR;
+            }
+            pyexpat_encoding_info *data = pyexpat_encoding_create(name, attr);
+            Py_DECREF(attr);
+            if (data == NULL) {
+                Py_DECREF(codec);
+                return XML_STATUS_ERROR;
+            }
+            for (i = 0; i < 256; i++) {
+                info->map[i] = data->map[i];
+            }
+            info->data = data;
+            info->convert = pyexpat_encoding_convert;
+            info->release = PyMem_Free;
+            Py_DECREF(codec);
+            return XML_STATUS_OK;
+        }
+    }
+    Py_DECREF(codec);
+
      u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
      if (u == NULL) {
          Py_XDECREF(u);
@@ -1478,8 +1567,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
  
      if (PyUnicode_GET_LENGTH(u) != 256) {
          Py_DECREF(u);
-        PyErr_SetString(PyExc_ValueError,
-                        "multi-byte encodings are not supported");
+        PyErr_Format(PyExc_ValueError,
+                     "multi-byte encoding '%s' is not supported",
+                     name);
          return XML_STATUS_ERROR;
      }
  
diff --git a/Python/codecs.c b/Python/codecs.c

index 0bde56c0ac662e10ea31aece19ab73c3bbde57aa..a522e6b88068b3e118c9a09e467fd07af069cd17 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -10,6 +10,7 @@ Copyright (c) Corporation for National Research Initiatives.
  
  #include "Python.h"
  #include "pycore_call.h"          // _PyObject_CallNoArgs()
+#include "pycore_codecs.h"        // export _PyCodec_LookupTextEncoding()
  #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
  #include "pycore_pyerrors.h"      // _PyErr_FormatNote()
  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
author	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 26 May 2026 19:40:25 +0000 (22:40 +0300)
committer	GitHub <noreply@github.com>
	Tue, 26 May 2026 19:40:25 +0000 (19:40 +0000)
Doc/library/pyexpat.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.16.rst		patch \| blob \| blame \| history
Include/internal/pycore_codecs.h		patch \| blob \| blame \| history
Lib/codecs.py		patch \| blob \| blame \| history
Lib/encodings/big5.py		patch \| blob \| blame \| history
Lib/encodings/big5hkscs.py		patch \| blob \| blame \| history
Lib/encodings/cp932.py		patch \| blob \| blame \| history
Lib/encodings/cp949.py		patch \| blob \| blame \| history
Lib/encodings/cp950.py		patch \| blob \| blame \| history
Lib/encodings/euc_jis_2004.py		patch \| blob \| blame \| history
Lib/encodings/euc_jisx0213.py		patch \| blob \| blame \| history
Lib/encodings/euc_jp.py		patch \| blob \| blame \| history
Lib/encodings/euc_kr.py		patch \| blob \| blame \| history
Lib/encodings/gb18030.py		patch \| blob \| blame \| history
Lib/encodings/gb2312.py		patch \| blob \| blame \| history
Lib/encodings/gbk.py		patch \| blob \| blame \| history
Lib/encodings/hz.py		patch \| blob \| blame \| history
Lib/encodings/idna.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp_1.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp_2.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp_2004.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp_3.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_jp_ext.py		patch \| blob \| blame \| history
Lib/encodings/iso2022_kr.py		patch \| blob \| blame \| history
Lib/encodings/johab.py		patch \| blob \| blame \| history
Lib/encodings/punycode.py		patch \| blob \| blame \| history
Lib/encodings/raw_unicode_escape.py		patch \| blob \| blame \| history
Lib/encodings/shift_jis.py		patch \| blob \| blame \| history
Lib/encodings/shift_jis_2004.py		patch \| blob \| blame \| history
Lib/encodings/shift_jisx0213.py		patch \| blob \| blame \| history
Lib/encodings/unicode_escape.py		patch \| blob \| blame \| history
Lib/encodings/utf_16.py		patch \| blob \| blame \| history
Lib/encodings/utf_16_be.py		patch \| blob \| blame \| history
Lib/encodings/utf_16_le.py		patch \| blob \| blame \| history
Lib/encodings/utf_32.py		patch \| blob \| blame \| history
Lib/encodings/utf_32_be.py		patch \| blob \| blame \| history
Lib/encodings/utf_32_le.py		patch \| blob \| blame \| history
Lib/encodings/utf_7.py		patch \| blob \| blame \| history
Lib/encodings/utf_8.py		patch \| blob \| blame \| history
Lib/encodings/utf_8_sig.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Lib/test/test_pyexpat.py		patch \| blob \| blame \| history
Lib/test/test_xml_etree.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-05-14-17-01-19.gh-issue-62259.ytlFD5.rst	[new file with mode: 0644]	patch \| blob
Modules/pyexpat.c		patch \| blob \| blame \| history
Python/codecs.c		patch \| blob \| blame \| history