gh-148821: Add more strict tests for XML encodings (GH-149765)

author Serhiy Storchaka <storchaka@gmail.com>

Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)

committer GitHub <noreply@github.com>

Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)
committer GitHub <noreply@github.com>
Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)
diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py

index 9a1620029c6da97423fcb2c9e6df31e8c791310e..4fe2e02326f04febb43558981464e1e4f7c7d79c 100644 (file)
--- a/Lib/test/test_pyexpat.py
+++ b/Lib/test/test_pyexpat.py
@@ -227,8 +227,7 @@ class ParseTest(unittest.TestCase):
              "Character data: '\xb5'",
              "End element: 'root'",
          ]
-        for operation, expected_operation in zip(operations, expected_operations):
-            self.assertEqual(operation, expected_operation)
+        self.assertEqual(operations, expected_operations)
  
      def test_parse_bytes(self):
          out = self.Outputter()
@@ -276,6 +275,79 @@ class ParseTest(unittest.TestCase):
          self.assertEqual(expat.ErrorString(cm.exception.code),
                            expat.errors.XML_ERROR_FINISHED)
  
+    @support.subTests('encoding', [
+        'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+        'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+        'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+        'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+        'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+        'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+        'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1125',
+        'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
+        'cp1256', 'cp1257', 'cp1258',
+        'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+        'mac-roman', 'mac-turkish',
+        'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
+    ])
+    def test_supported_ecodings(self, encoding):
+        out = self.Outputter()
+        parser = expat.ParserCreate()
+        self._hookup_callbacks(parser, out)
+        c = 'éπя\u05d0\u060c€'.encode(encoding, 'ignore').decode(encoding)[0]
+        data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
+                f'<root>{c}</root>').encode(encoding)
+        parser.Parse(data, True)
+        self.assertEqual(out.out, [
+            ('XML declaration', ('1.0', encoding, -1)),
+            "Start element: 'root' {}",
+            f'Character data: {c!r}',
+            "End element: 'root'",
+        ])
+
+    @support.subTests('encoding', [
+        'UTF-8', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be',
+        'koi8-u', 'cp1125', 'cp1251', 'iso8859-5', 'mac-cyrillic',
+    ])
+    def test_supported_ecodings2(self, encoding):
+        out = self.Outputter()
+        parser = expat.ParserCreate()
+        self._hookup_callbacks(parser, out)
+        data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
+                '<!-- коментар -->'
+                '<корінь атрибут="значення">зміст</корінь>').encode(encoding)
+        parser.Parse(data, True)
+        self.assertEqual(out.out, [
+            ('XML declaration', ('1.0', encoding, -1)),
+            "Comment: ' коментар '",
+            "Start element: 'корінь' {'атрибут': 'значення'}",
+            "Character data: 'зміст'",
+            "End element: 'корінь'",
+        ])
+
+    @support.subTests('encoding', [
+        'UTF-7',
+        "Big5-HKSCS", "Big5",
+        "cp932", "cp949", "cp950",
+        "EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
+        "GB18030", "GB2312", "GBK",
+        "ISO-2022-KR",
+        "johab",
+        "Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
+    ])
+    def test_unsupportes_ecodings(self, encoding):
+        parser = expat.ParserCreate()
+        data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
+                '<root></root>').encode(encoding)
+        with self.assertRaises(ValueError):
+            parser.Parse(data, True)
+
+    def test_unknown_ecoding(self):
+        parser = expat.ParserCreate()
+        data = b'<?xml version="1.0" encoding="xyz"?>\n<root></root>'
+        with self.assertRaises(LookupError):
+            parser.Parse(data, True)
+
+
  class NamespaceSeparatorTest(unittest.TestCase):
      def test_legal(self):
          # Tests that make sure we get errors when the namespace_separator value
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py

index 8f3efe9fc90794b9c13a1a5c9560533d65f81eb6..3a41ea97a2e0a2630dd7a00906e46a651be7d21c 100644 (file)
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1009,12 +1009,12 @@ class ElementTreeTest(unittest.TestCase):
          check("cp437", '\u221a')
          check("mac-roman", '\u02da')
  
-        def xml(encoding):
-            return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
-        def bxml(encoding):
-            return xml(encoding).encode(encoding)
+        def xml(encoding, body=''):
+            return "<?xml version='1.0' encoding='%s'?><xml>%s</xml>" % (encoding, body)
+        def bxml(encoding, body=''):
+            return xml(encoding, body).encode(encoding)
          supported_encodings = [
-            'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+            'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
              'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
              'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
              'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -1025,13 +1025,14 @@ class ElementTreeTest(unittest.TestCase):
              'cp1256', 'cp1257', 'cp1258',
              'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
              'mac-roman', 'mac-turkish',
-            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
-            'iso2022-jp-3', 'iso2022-jp-ext',
-            'koi8-r', 'koi8-t', 'koi8-u', 'kz1048',
-            'hz', 'ptcp154',
+            'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
          ]
          for encoding in supported_encodings:
-            self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+            with self.subTest(encoding=encoding):
+                self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+                c = 'éπя\u05d0\u060c€'.encode(encoding, 'ignore').decode(encoding)[0]
+                self.assertEqual(ET.tostring(ET.XML(bxml(encoding, c))),
+                                 ('<xml>&#%d;</xml>' % ord(c)).encode())
  
          unsupported_ascii_compatible_encodings = [
              'big5', 'big5hkscs',
@@ -1043,14 +1044,16 @@ class ElementTreeTest(unittest.TestCase):
              'utf-7',
          ]
          for encoding in unsupported_ascii_compatible_encodings:
-            self.assertRaises(ValueError, ET.XML, bxml(encoding))
+            with self.subTest(encoding=encoding):
+                self.assertRaises(ValueError, ET.XML, bxml(encoding))
  
          unsupported_ascii_incompatible_encodings = [
              'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
              'utf_32', 'utf_32_be', 'utf_32_le',
          ]
          for encoding in unsupported_ascii_incompatible_encodings:
-            self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+            with self.subTest(encoding=encoding):
+                self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
  
          self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
          self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
author	Serhiy Storchaka <storchaka@gmail.com>
	Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)
committer	GitHub <noreply@github.com>
	Wed, 13 May 2026 10:40:47 +0000 (13:40 +0300)
Lib/test/test_pyexpat.py		patch \| blob \| blame \| history
Lib/test/test_xml_etree.py		patch \| blob \| blame \| history