]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.12] gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) (#121205)
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Mon, 1 Jul 2024 08:47:38 +0000 (10:47 +0200)
committerGitHub <noreply@github.com>
Mon, 1 Jul 2024 08:47:38 +0000 (08:47 +0000)
gh-121188: Sanitize invalid XML characters in regrtest (GH-121195)

When creating the JUnit XML file, regrtest now escapes characters
which are invalid in XML, such as the chr(27) control character used
in ANSI escape sequences.
(cherry picked from commit af8c3d7a26d605099f5b3406a8d33ecddb77e8fb)

Co-authored-by: Victor Stinner <vstinner@python.org>
Lib/test/libregrtest/testresult.py
Lib/test/libregrtest/utils.py
Lib/test/test_regrtest.py
Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst [new file with mode: 0644]

index de23fdd59ded957736f0ce63e1c43acaf7eb7f49..1820f354572521accb4ec7340942f8b13505d0e2 100644 (file)
@@ -9,6 +9,7 @@ import time
 import traceback
 import unittest
 from test import support
+from test.libregrtest.utils import sanitize_xml
 
 class RegressionTestResult(unittest.TextTestResult):
     USE_XML = False
@@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult):
         if capture:
             if self._stdout_buffer is not None:
                 stdout = self._stdout_buffer.getvalue().rstrip()
-                ET.SubElement(e, 'system-out').text = stdout
+                ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
             if self._stderr_buffer is not None:
                 stderr = self._stderr_buffer.getvalue().rstrip()
-                ET.SubElement(e, 'system-err').text = stderr
+                ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
 
         for k, v in args.items():
             if not k or not v:
                 continue
+
             e2 = ET.SubElement(e, k)
             if hasattr(v, 'items'):
                 for k2, v2 in v.items():
                     if k2:
-                        e2.set(k2, str(v2))
+                        e2.set(k2, sanitize_xml(str(v2)))
                     else:
-                        e2.text = str(v2)
+                        e2.text = sanitize_xml(str(v2))
             else:
-                e2.text = str(v)
+                e2.text = sanitize_xml(str(v))
 
     @classmethod
     def __makeErrorDict(cls, err_type, err_value, err_tb):
index 265dbf9ab75a75d9c9f78dc9cece4b7c565d0b34..097ba777b23b58de87010465c6e7e861f652667f 100644 (file)
@@ -5,6 +5,7 @@ import math
 import os.path
 import platform
 import random
+import re
 import shlex
 import signal
 import subprocess
@@ -710,3 +711,24 @@ def get_signal_name(exitcode):
         pass
 
     return None
+
+
+ILLEGAL_XML_CHARS_RE = re.compile(
+    '['
+    # Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
+    '\x00-\x08\x0B\x0C\x0E-\x1F'
+    # Surrogate characters
+    '\uD800-\uDFFF'
+    # Special Unicode characters
+    '\uFFFE'
+    '\uFFFF'
+    # Match multiple sequential invalid characters for better effiency
+    ']+')
+
+def _sanitize_xml_replace(regs):
+    text = regs[0]
+    return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
+                   for ch in text)
+
+def sanitize_xml(text):
+    return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)
index 8135a3fdad17a6e879efb1ec64362185af45f7d2..75196ac040a87e4505b1887bce350fabf66d89e7 100644 (file)
@@ -21,6 +21,8 @@ import sysconfig
 import tempfile
 import textwrap
 import unittest
+from xml.etree import ElementTree
+
 from test import support
 from test.support import os_helper
 from test.libregrtest import cmdline
@@ -2221,6 +2223,44 @@ class ArgsTestCase(BaseTestCase):
             self.check_executed_tests(output, testname, stats=1, parallel=True)
             self.assertNotIn('SPAM SPAM SPAM', output)
 
+    def test_xml(self):
+        code = textwrap.dedent(r"""
+            import unittest
+            from test import support
+
+            class VerboseTests(unittest.TestCase):
+                def test_failed(self):
+                    print("abc \x1b def")
+                    self.fail()
+        """)
+        testname = self.create_test(code=code)
+
+        # Run sequentially
+        filename = os_helper.TESTFN
+        self.addCleanup(os_helper.unlink, filename)
+
+        output = self.run_tests(testname, "--junit-xml", filename,
+                                exitcode=EXITCODE_BAD_TEST)
+        self.check_executed_tests(output, testname,
+                                  failed=testname,
+                                  stats=TestStats(1, 1, 0))
+
+        # Test generated XML
+        with open(filename, encoding="utf8") as fp:
+            content = fp.read()
+
+        testsuite = ElementTree.fromstring(content)
+        self.assertEqual(int(testsuite.get('tests')), 1)
+        self.assertEqual(int(testsuite.get('errors')), 0)
+        self.assertEqual(int(testsuite.get('failures')), 1)
+
+        testcase = testsuite[0][0]
+        self.assertEqual(testcase.get('status'), 'run')
+        self.assertEqual(testcase.get('result'), 'completed')
+        self.assertGreater(float(testcase.get('time')), 0)
+        for out in testcase.iter('system-out'):
+            self.assertEqual(out.text, r"abc \x1b def")
+
 
 class TestUtils(unittest.TestCase):
     def test_format_duration(self):
@@ -2403,6 +2443,25 @@ class TestUtils(unittest.TestCase):
             self.assertTrue(match_test(test_chdir))
             self.assertFalse(match_test(test_copy))
 
+    def test_sanitize_xml(self):
+        sanitize_xml = utils.sanitize_xml
+
+        # escape invalid XML characters
+        self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
+                         r'abc \x1b\x1f def')
+        self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
+                         r'nul:\x00, bell:\x07')
+        self.assertEqual(sanitize_xml('surrogate:\uDC80'),
+                         r'surrogate:\udc80')
+        self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
+                         r'illegal \ufffe and \uffff')
+
+        # no escape for valid XML characters
+        self.assertEqual(sanitize_xml('a\n\tb'),
+                         'a\n\tb')
+        self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
+                         'valid t\xe9xt \u20ac')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst
new file mode 100644 (file)
index 0000000..c92002d
--- /dev/null
@@ -0,0 +1,3 @@
+When creating the JUnit XML file, regrtest now escapes characters which are
+invalid in XML, such as the chr(27) control character used in ANSI escape
+sequences. Patch by Victor Stinner.