]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-121188: Sanitize invalid XML characters in regrtest (#121195)
authorVictor Stinner <vstinner@python.org>
Mon, 1 Jul 2024 08:30:33 +0000 (10:30 +0200)
committerGitHub <noreply@github.com>
Mon, 1 Jul 2024 08:30:33 +0000 (08:30 +0000)
When creating the JUnit XML file, regrtest now escapes characters
which are invalid in XML, such as the chr(27) control character used
in ANSI escape sequences.

Lib/test/libregrtest/testresult.py
Lib/test/libregrtest/utils.py
Lib/test/test_regrtest.py
Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst [new file with mode: 0644]

index de23fdd59ded957736f0ce63e1c43acaf7eb7f49..1820f354572521accb4ec7340942f8b13505d0e2 100644 (file)
@@ -9,6 +9,7 @@ import time
 import traceback
 import unittest
 from test import support
+from test.libregrtest.utils import sanitize_xml
 
 class RegressionTestResult(unittest.TextTestResult):
     USE_XML = False
@@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult):
         if capture:
             if self._stdout_buffer is not None:
                 stdout = self._stdout_buffer.getvalue().rstrip()
-                ET.SubElement(e, 'system-out').text = stdout
+                ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
             if self._stderr_buffer is not None:
                 stderr = self._stderr_buffer.getvalue().rstrip()
-                ET.SubElement(e, 'system-err').text = stderr
+                ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
 
         for k, v in args.items():
             if not k or not v:
                 continue
+
             e2 = ET.SubElement(e, k)
             if hasattr(v, 'items'):
                 for k2, v2 in v.items():
                     if k2:
-                        e2.set(k2, str(v2))
+                        e2.set(k2, sanitize_xml(str(v2)))
                     else:
-                        e2.text = str(v2)
+                        e2.text = sanitize_xml(str(v2))
             else:
-                e2.text = str(v)
+                e2.text = sanitize_xml(str(v))
 
     @classmethod
     def __makeErrorDict(cls, err_type, err_value, err_tb):
index 8253d330b95b81151f74ebec79f2ddb5ffe2c6bd..0167742d388a2c3f3c852aa9316c561daacb39df 100644 (file)
@@ -5,6 +5,7 @@ import math
 import os.path
 import platform
 import random
+import re
 import shlex
 import signal
 import subprocess
@@ -712,3 +713,24 @@ def get_signal_name(exitcode):
         pass
 
     return None
+
+
+ILLEGAL_XML_CHARS_RE = re.compile(
+    '['
+    # Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
+    '\x00-\x08\x0B\x0C\x0E-\x1F'
+    # Surrogate characters
+    '\uD800-\uDFFF'
+    # Special Unicode characters
+    '\uFFFE'
+    '\uFFFF'
+    # Match multiple sequential invalid characters for better effiency
+    ']+')
+
+def _sanitize_xml_replace(regs):
+    text = regs[0]
+    return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
+                   for ch in text)
+
+def sanitize_xml(text):
+    return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)
index 0a15170a99e757ebd51ecbe8907c9e7aa135eb28..44fd11bfdc3fcbfd0a4b681672038cd83ed2a8ff 100644 (file)
@@ -21,6 +21,8 @@ import sysconfig
 import tempfile
 import textwrap
 import unittest
+from xml.etree import ElementTree
+
 from test import support
 from test.support import import_helper
 from test.support import os_helper
@@ -2254,6 +2256,44 @@ class ArgsTestCase(BaseTestCase):
             self.check_executed_tests(output, testname, stats=1, parallel=True)
             self.assertNotIn('SPAM SPAM SPAM', output)
 
+    def test_xml(self):
+        code = textwrap.dedent(r"""
+            import unittest
+            from test import support
+
+            class VerboseTests(unittest.TestCase):
+                def test_failed(self):
+                    print("abc \x1b def")
+                    self.fail()
+        """)
+        testname = self.create_test(code=code)
+
+        # Run sequentially
+        filename = os_helper.TESTFN
+        self.addCleanup(os_helper.unlink, filename)
+
+        output = self.run_tests(testname, "--junit-xml", filename,
+                                exitcode=EXITCODE_BAD_TEST)
+        self.check_executed_tests(output, testname,
+                                  failed=testname,
+                                  stats=TestStats(1, 1, 0))
+
+        # Test generated XML
+        with open(filename, encoding="utf8") as fp:
+            content = fp.read()
+
+        testsuite = ElementTree.fromstring(content)
+        self.assertEqual(int(testsuite.get('tests')), 1)
+        self.assertEqual(int(testsuite.get('errors')), 0)
+        self.assertEqual(int(testsuite.get('failures')), 1)
+
+        testcase = testsuite[0][0]
+        self.assertEqual(testcase.get('status'), 'run')
+        self.assertEqual(testcase.get('result'), 'completed')
+        self.assertGreater(float(testcase.get('time')), 0)
+        for out in testcase.iter('system-out'):
+            self.assertEqual(out.text, r"abc \x1b def")
+
 
 class TestUtils(unittest.TestCase):
     def test_format_duration(self):
@@ -2437,6 +2477,25 @@ class TestUtils(unittest.TestCase):
             self.assertTrue(match_test(test_chdir))
             self.assertFalse(match_test(test_copy))
 
+    def test_sanitize_xml(self):
+        sanitize_xml = utils.sanitize_xml
+
+        # escape invalid XML characters
+        self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
+                         r'abc \x1b\x1f def')
+        self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
+                         r'nul:\x00, bell:\x07')
+        self.assertEqual(sanitize_xml('surrogate:\uDC80'),
+                         r'surrogate:\udc80')
+        self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
+                         r'illegal \ufffe and \uffff')
+
+        # no escape for valid XML characters
+        self.assertEqual(sanitize_xml('a\n\tb'),
+                         'a\n\tb')
+        self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
+                         'valid t\xe9xt \u20ac')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst
new file mode 100644 (file)
index 0000000..c92002d
--- /dev/null
@@ -0,0 +1,3 @@
+When creating the JUnit XML file, regrtest now escapes characters which are
+invalid in XML, such as the chr(27) control character used in ANSI escape
+sequences. Patch by Victor Stinner.