gh-149489: Fix ElementTree serialization to HTML (GH-149490)

author Serhiy Storchaka <storchaka@gmail.com>

Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)

committer GitHub <noreply@github.com>

Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)
committer GitHub <noreply@github.com>
Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py

index 3a4d4098fbf567a122935fff1f66b431c3996218..89aff568a1b4ef92119b24adc999ddc981c8ac43 100644 (file)
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1287,7 +1287,15 @@ class ElementTreeTest(unittest.TestCase):
                {'': 'http://www.w3.org/2001/XMLSchema',
                 'ns': 'http://www.w3.org/2001/XMLSchema'})
  
-    def test_processinginstruction(self):
+    def test_comment_serialization(self):
+        comm = ET.Comment('<spam> & ham')
+        # comments are not escaped
+        self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->')
+        self.assertEqual(ET.tostring(comm, method='html'), b'<!--<spam> & ham-->')
+        # no comments in text serialization
+        self.assertEqual(ET.tostring(comm, method='text'), b'')
+
+    def test_processinginstruction_serialization(self):
          # Test ProcessingInstruction directly
  
          self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')),
@@ -1296,12 +1304,32 @@ class ElementTreeTest(unittest.TestCase):
                  b'<?test instruction?>')
  
          # Issue #2746
-
+        # processing instructions are not escaped
          self.assertEqual(ET.tostring(ET.PI('test', '<testing&>')),
                  b'<?test <testing&>?>')
          self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'),
                  b"<?xml version='1.0' encoding='latin-1'?>\n"
                  b"<?test <testing&>\xe3?>")
+        pi = ET.PI('test', 'ham & eggs < spam')
+        self.assertEqual(ET.tostring(pi), b'<?test ham & eggs < spam?>')
+        self.assertEqual(ET.tostring(pi, method='html'), b'<?test ham & eggs < spam?>')
+        # no processing instructions in text serialization
+        self.assertEqual(ET.tostring(pi, method='text'), b'')
+
+    def test_empty_attribute_serialization(self):
+        # empty attrs only work in html
+        elem = ET.Element('tag', attrib={'attr': None})
+        self.assertRaises(TypeError, ET.tostring, elem)
+        self.assertEqual(ET.tostring(elem, method='html'), b'<tag attr></tag>')
+
+    @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
+    def test_html_cdata_elems_serialization(self, tag):
+        # content of raw text elements is not escaped in html
+        tag = tag.title()
+        elem = ET.Element(tag)
+        elem.text = '<spam>&ham'
+        self.assertEqual(ET.tostring(elem, method='html'),
+                         ('<%s><spam>&ham</%s>' % (tag, tag)).encode())
  
      def test_html_empty_elems_serialization(self):
          # issue 15970
@@ -1317,6 +1345,14 @@ class ElementTreeTest(unittest.TestCase):
                                         method='html')
                  self.assertEqual(serialized, expected)
  
+    def test_html_plaintext_serialization(self):
+        # content of plaintext is not escaped in html
+        # no end tag for plaintext
+        elem = ET.Element('PlainText')
+        elem.text = '<spam>&ham'
+        self.assertEqual(ET.tostring(elem, method='html'),
+                         b'<PlainText><spam>&ham')
+
      def test_dump_attribute_order(self):
          # See BPO 34160
          e = ET.Element('cirriculum', status='public', company='example')
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py

index 75bebc0b1668abd87ab8e03e6dfb37946354e0d0..53727d7940b3f2ab397f6a65d441ce8742a2dadd 100644 (file)
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -917,17 +917,20 @@ def _serialize_xml(write, elem, qnames, namespaces,
      if elem.tail:
          write(_escape_cdata(elem.tail))
  
+_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed",
+                           "noframes", "plaintext"}
+
  HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
                "img", "input", "isindex", "link", "meta", "param", "source",
-              "track", "wbr"}
+              "track", "wbr", "plaintext"}
  
  def _serialize_html(write, elem, qnames, namespaces, **kwargs):
      tag = elem.tag
      text = elem.text
      if tag is Comment:
-        write("<!--%s-->" % _escape_cdata(text))
+        write("<!--%s-->" % text)
      elif tag is ProcessingInstruction:
-        write("<?%s?>" % _escape_cdata(text))
+        write("<?%s?>" % text)
      else:
          tag = qnames[tag]
          if tag is None:
@@ -951,16 +954,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
                  for k, v in items:
                      if isinstance(k, QName):
                          k = k.text
-                    if isinstance(v, QName):
-                        v = qnames[v.text]
+                    k = qnames[k]
+                    if v is None:
+                        write(" %s" % k)  # empty attr
                      else:
-                        v = _escape_attrib_html(v)
-                    # FIXME: handle boolean attributes
-                    write(" %s=\"%s\"" % (qnames[k], v))
+                        if isinstance(v, QName):
+                            v = qnames[v.text]
+                        else:
+                            v = _escape_attrib_html(v)
+                        write(" %s=\"%s\"" % (k, v))
              write(">")
              ltag = tag.lower()
              if text:
-                if ltag == "script" or ltag == "style":
+                if ltag in _CDATA_CONTENT_ELEMENTS:
                      write(text)
                  else:
                      write(_escape_cdata(text))
diff --git a/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst

new file mode 100644 (file)

index 0000000..1550c89
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst
@@ -0,0 +1,5 @@
+Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of
+comments, processing instructions and elements "xmp", "iframe", "noembed",
+"noframes", and "plaintext" is no longer escaped. The "plaintext" element no
+longer have the closing tag. Add support of empty attributes (with value
+``None``).
author	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)
committer	GitHub <noreply@github.com>
	Fri, 29 May 2026 21:04:50 +0000 (00:04 +0300)
Lib/test/test_xml_etree.py		patch \| blob \| blame \| history
Lib/xml/etree/ElementTree.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst	[new file with mode: 0644]	patch \| blob