[3.13] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137773)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Thu, 14 Aug 2025 18:44:16 +0000 (20:44 +0200)

committer GitHub <noreply@github.com>

Thu, 14 Aug 2025 18:44:16 +0000 (21:44 +0300)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Thu, 14 Aug 2025 18:44:16 +0000 (20:44 +0200)
committer GitHub <noreply@github.com>
Thu, 14 Aug 2025 18:44:16 +0000 (21:44 +0300)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 5d03c98df5cdd01e2a8818b1eaf11772cfe7b419..75bf8adae6d70a96ce43480c7ca8dd40d55f9bf5 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -146,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
          self.lasttag = '???'
          self.interesting = interesting_normal
          self.cdata_elem = None
+        self._support_cdata = True
          self._escapable = True
          super().reset()
  
@@ -183,6 +184,19 @@ class HTMLParser(_markupbase.ParserBase):
          self.cdata_elem = None
          self._escapable = True
  
+    def _set_support_cdata(self, flag=True):
+        """Enable or disable support of the CDATA sections.
+        If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
+        If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
+
+        This method is not called by default. Its purpose is to be called
+        in custom handle_starttag() and handle_endtag() methods, with
+        value that depends on the adjusted current node.
+        See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+        for details.
+        """
+        self._support_cdata = flag
+
      # Internal -- handle data as far as reasonable.  May leave state
      # and data to be processed by a subsequent call.  If 'end' is
      # true, force handling all data as if followed by EOF marker.
@@ -258,7 +272,10 @@ class HTMLParser(_markupbase.ParserBase):
                                  break
                          self.handle_comment(rawdata[i+4:j])
                      elif startswith("<![CDATA[", i):
-                        self.unknown_decl(rawdata[i+3:])
+                        if self._support_cdata:
+                            self.unknown_decl(rawdata[i+3:])
+                        else:
+                            self.handle_comment(rawdata[i+1:])
                      elif rawdata[i:i+9].lower() == '<!doctype':
                          self.handle_decl(rawdata[i+2:])
                      elif startswith("<!", i):
@@ -334,7 +351,14 @@ class HTMLParser(_markupbase.ParserBase):
              # this case is actually already handled in goahead()
              return self.parse_comment(i)
          elif rawdata[i:i+9] == '<![CDATA[':
-            return self.parse_marked_section(i)
+            if self._support_cdata:
+                j = rawdata.find(']]>', i+9)
+                if j < 0:
+                    return -1
+                self.unknown_decl(rawdata[i+3: j])
+                return j + 3
+            else:
+                return self.parse_bogus_comment(i)
          elif rawdata[i:i+9].lower() == '<!doctype':
              # find the closing >
              gtpos = rawdata.find('>', i+9)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 380bbe40177ec587236ce3e206a7c8db63a6dbd4..fff41dab321acd0c33370638ea1b62fc84a5ec05 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -10,10 +10,13 @@ from test import support
  
  class EventCollector(html.parser.HTMLParser):
  
-    def __init__(self, *args, **kw):
+    def __init__(self, *args, autocdata=False, **kw):
+        self.autocdata = autocdata
          self.events = []
          self.append = self.events.append
          html.parser.HTMLParser.__init__(self, *args, **kw)
+        if autocdata:
+            self._set_support_cdata(False)
  
      def get_events(self):
          # Normalize the list of events so that buffer artefacts don't
@@ -34,12 +37,16 @@ class EventCollector(html.parser.HTMLParser):
  
      def handle_starttag(self, tag, attrs):
          self.append(("starttag", tag, attrs))
+        if self.autocdata and tag == 'svg':
+            self._set_support_cdata(True)
  
      def handle_startendtag(self, tag, attrs):
          self.append(("startendtag", tag, attrs))
  
      def handle_endtag(self, tag):
          self.append(("endtag", tag))
+        if self.autocdata and tag == 'svg':
+            self._set_support_cdata(False)
  
      # all other markup
  
@@ -767,10 +774,6 @@ text
              ('<!', [('comment', '')]),
              ('<!-', [('comment', '-')]),
              ('<![', [('comment', '[')]),
-            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
-            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
-            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
-            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
              ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
              ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
              ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -783,6 +786,18 @@ text
          for html, expected in data:
              self._run_check(html, expected)
  
+    @support.subTests('content', ['', 'x', 'x]', 'x]]'])
+    def test_eof_in_cdata(self, content):
+        self._run_check('<![CDATA[' + content,
+                        [('unknown decl', 'CDATA[' + content)])
+        self._run_check('<![CDATA[' + content,
+                        [('comment', '![CDATA[' + content)],
+                        collector=EventCollector(autocdata=True))
+        self._run_check('<svg><text y="100"><![CDATA[' + content,
+                        [('starttag', 'svg', []),
+                         ('starttag', 'text', [('y', '100')]),
+                         ('unknown decl', 'CDATA[' + content)])
+
      def test_bogus_comments(self):
          html = ('<!ELEMENT br EMPTY>'
                  '<! not really a comment >'
@@ -845,28 +860,53 @@ text
          ]
          self._run_check(html, expected)
  
-    def test_cdata_declarations(self):
-        # More tests should be added. See also "8.2.4.42. Markup
-        # declaration open state", "8.2.4.69. CDATA section state",
-        # and issue 32876
-        html = ('<![CDATA[just some plain text]]>')
-        expected = [('unknown decl', 'CDATA[just some plain text')]
+    @support.subTests('content', [
+        'just some plain text',
+        '<!-- not a comment -->',
+        '&not-an-entity-ref;',
+        "<not a='start tag'>",
+        '',
+        '[[I have many brackets]]',
+        'I have a > in the middle',
+        'I have a ]] in the middle',
+        '] ]>',
+        ']] >',
+        ('\n'
+         '    if (a < b && a > b) {\n'
+         '        printf("[<marquee>How?</marquee>]");\n'
+         '    }\n'),
+    ])
+    def test_cdata_section_content(self, content):
+        # See "13.2.5.42 Markup declaration open state",
+        # "13.2.5.69 CDATA section state", and issue bpo-32876.
+        html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
+        expected = [
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[' + content),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+        ]
          self._run_check(html, expected)
+        self._run_check(html, expected, collector=EventCollector(autocdata=True))
  
-    def test_cdata_declarations_multiline(self):
-        html = ('<code><![CDATA['
-                '    if (a < b && a > b) {'
-                '        printf("[<marquee>How?</marquee>]");'
-                '    }'
-                ']]></code>')
+    def test_cdata_section(self):
+        # See "13.2.5.42 Markup declaration open state".
+        html = ('<![CDATA[foo<br>bar]]>'
+                '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
+                '<![CDATA[foo<br>bar]]>')
          expected = [
-            ('starttag', 'code', []),
-            ('unknown decl',
-             'CDATA[    if (a < b && a > b) {        '
-             'printf("[<marquee>How?</marquee>]");    }'),
-            ('endtag', 'code')
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[foo<br>bar'),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
          ]
-        self._run_check(html, expected)
+        self._run_check(html, expected, collector=EventCollector(autocdata=True))
  
      def test_convert_charrefs_dropped_text(self):
          # #23144: make sure that all the events are triggered when
diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst

new file mode 100644 (file)

index 0000000..fe000d9
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
@@ -0,0 +1,5 @@
+Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
+the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
+Add private method ``_set_support_cdata()`` which can be used to specify
+how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
+(SVG or MathML) or as a bogus comment in the HTML namespace.
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Thu, 14 Aug 2025 18:44:16 +0000 (20:44 +0200)
committer	GitHub <noreply@github.com>
	Thu, 14 Aug 2025 18:44:16 +0000 (21:44 +0300)
Lib/html/parser.py		patch \| blob \| blame \| history
Lib/test/test_htmlparser.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst	[new file with mode: 0644]	patch \| blob