[3.13] gh-135661: Fix parsing unterminated bogus comments in HTMLParser (GH-137873...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Sun, 17 Aug 2025 10:59:24 +0000 (12:59 +0200)

committer GitHub <noreply@github.com>

Sun, 17 Aug 2025 10:59:24 +0000 (10:59 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Sun, 17 Aug 2025 10:59:24 +0000 (12:59 +0200)
committer GitHub <noreply@github.com>
Sun, 17 Aug 2025 10:59:24 +0000 (10:59 +0000)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 75bf8adae6d70a96ce43480c7ca8dd40d55f9bf5..5d7050dad2396bbe1e1ec52c49aacec9b724ab36 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -271,11 +271,8 @@ class HTMLParser(_markupbase.ParserBase):
                                  j -= len(suffix)
                                  break
                          self.handle_comment(rawdata[i+4:j])
-                    elif startswith("<![CDATA[", i):
-                        if self._support_cdata:
-                            self.unknown_decl(rawdata[i+3:])
-                        else:
-                            self.handle_comment(rawdata[i+1:])
+                    elif startswith("<![CDATA[", i) and self._support_cdata:
+                        self.unknown_decl(rawdata[i+3:])
                      elif rawdata[i:i+9].lower() == '<!doctype':
                          self.handle_decl(rawdata[i+2:])
                      elif startswith("<!", i):
@@ -350,15 +347,12 @@ class HTMLParser(_markupbase.ParserBase):
          if rawdata[i:i+4] == '<!--':
              # this case is actually already handled in goahead()
              return self.parse_comment(i)
-        elif rawdata[i:i+9] == '<![CDATA[':
-            if self._support_cdata:
-                j = rawdata.find(']]>', i+9)
-                if j < 0:
-                    return -1
-                self.unknown_decl(rawdata[i+3: j])
-                return j + 3
-            else:
-                return self.parse_bogus_comment(i)
+        elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
+            j = rawdata.find(']]>', i+9)
+            if j < 0:
+                return -1
+            self.unknown_decl(rawdata[i+3: j])
+            return j + 3
          elif rawdata[i:i+9].lower() == '<!doctype':
              # find the closing >
              gtpos = rawdata.find('>', i+9)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index fff41dab321acd0c33370638ea1b62fc84a5ec05..6a1d69335a0616d69fc8c08013dd9a21af115066 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -791,7 +791,7 @@ text
          self._run_check('<![CDATA[' + content,
                          [('unknown decl', 'CDATA[' + content)])
          self._run_check('<![CDATA[' + content,
-                        [('comment', '![CDATA[' + content)],
+                        [('comment', '[CDATA[' + content)],
                          collector=EventCollector(autocdata=True))
          self._run_check('<svg><text y="100"><![CDATA[' + content,
                          [('starttag', 'svg', []),
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Sun, 17 Aug 2025 10:59:24 +0000 (12:59 +0200)
committer	GitHub <noreply@github.com>
	Sun, 17 Aug 2025 10:59:24 +0000 (10:59 +0000)
Lib/html/parser.py		patch \| blob \| blame \| history
Lib/test/test_htmlparser.py		patch \| blob \| blame \| history