]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH...
authorSerhiy Storchaka <storchaka@gmail.com>
Fri, 4 Jul 2025 07:00:23 +0000 (10:00 +0300)
committerGitHub <noreply@github.com>
Fri, 4 Jul 2025 07:00:23 +0000 (07:00 +0000)
* "--!>" now ends the comment.
* "-- >" no longer ends the comment.
* Support abnormally ended empty comments "<-->" and "<--->".

---------

Co-author: Kerim Kabirov <the.privat33r+gh@pm.me>
Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
Lib/html/parser.py
Lib/test/test_htmlparser.py
Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst [new file with mode: 0644]

index cc15de07b5bae64bda221ef47307c7450dca6658..9b4f09599134bd611f3555083fffcd12baddfe7a 100644 (file)
@@ -29,7 +29,8 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=
 starttagopen = re.compile('<[a-zA-Z]')
 endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
 # Note:
 #  1) if you change tagfind/attrfind remember to update locatetagend too;
 #  2) if you change tagfind/attrfind and/or locatetagend the parser will
@@ -336,6 +337,21 @@ class HTMLParser(_markupbase.ParserBase):
         else:
             return self.parse_bogus_comment(i)
 
+    # Internal -- parse comment, return length or -1 if not terminated
+    # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+    def parse_comment(self, i, report=True):
+        rawdata = self.rawdata
+        assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+        match = commentclose.search(rawdata, i+4)
+        if not match:
+            match = commentabruptclose.match(rawdata, i+4)
+            if not match:
+                return -1
+        if report:
+            j = match.start()
+            self.handle_comment(rawdata[i+4: j])
+        return match.end()
+
     # Internal -- parse bogus comment, return length or -1 if not terminated
     # see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
     def parse_bogus_comment(self, i, report=1):
index d0d2c54217ccafe4815165cbe70b85312021acdf..15cad061889a79bc5c8dcbade9a3b40feed4b7d2 100644 (file)
@@ -367,17 +367,45 @@ text
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'
                 '<!------>'
+                '<!----->'
                 '<!---->'
+                # abrupt-closing-of-empty-comment
+                '<!--->'
+                '<!-->'
                 '<!----I have many hyphens---->'
                 '<!-- I have a > in the middle -->'
-                '<!-- and I have -- in the middle! -->')
+                '<!-- and I have -- in the middle! -->'
+                '<!--incorrectly-closed-comment--!>'
+                '<!----!>'
+                '<!----!-->'
+                '<!---- >-->'
+                '<!---!>-->'
+                '<!--!>-->'
+                # nested-comment
+                '<!-- <!-- nested --> -->'
+                '<!--<!-->'
+                '<!--<!--!>'
+        )
         expected = [('comment', " I'm a valid comment "),
                     ('comment', 'me too!'),
                     ('comment', '--'),
+                    ('comment', '-'),
+                    ('comment', ''),
+                    ('comment', ''),
                     ('comment', ''),
                     ('comment', '--I have many hyphens--'),
                     ('comment', ' I have a > in the middle '),
-                    ('comment', ' and I have -- in the middle! ')]
+                    ('comment', ' and I have -- in the middle! '),
+                    ('comment', 'incorrectly-closed-comment'),
+                    ('comment', ''),
+                    ('comment', '--!'),
+                    ('comment', '-- >'),
+                    ('comment', '-!>'),
+                    ('comment', '!>'),
+                    ('comment', ' <!-- nested '), ('data', ' -->'),
+                    ('comment', '<!'),
+                    ('comment', '<!'),
+        ]
         self._run_check(html, expected)
 
     def test_condcoms(self):
diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst
new file mode 100644 (file)
index 0000000..71d15ee
--- /dev/null
@@ -0,0 +1,3 @@
+Fix comment parsing in :class:`html.parser.HTMLParser` according to the
+HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
+comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.