From: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> Date: Fri, 4 Jul 2025 07:26:03 +0000 (+0200) Subject: [3.14] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard... X-Git-Tag: v3.14.0b4~36 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fa7e76e4ddbe92da1f1d98a24cc15d8306ef4e2e;p=thirdparty%2FPython%2Fcpython.git [3.14] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664) (GH-136271) * "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --------- (cherry picked from commit 8ac7613dc8b8f82253d7c0e2b6ef6ed703a0a1ee) Co-author: Kerim Kabirov Co-authored-by: Serhiy Storchaka Co-authored-by: Ezio Melotti --- diff --git a/Lib/html/parser.py b/Lib/html/parser.py index cc15de07b5ba..9b4f09599134 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,7 +29,8 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;= starttagopen = re.compile('<[a-zA-Z]') endtagopen = re.compile('') -commentclose = re.compile(r'--\s*>') +commentclose = re.compile(r'--!?>') +commentabruptclose = re.compile(r'-?>') # Note: # 1) if you change tagfind/attrfind remember to update locatetagend too; # 2) if you change tagfind/attrfind and/or locatetagend the parser will @@ -336,6 +337,21 @@ class HTMLParser(_markupbase.ParserBase): else: return self.parse_bogus_comment(i) + # Internal -- parse comment, return length or -1 if not terminated + # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state + def parse_comment(self, i, report=True): + rawdata = self.rawdata + assert rawdata.startswith('" '' '' + '' '' + # abrupt-closing-of-empty-comment + '' + '' '' '' - '') + '' + '' + '' + '' + '' + '' + '' + # nested-comment + ' -->' + '' + '' + ) expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), + ('comment', '-'), + ('comment', ''), + ('comment', ''), ('comment', ''), ('comment', '--I have many hyphens--'), ('comment', ' I have a > in the middle '), - ('comment', ' and I have -- in the middle! ')] + ('comment', ' and I have -- in the middle! '), + ('comment', 'incorrectly-closed-comment'), + ('comment', ''), + ('comment', '--!'), + ('comment', '-- >'), + ('comment', '-!>'), + ('comment', '!>'), + ('comment', ' '), + ('comment', '`` now ends the comment. ``-- >`` no longer ends the +comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.