* "--!>" now ends the comment.
* "-- >" no longer ends the comment.
* Support abnormally ended empty comments "<-->" and "<--->".
---------
Co-author: Kerim Kabirov <the.privat33r+gh@pm.me>
Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatetagend too;
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
else:
return self.parse_bogus_comment(i)
+ # Internal -- parse comment, return length or -1 if not terminated
+ # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+ def parse_comment(self, i, report=True):
+ rawdata = self.rawdata
+ assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+ match = commentclose.search(rawdata, i+4)
+ if not match:
+ match = commentabruptclose.match(rawdata, i+4)
+ if not match:
+ return -1
+ if report:
+ j = match.start()
+ self.handle_comment(rawdata[i+4: j])
+ return match.end()
+
# Internal -- parse bogus comment, return length or -1 if not terminated
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
'<!------>'
+ '<!----->'
'<!---->'
+ # abrupt-closing-of-empty-comment
+ '<!--->'
+ '<!-->'
'<!----I have many hyphens---->'
'<!-- I have a > in the middle -->'
- '<!-- and I have -- in the middle! -->')
+ '<!-- and I have -- in the middle! -->'
+ '<!--incorrectly-closed-comment--!>'
+ '<!----!>'
+ '<!----!-->'
+ '<!---- >-->'
+ '<!---!>-->'
+ '<!--!>-->'
+ # nested-comment
+ '<!-- <!-- nested --> -->'
+ '<!--<!-->'
+ '<!--<!--!>'
+ )
expected = [('comment', " I'm a valid comment "),
('comment', 'me too!'),
('comment', '--'),
+ ('comment', '-'),
+ ('comment', ''),
+ ('comment', ''),
('comment', ''),
('comment', '--I have many hyphens--'),
('comment', ' I have a > in the middle '),
- ('comment', ' and I have -- in the middle! ')]
+ ('comment', ' and I have -- in the middle! '),
+ ('comment', 'incorrectly-closed-comment'),
+ ('comment', ''),
+ ('comment', '--!'),
+ ('comment', '-- >'),
+ ('comment', '-!>'),
+ ('comment', '!>'),
+ ('comment', ' <!-- nested '), ('data', ' -->'),
+ ('comment', '<!'),
+ ('comment', '<!'),
+ ]
self._run_check(html, expected)
def test_condcoms(self):
--- /dev/null
+Fix comment parsing in :class:`html.parser.HTMLParser` according to the
+HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
+comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.