This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
Create a parser instance able to parse invalid markup.
- If *convert_charrefs* is ``True`` (the default), all character
- references (except the ones in ``script``/``style`` elements) are
+ If *convert_charrefs* is true (the default), all character
+ references (except the ones in elements like ``script`` and ``style``) are
automatically converted to the corresponding Unicode characters.
+ If *scripting* is false (the default), the content of the ``noscript``
+ element is parsed normally; if it's true, it's returned as is without
+ being parsed.
+
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
+ .. versionchanged:: 3.14.1
+ Added the *scripting* parameter.
+
Example HTML Parser Application
-------------------------------
.. method:: HTMLParser.handle_data(data)
This method is called to process arbitrary data (e.g. text nodes and the
- content of ``<script>...</script>`` and ``<style>...</style>``).
+ content of elements like ``script`` and ``style``).
.. method:: HTMLParser.handle_entityref(name)
This method is called to process a named character reference of the form
``&name;`` (e.g. ``>``), where *name* is a general entity reference
- (e.g. ``'gt'``). This method is never called if *convert_charrefs* is
- ``True``.
+ (e.g. ``'gt'``).
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_charref(name)
This method is called to process decimal and hexadecimal numeric character
references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`. For example, the decimal
equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``;
- in this case the method will receive ``'62'`` or ``'x3E'``. This method
- is never called if *convert_charrefs* is ``True``.
+ in this case the method will receive ``'62'`` or ``'x3E'``.
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_comment(data)
Data : Python
End tag : h1
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing:
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing:
.. doctest::
End tag : style
>>> parser.feed('<script type="text/javascript">'
- ... 'alert("<strong>hello!</strong>");</script>')
+ ... 'alert("<strong>hello! ☺</strong>");</script>')
Start tag: script
attr: ('type', 'text/javascript')
- Data : alert("<strong>hello!</strong>");
+ Data : alert("<strong>hello! ☺</strong>");
End tag : script
Parsing comments:
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``):
+if *convert_charrefs* is false:
.. doctest::
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If *scripting* is false (the default), the content of the
+ ``noscript`` element is parsed normally; if it's true,
+ it's returned as is without being parsed.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
- if escapable and not self.convert_charrefs:
+ if self.cdata_elem == 'plaintext':
+ self.interesting = re.compile(r'\z')
+ elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
+ if (tag in self.CDATA_CONTENT_ELEMENTS or
+ (self.scripting and tag == "noscript") or
+ tag == "plaintext"):
+ self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
from test import support
+SAMPLE_RCDATA = (
+ '<!-- not a comment -->'
+ "<not a='start tag'>"
+ '<![CDATA[not a cdata]]>'
+ '<!not a bogus comment>'
+ '</not a bogus comment>'
+ '\u2603'
+)
+
+SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺'
+
+
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, autocdata=False, **kw):
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
'<!-- \u2603 -->',
- 'foo = "</ script>"',
- 'foo = "</scripture>"',
- 'foo = "</script\v>"',
- 'foo = "</script\xa0>"',
- 'foo = "</ſcript>"',
- 'foo = "</scrıpt>"',
])
def test_script_content(self, content):
s = f'<script>{content}</script>'
- self._run_check(s, [("starttag", "script", []),
- ("data", content),
- ("endtag", "script")])
+ self._run_check(s, [
+ ("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script"),
+ ])
@support.subTests('content', [
'a::before { content: "<!-- not a comment -->"; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: "<not a=\'start tag\'>"; }',
'a::before { content: "\u2603"; }',
- 'a::before { content: "< /style>"; }',
- 'a::before { content: "</ style>"; }',
- 'a::before { content: "</styled>"; }',
- 'a::before { content: "</style\v>"; }',
- 'a::before { content: "</style\xa0>"; }',
- 'a::before { content: "</ſtyle>"; }',
])
def test_style_content(self, content):
s = f'<style>{content}</style>'
("data", content),
("endtag", "style")])
- @support.subTests('content', [
- '<!-- not a comment -->',
- "<not a='start tag'>",
- '<![CDATA[not a cdata]]>',
- '<!not a bogus comment>',
- '</not a bogus comment>',
- '\u2603',
- '< /title>',
- '</ title>',
- '</titled>',
- '</title\v>',
- '</title\xa0>',
- '</tıtle>',
+ @support.subTests('tag', ['title', 'textarea'])
+ def test_rcdata_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RCDATA}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RCDATA),
+ ("endtag", tag),
])
- def test_title_content(self, content):
- source = f"<title>{content}</title>"
+ source = f"<{tag}>&</{tag}>"
self._run_check(source, [
- ("starttag", "title", []),
- ("data", content),
- ("endtag", "title"),
+ ("starttag", tag, []),
+ ('entityref', 'amp'),
+ ("endtag", tag),
])
- @support.subTests('content', [
- '<!-- not a comment -->',
- "<not a='start tag'>",
- '<![CDATA[not a cdata]]>',
- '<!not a bogus comment>',
- '</not a bogus comment>',
- '\u2603',
- '< /textarea>',
- '</ textarea>',
- '</textareable>',
- '</textarea\v>',
- '</textarea\xa0>',
+ @support.subTests('tag',
+ ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+ def test_rawtext_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RAWTEXT}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", tag),
+ ])
+
+ def test_noscript_content(self):
+ source = f"<noscript>{SAMPLE_RAWTEXT}</noscript>"
+ # scripting=False -- normal mode
+ self._run_check(source, [
+ ('starttag', 'noscript', []),
+ ('comment', ' not a comment '),
+ ('starttag', 'not', [('a', 'start tag')]),
+ ('unknown decl', 'CDATA[not a cdata'),
+ ('comment', 'not a bogus comment'),
+ ('endtag', 'not'),
+ ('data', '☃'),
+ ('entityref', 'amp'),
+ ('charref', '9786'),
+ ('endtag', 'noscript'),
])
- def test_textarea_content(self, content):
- source = f"<textarea>{content}</textarea>"
+ # scripting=True -- RAWTEXT mode
+ self._run_check(source, [
+ ("starttag", "noscript", []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", "noscript"),
+ ], collector=EventCollector(scripting=True))
+
+ def test_plaintext_content(self):
+ content = SAMPLE_RAWTEXT + '</plaintext>' # not closing
+ source = f"<plaintext>{content}"
self._run_check(source, [
- ("starttag", "textarea", []),
+ ("starttag", "plaintext", []),
("data", content),
- ("endtag", "textarea"),
])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
("endtag", "script")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
- @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
- 'style/', 'style foo=bar', 'style foo=">"'])
- def test_style_closing_tag(self, endtag):
- content = """
- b::before { content: "<!-- not a comment -->"; }
- p::before { content: "¬-an-entity-ref;"; }
- a::before { content: "<i>"; }
- a::after { content: "</i>"; }
- """
- s = f'<StyLE>{content}</{endtag}>'
- self._run_check(s, [("starttag", "style", []),
- ("data", content),
- ("endtag", "style")],
- collector=EventCollectorNoNormalize(convert_charrefs=False))
-
- @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
- 'title/', 'title foo=bar', 'title foo=">"'])
- def test_title_closing_tag(self, endtag):
- content = "<!-- not a comment --><i>Egg & Spam</i>"
- s = f'<TitLe>{content}</{endtag}>'
- self._run_check(s, [("starttag", "title", []),
- ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
- ("endtag", "title")],
- collector=EventCollectorNoNormalize(convert_charrefs=True))
- self._run_check(s, [("starttag", "title", []),
- ('data', '<!-- not a comment --><i>Egg '),
- ('entityref', 'amp'),
- ('data', ' Spam</i>'),
- ("endtag", "title")],
- collector=EventCollectorNoNormalize(convert_charrefs=False))
-
- @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
- 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
- def test_textarea_closing_tag(self, endtag):
- content = "<!-- not a comment --><i>Egg & Spam</i>"
- s = f'<TexTarEa>{content}</{endtag}>'
- self._run_check(s, [("starttag", "textarea", []),
- ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
- ("endtag", "textarea")],
- collector=EventCollectorNoNormalize(convert_charrefs=True))
- self._run_check(s, [("starttag", "textarea", []),
- ('data', '<!-- not a comment --><i>Egg '),
- ('entityref', 'amp'),
- ('data', ' Spam</i>'),
- ("endtag", "textarea")],
- collector=EventCollectorNoNormalize(convert_charrefs=False))
+ @support.subTests('tag', [
+ 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+ 'textarea', 'title', 'noscript',
+ ])
+ def test_closing_tag(self, tag):
+ for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
+ f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
+ content = "<!-- not a comment --><i>Spam</i>"
+ s = f'<{tag.upper()}>{content}</{endtag}>'
+ self._run_check(s, [
+ ("starttag", tag, []),
+ ('data', content),
+ ("endtag", tag),
+ ], collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+
+ @support.subTests('tag', [
+ 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+ 'textarea', 'title', 'noscript',
+ ])
+ def test_invalid_closing_tag(self, tag):
+ content = (
+ f'< /{tag}>'
+ f'</ {tag}>'
+ f'</{tag}x>'
+ f'</{tag}\v>'
+ f'</{tag}\xa0>'
+ )
+ source = f"<{tag}>{content}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ("endtag", tag),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+ @support.subTests('tag,endtag', [
+ ('title', 'tıtle'),
+ ('style', 'ſtyle'),
+ ('style', 'ſtyle'),
+ ('style', 'style'),
+ ('iframe', 'ıframe'),
+ ('noframes', 'noframeſ'),
+ ('noscript', 'noſcript'),
+ ('noscript', 'noscrıpt'),
+ ('script', 'ſcript'),
+ ('script', 'scrıpt'),
+ ])
+ def test_invalid_nonascii_closing_tag(self, tag, endtag):
+ content = f"<br></{endtag}>"
+ source = f"<{tag}>{content}"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+ source = f"<{tag}>{content}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ("endtag", tag),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
@support.subTests('tail,end', [
('', False),