import _markupbase
from html import unescape
+from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+ ref = match.group(0)
+ # Numeric / hex char refs must always be unescaped
+ if ref.startswith('&#'):
+ return unescape(ref)
+ # Named character / entity references must only be unescaped
+ # if they are an exact match, and they are not followed by an equals sign
+ if not ref.endswith('=') and ref[1:] in html5_entities:
+ return unescape(ref)
+ # Otherwise do not unescape
+ return ref
+
+def _unescape_attrvalue(s):
+ return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
- attrvalue = unescape(attrvalue)
+ attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
charrefs = ['"', '"', '"', '"', '"', '"']
- # check charrefs in the middle of the text/attributes
- expected = [('starttag', 'a', [('href', 'foo"zar')]),
- ('data', 'a"z'), ('endtag', 'a')]
+ # check charrefs in the middle of the text
+ expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
for charref in charrefs:
- self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+ self._run_check('<a>a{0}z</a>'.format(charref),
expected, collector=collector())
- # check charrefs at the beginning/end of the text/attributes
- expected = [('data', '"'),
- ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+ # check charrefs at the beginning/end of the text
+ expected = [('data', '"'), ('starttag', 'a', []),
('data', '"'), ('endtag', 'a'), ('data', '"')]
for charref in charrefs:
- self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+ self._run_check('{0}<a>'
'{0}</a>{0}'.format(charref),
expected, collector=collector())
# check charrefs in <script>/<style> elements
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())
+ def test_convert_charrefs_in_attribute_values(self):
+ # default value for convert_charrefs is now True
+ collector = lambda: EventCollectorCharrefs()
+ self.assertTrue(collector().convert_charrefs)
+
+ # always unescape terminated entity refs, numeric and hex char refs:
+ # - regardless whether they are at start, middle, end of attribute
+ # - or followed by alphanumeric, non-alphanumeric, or equals char
+ charrefs = ['¢', '¢', '¢', '¢', '¢']
+ expected = [('starttag', 'a',
+ [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
+ ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
+ ('endtag', 'a')]
+ for charref in charrefs:
+ self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
+ .format(charref), expected, collector=collector())
+
+ # only unescape unterminated entity matches if they are not followed by
+ # an alphanumeric or an equals sign
+ charref = '¢'
+ expected = [('starttag', 'a',
+ [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
+ ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
+ ('endtag', 'a')]
+ self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
+ .format(charref), expected, collector=collector())
+
# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):