From 3937c78e36648193ec0e91ca1d10b39680d7c657 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Fri, 9 May 2025 08:43:21 +0200 Subject: [PATCH] [3.14] gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215) (GH-133704) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. (cherry picked from commit 77b14a6d58e527f915966446eb0866652a46feb5) https: //html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state Co-authored-by: Sascha Ißbrücker --- Lib/html/parser.py | 20 ++++++++- Lib/test/test_htmlparser.py | 43 +++++++++++++++---- ...2-07-24-20-56-32.gh-issue-69426.unccw7.rst | 3 ++ 3 files changed, 57 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 13c95c34e505..0a1dd3b7d3bf 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -12,6 +12,7 @@ import re import _markupbase from html import unescape +from html.entities import html5 as html5_entities __all__ = ['HTMLParser'] @@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') +attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') @@ -57,6 +59,22 @@ endendtag = re.compile('>') # ') +# Character reference processing logic specific to attribute values +# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state +def _replace_attr_charref(match): + ref = match.group(0) + # Numeric / hex char refs must always be unescaped + if ref.startswith('&#'): + return unescape(ref) + # Named character / entity references must only be unescaped + # if they are an exact match, and they are not followed by an equals sign + if not ref.endswith('=') and ref[1:] in html5_entities: + return unescape(ref) + # Otherwise do not unescape + return ref + +def _unescape_attrvalue(s): + return attr_charref.sub(_replace_attr_charref, s) class HTMLParser(_markupbase.ParserBase): @@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase): attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: - attrvalue = unescape(attrvalue) + attrvalue = _unescape_attrvalue(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index b42a611c62c0..4fdba06cf4cc 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -348,18 +348,16 @@ text collector = lambda: EventCollectorCharrefs() self.assertTrue(collector().convert_charrefs) charrefs = ['"', '"', '"', '"', '"', '"'] - # check charrefs in the middle of the text/attributes - expected = [('starttag', 'a', [('href', 'foo"zar')]), - ('data', 'a"z'), ('endtag', 'a')] + # check charrefs in the middle of the text + expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')] for charref in charrefs: - self._run_check('a{0}z'.format(charref), + self._run_check('a{0}z'.format(charref), expected, collector=collector()) - # check charrefs at the beginning/end of the text/attributes - expected = [('data', '"'), - ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), + # check charrefs at the beginning/end of the text + expected = [('data', '"'), ('starttag', 'a', []), ('data', '"'), ('endtag', 'a'), ('data', '"')] for charref in charrefs: - self._run_check('{0}' + self._run_check('{0}' '{0}{0}'.format(charref), expected, collector=collector()) # check charrefs in