From: Ezio Melotti Date: Tue, 5 Apr 2011 17:40:52 +0000 (+0300) Subject: #7311: fix HTMLParser to accept non-ASCII attribute values. X-Git-Tag: v2.7.2rc1~186 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9f1ffb2ae932f5eef1bcf1317a0e3d8f4bad0e0c;p=thirdparty%2FPython%2Fcpython.git #7311: fix HTMLParser to accept non-ASCII attribute values. --- diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 4fdc09aa7631..e0189011d115 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') + r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 717585ca5b9b..0620d0bdc2a9 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -208,6 +208,23 @@ DOCTYPE html [ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ]) + def test_attr_nonascii(self): + # see issue 7311 + self._run_check(u"\u4e2d\u6587", [ + ("starttag", "img", [("src", "/foo/bar.png"), + ("alt", u"\u4e2d\u6587")]), + ]) + self._run_check(u"", [ + ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), + ("href", u"\u30c6\u30b9\u30c8.html")]), + ]) + self._run_check(u'', [ + ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), + ("href", u"\u30c6\u30b9\u30c8.html")]), + ]) + def test_attr_entity_replacement(self): self._run_check("""""", [ ("starttag", "a", [("b", "&><\"'")]), diff --git a/Misc/NEWS b/Misc/NEWS index 2c444a98b7fc..fe4605fafac3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -47,6 +47,8 @@ Core and Builtins Library ------- +- Issue #7311: fix HTMLParser to accept non-ASCII attribute values. + - Issue #10963: Ensure that subprocess.communicate() never raises EPIPE. - Issue #11662: Make urllib and urllib2 ignore redirections if the