From dee650189497735edbc08a54edabb5b06ef1bd09 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 21 Jul 2025 13:07:15 +0300 Subject: [PATCH] gh-135661: Fix parsing attributes with whitespaces around the "=" separator in HTMLParser (GH-136908) This fixes a regression introduced in GH-135930. --- Lib/html/parser.py | 4 +-- Lib/test/test_htmlparser.py | 28 +++++++++++-------- ...-06-25-14-13-39.gh-issue-135661.idjQ0B.rst | 5 ---- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 9b4f09599134..7eea885cfe63 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -45,7 +45,7 @@ attrfind_tolerant = re.compile(r""" ( (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name ) - (= # value indicator + ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator ('[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value @@ -57,7 +57,7 @@ locatetagend = re.compile(r""" [a-zA-Z][^\t\n\r\f />]* # tag name [\t\n\r\f /]* # optional whitespace before attribute name (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name - (?:= # value indicator + (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 15cad061889a..47c0752fb517 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -623,7 +623,7 @@ text html = '
The rain' expected = [ - ('starttag', 'div', [('style', ''), (',', None), ('foo', None), ('=', None), ('"bar"', None)]), + ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), @@ -813,12 +813,12 @@ class AttributesTestCase(TestCaseBase): ] self._run_check("""""", output) self._run_check("", [('starttag', 'a', [('foo', '=bar')])]) - self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) - self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo\v', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo\xa0', 'bar')])]) - self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) - self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo', '\vbar')])]) self._run_check("", [('starttag', 'a', [('foo', '\xa0bar')])]) @@ -829,8 +829,8 @@ class AttributesTestCase(TestCaseBase): ("d", "\txyz\n")])]) self._run_check("""""", [("starttag", "a", [("b", ""), ("c", "")])]) - self._run_check("", - [("starttag", "a", [("b", ""), ("c", "")])]) + self._run_check("", + [('starttag', 'a', [('b', 'x'), ('c', 'y')])]) self._run_check("", [("starttag", "a", [("b", "\v"), ("c", "\xa0")])]) # Regression test for SF patch #669683. @@ -899,13 +899,17 @@ class AttributesTestCase(TestCaseBase): ) expected = [ ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), - ('data', 'test - bad1'), ('endtag', 'a'), + ('data', 'test - bad1'), + ('endtag', 'a'), ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), - ('data', 'test - bad2'), ('endtag', 'a'), + ('data', 'test - bad2'), + ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), - ('data', 'test - bad3'), ('endtag', 'a'), - ('starttag', 'a', [('href', None), ('=', None), ("test' style", 'color:red;bad4')]), - ('data', 'test - bad4'), ('endtag', 'a') + ('data', 'test - bad3'), + ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), + ('endtag', 'a'), ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst b/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst index b6f9e104e440..27e886abdb58 100644 --- a/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst +++ b/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst @@ -18,8 +18,3 @@ according to the HTML5 standard. * Multiple ``=`` between attribute name and value are no longer collapsed. E.g. ```` produces attribute "foo" with value "=bar". - -* Whitespaces between the ``=`` separator and attribute name or value are no - longer ignored. E.g. ```` produces two attributes "foo" and - "=bar", both with value None; ```` produces two attributes: - "foo" with value "" and "bar" with value None. -- 2.47.2