piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
-# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
-# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
+# 1) if you change tagfind/attrfind remember to update locatetagend too;
+# 2) if you change tagfind/attrfind and/or locatetagend the parser will
# explode, so don't do it.
-# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
-# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
-tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
-attrfind_tolerant = re.compile(
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
+# see the HTML5 specs section "13.2.5.6 Tag open state",
+# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
+# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
+tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
+attrfind_tolerant = re.compile(r"""
+ (
+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
+ )
+ (= # value indicator
+ ('[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\t\n\r\f ]* # bare value
+ )
+ )?
+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
+""", re.VERBOSE)
+locatetagend = re.compile(r"""
+ [a-zA-Z][^\t\n\r\f />]* # tag name
+ [\t\n\r\f /]* # optional whitespace before attribute name
+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
+ (?:= # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\t\n\r\f ]* # bare value
+ )
+ )?
+ [\t\n\r\f /]* # possibly followed by a space
+ )*
+ >?
+""", re.VERBOSE)
+# The following variables are not used, but are temporarily left for
+# backward compatibility.
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
-# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
-# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+ self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
- not re.compile(r'[\s;]').search(rawdata, amppos)):
+ not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
else:
assert 0, "interesting.search() lied"
# end while
- if end and i < n and not self.cdata_elem:
+ if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
return self.parse_bogus_comment(i)
# Internal -- parse bogus comment, return length or -1 if not terminated
- # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
+ # see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
+ # See the HTML5 specs section "13.2.5.8 Tag name state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
- m = locatestarttagend_tolerant.match(rawdata, i)
- if m:
- j = m.end()
- next = rawdata[j:j+1]
- if next == ">":
- return j + 1
- if next == "/":
- if rawdata.startswith("/>", j):
- return j + 2
- if rawdata.startswith("/", j):
- # buffer boundary
- return -1
- # else bogus input
- if j > i:
- return j
- else:
- return i + 1
- if next == "":
- # end of input
- return -1
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
- # end of input in or before attribute value, or we have the
- # '/' from a '/>' ending
- return -1
- if j > i:
- return j
- else:
- return i + 1
- raise AssertionError("we should not get here!")
+ match = locatetagend.match(rawdata, i+1)
+ assert match
+ j = match.end()
+ if rawdata[j-1] != ">":
+ return -1
+ return j
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
+ # See the HTML5 specs section "13.2.5.7 End tag open state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
- match = endendtag.search(rawdata, i+1) # >
- if not match:
+ if rawdata.find('>', i+2) < 0: # fast check
return -1
- gtpos = match.end()
- match = endtagfind.match(rawdata, i) # </ + tag + >
- if not match:
- if self.cdata_elem is not None:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
- namematch = tagfind_tolerant.match(rawdata, i+2)
- if not namematch:
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
- if rawdata[i:i+3] == '</>':
- return i+3
- else:
- return self.parse_bogus_comment(i)
- tagname = namematch.group(1).lower()
- # consume and ignore other stuff between the name and the >
- # Note: this is not 100% correct, since we might have things like
- # </tag attr=">">, but looking for > after the name should cover
- # most of the cases and is much simpler
- gtpos = rawdata.find('>', namematch.end())
- self.handle_endtag(tagname)
- return gtpos+1
+ if not endtagopen.match(rawdata, i): # </ + letter
+ if rawdata[i+2:i+3] == '>': # </> is ignored
+ # "missing-end-tag-name" parser error
+ return i+3
+ else:
+ return self.parse_bogus_comment(i)
- elem = match.group(1).lower() # script or style
- if self.cdata_elem is not None:
- if elem != self.cdata_elem:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
+ match = locatetagend.match(rawdata, i+2)
+ assert match
+ j = match.end()
+ if rawdata[j-1] != ">":
+ return -1
- self.handle_endtag(elem)
+ # find the name: "13.2.5.8 Tag name state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+ match = tagfind_tolerant.match(rawdata, i+2)
+ assert match
+ tag = match.group(1).lower()
+ self.handle_endtag(tag)
self.clear_cdata_mode()
- return gtpos
+ return j
# Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):
self.fail('This should never be called with convert_charrefs=True')
+# The normal event collector normalizes the events in get_events,
+# so we override it to return the original list of events.
+class EventCollectorNoNormalize(EventCollector):
+ def get_events(self):
+ return self.events
+
+
class TestCaseBase(unittest.TestCase):
def get_collector(self):
("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
("starttag_text", s)])
- def test_cdata_content(self):
- contents = [
+ @support.subTests('content', [
'<!-- not a comment --> ¬-an-entity-ref;',
"<not a='start tag'>",
'<a href="" /> <p> <span></span>',
'src="http://www.example.org/r=\'+new '
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
- 'foo = "</sty" + "le>";',
'<!-- \u2603 -->',
- # these two should be invalid according to the HTML 5 spec,
- # section 8.1.2.2
- #'foo = </\nscript>',
- #'foo = </ script>',
- ]
- elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
- for content in contents:
- for element in elements:
- element_lower = element.lower()
- s = '<{element}>{content}</{element}>'.format(element=element,
- content=content)
- self._run_check(s, [("starttag", element_lower, []),
- ("data", content),
- ("endtag", element_lower)])
-
- def test_cdata_with_closing_tags(self):
+ 'foo = "</ script>"',
+ 'foo = "</scripture>"',
+ 'foo = "</script\v>"',
+ 'foo = "</script\xa0>"',
+ 'foo = "</ſcript>"',
+ 'foo = "</scrıpt>"',
+ ])
+ def test_script_content(self, content):
+ s = f'<script>{content}</script>'
+ self._run_check(s, [("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script")])
+
+ @support.subTests('content', [
+ 'a::before { content: "<!-- not a comment -->"; }',
+ 'a::before { content: "¬-an-entity-ref;"; }',
+ 'a::before { content: "<not a=\'start tag\'>"; }',
+ 'a::before { content: "\u2603"; }',
+ 'a::before { content: "< /style>"; }',
+ 'a::before { content: "</ style>"; }',
+ 'a::before { content: "</styled>"; }',
+ 'a::before { content: "</style\v>"; }',
+ 'a::before { content: "</style\xa0>"; }',
+ 'a::before { content: "</ſtyle>"; }',
+ ])
+ def test_style_content(self, content):
+ s = f'<style>{content}</style>'
+ self._run_check(s, [("starttag", "style", []),
+ ("data", content),
+ ("endtag", "style")])
+
+ @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
+ 'script/', 'script foo=bar', 'script foo=">"'])
+ def test_script_closing_tag(self, endtag):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
- # The normal event collector normalizes the events in get_events,
- # so we override it to return the original list of events.
- class Collector(EventCollector):
- def get_events(self):
- return self.events
-
content = """<!-- not a comment --> ¬-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
- for element in [' script', 'script ', ' script ',
- '\nscript', 'script\n', '\nscript\n']:
- element_lower = element.lower().strip()
- s = '<script>{content}</{element}>'.format(element=element,
- content=content)
- self._run_check(s, [("starttag", element_lower, []),
- ("data", content),
- ("endtag", element_lower)],
- collector=Collector(convert_charrefs=False))
+ s = f'<ScrIPt>{content}</{endtag}>'
+ self._run_check(s, [("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script")],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+ @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
+ 'style/', 'style foo=bar', 'style foo=">"'])
+ def test_style_closing_tag(self, endtag):
+ content = """
+ b::before { content: "<!-- not a comment -->"; }
+ p::before { content: "¬-an-entity-ref;"; }
+ a::before { content: "<i>"; }
+ a::after { content: "</i>"; }
+ """
+ s = f'<StyLE>{content}</{endtag}>'
+ self._run_check(s, [("starttag", "style", []),
+ ("data", content),
+ ("endtag", "style")],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+ @support.subTests('tail,end', [
+ ('', False),
+ ('<', False),
+ ('</', False),
+ ('</s', False),
+ ('</script', False),
+ ('</script ', True),
+ ('</script foo=bar', True),
+ ('</script foo=">', True),
+ ])
+ def test_eof_in_script(self, tail, end):
+ content = "a = 123"
+ s = f'<ScrIPt>{content}{tail}'
+ self._run_check(s, [("starttag", "script", []),
+ ("data", content if end else content + tail)],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
self._run_check("</$>", [('comment', '$')])
self._run_check("</", [('data', '</')])
self._run_check("</a", [])
- self._run_check("</ a>", [('endtag', 'a')])
+ self._run_check("</ a>", [('comment', ' a')])
self._run_check("</ a", [('comment', ' a')])
self._run_check("<a<a>", [('starttag', 'a<a', [])])
self._run_check("</a<a>", [('endtag', 'a<a')])
]
self._run_check(html, expected)
+ def test_slashes_in_endtag(self):
+ self._run_check('</a/>', [('endtag', 'a')])
+ self._run_check('</a foo="var"/>', [('endtag', 'a')])
+
def test_declaration_junk_chars(self):
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
self._run_check(html, expected)
def test_broken_invalid_end_tag(self):
- # This is technically wrong (the "> shouldn't be included in the 'data')
- # but is probably not worth fixing it (in addition to all the cases of
- # the previous test, it would require a full attribute parsing).
- # see #13993
html = '<b>This</b attr=">"> confuses the parser'
expected = [('starttag', 'b', []),
('data', 'This'),
('endtag', 'b'),
- ('data', '"> confuses the parser')]
+ ('data', ' confuses the parser')]
self._run_check(html, expected)
def test_correct_detection_of_start_tags(self):
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
expected = [
- ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
+ ('starttag', 'div', [('style', ''), (',', None), ('foo', None), ('=', None), ('"bar"', None)]),
('starttag', 'b', []),
('data', 'The '),
('starttag', 'a', [('href', 'some_url')]),
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
]
self._run_check("""<a b='v' c="v" d=v e>""", output)
- self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
- self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
- self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
+ self._run_check("<a foo==bar>", [('starttag', 'a', [('foo', '=bar')])])
+ self._run_check("<a foo =bar>", [('starttag', 'a', [('foo', None), ('=bar', None)])])
+ self._run_check("<a foo\t=bar>", [('starttag', 'a', [('foo', None), ('=bar', None)])])
+ self._run_check("<a foo\v=bar>", [('starttag', 'a', [('foo\v', 'bar')])])
+ self._run_check("<a foo\xa0=bar>", [('starttag', 'a', [('foo\xa0', 'bar')])])
+ self._run_check("<a foo= bar>", [('starttag', 'a', [('foo', ''), ('bar', None)])])
+ self._run_check("<a foo=\tbar>", [('starttag', 'a', [('foo', ''), ('bar', None)])])
+ self._run_check("<a foo=\vbar>", [('starttag', 'a', [('foo', '\vbar')])])
+ self._run_check("<a foo=\xa0bar>", [('starttag', 'a', [('foo', '\xa0bar')])])
def test_attr_values(self):
self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
("d", "\txyz\n")])])
self._run_check("""<a b='' c="">""",
[("starttag", "a", [("b", ""), ("c", "")])])
+ self._run_check("<a b=\t c=\n>",
+ [("starttag", "a", [("b", ""), ("c", "")])])
+ self._run_check("<a b=\v c=\xa0>",
+ [("starttag", "a", [("b", "\v"), ("c", "\xa0")])])
# Regression test for SF patch #669683.
self._run_check("<e a=rgb(1,2,3)>",
[("starttag", "e", [("a", "rgb(1,2,3)")])])
('data', 'test - bad2'), ('endtag', 'a'),
('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
('data', 'test - bad3'), ('endtag', 'a'),
- ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
+ ('starttag', 'a', [('href', None), ('=', None), ("test' style", 'color:red;bad4')]),
('data', 'test - bad4'), ('endtag', 'a')
]
self._run_check(html, expected)