"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
+ RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self._escapable = True
super().reset()
def feed(self, data):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
- def set_cdata_mode(self, elem):
+ def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
- re.IGNORECASE|re.ASCII)
+ self._escapable = escapable
+ if escapable and not self.convert_charrefs:
+ self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
+ else:
+ self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
+ self._escapable = True
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
break
j = n
if i < j:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
assert 0, "interesting.search() lied"
# end while
if end and i < n:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
+ elif tag in self.RCDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag, escapable=True)
return endpos
# Internal -- check to see if we have a complete starttag; return end
("data", content),
("endtag", "style")])
+ @support.subTests('content', [
+ '<!-- not a comment -->',
+ "<not a='start tag'>",
+ '<![CDATA[not a cdata]]>',
+ '<!not a bogus comment>',
+ '</not a bogus comment>',
+ '\u2603',
+ '< /title>',
+ '</ title>',
+ '</titled>',
+ '</title\v>',
+ '</title\xa0>',
+ '</tıtle>',
+ ])
+ def test_title_content(self, content):
+ source = f"<title>{content}</title>"
+ self._run_check(source, [
+ ("starttag", "title", []),
+ ("data", content),
+ ("endtag", "title"),
+ ])
+
+ @support.subTests('content', [
+ '<!-- not a comment -->',
+ "<not a='start tag'>",
+ '<![CDATA[not a cdata]]>',
+ '<!not a bogus comment>',
+ '</not a bogus comment>',
+ '\u2603',
+ '< /textarea>',
+ '</ textarea>',
+ '</textareable>',
+ '</textarea\v>',
+ '</textarea\xa0>',
+ ])
+ def test_textarea_content(self, content):
+ source = f"<textarea>{content}</textarea>"
+ self._run_check(source, [
+ ("starttag", "textarea", []),
+ ("data", content),
+ ("endtag", "textarea"),
+ ])
+
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
def test_script_closing_tag(self, endtag):
("endtag", "style")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
+ @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
+ 'title/', 'title foo=bar', 'title foo=">"'])
+ def test_title_closing_tag(self, endtag):
+ content = "<!-- not a comment --><i>Egg & Spam</i>"
+ s = f'<TitLe>{content}</{endtag}>'
+ self._run_check(s, [("starttag", "title", []),
+ ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
+ ("endtag", "title")],
+ collector=EventCollectorNoNormalize(convert_charrefs=True))
+ self._run_check(s, [("starttag", "title", []),
+ ('data', '<!-- not a comment --><i>Egg '),
+ ('entityref', 'amp'),
+ ('data', ' Spam</i>'),
+ ("endtag", "title")],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+ @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
+ 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
+ def test_textarea_closing_tag(self, endtag):
+ content = "<!-- not a comment --><i>Egg & Spam</i>"
+ s = f'<TexTarEa>{content}</{endtag}>'
+ self._run_check(s, [("starttag", "textarea", []),
+ ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
+ ("endtag", "textarea")],
+ collector=EventCollectorNoNormalize(convert_charrefs=True))
+ self._run_check(s, [("starttag", "textarea", []),
+ ('data', '<!-- not a comment --><i>Egg '),
+ ('entityref', 'amp'),
+ ('data', ' Spam</i>'),
+ ("endtag", "textarea")],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
@support.subTests('tail,end', [
('', False),
('<', False),
("data", content if end else content + tail)],
collector=EventCollectorNoNormalize(convert_charrefs=False))
+ @support.subTests('tail,end', [
+ ('', False),
+ ('<', False),
+ ('</', False),
+ ('</t', False),
+ ('</title', False),
+ ('</title ', True),
+ ('</title foo=bar', True),
+ ('</title foo=">', True),
+ ])
+ def test_eof_in_title(self, tail, end):
+ s = f'<TitLe>Egg & Spam{tail}'
+ self._run_check(s, [("starttag", "title", []),
+ ("data", "Egg & Spam" + ('' if end else tail))],
+ collector=EventCollectorNoNormalize(convert_charrefs=True))
+ self._run_check(s, [("starttag", "title", []),
+ ('data', 'Egg '),
+ ('entityref', 'amp'),
+ ('data', ' Spam' + ('' if end else tail))],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'