#15156: HTMLParser now uses the new "html.entities.html5" dictionary.

author Ezio Melotti <ezio.melotti@gmail.com>

Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)
diff --git a/Doc/library/html.entities.rst b/Doc/library/html.entities.rst

index f0dd7aae8fe34150ef422b5eb2a0fad7ec37c01f..65ce817b3e14acdb932cb349c5ed1d781122c632 100644 (file)
--- a/Doc/library/html.entities.rst
+++ b/Doc/library/html.entities.rst
@@ -11,10 +11,6 @@
  
  This module defines four dictionaries, :data:`html5`,
  :data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
-:data:`entitydefs` is used to provide the :attr:`entitydefs`
-attribute of the :class:`html.parser.HTMLParser` class.  The definition provided
-here contains all the entities defined by XHTML 1.0 that can be handled using
-simple textual substitution in the Latin-1 character set (ISO-8859-1).
  
  
  .. data:: html5
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 494cf24fd879eea47c77cff0a01dee53b631285c..f8ac82834a3e200fe707e818b1260e2417bcf660 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase):
              self.error("unknown declaration: %r" % (data,))
  
      # Internal -- helper to remove special character quoting
-    entitydefs = None
      def unescape(self, s):
          if '&' not in s:
              return s
@@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase):
                  if s[0] == "#":
                      s = s[1:]
                      if s[0] in ['x','X']:
-                        c = int(s[1:], 16)
+                        c = int(s[1:].rstrip(';'), 16)
                      else:
-                        c = int(s)
+                        c = int(s.rstrip(';'))
                      return chr(c)
              except ValueError:
-                return '&#'+ s +';'
+                return '&#' + s
              else:
-                # Cannot use name2codepoint directly, because HTMLParser
-                # supports apos, which is not part of HTML 4
-                import html.entities
-                if HTMLParser.entitydefs is None:
-                    entitydefs = HTMLParser.entitydefs = {'apos':"'"}
-                    for k, v in html.entities.name2codepoint.items():
-                        entitydefs[k] = chr(v)
-                try:
-                    return self.entitydefs[s]
-                except KeyError:
-                    return '&'+s+';'
-
-        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
+                from html.entities import html5
+                if s in html5:
+                    return html5[s]
+                elif s.endswith(';'):
+                    return '&' + s
+                for x in range(2, len(s)):
+                    if s[:x] in html5:
+                        return html5[s[:x]] + s[x:]
+                else:
+                    return '&' + s
+
+        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
                        replaceEntities, s, flags=re.ASCII)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 64a4f5dfb4547887224b8bec271d3df5b5206206..c5d878dca597809e156b95d8316a71d6b4a68e9f 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
          self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
                          'method="post">', [
                              ('starttag', 'form',
-                                [('action', '/xxx.php?a=1&b=2&amp'),
+                                [('action', '/xxx.php?a=1&b=2&'),
                                   (',', None), ('method', 'post')])])
  
      def test_weird_chars_in_unquoted_attribute_values(self):
@@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
          self.assertEqual(p.unescape('&#0038;'),'&')
          # see #12888
          self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
+        # see #15156
+        self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
+                                    '&alphacentauri&alpha;centauri'),
+                                    'ÉricÉric&alphacentauriαcentauri')
+        self.assertEqual(p.unescape('&co;'), '&co;')
  
      def test_broken_comments(self):
          html = ('<! not really a comment >'
diff --git a/Misc/NEWS b/Misc/NEWS

index 0ccdce58f93e01724bc77f27197946c7cbe2a88e..da574b0bf219aa804a492db33a9b0995e89785cc 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -76,6 +76,8 @@ Library
    It is used automatically on platforms supporting the necessary os.openat()
    and os.unlinkat() functions. Main code by Martin von Löwis.
  
+- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
+
  - Issue #11113: add a new "html5" dictionary containing the named character
    references defined by the HTML5 standard and the equivalent Unicode
    character(s) to the html.entities module.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Sun, 24 Jun 2012 20:02:56 +0000 (22:02 +0200)
Doc/library/html.entities.rst		patch \| blob \| blame \| history
Lib/html/parser.py		patch \| blob \| blame \| history
Lib/test/test_htmlparser.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history