bpo-39337: encodings.normalize_encoding() now ignores non-ASCII characters (GH-22219)

author Hai Shi <shihai1992@gmail.com>

Wed, 14 Oct 2020 15:43:31 +0000 (23:43 +0800)

committer GitHub <noreply@github.com>

Wed, 14 Oct 2020 15:43:31 +0000 (17:43 +0200)
author Hai Shi <shihai1992@gmail.com>
Wed, 14 Oct 2020 15:43:31 +0000 (23:43 +0800)
committer GitHub <noreply@github.com>
Wed, 14 Oct 2020 15:43:31 +0000 (17:43 +0200)
diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst

index c8ddcd2d24296e58f84d2feb62ac594ab5859039..738ef974e7867b3e12874f998765c85129977aa2 100644 (file)
--- a/Doc/whatsnew/3.10.rst
+++ b/Doc/whatsnew/3.10.rst
@@ -186,6 +186,11 @@ by :func:`curses.color_content`, :func:`curses.init_color`,
  support is provided by the underlying ncurses library.
  (Contributed by Jeffrey Kintscher and Hans Petter Jansson in :issue:`36982`.)
  
+encodings
+---------
+:func:`encodings.normalize_encoding` now ignores non-ASCII characters.
+(Contributed by Hai Shi in :issue:`39337`.)
+
  glob
  ----
  
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py

index ddd5afdcf2dab021c59d09e7fed67f952ce76129..4b37d3321c9033d86a459c05f66aaa133fbb34d9 100644 (file)
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -61,7 +61,8 @@ def normalize_encoding(encoding):
          if c.isalnum() or c == '.':
              if punct and chars:
                  chars.append('_')
-            chars.append(c)
+            if c.isascii():
+                chars.append(c)
              punct = False
          else:
              punct = True
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index ddf4e08af6247a3984a150c7935c805796c7b2d7..09ceef76eb098da9f3cf6429fc35fd569b3f082d 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3417,7 +3417,7 @@ class Rot13UtilTest(unittest.TestCase):
  
  class CodecNameNormalizationTest(unittest.TestCase):
      """Test codec name normalization"""
-    def test_normalized_encoding(self):
+    def test_codecs_lookup(self):
          FOUND = (1, 2, 3, 4)
          NOT_FOUND = (None, None, None, None)
          def search_function(encoding):
@@ -3439,6 +3439,18 @@ class CodecNameNormalizationTest(unittest.TestCase):
          self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
          self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
  
+    def test_encodings_normalize_encoding(self):
+        # encodings.normalize_encoding() ignores non-ASCII characters.
+        normalize = encodings.normalize_encoding
+        self.assertEqual(normalize('utf_8'), 'utf_8')
+        self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
+        self.assertEqual(normalize('utf   8'), 'utf_8')
+        # encodings.normalize_encoding() doesn't convert
+        # characters to lower case.
+        self.assertEqual(normalize('UTF 8'), 'UTF_8')
+        self.assertEqual(normalize('utf.8'), 'utf.8')
+        self.assertEqual(normalize('utf...8'), 'utf...8')
+
  
  if __name__ == "__main__":
      unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst

new file mode 100644 (file)

index 0000000..c2b4dbe
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst
@@ -0,0 +1 @@
+:func:`encodings.normalize_encoding` now ignores non-ASCII characters.
author	Hai Shi <shihai1992@gmail.com>
	Wed, 14 Oct 2020 15:43:31 +0000 (23:43 +0800)
committer	GitHub <noreply@github.com>
	Wed, 14 Oct 2020 15:43:31 +0000 (17:43 +0200)
Doc/whatsnew/3.10.rst		patch \| blob \| blame \| history
Lib/encodings/__init__.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst	[new file with mode: 0644]	patch \| blob