]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147)
authorSerhiy Storchaka <storchaka@gmail.com>
Sun, 18 May 2025 19:21:06 +0000 (22:21 +0300)
committerGitHub <noreply@github.com>
Sun, 18 May 2025 19:21:06 +0000 (22:21 +0300)
UnicodeEncodeError is now handled the same way as OSError during
TarFile member extraction.

Lib/tarfile.py
Lib/test/test_tarfile.py
Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst [new file with mode: 0644]

index 13889d768021b19a014f1433af6256b5be49d81e..212b71f6509740d3a2567effaf69036ffcab3460 100644 (file)
@@ -2439,7 +2439,7 @@ class TarFile(object):
         unfiltered = tarinfo
         try:
             tarinfo = filter_function(tarinfo, path)
-        except (OSError, FilterError) as e:
+        except (OSError, UnicodeEncodeError, FilterError) as e:
             self._handle_fatal_error(e)
         except ExtractError as e:
             self._handle_nonfatal_error(e)
@@ -2460,7 +2460,7 @@ class TarFile(object):
             self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
                                  set_attrs=set_attrs,
                                  numeric_owner=numeric_owner)
-        except OSError as e:
+        except (OSError, UnicodeEncodeError) as e:
             self._handle_fatal_error(e)
         except ExtractError as e:
             self._handle_nonfatal_error(e)
index 2d9649237a9382bdc8127f19b13abbd9a551af6b..2018a20afd1b18079a7379b8cacefc42bab2d196 100644 (file)
@@ -3490,11 +3490,12 @@ class ArchiveMaker:
         with t.open() as tar:
             ... # `tar` is now a TarFile with 'filename' in it!
     """
-    def __init__(self):
+    def __init__(self, **kwargs):
         self.bio = io.BytesIO()
+        self.tar_kwargs = dict(kwargs)
 
     def __enter__(self):
-        self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
+        self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs)
         return self
 
     def __exit__(self, *exc):
@@ -4073,7 +4074,10 @@ class TestExtractionFilters(unittest.TestCase):
         # that in the test archive.)
         with tarfile.TarFile.open(tarname) as tar:
             for tarinfo in tar.getmembers():
-                filtered = tarfile.tar_filter(tarinfo, '')
+                try:
+                    filtered = tarfile.tar_filter(tarinfo, '')
+                except UnicodeEncodeError:
+                    continue
                 self.assertIs(filtered.name, tarinfo.name)
                 self.assertIs(filtered.type, tarinfo.type)
 
@@ -4084,11 +4088,48 @@ class TestExtractionFilters(unittest.TestCase):
             for tarinfo in tar.getmembers():
                 try:
                     filtered = tarfile.data_filter(tarinfo, '')
-                except tarfile.FilterError:
+                except (tarfile.FilterError, UnicodeEncodeError):
                     continue
                 self.assertIs(filtered.name, tarinfo.name)
                 self.assertIs(filtered.type, tarinfo.type)
 
+    @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
+    def test_filter_unencodable(self):
+        # Sanity check using a valid path.
+        tarinfo = tarfile.TarInfo(os_helper.TESTFN)
+        filtered = tarfile.tar_filter(tarinfo, '')
+        self.assertIs(filtered.name, tarinfo.name)
+        filtered = tarfile.data_filter(tarinfo, '')
+        self.assertIs(filtered.name, tarinfo.name)
+
+        tarinfo = tarfile.TarInfo('test\x00')
+        self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
+        self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
+        tarinfo = tarfile.TarInfo('\ud800')
+        self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
+        self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
+
+    @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
+    def test_extract_unencodable(self):
+        # Create a member with name \xed\xa0\x80 which is UTF-8 encoded
+        # lone surrogate \ud800.
+        with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
+            arc.add('\udced\udca0\udc80')
+        with os_helper.temp_cwd() as tmp:
+            tar = arc.open(encoding='utf-8', errors='surrogatepass',
+                           errorlevel=1)
+            self.assertEqual(tar.getnames(), ['\ud800'])
+            with self.assertRaises(UnicodeEncodeError):
+                tar.extractall()
+            self.assertEqual(os.listdir(), [])
+
+            tar = arc.open(encoding='utf-8', errors='surrogatepass',
+                           errorlevel=0, debug=1)
+            with support.captured_stderr() as stderr:
+                tar.extractall()
+            self.assertEqual(os.listdir(), [])
+            self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
+
     def test_change_default_filter_on_instance(self):
         tar = tarfile.TarFile(tarname, 'r')
         def strict_filter(tarinfo, path):
diff --git a/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst
new file mode 100644 (file)
index 0000000..44565a5
--- /dev/null
@@ -0,0 +1,2 @@
+The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same
+way as :exc:`OSError` when cannot extract a member.