]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-103200: Fix performance issues with `zipimport.invalidate_caches()` (GH-103208)
authorDesmond Cheong <desmondcheongzx@gmail.com>
Fri, 7 Jul 2023 22:02:13 +0000 (15:02 -0700)
committerGitHub <noreply@github.com>
Fri, 7 Jul 2023 22:02:13 +0000 (22:02 +0000)
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Brett Cannon <brett@python.org>
Lib/test/test_zipimport.py
Lib/zipimport.py
Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst [new file with mode: 0644]

index 14c19719e260c4cf1a1b65d8dd574c92df74a634..c12798d221e9b763bf9227740c6c08a5f232bdae 100644 (file)
@@ -520,10 +520,10 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
                 z.writestr(zinfo, data)
 
         zi = zipimport.zipimporter(TEMP_ZIP)
-        self.assertEqual(zi._files.keys(), files.keys())
+        self.assertEqual(zi._get_files().keys(), files.keys())
         # Check that the file information remains accurate after reloading
         zi.invalidate_caches()
-        self.assertEqual(zi._files.keys(), files.keys())
+        self.assertEqual(zi._get_files().keys(), files.keys())
         # Add a new file to the ZIP archive
         newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
         files.update(newfile)
@@ -535,17 +535,54 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
                 z.writestr(zinfo, data)
         # Check that we can detect the new file after invalidating the cache
         zi.invalidate_caches()
-        self.assertEqual(zi._files.keys(), files.keys())
+        self.assertEqual(zi._get_files().keys(), files.keys())
         spec = zi.find_spec('spam2')
         self.assertIsNotNone(spec)
         self.assertIsInstance(spec.loader, zipimport.zipimporter)
         # Check that the cached data is removed if the file is deleted
         os.remove(TEMP_ZIP)
         zi.invalidate_caches()
-        self.assertFalse(zi._files)
+        self.assertFalse(zi._get_files())
         self.assertIsNone(zipimport._zip_directory_cache.get(zi.archive))
         self.assertIsNone(zi.find_spec("name_does_not_matter"))
 
+    def testInvalidateCachesWithMultipleZipimports(self):
+        packdir = TESTPACK + os.sep
+        packdir2 = packdir + TESTPACK2 + os.sep
+        files = {packdir + "__init__" + pyc_ext: (NOW, test_pyc),
+                 packdir2 + "__init__" + pyc_ext: (NOW, test_pyc),
+                 packdir2 + TESTMOD + pyc_ext: (NOW, test_pyc),
+                 "spam" + pyc_ext: (NOW, test_pyc)}
+        self.addCleanup(os_helper.unlink, TEMP_ZIP)
+        with ZipFile(TEMP_ZIP, "w") as z:
+            for name, (mtime, data) in files.items():
+                zinfo = ZipInfo(name, time.localtime(mtime))
+                zinfo.compress_type = self.compression
+                zinfo.comment = b"spam"
+                z.writestr(zinfo, data)
+
+        zi = zipimport.zipimporter(TEMP_ZIP)
+        self.assertEqual(zi._get_files().keys(), files.keys())
+        # Zipimporter for the same path.
+        zi2 = zipimport.zipimporter(TEMP_ZIP)
+        self.assertEqual(zi2._get_files().keys(), files.keys())
+        # Add a new file to the ZIP archive to make the cache wrong.
+        newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
+        files.update(newfile)
+        with ZipFile(TEMP_ZIP, "a") as z:
+            for name, (mtime, data) in newfile.items():
+                zinfo = ZipInfo(name, time.localtime(mtime))
+                zinfo.compress_type = self.compression
+                zinfo.comment = b"spam"
+                z.writestr(zinfo, data)
+        # Invalidate the cache of the first zipimporter.
+        zi.invalidate_caches()
+        # Check that the second zipimporter detects the new file and isn't using a stale cache.
+        self.assertEqual(zi2._get_files().keys(), files.keys())
+        spec = zi2.find_spec('spam2')
+        self.assertIsNotNone(spec)
+        self.assertIsInstance(spec.loader, zipimport.zipimporter)
+
     def testZipImporterMethodsInSubDirectory(self):
         packdir = TESTPACK + os.sep
         packdir2 = packdir + TESTPACK2 + os.sep
index a7333a4c4906dea86b88208dffa37becb6b30a45..5b9f614f02f7af01de9a183e8d81e209ef8e0039 100644 (file)
@@ -88,12 +88,8 @@ class zipimporter(_bootstrap_external._LoaderBasics):
                     raise ZipImportError('not a Zip file', path=path)
                 break
 
-        try:
-            files = _zip_directory_cache[path]
-        except KeyError:
-            files = _read_directory(path)
-            _zip_directory_cache[path] = files
-        self._files = files
+        if path not in _zip_directory_cache:
+            _zip_directory_cache[path] = _read_directory(path)
         self.archive = path
         # a prefix directory following the ZIP file path.
         self.prefix = _bootstrap_external._path_join(*prefix[::-1])
@@ -152,7 +148,7 @@ class zipimporter(_bootstrap_external._LoaderBasics):
             key = pathname[len(self.archive + path_sep):]
 
         try:
-            toc_entry = self._files[key]
+            toc_entry = self._get_files()[key]
         except KeyError:
             raise OSError(0, '', key)
         return _get_data(self.archive, toc_entry)
@@ -189,7 +185,7 @@ class zipimporter(_bootstrap_external._LoaderBasics):
             fullpath = f'{path}.py'
 
         try:
-            toc_entry = self._files[fullpath]
+            toc_entry = self._get_files()[fullpath]
         except KeyError:
             # we have the module, but no source
             return None
@@ -268,14 +264,22 @@ class zipimporter(_bootstrap_external._LoaderBasics):
         return ZipReader(self, fullname)
 
 
-    def invalidate_caches(self):
-        """Reload the file data of the archive path."""
+    def _get_files(self):
+        """Return the files within the archive path."""
         try:
-            self._files = _read_directory(self.archive)
-            _zip_directory_cache[self.archive] = self._files
-        except ZipImportError:
-            _zip_directory_cache.pop(self.archive, None)
-            self._files = {}
+            files = _zip_directory_cache[self.archive]
+        except KeyError:
+            try:
+                files = _zip_directory_cache[self.archive] = _read_directory(self.archive)
+            except ZipImportError:
+                files = {}
+
+        return files
+
+
+    def invalidate_caches(self):
+        """Invalidates the cache of file data of the archive path."""
+        _zip_directory_cache.pop(self.archive, None)
 
 
     def __repr__(self):
@@ -305,15 +309,15 @@ def _is_dir(self, path):
     # of a namespace package. We test by seeing if the name, with an
     # appended path separator, exists.
     dirpath = path + path_sep
-    # If dirpath is present in self._files, we have a directory.
-    return dirpath in self._files
+    # If dirpath is present in self._get_files(), we have a directory.
+    return dirpath in self._get_files()
 
 # Return some information about a module.
 def _get_module_info(self, fullname):
     path = _get_module_path(self, fullname)
     for suffix, isbytecode, ispackage in _zip_searchorder:
         fullpath = path + suffix
-        if fullpath in self._files:
+        if fullpath in self._get_files():
             return ispackage
     return None
 
@@ -656,7 +660,7 @@ def _get_mtime_and_size_of_source(self, path):
         # strip 'c' or 'o' from *.py[co]
         assert path[-1:] in ('c', 'o')
         path = path[:-1]
-        toc_entry = self._files[path]
+        toc_entry = self._get_files()[path]
         # fetch the time stamp of the .py file for comparison
         # with an embedded pyc time stamp
         time = toc_entry[5]
@@ -676,7 +680,7 @@ def _get_pyc_source(self, path):
     path = path[:-1]
 
     try:
-        toc_entry = self._files[path]
+        toc_entry = self._get_files()[path]
     except KeyError:
         return None
     else:
@@ -692,7 +696,7 @@ def _get_module_code(self, fullname):
         fullpath = path + suffix
         _bootstrap._verbose_message('trying {}{}{}', self.archive, path_sep, fullpath, verbosity=2)
         try:
-            toc_entry = self._files[fullpath]
+            toc_entry = self._get_files()[fullpath]
         except KeyError:
             pass
         else:
diff --git a/Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst b/Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst
new file mode 100644 (file)
index 0000000..e264e0c
--- /dev/null
@@ -0,0 +1 @@
+Fix cache repopulation semantics of zipimport.invalidate_caches(). The cache is now repopulated upon retrieving files with an invalid cache, not when the cache is invalidated.