]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-121267: Improve performance of tarfile (#121267) (#121269)
authorJohan Förberg <johan@forberg.se>
Wed, 30 Oct 2024 22:08:30 +0000 (23:08 +0100)
committerGitHub <noreply@github.com>
Wed, 30 Oct 2024 22:08:30 +0000 (15:08 -0700)
Tarfile in the default write mode spends much of its time resolving UIDs
into usernames and GIDs into group names. By caching these mappings, a
significant speedup can be achieved.

In my simple benchmark[1], this extra caching speeds up tarfile by 8x.

[1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2

---------

Co-authored-by: Tian Gao <gaogaotiantian@hotmail.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
Co-authored-by: Shantanu <12621235+hauntsaninja@users.noreply.github.com>
Lib/tarfile.py
Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst [new file with mode: 0644]

index 1475b3da2d32937092103bd4da29cd1430eb5b09..a0fab46b24e24924aec8081c1fb07f2cc520752d 100644 (file)
@@ -1760,6 +1760,8 @@ class TarFile(object):
                                 # current position in the archive file
         self.inodes = {}        # dictionary caching the inodes of
                                 # archive members already added
+        self._unames = {}       # Cached mappings of uid -> uname
+        self._gnames = {}       # Cached mappings of gid -> gname
 
         try:
             if self.mode == "r":
@@ -2138,16 +2140,23 @@ class TarFile(object):
         tarinfo.mtime = statres.st_mtime
         tarinfo.type = type
         tarinfo.linkname = linkname
+
+        # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
+        # speed things up, cache the resolved usernames and group names.
         if pwd:
-            try:
-                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
-            except KeyError:
-                pass
+            if tarinfo.uid not in self._unames:
+                try:
+                    self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
+                except KeyError:
+                    self._unames[tarinfo.uid] = ''
+            tarinfo.uname = self._unames[tarinfo.uid]
         if grp:
-            try:
-                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
-            except KeyError:
-                pass
+            if tarinfo.gid not in self._gnames:
+                try:
+                    self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
+                except KeyError:
+                    self._gnames[tarinfo.gid] = ''
+            tarinfo.gname = self._gnames[tarinfo.gid]
 
         if type in (CHRTYPE, BLKTYPE):
             if hasattr(os, "major") and hasattr(os, "minor"):
diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst
new file mode 100644 (file)
index 0000000..9e52405
--- /dev/null
@@ -0,0 +1,2 @@
+Improve the performance of :mod:`tarfile` when writing files, by caching user names
+and group names.