]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
Issue #27199: TarFile expose copyfileobj bufsize to improve throughput
authorŁukasz Langa <lukasz@langa.pl>
Sat, 10 Sep 2016 02:48:14 +0000 (19:48 -0700)
committerŁukasz Langa <lukasz@langa.pl>
Sat, 10 Sep 2016 02:48:14 +0000 (19:48 -0700)
Patch by Jason Fried.

Lib/tarfile.py
Misc/NEWS

index 960c673067e940330f5605b01e0188bb37aeba2e..a62ab825454e95629dcab0fa0d5ffed09851966e 100755 (executable)
@@ -228,21 +228,21 @@ def calc_chksums(buf):
     signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
     return unsigned_chksum, signed_chksum
 
-def copyfileobj(src, dst, length=None, exception=OSError):
+def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
     """Copy length bytes from fileobj src to fileobj dst.
        If length is None, copy the entire content.
     """
+    bufsize = bufsize or 16 * 1024
     if length == 0:
         return
     if length is None:
-        shutil.copyfileobj(src, dst)
+        shutil.copyfileobj(src, dst, bufsize)
         return
 
-    BUFSIZE = 16 * 1024
-    blocks, remainder = divmod(length, BUFSIZE)
+    blocks, remainder = divmod(length, bufsize)
     for b in range(blocks):
-        buf = src.read(BUFSIZE)
-        if len(buf) < BUFSIZE:
+        buf = src.read(bufsize)
+        if len(buf) < bufsize:
             raise exception("unexpected end of data")
         dst.write(buf)
 
@@ -1403,7 +1403,8 @@ class TarFile(object):
 
     def __init__(self, name=None, mode="r", fileobj=None, format=None,
             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
-            errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
+            errors="surrogateescape", pax_headers=None, debug=None,
+            errorlevel=None, copybufsize=None):
         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
            read from an existing archive, 'a' to append data to an existing
            file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1459,6 +1460,7 @@ class TarFile(object):
             self.errorlevel = errorlevel
 
         # Init datastructures.
+        self.copybufsize = copybufsize
         self.closed = False
         self.members = []       # list of members as TarInfo objects
         self._loaded = False    # flag if all members have been read
@@ -1558,7 +1560,7 @@ class TarFile(object):
                     saved_pos = fileobj.tell()
                 try:
                     return func(name, "r", fileobj, **kwargs)
-                except (ReadError, CompressionError) as e:
+                except (ReadError, CompressionError):
                     if fileobj is not None:
                         fileobj.seek(saved_pos)
                     continue
@@ -1963,10 +1965,10 @@ class TarFile(object):
         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
         self.fileobj.write(buf)
         self.offset += len(buf)
-
+        bufsize=self.copybufsize
         # If there's data to follow, append it.
         if fileobj is not None:
-            copyfileobj(fileobj, self.fileobj, tarinfo.size)
+            copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
             if remainder > 0:
                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
@@ -2148,15 +2150,16 @@ class TarFile(object):
         """
         source = self.fileobj
         source.seek(tarinfo.offset_data)
+        bufsize = self.copybufsize
         with bltn_open(targetpath, "wb") as target:
             if tarinfo.sparse is not None:
                 for offset, size in tarinfo.sparse:
                     target.seek(offset)
-                    copyfileobj(source, target, size, ReadError)
+                    copyfileobj(source, target, size, ReadError, bufsize)
                 target.seek(tarinfo.size)
                 target.truncate()
             else:
-                copyfileobj(source, target, tarinfo.size, ReadError)
+                copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
 
     def makeunknown(self, tarinfo, targetpath):
         """Make a file from a TarInfo object with an unknown type
@@ -2235,7 +2238,7 @@ class TarFile(object):
                     os.lchown(targetpath, u, g)
                 else:
                     os.chown(targetpath, u, g)
-            except OSError as e:
+            except OSError:
                 raise ExtractError("could not change owner")
 
     def chmod(self, tarinfo, targetpath):
@@ -2244,7 +2247,7 @@ class TarFile(object):
         if hasattr(os, 'chmod'):
             try:
                 os.chmod(targetpath, tarinfo.mode)
-            except OSError as e:
+            except OSError:
                 raise ExtractError("could not change mode")
 
     def utime(self, tarinfo, targetpath):
@@ -2254,7 +2257,7 @@ class TarFile(object):
             return
         try:
             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
-        except OSError as e:
+        except OSError:
             raise ExtractError("could not change modification time")
 
     #--------------------------------------------------------------------------
index 7ce9ac17e8222136e210620124052de3726b6a75..b8cc778de117d60db3b7b6c58b730cc54132a6c2 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.6.0 beta 1
 Core and Builtins
 -----------------
 
+- Issue #27199: In tarfile, expose copyfileobj bufsize to improve throughput.
+  Patch by Jason Fried.
+
 - Issue #27948: In f-strings, only allow backslashes inside the braces
   (where the expressions are).  This is a breaking change from the 3.6
   alpha releases, where backslashes are allowed anywhere in an