]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#101251)
authorArjun <ccldarjun@icloud.com>
Mon, 8 May 2023 17:55:59 +0000 (10:55 -0700)
committerGitHub <noreply@github.com>
Mon, 8 May 2023 17:55:59 +0000 (17:55 +0000)
Use `io.BufferedWriter` to buffer gzip writes.

---------

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
Co-authored-by: Gregory P. Smith <greg@krypto.org>
Lib/gzip.py
Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst [new file with mode: 0644]

index 75c6ddc3f2cffb97044ac789331037df0d93d4ca..8796c8d9fd9a2db585bfd8f5742184fe8bb4d7dc 100644 (file)
@@ -22,6 +22,7 @@ _COMPRESS_LEVEL_TRADEOFF = 6
 _COMPRESS_LEVEL_BEST = 9
 
 READ_BUFFER_SIZE = 128 * 1024
+_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
 
 
 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
@@ -120,6 +121,21 @@ class BadGzipFile(OSError):
     """Exception raised in some cases for invalid gzip files."""
 
 
+class _WriteBufferStream(io.RawIOBase):
+    """Minimal object to pass WriteBuffer flushes into GzipFile"""
+    def __init__(self, gzip_file):
+        self.gzip_file = gzip_file
+
+    def write(self, data):
+        return self.gzip_file._write_raw(data)
+
+    def seekable(self):
+        return False
+
+    def writable(self):
+        return True
+
+
 class GzipFile(_compression.BaseStream):
     """The GzipFile class simulates most of the methods of a file object with
     the exception of the truncate() method.
@@ -184,6 +200,7 @@ class GzipFile(_compression.BaseStream):
         if mode is None:
             mode = getattr(fileobj, 'mode', 'rb')
 
+
         if mode.startswith('r'):
             self.mode = READ
             raw = _GzipReader(fileobj)
@@ -206,6 +223,9 @@ class GzipFile(_compression.BaseStream):
                                              zlib.DEF_MEM_LEVEL,
                                              0)
             self._write_mtime = mtime
+            self._buffer_size = _WRITE_BUFFER_SIZE
+            self._buffer = io.BufferedWriter(_WriteBufferStream(self),
+                                             buffer_size=self._buffer_size)
         else:
             raise ValueError("Invalid mode: {!r}".format(mode))
 
@@ -231,6 +251,11 @@ class GzipFile(_compression.BaseStream):
         self.bufsize = 0
         self.offset = 0  # Current file offset for seek(), tell(), etc
 
+    def tell(self):
+        self._check_not_closed()
+        self._buffer.flush()
+        return super().tell()
+
     def _write_gzip_header(self, compresslevel):
         self.fileobj.write(b'\037\213')             # magic header
         self.fileobj.write(b'\010')                 # compression method
@@ -272,6 +297,10 @@ class GzipFile(_compression.BaseStream):
         if self.fileobj is None:
             raise ValueError("write() on closed GzipFile object")
 
+        return self._buffer.write(data)
+
+    def _write_raw(self, data):
+        # Called by our self._buffer underlying WriteBufferStream.
         if isinstance(data, (bytes, bytearray)):
             length = len(data)
         else:
@@ -322,9 +351,9 @@ class GzipFile(_compression.BaseStream):
         fileobj = self.fileobj
         if fileobj is None:
             return
-        self.fileobj = None
         try:
             if self.mode == WRITE:
+                self._buffer.flush()
                 fileobj.write(self.compress.flush())
                 write32u(fileobj, self.crc)
                 # self.size may exceed 2 GiB, or even 4 GiB
@@ -332,6 +361,7 @@ class GzipFile(_compression.BaseStream):
             elif self.mode == READ:
                 self._buffer.close()
         finally:
+            self.fileobj = None
             myfileobj = self.myfileobj
             if myfileobj:
                 self.myfileobj = None
@@ -341,7 +371,7 @@ class GzipFile(_compression.BaseStream):
         self._check_not_closed()
         if self.mode == WRITE:
             # Ensure the compressor's buffer is flushed
-            self.fileobj.write(self.compress.flush(zlib_mode))
+            self._buffer.flush()
             self.fileobj.flush()
 
     def fileno(self):
@@ -378,10 +408,10 @@ class GzipFile(_compression.BaseStream):
             if offset < self.offset:
                 raise OSError('Negative seek in write mode')
             count = offset - self.offset
-            chunk = b'\0' * 1024
-            for i in range(count // 1024):
+            chunk = b'\0' * self._buffer_size
+            for i in range(count // self._buffer_size):
                 self.write(chunk)
-            self.write(b'\0' * (count % 1024))
+            self.write(b'\0' * (count % self._buffer_size))
         elif self.mode == READ:
             self._check_not_closed()
             return self._buffer.seek(offset, whence)
diff --git a/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst
new file mode 100644 (file)
index 0000000..556db0e
--- /dev/null
@@ -0,0 +1,2 @@
+Decrease execution time of some :mod:`gzip` file writes by 15% by
+adding more appropriate buffering.