From: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:39:51 +0000 (+0200) Subject: [3.15] gh-121109: Fix performance of tarfile reading with "r|*" (GH-121296) (GH-150604) X-Git-Tag: v3.15.0b2~21 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=9c1e3af17a9645ca7318b2d48481c832aebf8eae;p=thirdparty%2FPython%2Fcpython.git [3.15] gh-121109: Fix performance of tarfile reading with "r|*" (GH-121296) (GH-150604) (cherry picked from commit 6d7a19e5334636f77cac135120fe81f343a73876) Co-authored-by: Tomi Belan --- diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 5bf2ede09010..55e4a4e0c9a2 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -380,7 +380,6 @@ class _Stream: except ImportError: raise CompressionError("bz2 module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = bz2.BZ2Decompressor() self.exception = OSError else: @@ -392,7 +391,6 @@ class _Stream: except ImportError: raise CompressionError("lzma module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = lzma.LZMADecompressor() self.exception = lzma.LZMAError else: @@ -403,7 +401,6 @@ class _Stream: except ImportError: raise CompressionError("compression.zstd module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = zstd.ZstdDecompressor() self.exception = zstd.ZstdError else: @@ -485,7 +482,6 @@ class _Stream: """Initialize for reading a gzip compressed fileobj. """ self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) - self.dbuf = b"" # taken from gzip.GzipFile with some alterations if self.__read(2) != b"\037\213": @@ -543,26 +539,44 @@ class _Stream: if self.comptype == "tar": return self.__read(size) - c = len(self.dbuf) - t = [self.dbuf] + c = 0 + t = [] while c < size: - # Skip underlying buffer to avoid unaligned double buffering. - if self.buf: - buf = self.buf - self.buf = b"" + if self.comptype == "gz": + # zlib interface is different than others. + # It returns data in unconsumed_tail. + if self.buf: + cbuf = self.buf + self.buf = b"" + else: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + + try: + dbuf = self.cmp.decompress(cbuf, size - c) + self.buf = self.cmp.unconsumed_tail + except self.exception as e: + raise ReadError("invalid compressed data") from e else: - buf = self.fileobj.read(self.bufsize) - if not buf: - break - try: - buf = self.cmp.decompress(buf) - except self.exception as e: - raise ReadError("invalid compressed data") from e - t.append(buf) - c += len(buf) - t = b"".join(t) - self.dbuf = t[size:] - return t[:size] + # Other decompressors have needs_input. + # decompress() can buffer data internally. + if self.cmp.needs_input: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + else: + cbuf = b"" + + try: + dbuf = self.cmp.decompress(cbuf, size - c) + except self.exception as e: + raise ReadError("invalid compressed data") from e + + t.append(dbuf) + c += len(dbuf) + + return b"".join(t) def __read(self, size): """Return size bytes from stream. If internal buffer is empty, diff --git a/Misc/ACKS b/Misc/ACKS index 234d0d2d0a2a..14f0db754953 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -144,6 +144,7 @@ Bas van Beek Ian Beer Stefan Behnel Reimer Behrends +Tomi Belan Maxime Bélanger Ben Bell Thomas Bellman diff --git a/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst new file mode 100644 index 000000000000..eca6014e4a0a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst @@ -0,0 +1,2 @@ +Fix :mod:`tarfile` performance issue when reading archives in streaming mode +(e.g. ``r|*``).