[3.14] gh-151497: Avoid huge pre-allocation for oversized tarfile extended headers...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)

committer GitHub <noreply@github.com>

Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)
committer GitHub <noreply@github.com>
Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)
diff --git a/Lib/tarfile.py b/Lib/tarfile.py

index 399f906efdffa4d1f78a978b48eaffddc115214d..cb09e307c469bdd30b5ff6963cba9a28948776ae 100644 (file)
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -258,6 +258,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
          dst.write(buf)
      return
  
+# Maximum number of bytes read in a single call when reading a member's
+# extended header (a GNU long name/link or a pax header).  The size of such
+# a header is taken from the archive and is not trustworthy, so it is read in
+# bounded chunks to avoid a huge up-front allocation when a crafted or
+# truncated archive claims far more data than the file actually contains
+# (gh-151497).
+_EXTHEADER_READ_CHUNK = 1024 * 1024  # 1 MiB
+
+def _safe_read(fileobj, size):
+    """Read up to *size* bytes from *fileobj* in bounded chunks.
+
+    Returns the same bytes as ``fileobj.read(size)`` would (including a short
+    result at end of file), but limits pre-allocation, so an
+    oversized size field in a crafted header cannot force a huge allocation.
+    """
+    if size <= _EXTHEADER_READ_CHUNK:
+        return fileobj.read(size)
+    chunks = []
+    while size > 0:
+        chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
+        if not chunk:
+            break
+        chunks.append(chunk)
+        size -= len(chunk)
+    return b"".join(chunks)
+
  def _safe_print(s):
      encoding = getattr(sys.stdout, 'encoding', None)
      if encoding is not None:
@@ -1431,7 +1457,7 @@ class TarInfo(object):
          """Process the blocks that hold a GNU longname
             or longlink member.
          """
-        buf = tarfile.fileobj.read(self._block(self.size))
+        buf = _safe_read(tarfile.fileobj, self._block(self.size))
  
          # Fetch the next header and process it.
          try:
@@ -1487,7 +1513,7 @@ class TarInfo(object):
             POSIX.1-2008.
          """
          # Read the header information.
-        buf = tarfile.fileobj.read(self._block(self.size))
+        buf = _safe_read(tarfile.fileobj, self._block(self.size))
  
          # A pax header stores supplemental information for either
          # the following file (extended) or all following files
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py

index 045377d620cc42a69f7884fb91413626b4de016d..804c3e6d809487ad9969312322dcb40183f1e448 100644 (file)
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -549,6 +549,53 @@ class CommonReadTest(ReadTest):
              self.assertIs(fobj.seekable(), True)
  
  
+class ReadSizeRecorder(io.BytesIO):
+    # Records the largest size ever passed to read(), so a test can check
+    # that tarfile does not request far more data than the archive holds
+    # (which on a real file would pre-allocate it).
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_read_size = 0
+
+    def read(self, size=-1):
+        if size is not None and size >= 0:
+            self.max_read_size = max(self.max_read_size, size)
+        return super().read(size)
+
+
+@support.cpython_only
+class ExtendedHeaderMemoryTest(unittest.TestCase):
+    # gh-151497: the size of a GNU long name/link or a pax extended header is
+    # read from the archive and is untrusted.  A crafted header can claim a
+    # size far larger than the file actually contains; opening such an archive
+    # must not try to read (and so pre-allocate) the claimed size in one go.
+
+    def crafted_archive(self, hdrtype):
+        tarinfo = tarfile.TarInfo("A")
+        tarinfo.type = hdrtype
+        tarinfo.size = 0xFFFFFFFF  # ~4 GiB claimed in a 512-byte header
+        return tarinfo.tobuf(format=tarfile.GNU_FORMAT)
+
+    def check(self, hdrtype):
+        fobj = ReadSizeRecorder(self.crafted_archive(hdrtype))
+        try:
+            with tarfile.open(fileobj=fobj, mode="r:") as tar:
+                tar.getmembers()
+        except tarfile.ReadError:
+            pass  # a truncated header is fine; we only check the allocation
+        # The bogus ~4 GiB size must never reach a single read() call.
+        self.assertLessEqual(fobj.max_read_size, tarfile._EXTHEADER_READ_CHUNK)
+
+    def test_gnu_longname_oversized_size(self):
+        self.check(tarfile.GNUTYPE_LONGNAME)
+
+    def test_gnu_longlink_oversized_size(self):
+        self.check(tarfile.GNUTYPE_LONGLINK)
+
+    def test_pax_header_oversized_size(self):
+        self.check(tarfile.XHDTYPE)
+
+
  class MiscReadTestBase(CommonReadTest):
      is_stream = False
  
diff --git a/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst b/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst

new file mode 100644 (file)

index 0000000..a4c03c9
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst
@@ -0,0 +1,4 @@
+Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge
+buffer when a crafted or truncated member claims an oversized extended header
+(a GNU long name/link or a pax header).  The extended header is now read in
+bounded chunks, so its size field can no longer trigger memory exhaustion.
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)
committer	GitHub <noreply@github.com>
	Wed, 24 Jun 2026 09:23:36 +0000 (11:23 +0200)
Lib/tarfile.py		patch \| blob \| blame \| history
Lib/test/test_tarfile.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst	[new file with mode: 0644]	patch \| blob