dst.write(buf)
return
+# Maximum number of bytes read in a single call when reading a member's
+# extended header (a GNU long name/link or a pax header). The size of such
+# a header is taken from the archive and is not trustworthy, so it is read in
+# bounded chunks to avoid a huge up-front allocation when a crafted or
+# truncated archive claims far more data than the file actually contains
+# (gh-151497).
+_EXTHEADER_READ_CHUNK = 1024 * 1024 # 1 MiB
+
+def _safe_read(fileobj, size):
+ """Read up to *size* bytes from *fileobj* in bounded chunks.
+
+ Returns the same bytes as ``fileobj.read(size)`` would (including a short
+ result at end of file), but limits pre-allocation, so an
+ oversized size field in a crafted header cannot force a huge allocation.
+ """
+ if size <= _EXTHEADER_READ_CHUNK:
+ return fileobj.read(size)
+ chunks = []
+ while size > 0:
+ chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
+ if not chunk:
+ break
+ chunks.append(chunk)
+ size -= len(chunk)
+ return b"".join(chunks)
+
def _safe_print(s):
encoding = getattr(sys.stdout, 'encoding', None)
if encoding is not None:
"""Process the blocks that hold a GNU longname
or longlink member.
"""
- buf = tarfile.fileobj.read(self._block(self.size))
+ buf = _safe_read(tarfile.fileobj, self._block(self.size))
# Fetch the next header and process it.
try:
POSIX.1-2008.
"""
# Read the header information.
- buf = tarfile.fileobj.read(self._block(self.size))
+ buf = _safe_read(tarfile.fileobj, self._block(self.size))
# A pax header stores supplemental information for either
# the following file (extended) or all following files
self.assertIs(fobj.seekable(), True)
+class ReadSizeRecorder(io.BytesIO):
+ # Records the largest size ever passed to read(), so a test can check
+ # that tarfile does not request far more data than the archive holds
+ # (which on a real file would pre-allocate it).
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.max_read_size = 0
+
+ def read(self, size=-1):
+ if size is not None and size >= 0:
+ self.max_read_size = max(self.max_read_size, size)
+ return super().read(size)
+
+
+@support.cpython_only
+class ExtendedHeaderMemoryTest(unittest.TestCase):
+ # gh-151497: the size of a GNU long name/link or a pax extended header is
+ # read from the archive and is untrusted. A crafted header can claim a
+ # size far larger than the file actually contains; opening such an archive
+ # must not try to read (and so pre-allocate) the claimed size in one go.
+
+ def crafted_archive(self, hdrtype):
+ tarinfo = tarfile.TarInfo("A")
+ tarinfo.type = hdrtype
+ tarinfo.size = 0xFFFFFFFF # ~4 GiB claimed in a 512-byte header
+ return tarinfo.tobuf(format=tarfile.GNU_FORMAT)
+
+ def check(self, hdrtype):
+ fobj = ReadSizeRecorder(self.crafted_archive(hdrtype))
+ try:
+ with tarfile.open(fileobj=fobj, mode="r:") as tar:
+ tar.getmembers()
+ except tarfile.ReadError:
+ pass # a truncated header is fine; we only check the allocation
+ # The bogus ~4 GiB size must never reach a single read() call.
+ self.assertLessEqual(fobj.max_read_size, tarfile._EXTHEADER_READ_CHUNK)
+
+ def test_gnu_longname_oversized_size(self):
+ self.check(tarfile.GNUTYPE_LONGNAME)
+
+ def test_gnu_longlink_oversized_size(self):
+ self.check(tarfile.GNUTYPE_LONGLINK)
+
+ def test_pax_header_oversized_size(self):
+ self.check(tarfile.XHDTYPE)
+
+
class MiscReadTestBase(CommonReadTest):
is_stream = False