From 38fab8e50e4e4579a92058dd5830859a3ef1dea6 Mon Sep 17 00:00:00 2001 From: Shardul Deshpande Date: Mon, 15 Jun 2026 15:32:54 +0530 Subject: [PATCH] gh-151497: Avoid huge pre-allocation for oversized tarfile extended headers tarfile reads a member's extended header (a GNU long name/link or a pax header) with a single read sized by the header's size field: buf = tarfile.fileobj.read(self._block(self.size)) The size is taken from the archive and is not validated, so a ~512-byte crafted file can claim several gigabytes (or, via base-256 encoding, far more) and make read() pre-allocate that much memory -- on open/iterate, before any extraction filter runs. Read the extended-header data in bounded chunks instead, so an oversized or truncated header can no longer force a huge allocation. The bytes returned for valid archives are unchanged. --- Lib/tarfile.py | 30 +++++++++++- Lib/test/test_tarfile.py | 46 +++++++++++++++++++ ...-06-15-15-32-36.gh-issue-151497.1cfmSV.rst | 4 ++ 3 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index a293a0492472749..f68cbef4b85b1fd 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -256,6 +256,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): dst.write(buf) return +# Maximum number of bytes read in a single call when reading a member's +# extended header (a GNU long name/link or a pax header). The size of such +# a header is taken from the archive and is not trustworthy, so it is read in +# bounded chunks to avoid a huge up-front allocation when a crafted or +# truncated archive claims far more data than the file actually contains +# (gh-151497). +_EXTHEADER_READ_CHUNK = 1024 * 1024 # 1 MiB + +def _safe_read(fileobj, size): + """Read up to *size* bytes from *fileobj* in bounded chunks. + + Returns the same bytes as ``fileobj.read(size)`` would (including a short + result at end of file), but never pre-allocates *size* bytes, so an + oversized size field in a crafted header cannot force a huge allocation. + """ + if size <= _EXTHEADER_READ_CHUNK: + return fileobj.read(size) + chunks = [] + while size > 0: + chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK)) + if not chunk: + break + chunks.append(chunk) + size -= len(chunk) + return b"".join(chunks) + def _safe_print(s): encoding = getattr(sys.stdout, 'encoding', None) if encoding is not None: @@ -1424,7 +1450,7 @@ def _proc_gnulong(self, tarfile): """Process the blocks that hold a GNU longname or longlink member. """ - buf = tarfile.fileobj.read(self._block(self.size)) + buf = _safe_read(tarfile.fileobj, self._block(self.size)) # Fetch the next header and process it. try: @@ -1480,7 +1506,7 @@ def _proc_pax(self, tarfile): POSIX.1-2008. """ # Read the header information. - buf = tarfile.fileobj.read(self._block(self.size)) + buf = _safe_read(tarfile.fileobj, self._block(self.size)) # A pax header stores supplemental information for either # the following file (extended) or all following files diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 62a262740a7efa8..8d93f0103b36575 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -550,6 +550,52 @@ def test_extractfile_attrs(self): self.assertIs(fobj.seekable(), True) +class _ReadSizeRecorder(io.BytesIO): + # Records the largest size ever passed to read(), so a test can check + # that tarfile does not request far more data than the archive holds + # (which on a real file would pre-allocate it). + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.max_read_size = 0 + + def read(self, size=-1): + if size is not None and size >= 0: + self.max_read_size = max(self.max_read_size, size) + return super().read(size) + + +class ExtendedHeaderMemoryTest(unittest.TestCase): + # gh-151497: the size of a GNU long name/link or a pax extended header is + # read from the archive and is untrusted. A crafted header can claim a + # size far larger than the file actually contains; opening such an archive + # must not try to read (and so pre-allocate) the claimed size in one go. + + def _crafted_archive(self, hdrtype): + tarinfo = tarfile.TarInfo("A") + tarinfo.type = hdrtype + tarinfo.size = 0xFFFFFFFF # ~4 GiB claimed in a 512-byte header + return tarinfo.tobuf(format=tarfile.GNU_FORMAT) + + def _check(self, hdrtype): + fobj = _ReadSizeRecorder(self._crafted_archive(hdrtype)) + try: + with tarfile.open(fileobj=fobj, mode="r:") as tar: + tar.getmembers() + except tarfile.ReadError: + pass # a truncated header is fine; we only check the allocation + # The bogus ~4 GiB size must never reach a single read() call. + self.assertLess(fobj.max_read_size, 10 * 1024 * 1024) + + def test_gnu_longname_oversized_size(self): + self._check(tarfile.GNUTYPE_LONGNAME) + + def test_gnu_longlink_oversized_size(self): + self._check(tarfile.GNUTYPE_LONGLINK) + + def test_pax_header_oversized_size(self): + self._check(tarfile.XHDTYPE) + + class MiscReadTestBase(CommonReadTest): is_stream = False diff --git a/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst b/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst new file mode 100644 index 000000000000000..a4c03c9d71d7618 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst @@ -0,0 +1,4 @@ +Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge +buffer when a crafted or truncated member claims an oversized extended header +(a GNU long name/link or a pax header). The extended header is now read in +bounded chunks, so its size field can no longer trigger memory exhaustion.