gh-151497: Avoid huge pre-allocation for oversized tarfile extended headers

iamsharduld · iamsharduld · commit 38fab8e50e4e · 2026-06-15T15:32:54.000+05:30
tarfile reads a member's extended header (a GNU long name/link or a pax
header) with a single read sized by the header's size field:

    buf = tarfile.fileobj.read(self._block(self.size))

The size is taken from the archive and is not validated, so a ~512-byte
crafted file can claim several gigabytes (or, via base-256 encoding, far
more) and make read() pre-allocate that much memory -- on open/iterate,
before any extraction filter runs.

Read the extended-header data in bounded chunks instead, so an oversized
or truncated header can no longer force a huge allocation. The bytes
returned for valid archives are unchanged.
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
@@ -256,6 +256,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
         dst.write(buf)
     return
 
+# Maximum number of bytes read in a single call when reading a member's
+# extended header (a GNU long name/link or a pax header).  The size of such
+# a header is taken from the archive and is not trustworthy, so it is read in
+# bounded chunks to avoid a huge up-front allocation when a crafted or
+# truncated archive claims far more data than the file actually contains
+# (gh-151497).
+_EXTHEADER_READ_CHUNK = 1024 * 1024  # 1 MiB
+
+def _safe_read(fileobj, size):
+    """Read up to *size* bytes from *fileobj* in bounded chunks.
+
+    Returns the same bytes as ``fileobj.read(size)`` would (including a short
+    result at end of file), but never pre-allocates *size* bytes, so an
+    oversized size field in a crafted header cannot force a huge allocation.
+    """
+    if size <= _EXTHEADER_READ_CHUNK:
+        return fileobj.read(size)
+    chunks = []
+    while size > 0:
+        chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
+        if not chunk:
+            break
+        chunks.append(chunk)
+        size -= len(chunk)
+    return b"".join(chunks)
+
 def _safe_print(s):
     encoding = getattr(sys.stdout, 'encoding', None)
     if encoding is not None:
@@ -1424,7 +1450,7 @@ def _proc_gnulong(self, tarfile):
         """Process the blocks that hold a GNU longname
            or longlink member.
         """
-        buf = tarfile.fileobj.read(self._block(self.size))
+        buf = _safe_read(tarfile.fileobj, self._block(self.size))
 
         # Fetch the next header and process it.
         try:
@@ -1480,7 +1506,7 @@ def _proc_pax(self, tarfile):
            POSIX.1-2008.
         """
         # Read the header information.
-        buf = tarfile.fileobj.read(self._block(self.size))
+        buf = _safe_read(tarfile.fileobj, self._block(self.size))
 
         # A pax header stores supplemental information for either
         # the following file (extended) or all following files
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
@@ -550,6 +550,52 @@ def test_extractfile_attrs(self):
             self.assertIs(fobj.seekable(), True)
 
 
+class _ReadSizeRecorder(io.BytesIO):
+    # Records the largest size ever passed to read(), so a test can check
+    # that tarfile does not request far more data than the archive holds
+    # (which on a real file would pre-allocate it).
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_read_size = 0
+
+    def read(self, size=-1):
+        if size is not None and size >= 0:
+            self.max_read_size = max(self.max_read_size, size)
+        return super().read(size)
+
+
+class ExtendedHeaderMemoryTest(unittest.TestCase):
+    # gh-151497: the size of a GNU long name/link or a pax extended header is
+    # read from the archive and is untrusted.  A crafted header can claim a
+    # size far larger than the file actually contains; opening such an archive
+    # must not try to read (and so pre-allocate) the claimed size in one go.
+
+    def _crafted_archive(self, hdrtype):
+        tarinfo = tarfile.TarInfo("A")
+        tarinfo.type = hdrtype
+        tarinfo.size = 0xFFFFFFFF  # ~4 GiB claimed in a 512-byte header
+        return tarinfo.tobuf(format=tarfile.GNU_FORMAT)
+
+    def _check(self, hdrtype):
+        fobj = _ReadSizeRecorder(self._crafted_archive(hdrtype))
+        try:
+            with tarfile.open(fileobj=fobj, mode="r:") as tar:
+                tar.getmembers()
+        except tarfile.ReadError:
+            pass  # a truncated header is fine; we only check the allocation
+        # The bogus ~4 GiB size must never reach a single read() call.
+        self.assertLess(fobj.max_read_size, 10 * 1024 * 1024)
+
+    def test_gnu_longname_oversized_size(self):
+        self._check(tarfile.GNUTYPE_LONGNAME)
+
+    def test_gnu_longlink_oversized_size(self):
+        self._check(tarfile.GNUTYPE_LONGLINK)
+
+    def test_pax_header_oversized_size(self):
+        self._check(tarfile.XHDTYPE)
+
+
 class MiscReadTestBase(CommonReadTest):
     is_stream = False
 
diff --git a/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst b/Misc/NEWS.d/next/Library/2026-06-15-15-32-36.gh-issue-151497.1cfmSV.rst
@@ -0,0 +1,4 @@
+Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge
+buffer when a crafted or truncated member claims an oversized extended header
+(a GNU long name/link or a pax header).  The extended header is now read in
+bounded chunks, so its size field can no longer trigger memory exhaustion.