Skip to content

Commit 38fab8e

Browse files
committed
gh-151497: Avoid huge pre-allocation for oversized tarfile extended headers
tarfile reads a member's extended header (a GNU long name/link or a pax header) with a single read sized by the header's size field: buf = tarfile.fileobj.read(self._block(self.size)) The size is taken from the archive and is not validated, so a ~512-byte crafted file can claim several gigabytes (or, via base-256 encoding, far more) and make read() pre-allocate that much memory -- on open/iterate, before any extraction filter runs. Read the extended-header data in bounded chunks instead, so an oversized or truncated header can no longer force a huge allocation. The bytes returned for valid archives are unchanged.
1 parent 5b38519 commit 38fab8e

3 files changed

Lines changed: 78 additions & 2 deletions

File tree

Lib/tarfile.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
256256
dst.write(buf)
257257
return
258258

259+
# Maximum number of bytes read in a single call when reading a member's
260+
# extended header (a GNU long name/link or a pax header). The size of such
261+
# a header is taken from the archive and is not trustworthy, so it is read in
262+
# bounded chunks to avoid a huge up-front allocation when a crafted or
263+
# truncated archive claims far more data than the file actually contains
264+
# (gh-151497).
265+
_EXTHEADER_READ_CHUNK = 1024 * 1024 # 1 MiB
266+
267+
def _safe_read(fileobj, size):
268+
"""Read up to *size* bytes from *fileobj* in bounded chunks.
269+
270+
Returns the same bytes as ``fileobj.read(size)`` would (including a short
271+
result at end of file), but never pre-allocates *size* bytes, so an
272+
oversized size field in a crafted header cannot force a huge allocation.
273+
"""
274+
if size <= _EXTHEADER_READ_CHUNK:
275+
return fileobj.read(size)
276+
chunks = []
277+
while size > 0:
278+
chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
279+
if not chunk:
280+
break
281+
chunks.append(chunk)
282+
size -= len(chunk)
283+
return b"".join(chunks)
284+
259285
def _safe_print(s):
260286
encoding = getattr(sys.stdout, 'encoding', None)
261287
if encoding is not None:
@@ -1424,7 +1450,7 @@ def _proc_gnulong(self, tarfile):
14241450
"""Process the blocks that hold a GNU longname
14251451
or longlink member.
14261452
"""
1427-
buf = tarfile.fileobj.read(self._block(self.size))
1453+
buf = _safe_read(tarfile.fileobj, self._block(self.size))
14281454

14291455
# Fetch the next header and process it.
14301456
try:
@@ -1480,7 +1506,7 @@ def _proc_pax(self, tarfile):
14801506
POSIX.1-2008.
14811507
"""
14821508
# Read the header information.
1483-
buf = tarfile.fileobj.read(self._block(self.size))
1509+
buf = _safe_read(tarfile.fileobj, self._block(self.size))
14841510

14851511
# A pax header stores supplemental information for either
14861512
# the following file (extended) or all following files

Lib/test/test_tarfile.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,52 @@ def test_extractfile_attrs(self):
550550
self.assertIs(fobj.seekable(), True)
551551

552552

553+
class _ReadSizeRecorder(io.BytesIO):
554+
# Records the largest size ever passed to read(), so a test can check
555+
# that tarfile does not request far more data than the archive holds
556+
# (which on a real file would pre-allocate it).
557+
def __init__(self, *args, **kwargs):
558+
super().__init__(*args, **kwargs)
559+
self.max_read_size = 0
560+
561+
def read(self, size=-1):
562+
if size is not None and size >= 0:
563+
self.max_read_size = max(self.max_read_size, size)
564+
return super().read(size)
565+
566+
567+
class ExtendedHeaderMemoryTest(unittest.TestCase):
568+
# gh-151497: the size of a GNU long name/link or a pax extended header is
569+
# read from the archive and is untrusted. A crafted header can claim a
570+
# size far larger than the file actually contains; opening such an archive
571+
# must not try to read (and so pre-allocate) the claimed size in one go.
572+
573+
def _crafted_archive(self, hdrtype):
574+
tarinfo = tarfile.TarInfo("A")
575+
tarinfo.type = hdrtype
576+
tarinfo.size = 0xFFFFFFFF # ~4 GiB claimed in a 512-byte header
577+
return tarinfo.tobuf(format=tarfile.GNU_FORMAT)
578+
579+
def _check(self, hdrtype):
580+
fobj = _ReadSizeRecorder(self._crafted_archive(hdrtype))
581+
try:
582+
with tarfile.open(fileobj=fobj, mode="r:") as tar:
583+
tar.getmembers()
584+
except tarfile.ReadError:
585+
pass # a truncated header is fine; we only check the allocation
586+
# The bogus ~4 GiB size must never reach a single read() call.
587+
self.assertLess(fobj.max_read_size, 10 * 1024 * 1024)
588+
589+
def test_gnu_longname_oversized_size(self):
590+
self._check(tarfile.GNUTYPE_LONGNAME)
591+
592+
def test_gnu_longlink_oversized_size(self):
593+
self._check(tarfile.GNUTYPE_LONGLINK)
594+
595+
def test_pax_header_oversized_size(self):
596+
self._check(tarfile.XHDTYPE)
597+
598+
553599
class MiscReadTestBase(CommonReadTest):
554600
is_stream = False
555601

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge
2+
buffer when a crafted or truncated member claims an oversized extended header
3+
(a GNU long name/link or a pax header). The extended header is now read in
4+
bounded chunks, so its size field can no longer trigger memory exhaustion.

0 commit comments

Comments
 (0)