Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
dst.write(buf)
return

# Maximum number of bytes read in a single call when reading a member's
# extended header (a GNU long name/link or a pax header). The size of such
# a header is taken from the archive and is not trustworthy, so it is read in
# bounded chunks to avoid a huge up-front allocation when a crafted or
# truncated archive claims far more data than the file actually contains
# (gh-151497).
_EXTHEADER_READ_CHUNK = 1024 * 1024 # 1 MiB

def _safe_read(fileobj, size):
"""Read up to *size* bytes from *fileobj* in bounded chunks.

Returns the same bytes as ``fileobj.read(size)`` would (including a short
result at end of file), but never pre-allocates *size* bytes, so an
oversized size field in a crafted header cannot force a huge allocation.
"""
if size <= _EXTHEADER_READ_CHUNK:
return fileobj.read(size)
chunks = []
while size > 0:
chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
if not chunk:
break
chunks.append(chunk)
size -= len(chunk)
return b"".join(chunks)

def _safe_print(s):
encoding = getattr(sys.stdout, 'encoding', None)
if encoding is not None:
Expand Down Expand Up @@ -1424,7 +1450,7 @@ def _proc_gnulong(self, tarfile):
"""Process the blocks that hold a GNU longname
or longlink member.
"""
buf = tarfile.fileobj.read(self._block(self.size))
buf = _safe_read(tarfile.fileobj, self._block(self.size))

# Fetch the next header and process it.
try:
Expand Down Expand Up @@ -1480,7 +1506,7 @@ def _proc_pax(self, tarfile):
POSIX.1-2008.
"""
# Read the header information.
buf = tarfile.fileobj.read(self._block(self.size))
buf = _safe_read(tarfile.fileobj, self._block(self.size))

# A pax header stores supplemental information for either
# the following file (extended) or all following files
Expand Down
46 changes: 46 additions & 0 deletions Lib/test/test_tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,52 @@ def test_extractfile_attrs(self):
self.assertIs(fobj.seekable(), True)


class _ReadSizeRecorder(io.BytesIO):
# Records the largest size ever passed to read(), so a test can check
# that tarfile does not request far more data than the archive holds
# (which on a real file would pre-allocate it).
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_read_size = 0

def read(self, size=-1):
if size is not None and size >= 0:
self.max_read_size = max(self.max_read_size, size)
return super().read(size)


class ExtendedHeaderMemoryTest(unittest.TestCase):
# gh-151497: the size of a GNU long name/link or a pax extended header is
# read from the archive and is untrusted. A crafted header can claim a
# size far larger than the file actually contains; opening such an archive
# must not try to read (and so pre-allocate) the claimed size in one go.

def _crafted_archive(self, hdrtype):
tarinfo = tarfile.TarInfo("A")
tarinfo.type = hdrtype
tarinfo.size = 0xFFFFFFFF # ~4 GiB claimed in a 512-byte header
return tarinfo.tobuf(format=tarfile.GNU_FORMAT)

def _check(self, hdrtype):
fobj = _ReadSizeRecorder(self._crafted_archive(hdrtype))
try:
with tarfile.open(fileobj=fobj, mode="r:") as tar:
tar.getmembers()
except tarfile.ReadError:
pass # a truncated header is fine; we only check the allocation
# The bogus ~4 GiB size must never reach a single read() call.
self.assertLess(fobj.max_read_size, 10 * 1024 * 1024)

def test_gnu_longname_oversized_size(self):
self._check(tarfile.GNUTYPE_LONGNAME)

def test_gnu_longlink_oversized_size(self):
self._check(tarfile.GNUTYPE_LONGLINK)

def test_pax_header_oversized_size(self):
self._check(tarfile.XHDTYPE)


class MiscReadTestBase(CommonReadTest):
is_stream = False

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge
buffer when a crafted or truncated member claims an oversized extended header
(a GNU long name/link or a pax header). The extended header is now read in
bounded chunks, so its size field can no longer trigger memory exhaustion.
Loading