diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 3f82f78895..76c0c3fdd2 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -48,6 +48,29 @@ The environment variable picked up by Iceberg starts with `PYICEBERG_` and then For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog. +## Manifest Caching + +PyIceberg caches `ManifestFile` objects locally and uses an LRU policy to bound the cache size. By default, up to `128` +distinct manifest files are retained. + +You can tune the `manifest-cache-size` configuration in `.pyiceberg.yaml`: + +```yaml +manifest-cache-size: 256 +``` + +Permitted values: any non-negative integer. Set the value to `0` to disable manifest caching entirely. + +You can also set it with the `PYICEBERG_MANIFEST_CACHE_SIZE` environment variable: + +```sh +export PYICEBERG_MANIFEST_CACHE_SIZE=256 +``` + +The memory used by this cache depends on the size and number of distinct manifests your workload touches. Lower the value +if you want a tighter memory bound, or call `clear_manifest_cache()` to proactively release cached manifest metadata in +long-lived processes. + ## Tables Iceberg tables support table properties to configure table behavior. diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 3811a9d894..9842f79d8e 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -51,6 +51,7 @@ StringType, StructType, ) +from pyiceberg.utils.config import Config UNASSIGNED_SEQ = -1 DEFAULT_BLOCK_SIZE = 67108864 # 64 * 1024 * 1024 @@ -891,17 +892,70 @@ def __hash__(self) -> int: return hash(self.manifest_path) -# Global cache for ManifestFile objects, keyed by manifest_path. -# This deduplicates ManifestFile objects across manifest lists, which commonly -# share manifests after append operations. -_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=128) +class _ManifestCache: + """Process-wide ManifestFile cache keyed by manifest_path. -# Lock for thread-safe cache access -_manifest_cache_lock = threading.RLock() + Consecutive snapshots often reference the same manifests after append + operations, so reusing ManifestFile instances avoids retaining duplicate + objects. + """ + + DEFAULT_SIZE = 128 + + _cache: LRUCache[str, ManifestFile] | None + + def __init__(self) -> None: + self.maxsize = self._load_configured_size() + self._cache = LRUCache(maxsize=self.maxsize) if self.maxsize > 0 else None + self._lock = threading.RLock() + + @classmethod + def _load_configured_size(cls) -> int: + configured_size = Config().get_int("manifest-cache-size") + if configured_size is None: + return cls.DEFAULT_SIZE + if configured_size < 0: + raise ValueError( + f"manifest-cache-size should be a non-negative integer or left unset. Current value: {configured_size}" + ) + return configured_size + + def clear(self) -> None: + with self._lock: + if self._cache is not None: + self._cache.clear() + + def get_or_cache(self, manifest_file: ManifestFile) -> ManifestFile: + if self._cache is None: + return manifest_file + + with self._lock: + manifest_path = manifest_file.manifest_path + if manifest_path in self._cache: + return self._cache[manifest_path] + + self._cache[manifest_path] = manifest_file + return manifest_file + + def __len__(self) -> int: + with self._lock: + return len(self._cache) if self._cache is not None else 0 + + +_manifest_cache = _ManifestCache() + + +def clear_manifest_cache() -> None: + """Clear cached ManifestFile objects. + + This is primarily useful in long-lived or memory-sensitive processes that + want to release cached manifest metadata between bursts of table reads. + """ + _manifest_cache.clear() def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]: - """Read manifests from a manifest list, deduplicating ManifestFile objects via cache. + """Read manifests from a manifest list, reusing cached ManifestFile objects. Caches individual ManifestFile objects by manifest_path. This is memory-efficient because consecutive manifest lists typically share most of their manifests: @@ -927,17 +981,7 @@ def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]: file = io.new_input(manifest_list) manifest_files = list(read_manifest_list(file)) - result = [] - with _manifest_cache_lock: - for manifest_file in manifest_files: - manifest_path = manifest_file.manifest_path - if manifest_path in _manifest_cache: - result.append(_manifest_cache[manifest_path]) - else: - _manifest_cache[manifest_path] = manifest_file - result.append(manifest_file) - - return tuple(result) + return tuple(_manifest_cache.get_or_cache(manifest_file) for manifest_file in manifest_files) def read_manifest_list(input_file: InputFile) -> Iterator[ManifestFile]: diff --git a/tests/benchmark/test_memory_benchmark.py b/tests/benchmark/test_memory_benchmark.py index 82454c8574..19bb77c8a6 100644 --- a/tests/benchmark/test_memory_benchmark.py +++ b/tests/benchmark/test_memory_benchmark.py @@ -32,8 +32,9 @@ import pyarrow as pa import pytest +from pyiceberg import manifest as manifest_module from pyiceberg.catalog.memory import InMemoryCatalog -from pyiceberg.manifest import _manifest_cache +from pyiceberg.manifest import clear_manifest_cache def generate_test_dataframe() -> pa.Table: @@ -64,7 +65,7 @@ def memory_catalog(tmp_path_factory: pytest.TempPathFactory) -> InMemoryCatalog: @pytest.fixture(autouse=True) def clear_caches() -> None: """Clear caches before each test.""" - _manifest_cache.clear() + clear_manifest_cache() gc.collect() @@ -72,8 +73,8 @@ def clear_caches() -> None: def test_manifest_cache_memory_growth(memory_catalog: InMemoryCatalog) -> None: """Benchmark memory growth of manifest cache during repeated appends. - This test reproduces the issue from GitHub #2325 where each append creates - a new manifest list entry in the cache, causing memory to grow. + This test reproduces the issue from GitHub #2325 where the old cache stored + each manifest list result, causing memory to grow. With the old caching strategy (tuple per manifest list), memory grew as O(N²). With the new strategy (individual ManifestFile objects), memory grows as O(N). @@ -95,7 +96,7 @@ def test_manifest_cache_memory_growth(memory_catalog: InMemoryCatalog) -> None: # Sample memory at intervals if (i + 1) % 10 == 0: current, _ = tracemalloc.get_traced_memory() - cache_size = len(_manifest_cache) + cache_size = len(manifest_module._manifest_cache) memory_samples.append((i + 1, current, cache_size)) print(f" Iteration {i + 1}: Memory={current / 1024:.1f} KB, Cache entries={cache_size}") @@ -150,13 +151,13 @@ def test_memory_after_gc_with_cache_cleared(memory_catalog: InMemoryCatalog) -> gc.collect() before_clear_memory, _ = tracemalloc.get_traced_memory() - cache_size_before = len(_manifest_cache) + cache_size_before = len(manifest_module._manifest_cache) print(f" Memory before clear: {before_clear_memory / 1024:.1f} KB") print(f" Cache size: {cache_size_before}") # Phase 2: Clear cache and GC print("\nPhase 2: Clearing cache and running GC...") - _manifest_cache.clear() + clear_manifest_cache() gc.collect() gc.collect() # Multiple GC passes for thorough cleanup @@ -192,6 +193,7 @@ def test_manifest_cache_deduplication_efficiency() -> None: ManifestEntry, ManifestEntryStatus, _manifests, + clear_manifest_cache, write_manifest, write_manifest_list, ) @@ -245,7 +247,7 @@ def test_manifest_cache_deduplication_efficiency() -> None: num_lists = 10 print(f"Creating {num_lists} manifest lists with overlapping manifests...") - _manifest_cache.clear() + clear_manifest_cache() for i in range(num_lists): list_path = f"{tmp_dir}/manifest-list_{i}.avro" @@ -265,7 +267,7 @@ def test_manifest_cache_deduplication_efficiency() -> None: _manifests(io, list_path) # Analyze cache efficiency - cache_entries = len(_manifest_cache) + cache_entries = len(manifest_module._manifest_cache) # List i contains manifests 0..i, so only the first num_lists manifests are actually used manifests_actually_used = num_lists diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 40ad4bf221..f2ae1e05ad 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -15,11 +15,15 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=redefined-outer-name,arguments-renamed,fixme +import importlib +from pathlib import Path from tempfile import TemporaryDirectory +from typing import Any import fastavro import pytest +import pyiceberg.manifest as manifest_module from pyiceberg.avro.codecs import AvroCompressionCodec from pyiceberg.io import load_file_io from pyiceberg.io.pyarrow import PyArrowFileIO @@ -33,8 +37,8 @@ ManifestFile, PartitionFieldSummary, _inherit_from_manifest, - _manifest_cache, _manifests, + clear_manifest_cache, read_manifest_list, write_manifest, write_manifest_list, @@ -47,9 +51,8 @@ @pytest.fixture(autouse=True) -def clear_global_manifests_cache() -> None: - # Clear the global cache before each test - _manifest_cache.clear() +def reset_global_manifests_cache() -> None: + clear_manifest_cache() def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: dict[str, str]) -> None: @@ -805,8 +808,8 @@ def test_manifest_cache_deduplicates_manifest_files() -> None: # Verify cache size - should only have 3 unique ManifestFile objects # instead of 1 + 2 + 3 = 6 objects as with the old approach - assert len(_manifest_cache) == 3, ( - f"Cache should contain exactly 3 unique ManifestFile objects, but has {len(_manifest_cache)}" + assert len(manifest_module._manifest_cache) == 3, ( + f"Cache should contain exactly 3 unique ManifestFile objects, but has {len(manifest_module._manifest_cache)}" ) @@ -880,9 +883,9 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None: # With the new approach, we should have exactly N objects # Verify cache has exactly N unique entries - assert len(_manifest_cache) == num_manifests, ( + assert len(manifest_module._manifest_cache) == num_manifests, ( f"Cache should contain exactly {num_manifests} ManifestFile objects, " - f"but has {len(_manifest_cache)}. " + f"but has {len(manifest_module._manifest_cache)}. " f"Old approach would have {num_manifests * (num_manifests + 1) // 2} objects." ) @@ -973,3 +976,173 @@ def test_inherit_from_manifest_snapshot_id() -> None: assert result.snapshot_id == 3051729675574597004 assert result.sequence_number == 1 assert result.file_sequence_number == 1 + + +def _create_test_manifest_list(module: Any, io: PyArrowFileIO, tmp_dir: str, name: str, snapshot_id: int) -> str: + schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True)) + spec = UNPARTITIONED_PARTITION_SPEC + + manifest_path = f"{tmp_dir}/manifest-{name}.avro" + with module.write_manifest( + format_version=2, + spec=spec, + schema=schema, + output_file=io.new_output(manifest_path), + snapshot_id=snapshot_id, + avro_compression="zstandard", + ) as writer: + data_file = module.DataFile.from_args( + content=module.DataFileContent.DATA, + file_path=f"{tmp_dir}/data-{name}.parquet", + file_format=module.FileFormat.PARQUET, + partition=Record(), + record_count=100, + file_size_in_bytes=1000, + ) + writer.add_entry( + module.ManifestEntry.from_args( + status=module.ManifestEntryStatus.ADDED, + snapshot_id=snapshot_id, + data_file=data_file, + ) + ) + manifest_file = writer.to_manifest_file() + + list_path = f"{tmp_dir}/manifest-list-{name}.avro" + with module.write_manifest_list( + format_version=2, + output_file=io.new_output(list_path), + snapshot_id=snapshot_id, + parent_snapshot_id=snapshot_id - 1 if snapshot_id > 1 else None, + sequence_number=snapshot_id, + avro_compression="zstandard", + ) as list_writer: + list_writer.add_manifests([manifest_file]) + + return list_path + + +def test_clear_manifest_cache() -> None: + """Test that clear_manifest_cache() clears cache entries while keeping cache enabled.""" + io = PyArrowFileIO() + + with TemporaryDirectory() as tmp_dir: + list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="clear", snapshot_id=1) + + # Populate the cache + _manifests(io, list_path) + + # Verify cache has entries + assert len(manifest_module._manifest_cache) > 0, "Cache should have entries after reading manifests" + + # Clear the cache + clear_manifest_cache() + + # Verify cache is empty but still enabled + assert len(manifest_module._manifest_cache) == 0, "Cache should be empty after clear" + + +def test_manifest_cache_can_be_disabled_with_size_zero(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that manifest-cache-size=0 disables caching.""" + monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "0") + importlib.reload(manifest_module) + + try: + assert manifest_module._manifest_cache.maxsize == 0 + assert len(manifest_module._manifest_cache) == 0 + + io = PyArrowFileIO() + + with TemporaryDirectory() as tmp_dir: + list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="disabled", snapshot_id=1) + + manifests_first_call = manifest_module._manifests(io, list_path) + manifests_second_call = manifest_module._manifests(io, list_path) + + assert len(manifest_module._manifest_cache) == 0 + assert manifests_first_call[0] is not manifests_second_call[0] + finally: + monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False) + importlib.reload(manifest_module) + + +def test_manifest_cache_respects_positive_env_size(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that a positive manifest-cache-size enables a bounded cache.""" + monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "1") + importlib.reload(manifest_module) + + try: + assert manifest_module._manifest_cache.maxsize == 1 + + io = PyArrowFileIO() + + with TemporaryDirectory() as tmp_dir: + first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1) + second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2) + + manifests_first_call = manifest_module._manifests(io, first_list_path) + manifests_second_call = manifest_module._manifests(io, first_list_path) + + assert manifests_first_call[0] is manifests_second_call[0] + assert len(manifest_module._manifest_cache) == 1 + + manifest_module._manifests(io, second_list_path) + + assert len(manifest_module._manifest_cache) == 1 + finally: + monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False) + importlib.reload(manifest_module) + + +def test_manifest_cache_reads_size_from_configuration_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Test that manifest-cache-size can be loaded from .pyiceberg.yaml.""" + config_dir = tmp_path / "config" + config_dir.mkdir() + (config_dir / ".pyiceberg.yaml").write_text("manifest-cache-size: 2\n", encoding="utf-8") + + monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False) + monkeypatch.setenv("PYICEBERG_HOME", str(config_dir)) + importlib.reload(manifest_module) + + try: + assert manifest_module._manifest_cache.maxsize == 2 + + io = PyArrowFileIO() + + with TemporaryDirectory() as tmp_dir: + first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1) + second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2) + third_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="third", snapshot_id=3) + + manifest_module._manifests(io, first_list_path) + manifest_module._manifests(io, second_list_path) + manifest_module._manifests(io, third_list_path) + + assert len(manifest_module._manifest_cache) == 2 + finally: + monkeypatch.delenv("PYICEBERG_HOME", raising=False) + importlib.reload(manifest_module) + + +def test_invalid_manifest_cache_size_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that invalid manifest-cache-size values raise a helpful error.""" + monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "not-an-int") + + try: + with pytest.raises(ValueError, match="manifest-cache-size should be an integer or left unset"): + importlib.reload(manifest_module) + finally: + monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False) + importlib.reload(manifest_module) + + +def test_negative_manifest_cache_size_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that negative manifest-cache-size values raise a helpful error.""" + monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "-1") + + try: + with pytest.raises(ValueError, match="manifest-cache-size should be a non-negative integer or left unset"): + importlib.reload(manifest_module) + finally: + monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False) + importlib.reload(manifest_module)