From 76b1dd101df88ec8ff0c72bcc0552083ff060899 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 10 Apr 2026 12:55:36 +0200 Subject: [PATCH 1/8] tests for CoW behavior in pandas --- cdisc_rules_engine/rules_engine.py | 5 +- .../test_cache/test_immutable_cache.py | 126 ++++++++++++++++++ 2 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_services/test_cache/test_immutable_cache.py diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 33f371800..96b8f48cc 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -2,6 +2,7 @@ from typing import Iterable, List, Union from dateutil.parser._parser import ParserError import traceback +import pandas as pd from business_rules import export_rule_data from business_rules.engine import run @@ -62,6 +63,8 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.enums.sensitivity import Sensitivity +pd.options.mode.copy_on_write = True + class RulesEngine: def __init__( @@ -381,8 +384,6 @@ def execute_rule( rule["conditions"], dataset.columns.to_list() ) rule_copy["conditions"].set_conditions(updated_conditions) - # Adding copy for now to avoid updating cached dataset - dataset = deepcopy(dataset) # preprocess dataset dataset_preprocessor = DatasetPreprocessor( dataset, dataset_metadata, self.data_service, self.cache diff --git a/tests/unit/test_services/test_cache/test_immutable_cache.py b/tests/unit/test_services/test_cache/test_immutable_cache.py new file mode 100644 index 000000000..9b7f6b8bc --- /dev/null +++ b/tests/unit/test_services/test_cache/test_immutable_cache.py @@ -0,0 +1,126 @@ +import pandas as pd +import pytest + + +class CacheService: + def __init__(self): + self._cache = {} + + def set(self, key, df): + self._cache[key] = df + + def get_deepcopy(self, key): + return self._cache[key].copy(deep=True) + + def get_cow(self, key): + return self._cache[key] # relying on CoW + + +@pytest.fixture +def sample_df(): + return pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + + +def test_deepcopy_does_not_modify_cache(sample_df): + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_deepcopy("x") + df.loc[0, "A"] = 999 + + cached = cache._cache["x"] + + assert cached.loc[0, "A"] == 1 + assert df.loc[0, "A"] == 999 + + +def test_cow_does_not_modify_cache(sample_df): + pd.options.mode.copy_on_write = True + + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_cow("x") + df.loc[0, "A"] = 999 + + cached = cache._cache["x"] + + assert cached.loc[0, "A"] == 1 + assert df.loc[0, "A"] == 999 + + +def test_cow_shares_memory_before_write(sample_df): + pd.options.mode.copy_on_write = True + + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_cow("x") + + assert df is cache._cache["x"] + + +def test_cow_inplace_operation(sample_df): + pd.options.mode.copy_on_write = True + + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_cow("x") + + df["A"].fillna(0, inplace=True) + + cached = cache._cache["x"] + + assert cached.equals(sample_df) + + +def test_cow_chained_assignment(sample_df): + pd.options.mode.copy_on_write = True + + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_cow("x") + + df_slice = df[df["A"] > 1] + df_slice["A"] = 999 # chained assignment + + cached = cache._cache["x"] + + assert cached["A"].tolist() == [1, 2, 3] + + +def test_cow_numpy_view_can_break_isolation(sample_df): + pd.options.mode.copy_on_write = True + + cache = CacheService() + cache.set("x", sample_df) + + df = cache.get_cow("x") + + values = df["A"].values + values[0] = 999 # bypassing pandas + + cached = cache._cache["x"] + + # CoW didn't work + assert cached.loc[0, "A"] == 999 + + +def test_cow_object_dtype_mutation(): + pd.options.mode.copy_on_write = True + + df = pd.DataFrame({"A": [[1], [2], [3]]}) + + cache = CacheService() + cache.set("x", df) + + df2 = cache.get_cow("x") + + df2.loc[0, "A"].append(999) # mutate nested + + cached = cache._cache["x"] + + # cache changed + assert cached.loc[0, "A"] == [1, 999] From 12b2dc859e9610d049f38849f602b4ef86a94ee6 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 13 Apr 2026 18:36:21 +0200 Subject: [PATCH 2/8] tested true CoW via shallow copy --- .../test_cache/test_immutable_cache.py | 23 +++---------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/tests/unit/test_services/test_cache/test_immutable_cache.py b/tests/unit/test_services/test_cache/test_immutable_cache.py index 9b7f6b8bc..74b3a550a 100644 --- a/tests/unit/test_services/test_cache/test_immutable_cache.py +++ b/tests/unit/test_services/test_cache/test_immutable_cache.py @@ -13,7 +13,7 @@ def get_deepcopy(self, key): return self._cache[key].copy(deep=True) def get_cow(self, key): - return self._cache[key] # relying on CoW + return self._cache[key].copy(deep=False) # relying on CoW @pytest.fixture @@ -56,8 +56,8 @@ def test_cow_shares_memory_before_write(sample_df): cache.set("x", sample_df) df = cache.get_cow("x") - - assert df is cache._cache["x"] + # memory is shared between objects, but they are not the same object + assert df._mgr.blocks[0].values.base is cache._cache["x"]._mgr.blocks[0].values.base def test_cow_inplace_operation(sample_df): @@ -91,23 +91,6 @@ def test_cow_chained_assignment(sample_df): assert cached["A"].tolist() == [1, 2, 3] -def test_cow_numpy_view_can_break_isolation(sample_df): - pd.options.mode.copy_on_write = True - - cache = CacheService() - cache.set("x", sample_df) - - df = cache.get_cow("x") - - values = df["A"].values - values[0] = 999 # bypassing pandas - - cached = cache._cache["x"] - - # CoW didn't work - assert cached.loc[0, "A"] == 999 - - def test_cow_object_dtype_mutation(): pd.options.mode.copy_on_write = True From 7a4479d59e548d6feae921d52fdd1dcda7740c71 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 14 Apr 2026 12:36:36 +0200 Subject: [PATCH 3/8] added shallow copying for cached datasets --- .../services/cache/in_memory_cache_service.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/services/cache/in_memory_cache_service.py b/cdisc_rules_engine/services/cache/in_memory_cache_service.py index cdc79b01c..331a85e83 100644 --- a/cdisc_rules_engine/services/cache/in_memory_cache_service.py +++ b/cdisc_rules_engine/services/cache/in_memory_cache_service.py @@ -5,7 +5,7 @@ from cdisc_rules_engine.interfaces import ( CacheServiceInterface, ) -from cdisc_rules_engine.models.dataset import DatasetInterface +from cdisc_rules_engine.models.dataset import DatasetInterface, PandasDataset from cachetools import LRUCache import psutil from multiprocessing import Lock @@ -66,7 +66,10 @@ def add_dataset(self, cache_key, data): self.dataset_cache[cache_key] = data def get_dataset(self, cache_key): - return self.dataset_cache.get(cache_key, None) + cached = self.dataset_cache.get(cache_key) + if isinstance(cached, PandasDataset): + cached.data = cached.data.copy(deep=False) + return cached def add_batch( self, @@ -82,7 +85,10 @@ def add_batch( self.add(prefix + cache_key, item) def get(self, cache_key): - return self.cache.get(cache_key, None) + cached = self.cache.get(cache_key) + if isinstance(cached, PandasDataset): + cached.data = cached.data.copy(deep=False) + return cached def get_all(self, cache_keys: List[str]): return [self.cache.get(key) for key in cache_keys] From 3fe3bb67c410b80fbbc8ebbdb98d7f527b7c1f17 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 15 Apr 2026 14:31:46 +0200 Subject: [PATCH 4/8] dask copy workaround --- cdisc_rules_engine/rules_engine.py | 3 +++ cdisc_rules_engine/services/cache/in_memory_cache_service.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 96b8f48cc..b67b4a59f 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -34,6 +34,7 @@ DataServiceInterface, ) from cdisc_rules_engine.models.actions import COREActions +from cdisc_rules_engine.models.dataset import DaskDataset from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset_variable import DatasetVariable from cdisc_rules_engine.models.failed_validation_entity import FailedValidationEntity @@ -385,6 +386,8 @@ def execute_rule( ) rule_copy["conditions"].set_conditions(updated_conditions) # preprocess dataset + if isinstance(dataset, DaskDataset): + dataset = deepcopy(dataset) dataset_preprocessor = DatasetPreprocessor( dataset, dataset_metadata, self.data_service, self.cache ) diff --git a/cdisc_rules_engine/services/cache/in_memory_cache_service.py b/cdisc_rules_engine/services/cache/in_memory_cache_service.py index 331a85e83..0912db9a7 100644 --- a/cdisc_rules_engine/services/cache/in_memory_cache_service.py +++ b/cdisc_rules_engine/services/cache/in_memory_cache_service.py @@ -67,7 +67,8 @@ def add_dataset(self, cache_key, data): def get_dataset(self, cache_key): cached = self.dataset_cache.get(cache_key) - if isinstance(cached, PandasDataset): + # Adding DaskDataset will cause downstream issues since Dask does not support copy-on-write. + if type(cached) is PandasDataset: cached.data = cached.data.copy(deep=False) return cached @@ -86,7 +87,7 @@ def add_batch( def get(self, cache_key): cached = self.cache.get(cache_key) - if isinstance(cached, PandasDataset): + if type(cached) is PandasDataset: cached.data = cached.data.copy(deep=False) return cached From 6676707c427a98ead914fb27fcec33edf55f5a5c Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 27 Apr 2026 20:06:20 +0200 Subject: [PATCH 5/8] fix CoW tests and wrapper --- .../services/cache/in_memory_cache_service.py | 5 +- .../test_cache/test_immutable_cache.py | 138 +++++++++--------- 2 files changed, 69 insertions(+), 74 deletions(-) diff --git a/cdisc_rules_engine/services/cache/in_memory_cache_service.py b/cdisc_rules_engine/services/cache/in_memory_cache_service.py index 0912db9a7..d6aeb0970 100644 --- a/cdisc_rules_engine/services/cache/in_memory_cache_service.py +++ b/cdisc_rules_engine/services/cache/in_memory_cache_service.py @@ -67,9 +67,8 @@ def add_dataset(self, cache_key, data): def get_dataset(self, cache_key): cached = self.dataset_cache.get(cache_key) - # Adding DaskDataset will cause downstream issues since Dask does not support copy-on-write. if type(cached) is PandasDataset: - cached.data = cached.data.copy(deep=False) + return PandasDataset(cached.data.copy(deep=False)) return cached def add_batch( @@ -88,7 +87,7 @@ def add_batch( def get(self, cache_key): cached = self.cache.get(cache_key) if type(cached) is PandasDataset: - cached.data = cached.data.copy(deep=False) + return PandasDataset(cached.data.copy(deep=False)) return cached def get_all(self, cache_keys: List[str]): diff --git a/tests/unit/test_services/test_cache/test_immutable_cache.py b/tests/unit/test_services/test_cache/test_immutable_cache.py index 74b3a550a..5e0231aa9 100644 --- a/tests/unit/test_services/test_cache/test_immutable_cache.py +++ b/tests/unit/test_services/test_cache/test_immutable_cache.py @@ -1,109 +1,105 @@ import pandas as pd import pytest +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +from cdisc_rules_engine.services.cache.in_memory_cache_service import ( + InMemoryCacheService, +) -class CacheService: - def __init__(self): - self._cache = {} - def set(self, key, df): - self._cache[key] = df - - def get_deepcopy(self, key): - return self._cache[key].copy(deep=True) - - def get_cow(self, key): - return self._cache[key].copy(deep=False) # relying on CoW +@pytest.fixture(autouse=True) +def reset_singleton(): + InMemoryCacheService._instance = None + yield + InMemoryCacheService._instance = None @pytest.fixture -def sample_df(): - return pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) - - -def test_deepcopy_does_not_modify_cache(sample_df): - cache = CacheService() - cache.set("x", sample_df) - - df = cache.get_deepcopy("x") - df.loc[0, "A"] = 999 +def cache(): + return InMemoryCacheService() - cached = cache._cache["x"] - - assert cached.loc[0, "A"] == 1 - assert df.loc[0, "A"] == 999 - - -def test_cow_does_not_modify_cache(sample_df): - pd.options.mode.copy_on_write = True - cache = CacheService() - cache.set("x", sample_df) - - df = cache.get_cow("x") - df.loc[0, "A"] = 999 +@pytest.fixture +def sample_dataset(): + return PandasDataset(pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]})) - cached = cache._cache["x"] - assert cached.loc[0, "A"] == 1 - assert df.loc[0, "A"] == 999 +def test_get_returns_cow_copy(cache, sample_dataset): + cache.add("x", sample_dataset) + result = cache.get("x") + assert result is not sample_dataset + assert result.data is not sample_dataset.data -def test_cow_shares_memory_before_write(sample_df): +def test_get_cow_does_not_modify_cache_on_write(cache, sample_dataset): pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) - cache = CacheService() - cache.set("x", sample_df) + retrieved = cache.get("x") + retrieved.data.loc[0, "A"] = 999 - df = cache.get_cow("x") - # memory is shared between objects, but they are not the same object - assert df._mgr.blocks[0].values.base is cache._cache["x"]._mgr.blocks[0].values.base + cached_data = cache.cache["x"].data + assert cached_data.loc[0, "A"] == 1 -def test_cow_inplace_operation(sample_df): +def test_get_cow_shares_memory_before_write(cache, sample_dataset): pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + import numpy as np - cache = CacheService() - cache.set("x", sample_df) - - df = cache.get_cow("x") - - df["A"].fillna(0, inplace=True) + assert np.shares_memory(retrieved.data["A"], cache.cache["x"].data["A"]) - cached = cache._cache["x"] - assert cached.equals(sample_df) +def test_get_dataset_returns_cow_copy(cache, sample_dataset): + cache.add_dataset("x", sample_dataset) + result = cache.get_dataset("x") + assert result is not sample_dataset + assert result.data is not sample_dataset.data -def test_cow_chained_assignment(sample_df): +def test_get_dataset_cow_does_not_modify_cache_on_write(cache, sample_dataset): pd.options.mode.copy_on_write = True + cache.add_dataset("x", sample_dataset) - cache = CacheService() - cache.set("x", sample_df) + retrieved = cache.get_dataset("x") + retrieved.data.loc[0, "A"] = 999 - df = cache.get_cow("x") + cached_data = cache.dataset_cache["x"].data + assert cached_data.loc[0, "A"] == 1 - df_slice = df[df["A"] > 1] - df_slice["A"] = 999 # chained assignment - cached = cache._cache["x"] - - assert cached["A"].tolist() == [1, 2, 3] +def test_get_object_dtype_nested_mutation_affects_cache(cache): + """CoW can't protect in nested mutations""" + df = pd.DataFrame({"A": [[1], [2], [3]]}) + dataset = PandasDataset(df) + cache.add("x", dataset) + retrieved = cache.get("x") + retrieved.data.loc[0, "A"].append(999) -def test_cow_object_dtype_mutation(): - pd.options.mode.copy_on_write = True + cached_data = cache.cache["x"].data + assert cached_data.loc[0, "A"] == [1, 999] - df = pd.DataFrame({"A": [[1], [2], [3]]}) - cache = CacheService() - cache.set("x", df) +def test_get_non_dataset_returns_as_is(cache): + cache.add("key", {"some": "dict"}) + result = cache.get("key") + assert result == {"some": "dict"} - df2 = cache.get_cow("x") - df2.loc[0, "A"].append(999) # mutate nested +def test_get_returns_new_wrapper_not_cached_object(cache, sample_dataset): + """get() должен возвращать новый PandasDataset, а не сам объект из кэша.""" + cache.add("x", sample_dataset) + result = cache.get("x") + assert result is not cache.cache["x"] # новый wrapper + assert ( + result.data is not cache.cache["x"].data + ) # новый pd.DataFrame объект (shallow copy) - cached = cache._cache["x"] - # cache changed - assert cached.loc[0, "A"] == [1, 999] +def test_get_dataset_returns_new_wrapper_not_cached_object(cache, sample_dataset): + cache.add_dataset("x", sample_dataset) + result = cache.get_dataset("x") + assert result is not cache.dataset_cache["x"] + assert result.data is not cache.dataset_cache["x"].data From 6940e4b21fbadb3886580c73d08815f757816f89 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 28 Apr 2026 12:40:23 +0200 Subject: [PATCH 6/8] added tests for cache methods. changed cache access to get() and get_dataset() methods --- .../services/cache/in_memory_cache_service.py | 8 +- .../test_cache/test_immutable_cache.py | 277 +++++++++++++----- 2 files changed, 202 insertions(+), 83 deletions(-) diff --git a/cdisc_rules_engine/services/cache/in_memory_cache_service.py b/cdisc_rules_engine/services/cache/in_memory_cache_service.py index d6aeb0970..1bb005aeb 100644 --- a/cdisc_rules_engine/services/cache/in_memory_cache_service.py +++ b/cdisc_rules_engine/services/cache/in_memory_cache_service.py @@ -91,24 +91,24 @@ def get(self, cache_key): return cached def get_all(self, cache_keys: List[str]): - return [self.cache.get(key) for key in cache_keys] + return [self.get(key) for key in cache_keys] def get_all_by_prefix(self, prefix): items = [] for key in self.cache: if key.startswith(prefix): - items.append(self.cache[key]) + items.append(self.get(key)) return items def dataset_keys(self): return self.dataset_cache.keys() def filter_cache(self, prefix: str) -> dict: - return {k: self.cache[k] for k in self.cache.keys() if k.startswith(prefix)} + return {k: self.cache.get(k) for k in self.cache.keys() if k.startswith(prefix)} def get_by_regex(self, regex: str) -> dict: regex = regex.replace("*", ".*") - return {k: self.cache[k] for k in self.cache.keys() if re.search(regex, k)} + return {k: self.get(k) for k in self.cache.keys() if re.search(regex, k)} def exists(self, cache_key): return cache_key in self.cache diff --git a/tests/unit/test_services/test_cache/test_immutable_cache.py b/tests/unit/test_services/test_cache/test_immutable_cache.py index 5e0231aa9..1846fe135 100644 --- a/tests/unit/test_services/test_cache/test_immutable_cache.py +++ b/tests/unit/test_services/test_cache/test_immutable_cache.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -24,82 +25,200 @@ def sample_dataset(): return PandasDataset(pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]})) -def test_get_returns_cow_copy(cache, sample_dataset): - cache.add("x", sample_dataset) - result = cache.get("x") - assert result is not sample_dataset - assert result.data is not sample_dataset.data - - -def test_get_cow_does_not_modify_cache_on_write(cache, sample_dataset): - pd.options.mode.copy_on_write = True - cache.add("x", sample_dataset) - - retrieved = cache.get("x") - retrieved.data.loc[0, "A"] = 999 - - cached_data = cache.cache["x"].data - assert cached_data.loc[0, "A"] == 1 - - -def test_get_cow_shares_memory_before_write(cache, sample_dataset): - pd.options.mode.copy_on_write = True - cache.add("x", sample_dataset) - retrieved = cache.get("x") - import numpy as np - - assert np.shares_memory(retrieved.data["A"], cache.cache["x"].data["A"]) - - -def test_get_dataset_returns_cow_copy(cache, sample_dataset): - cache.add_dataset("x", sample_dataset) - result = cache.get_dataset("x") - assert result is not sample_dataset - assert result.data is not sample_dataset.data - - -def test_get_dataset_cow_does_not_modify_cache_on_write(cache, sample_dataset): - pd.options.mode.copy_on_write = True - cache.add_dataset("x", sample_dataset) - - retrieved = cache.get_dataset("x") - retrieved.data.loc[0, "A"] = 999 - - cached_data = cache.dataset_cache["x"].data - assert cached_data.loc[0, "A"] == 1 - - -def test_get_object_dtype_nested_mutation_affects_cache(cache): - """CoW can't protect in nested mutations""" - df = pd.DataFrame({"A": [[1], [2], [3]]}) - dataset = PandasDataset(df) - cache.add("x", dataset) - - retrieved = cache.get("x") - retrieved.data.loc[0, "A"].append(999) - - cached_data = cache.cache["x"].data - assert cached_data.loc[0, "A"] == [1, 999] - - -def test_get_non_dataset_returns_as_is(cache): - cache.add("key", {"some": "dict"}) - result = cache.get("key") - assert result == {"some": "dict"} - - -def test_get_returns_new_wrapper_not_cached_object(cache, sample_dataset): - """get() должен возвращать новый PandasDataset, а не сам объект из кэша.""" - cache.add("x", sample_dataset) - result = cache.get("x") - assert result is not cache.cache["x"] # новый wrapper - assert ( - result.data is not cache.cache["x"].data - ) # новый pd.DataFrame объект (shallow copy) - - -def test_get_dataset_returns_new_wrapper_not_cached_object(cache, sample_dataset): - cache.add_dataset("x", sample_dataset) - result = cache.get_dataset("x") - assert result is not cache.dataset_cache["x"] - assert result.data is not cache.dataset_cache["x"].data +class TestGet: + def test_returns_new_wrapper_not_cached_object(self, cache, sample_dataset): + cache.add("x", sample_dataset) + result = cache.get("x") + assert result is not cache.cache["x"] + assert result.data is not cache.cache["x"].data + + def test_cow_does_not_modify_cache_on_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + retrieved.data.loc[0, "A"] = 999 + assert cache.cache["x"].data.loc[0, "A"] == 1 + + def test_shares_memory_before_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + assert np.shares_memory(retrieved.data["A"], cache.cache["x"].data["A"]) + + def test_add_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + retrieved.data = pd.concat( + [retrieved.data, pd.DataFrame({"A": [999], "B": [999]})], + ignore_index=True, + ) + assert len(cache.cache["x"].data) == 3 + assert len(retrieved.data) == 4 + + def test_drop_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + retrieved.data = retrieved.data.drop(index=0).reset_index(drop=True) + assert len(cache.cache["x"].data) == 3 + assert len(retrieved.data) == 2 + + def test_filter_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + retrieved = cache.get("x") + retrieved.data = retrieved.data[retrieved.data["A"] > 1].reset_index(drop=True) + assert len(cache.cache["x"].data) == 3 + assert cache.cache["x"].data["A"].tolist() == [1, 2, 3] + + def test_multiple_gets_are_independent(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + first = cache.get("x") + second = cache.get("x") + first.data = first.data.drop(index=0).reset_index(drop=True) + assert len(second.data) == 3 + assert len(cache.cache["x"].data) == 3 + + def test_non_dataset_returns_as_is(self, cache): + cache.add("key", {"some": "dict"}) + assert cache.get("key") == {"some": "dict"} + + def test_object_dtype_nested_mutation_affects_cache(self, cache): + """CoW can't protect in nested mutations""" + df = pd.DataFrame({"A": [[1], [2], [3]]}) + cache.add("x", PandasDataset(df)) + retrieved = cache.get("x") + retrieved.data.loc[0, "A"].append(999) + assert cache.cache["x"].data.loc[0, "A"] == [1, 999] + + +class TestGetDataset: + def test_returns_new_wrapper_not_cached_object(self, cache, sample_dataset): + cache.add_dataset("x", sample_dataset) + result = cache.get_dataset("x") + assert result is not cache.dataset_cache["x"] + assert result.data is not cache.dataset_cache["x"].data + + def test_cow_does_not_modify_cache_on_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add_dataset("x", sample_dataset) + retrieved = cache.get_dataset("x") + retrieved.data.loc[0, "A"] = 999 + assert cache.dataset_cache["x"].data.loc[0, "A"] == 1 + + def test_add_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add_dataset("x", sample_dataset) + retrieved = cache.get_dataset("x") + retrieved.data = pd.concat( + [retrieved.data, pd.DataFrame({"A": [999], "B": [999]})], + ignore_index=True, + ) + assert len(cache.dataset_cache["x"].data) == 3 + assert len(retrieved.data) == 4 + + def test_drop_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add_dataset("x", sample_dataset) + retrieved = cache.get_dataset("x") + retrieved.data = retrieved.data.drop(index=0).reset_index(drop=True) + assert len(cache.dataset_cache["x"].data) == 3 + assert len(retrieved.data) == 2 + + +class TestGetAll: + def test_returns_new_wrappers(self, cache, sample_dataset): + cache.add("x", sample_dataset) + cache.add("y", sample_dataset) + results = cache.get_all(["x", "y"]) + assert all(r is not cache.cache["x"] for r in results) + assert all(r.data is not cache.cache["x"].data for r in results) + + def test_results_are_independent(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + cache.add("y", sample_dataset) + first, second = cache.get_all(["x", "y"]) + first.data = first.data.drop(index=0).reset_index(drop=True) + assert len(second.data) == 3 + assert len(cache.cache["x"].data) == 3 + + def test_cow_does_not_modify_cache_on_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("x", sample_dataset) + results = cache.get_all(["x"]) + results[0].data.loc[0, "A"] = 999 + assert cache.cache["x"].data.loc[0, "A"] == 1 + + def test_missing_key_returns_none(self, cache): + assert cache.get_all(["missing"]) == [None] + + +class TestGetAllByPrefix: + def test_returns_only_matching_keys(self, cache, sample_dataset): + cache.add("ds/ae", sample_dataset) + cache.add("ds/lb", sample_dataset) + cache.add("other/ae", sample_dataset) + results = cache.get_all_by_prefix("ds/") + assert len(results) == 2 + + def test_returns_new_wrappers(self, cache, sample_dataset): + cache.add("ds/ae", sample_dataset) + results = cache.get_all_by_prefix("ds/") + assert results[0] is not cache.cache["ds/ae"] + assert results[0].data is not cache.cache["ds/ae"].data + + def test_cow_does_not_modify_cache_on_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("ds/ae", sample_dataset) + results = cache.get_all_by_prefix("ds/") + results[0].data.loc[0, "A"] = 999 + assert cache.cache["ds/ae"].data.loc[0, "A"] == 1 + + def test_drop_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("ds/ae", sample_dataset) + results = cache.get_all_by_prefix("ds/") + results[0].data = results[0].data.drop(index=0).reset_index(drop=True) + assert len(cache.cache["ds/ae"].data) == 3 + + def test_no_match_returns_empty(self, cache, sample_dataset): + cache.add("ds/ae", sample_dataset) + assert cache.get_all_by_prefix("other/") == [] + + +class TestGetByRegex: + def test_returns_matching_keys(self, cache, sample_dataset): + cache.add("ae_data", sample_dataset) + cache.add("lb_data", sample_dataset) + cache.add("ae_meta", sample_dataset) + result = cache.get_by_regex("ae_*") + assert set(result.keys()) == {"ae_data", "ae_meta"} + + def test_returns_new_wrappers(self, cache, sample_dataset): + cache.add("ae_data", sample_dataset) + result = cache.get_by_regex("ae_*") + assert result["ae_data"] is not cache.cache["ae_data"] + assert result["ae_data"].data is not cache.cache["ae_data"].data + + def test_cow_does_not_modify_cache_on_write(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("ae_data", sample_dataset) + result = cache.get_by_regex("ae_*") + result["ae_data"].data.loc[0, "A"] = 999 + assert cache.cache["ae_data"].data.loc[0, "A"] == 1 + + def test_drop_rows_does_not_affect_cache(self, cache, sample_dataset): + pd.options.mode.copy_on_write = True + cache.add("ae_data", sample_dataset) + result = cache.get_by_regex("ae_*") + result["ae_data"].data = ( + result["ae_data"].data.drop(index=0).reset_index(drop=True) + ) + assert len(cache.cache["ae_data"].data) == 3 + + def test_no_match_returns_empty_dict(self, cache, sample_dataset): + cache.add("ae_data", sample_dataset) + assert cache.get_by_regex("lb_*") == {} From d06d875cd96344512f8cf93823415bcc57deeb55 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 28 Apr 2026 12:46:06 +0200 Subject: [PATCH 7/8] readme notice about CoW usage --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 4340249dc..8dd3fdab1 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,11 @@ All examples below use `python core.py` for source code users. **If you're using ### Running a validation (`validate`) +Pandas Copy-on-Write (CoW) enabled globally when using the rules engine. +In Pandas 2.x this is an opt-in feature, so it affects the whole process. + +Note: in Pandas 3.x, CoW is enabled by default, so this behavior will become standard once the project is upgraded. + Clone the repository and run: ```bash From d14e859eb45f442b669484ba11408f7b1e25b260 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 29 Apr 2026 12:55:33 +0200 Subject: [PATCH 8/8] fix filter_cache access to cache --- cdisc_rules_engine/services/cache/in_memory_cache_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdisc_rules_engine/services/cache/in_memory_cache_service.py b/cdisc_rules_engine/services/cache/in_memory_cache_service.py index 1bb005aeb..9f36280ab 100644 --- a/cdisc_rules_engine/services/cache/in_memory_cache_service.py +++ b/cdisc_rules_engine/services/cache/in_memory_cache_service.py @@ -104,7 +104,7 @@ def dataset_keys(self): return self.dataset_cache.keys() def filter_cache(self, prefix: str) -> dict: - return {k: self.cache.get(k) for k in self.cache.keys() if k.startswith(prefix)} + return {k: self.get(k) for k in self.cache.keys() if k.startswith(prefix)} def get_by_regex(self, regex: str) -> dict: regex = regex.replace("*", ".*")