From e1c732a209f1939fea49e29e0685d5c26adeea44 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 8 Apr 2026 16:24:07 +0200 Subject: [PATCH 01/14] added cached_dataset decorator to get_dataset --- cdisc_rules_engine/dataset_builders/base_dataset_builder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 8aa2957d2..410549d61 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -2,6 +2,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) +from cdisc_rules_engine.services.data_services import cached_dataset from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) @@ -36,6 +37,7 @@ def __init__( ): self.data_service = data_service self.cache = cache_service + self.cache_service = cache_service self.data_processor = data_processor self.rule_processor = rule_processor self.dataset_path = dataset_path @@ -69,6 +71,7 @@ def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface: finally: self.dataset_path = original_path + @cached_dataset("contents") def get_dataset(self, **kwargs): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: From b9a9b14020d46ef3d699368ecac708b52d82dcf3 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Thu, 9 Apr 2026 17:52:49 +0200 Subject: [PATCH 02/14] added per-builder dataset caching on rule validation. --- cdisc_rules_engine/dataset_builders/base_dataset_builder.py | 4 ++-- cdisc_rules_engine/rules_engine.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 410549d61..9986f691a 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -2,10 +2,10 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) -from cdisc_rules_engine.services.data_services import cached_dataset from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) +from cdisc_rules_engine.utilities.decorators import cached from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, @@ -71,7 +71,7 @@ def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface: finally: self.dataset_path = original_path - @cached_dataset("contents") + @cached("get_dataset") def get_dataset(self, **kwargs): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 33f371800..d1a8bc1a0 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -326,7 +326,11 @@ def validate_rule( kwargs = {} builder = self.get_dataset_builder(rule, datasets, dataset_metadata) try: - dataset = builder.get_dataset() + dataset = builder.get_dataset( + domain_name=dataset_metadata.domain, + dataset_name=dataset_metadata.name, + name=builder.__class__.__name__, + ) except Exception as e: raise DatasetBuilderError( f"Failed to build dataset for rule validation. " From c2b36b6f89a55dd44950117d0dc8d0a1d073d42b Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 10 Apr 2026 12:16:12 +0200 Subject: [PATCH 03/14] cache tests --- .../test_data_service/test_data_service.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index 37da3e277..a5e794a60 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -9,6 +9,7 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_types import DatasetTypes from cdisc_rules_engine.services.data_services import cached_dataset, LocalDataService +from cdisc_rules_engine.utilities.decorators import cached from cdisc_rules_engine.utilities.utils import get_dataset_cache_key_from_path from cdisc_rules_engine.constants.classes import ( FINDINGS, @@ -374,3 +375,72 @@ def to_be_decorated(instance, dataset_name: str): test_dataset_name, DatasetTypes.CONTENTS.value ): test_df }, "New dataset was not added to the cache" + + +def test_cached_cache_hit_call_count(): + cache_service = MagicMock() + expected_data = PandasDataset() + cache_service.get.return_value = expected_data + + instance = MagicMock() + instance.cache_service = cache_service + instance.name = "Builder" + + func_mock = MagicMock(return_value=PandasDataset()) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + result = func(instance, dataset_name="ae.xpt", name="Builder", domain_name="AE") + + assert result is expected_data + assert func_mock.call_count == 0 # 🔥 ключевая проверка + + +def test_cached_cache_miss_call_count(): + cache_service = MagicMock() + cache_service.get.return_value = None + + instance = MagicMock() + instance.cache_service = cache_service + + func_mock = MagicMock(return_value=PandasDataset()) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + func(instance, dataset_name="ae.xpt") + + assert func_mock.call_count == 1 + + +def test_cached_different_builders_have_different_cache(): + cache_service = MagicMock() + cache_service.get.return_value = None + + func_mock = MagicMock(side_effect=[PandasDataset(), PandasDataset()]) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + # builder 1 + instance1 = MagicMock() + instance1.cache_service = cache_service + instance1.name = "BuilderA" + + # builder 2 + instance2 = MagicMock() + instance2.cache_service = cache_service + instance2.name = "BuilderB" + + func(instance1, dataset_name="ae.xpt", domain_name="AE") + func(instance2, dataset_name="ae.xpt", domain_name="AE") + + assert func_mock.call_count == 2 + assert cache_service.add.call_count == 2 + keys = [call[0][0] for call in cache_service.get.call_args_list] + + assert keys[0] != keys[1] From 68643b362bcc3895c6f02483ec516d4ab71a4d63 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Sun, 12 Apr 2026 20:54:13 +0200 Subject: [PATCH 04/14] fix for tests -- RulesEngine cache is cleared before each test --- cdisc_rules_engine/dataset_builders/base_dataset_builder.py | 3 +++ tests/unit/test_rules_engine.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 9986f691a..1674668aa 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -77,6 +77,9 @@ def get_dataset(self, **kwargs): if self.dataset_metadata.is_split: # Handle split datasets for content checks. # A content check is any check that is not in the list of rule types + kwargs.pop( + "dataset_name", None + ) # to prevent downstream duplicate keys error dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.build_split_datasets, datasets_metadata=get_corresponding_datasets( diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 5de91eb06..2d0b5757f 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -26,6 +26,11 @@ from cdisc_rules_engine.models.dataset import PandasDataset +@pytest.fixture(autouse=True) +def clear_lru_cache(): + RulesEngine(standard="sdtmig").cache.clear_all() + + def test_get_schema(): schema = RulesEngine().get_schema() assert "variables" in schema From 96f86379620b16cd993aaee083e86743192c87b6 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 13 Apr 2026 18:17:27 +0200 Subject: [PATCH 05/14] added fields for caching to BaseDatasetBuilder --- .../dataset_builders/base_dataset_builder.py | 8 ++++++++ cdisc_rules_engine/rules_engine.py | 6 +----- tests/unit/test_usdm_data.py | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 1674668aa..79e5ea61d 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -50,6 +50,14 @@ def __init__( self.standard_substandard = standard_substandard self.library_metadata = library_metadata self.dataset_implementation = self.data_service.dataset_implementation + if isinstance(dataset_metadata, SDTMDatasetMetadata): + self.domain = ( + f"SUPP{dataset_metadata.rdomain}" + if dataset_metadata.rdomain + else dataset_metadata.domain + ) + self.dataset_name = dataset_metadata.name + self.name = self.__class__.__name__ @abstractmethod def build(self) -> DatasetInterface: diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index d1a8bc1a0..33f371800 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -326,11 +326,7 @@ def validate_rule( kwargs = {} builder = self.get_dataset_builder(rule, datasets, dataset_metadata) try: - dataset = builder.get_dataset( - domain_name=dataset_metadata.domain, - dataset_name=dataset_metadata.name, - name=builder.__class__.__name__, - ) + dataset = builder.get_dataset() except Exception as e: raise DatasetBuilderError( f"Failed to build dataset for rule validation. " diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index c2461b0e9..1275e302a 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -93,6 +93,7 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): "cdisc_rules_engine.services.data_services.USDMDataService.get_dataset", return_value=dataset_mock, ): + RulesEngine(standard="sdtmig").cache.clear_all() validation_result: List[dict] = RulesEngine( standard="usdm", dataset_paths=[dataset_path] ).validate_single_dataset( From d008108a8f99a4b6ffbfdfa835270de9d5feda9c Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 14 Apr 2026 11:02:17 +0200 Subject: [PATCH 06/14] remove comment --- tests/unit/test_services/test_data_service/test_data_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index a5e794a60..debfc3829 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -395,7 +395,7 @@ def func(self, **kwargs): result = func(instance, dataset_name="ae.xpt", name="Builder", domain_name="AE") assert result is expected_data - assert func_mock.call_count == 0 # 🔥 ключевая проверка + assert func_mock.call_count == 0 def test_cached_cache_miss_call_count(): From e439a89c85bb7da6c741d32902acb01a7a471d84 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 14 Apr 2026 12:45:08 +0200 Subject: [PATCH 07/14] removed unnecessary kwargs pop call --- cdisc_rules_engine/dataset_builders/base_dataset_builder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 79e5ea61d..a61d96452 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -85,9 +85,6 @@ def get_dataset(self, **kwargs): if self.dataset_metadata.is_split: # Handle split datasets for content checks. # A content check is any check that is not in the list of rule types - kwargs.pop( - "dataset_name", None - ) # to prevent downstream duplicate keys error dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.build_split_datasets, datasets_metadata=get_corresponding_datasets( From 62f649c2379be618a2e3ca01f8eca6bebc9bc824 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 15 Apr 2026 16:22:51 +0200 Subject: [PATCH 08/14] test for ignored kwargs in split dataset build --- .../test_base_dataset_builder.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/unit/test_dataset_builders/test_base_dataset_builder.py b/tests/unit/test_dataset_builders/test_base_dataset_builder.py index 92936891b..1269f3c0c 100644 --- a/tests/unit/test_dataset_builders/test_base_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_base_dataset_builder.py @@ -232,3 +232,30 @@ def test_get_define_xml_variables_metadata_domain_not_found( match=r"name=AE, domain=AE is not found in Define XML", ): builder.get_define_xml_variables_metadata() + + +def test_get_dataset_kwargs_are_ignored_in_cache(monkeypatch): + # Reset singleton cache to isolate test + from cdisc_rules_engine.services.cache import InMemoryCacheService + + InMemoryCacheService._instance = None + cache_service = InMemoryCacheService.get_instance() + + dataset_metadata = MagicMock() + dataset_metadata.is_split = True + dataset_metadata.full_path = "dummy_path" + + builder = create_builder_instance(dataset_metadata) + builder.cache_service = cache_service + + def fake_concat(*args, **kwargs): + return {"value": kwargs.get("param")} + + builder.data_service.concat_split_datasets = MagicMock(side_effect=fake_concat) + + result1 = builder.get_dataset(dataset_name="test", param=1) + result2 = builder.get_dataset(dataset_name="test", param=2) + + assert result1 == result2 + + assert builder.data_service.concat_split_datasets.call_count == 1 From 252cb8ba56b0c46611d4d0577197afde819f73af Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Thu, 16 Apr 2026 20:46:29 +0200 Subject: [PATCH 09/14] added cached decorator to JSONataDatasetBuilder --- .../jsonata_dataset_builder.py | 2 + .../test_jsonata_dataset_bilder.py | 52 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py diff --git a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py index e1041a79f..a50094c1b 100644 --- a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py @@ -1,5 +1,6 @@ from json import load from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder +from cdisc_rules_engine.utilities.decorators import cached def add_json_pointer_paths(node, path=""): @@ -19,6 +20,7 @@ def add_json_pointer_paths(node, path=""): class JSONataDatasetBuilder(BaseDatasetBuilder): + @cached("get_dataset") def get_dataset(self, **kwargs): if not self.dataset_metadata.full_path: return None diff --git a/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py new file mode 100644 index 000000000..2ded48ed1 --- /dev/null +++ b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py @@ -0,0 +1,52 @@ +from unittest.mock import MagicMock, mock_open, patch + +from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import ( + JSONataDatasetBuilder, +) + + +def test_get_dataset_uses_cache(): + metadata = MagicMock() + metadata.full_path = "fake_path.json" + + data_service = MagicMock() + data_service.read_data.return_value.__enter__.return_value = mock_open( + read_data='{"a": {"b": 1}}' + )() + + cache_service = MagicMock() + cache_service.get.side_effect = [None, {"cached": True}] + + builder = JSONataDatasetBuilder( + dataset_metadata=metadata, + data_service=data_service, + rule={}, + cache_service=cache_service, + rule_processor=MagicMock(), + data_processor=MagicMock(), + dataset_path="dummy.xpt", + datasets=[], + define_xml_path=None, + standard="USDM", + standard_version="4.0", + standard_substandard=None, + ) + builder.cache_service = cache_service + + with patch( + "cdisc_rules_engine.dataset_builders.jsonata_dataset_builder.load", + return_value={"a": {"b": 1}}, + ) as mock_load: + + result1 = builder.get_dataset() + result2 = builder.get_dataset() + + assert result1 == {"_path": "", "a": {"_path": "/a", "b": 1}} + + # second call gets from cache + assert result2 == {"cached": True} + + assert mock_load.call_count == 1 + + assert cache_service.get.call_count == 2 + assert cache_service.add.call_count == 1 From 81cd458f4401634ca644dd9894990994f73be32b Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 22 Apr 2026 14:01:04 +0200 Subject: [PATCH 10/14] removed kwargs from get_dataset method --- .../dataset_builders/base_dataset_builder.py | 6 ++--- .../contents_dataset_builder.py | 4 +-- .../json_schema_check_dataset_builder.py | 6 ++--- .../jsonata_dataset_builder.py | 2 +- .../interfaces/data_service_interface.py | 1 - .../data_services/base_data_service.py | 6 ----- .../test_base_dataset_builder.py | 27 ------------------- 7 files changed, 8 insertions(+), 44 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index a61d96452..450919233 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -80,7 +80,7 @@ def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface: self.dataset_path = original_path @cached("get_dataset") - def get_dataset(self, **kwargs): + def get_dataset(self): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: # Handle split datasets for content checks. @@ -90,7 +90,6 @@ def get_dataset(self, **kwargs): datasets_metadata=get_corresponding_datasets( self.datasets, self.dataset_metadata ), - **kwargs, ) else: # single dataset. the most common case @@ -98,7 +97,7 @@ def get_dataset(self, **kwargs): dataset = tag_source(dataset, self.dataset_metadata) return dataset - def get_dataset_contents(self, **kwargs): + def get_dataset_contents(self): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: # Handle split datasets for content checks. @@ -108,7 +107,6 @@ def get_dataset_contents(self, **kwargs): datasets_metadata=get_corresponding_datasets( self.datasets, self.dataset_metadata ), - **kwargs, ) else: # single dataset. the most common case diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 426cbdacf..b5b127fdf 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -19,8 +19,8 @@ def build_split_datasets(self, dataset_name, **kwargs): dataset_name=dataset_name, datasets=self.datasets ) - def get_dataset(self, **kwargs): - dataset = super().get_dataset(**kwargs) + def get_dataset(self): + dataset = super().get_dataset() length = sum( [ dataset.record_count diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py index c139347b0..6838fb610 100644 --- a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py @@ -46,7 +46,7 @@ def _get_cached_dataset(self) -> dict[str, list[str]]: return errlist - def get_dataset(self, **kwargs) -> DatasetInterface: + def get_dataset(self) -> DatasetInterface: dataset = self._get_cached_dataset() records = [ {key: dataset[key][i] for key in dataset} @@ -56,10 +56,10 @@ def get_dataset(self, **kwargs) -> DatasetInterface: row for row in records if row["dataset"] == self.dataset_metadata.name ] if filtered: - result = self.dataset_implementation.from_records(filtered, **kwargs) + result = self.dataset_implementation.from_records(filtered) else: empty_row = {key: "" for key in self.dataset_template.keys()} - result = self.dataset_implementation.from_records([empty_row], **kwargs) + result = self.dataset_implementation.from_records([empty_row]) return tag_source(result, self.dataset_metadata) def list_errors(self, tree: exceptions.ErrorTree, errlist: dict[str, list]): diff --git a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py index a50094c1b..2ac387f12 100644 --- a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py @@ -21,7 +21,7 @@ def add_json_pointer_paths(node, path=""): class JSONataDatasetBuilder(BaseDatasetBuilder): @cached("get_dataset") - def get_dataset(self, **kwargs): + def get_dataset(self): if not self.dataset_metadata.full_path: return None with self.data_service.read_data(self.dataset_metadata.full_path) as fp: diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 8e3ab1778..6e03a89fd 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -71,7 +71,6 @@ def concat_split_datasets( self, func_to_call: Callable, datasets_metadata: Iterable[DatasetMetadata], - **kwargs, ): """ Accepts a list of split dataset filenames, diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 8ac16f92e..d613a91c9 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -137,7 +137,6 @@ def concat_split_datasets( self, func_to_call: Callable, datasets_metadata: Iterable[DatasetMetadata], - **kwargs, ) -> DatasetInterface: """ Accepts a list of split dataset filenames, asynchronously downloads @@ -146,22 +145,17 @@ def concat_split_datasets( func_to_call must accept dataset_name and kwargs as input parameters and return pandas DataFrame. """ - # pop drop_duplicates param at the beginning to avoid passing it to func_to_call - drop_duplicates: bool = kwargs.pop("drop_duplicates", False) # download datasets asynchronously datasets: Iterator[DatasetInterface] = self._async_get_datasets( func_to_call, dataset_names=[dataset.full_path for dataset in datasets_metadata], - **kwargs, ) full_dataset = self.dataset_implementation() for dataset, dataset_metadata in zip(datasets, datasets_metadata): tagged_dataset = tag_source(dataset, dataset_metadata) full_dataset = full_dataset.concat(tagged_dataset, ignore_index=True) - if drop_duplicates: - full_dataset = full_dataset.drop_duplicates() return full_dataset def check_filepath(self, dataset_names: List[str]) -> List: diff --git a/tests/unit/test_dataset_builders/test_base_dataset_builder.py b/tests/unit/test_dataset_builders/test_base_dataset_builder.py index 1269f3c0c..92936891b 100644 --- a/tests/unit/test_dataset_builders/test_base_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_base_dataset_builder.py @@ -232,30 +232,3 @@ def test_get_define_xml_variables_metadata_domain_not_found( match=r"name=AE, domain=AE is not found in Define XML", ): builder.get_define_xml_variables_metadata() - - -def test_get_dataset_kwargs_are_ignored_in_cache(monkeypatch): - # Reset singleton cache to isolate test - from cdisc_rules_engine.services.cache import InMemoryCacheService - - InMemoryCacheService._instance = None - cache_service = InMemoryCacheService.get_instance() - - dataset_metadata = MagicMock() - dataset_metadata.is_split = True - dataset_metadata.full_path = "dummy_path" - - builder = create_builder_instance(dataset_metadata) - builder.cache_service = cache_service - - def fake_concat(*args, **kwargs): - return {"value": kwargs.get("param")} - - builder.data_service.concat_split_datasets = MagicMock(side_effect=fake_concat) - - result1 = builder.get_dataset(dataset_name="test", param=1) - result2 = builder.get_dataset(dataset_name="test", param=2) - - assert result1 == result2 - - assert builder.data_service.concat_split_datasets.call_count == 1 From 08f56fad1f00a3c5ba3c2d989001491ae0c35b2e Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 29 Apr 2026 15:04:23 +0200 Subject: [PATCH 11/14] fix test --- tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py index 2ded48ed1..2729a5372 100644 --- a/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py +++ b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py @@ -24,8 +24,6 @@ def test_get_dataset_uses_cache(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", - datasets=[], define_xml_path=None, standard="USDM", standard_version="4.0", From b03de55460b57040d136f623b3bdec6d5b58aee7 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 30 Apr 2026 16:07:11 -0400 Subject: [PATCH 12/14] removed params and unused drop_duplicates argument from get metadata --- .../dataset_builders/variables_metadata_dataset_builder.py | 3 +-- .../variables_metadata_values_dataset_builder.py | 3 +-- ...riables_metadata_with_define_and_library_dataset_builder.py | 3 +-- .../variables_metadata_with_define_dataset_builder.py | 3 +-- .../variables_metadata_with_library_metadata.py | 3 +-- cdisc_rules_engine/interfaces/data_service_interface.py | 2 +- .../services/data_services/dummy_data_service.py | 2 +- .../services/data_services/excel_data_service.py | 2 +- .../services/data_services/local_data_service.py | 2 +- cdisc_rules_engine/services/data_services/usdm_data_service.py | 2 +- 10 files changed, 10 insertions(+), 15 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py index 7c818bf00..0e2a08978 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py @@ -17,8 +17,7 @@ def build(self): """ # Get basic variable metadata variables_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) # Check if the rule requires variable_max_size diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py index 7c279538b..2234ae68e 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py @@ -22,8 +22,7 @@ def build(self): """ data_contents_long_df = super().build() variable_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) merged_df = data_contents_long_df.merge( variable_metadata._data, how="left", on="variable_name" diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index 173b6a9da..930c55953 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -42,8 +42,7 @@ def build(self): """ variable_metadata: List[dict] = self.get_define_xml_variables_metadata() content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index b5a55fa83..3c406deea 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -36,8 +36,7 @@ def build(self): variable_metadata: List[dict] = self.get_define_xml_variables_metadata() # get dataset metadata and execute the rule content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index c7d87e86c..543e4ce0b 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -26,8 +26,7 @@ def build(self): # get dataset metadata and execute the rule content_variables_metadata: DatasetInterface = ( self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) ) dataset_contents = self.get_dataset_contents() diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 3ebc769dc..861bcc8d5 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -52,7 +52,7 @@ def get_raw_dataset_metadata( """ @abstractmethod - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets variables metadata of a dataset. """ diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index a3a98b978..27d9d78db 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -81,7 +81,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada for dataset in self.data } - def get_variables_metadata(self, dataset_name: str, **params) -> PandasDataset: + def get_variables_metadata(self, dataset_name: str) -> PandasDataset: metadata_to_return = { "variable_name": [], "variable_order_number": [], diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 1da5ec33e..6f79f05a3 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -202,7 +202,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 1fe731e5f..77eb5119a 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -150,7 +150,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 83052f2e6..a2d06fd1d 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -158,7 +158,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ From b5054046792015be90db77cc48783b9cc8b4fe71 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 30 Apr 2026 16:28:07 -0400 Subject: [PATCH 13/14] remove kwargs --- .../dataset_builders/domain_list_dataset_builder.py | 3 +-- cdisc_rules_engine/models/dataset/dask_dataset.py | 4 ++-- cdisc_rules_engine/models/dataset/dataset_interface.py | 2 +- cdisc_rules_engine/models/dataset/pandas_dataset.py | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py index eba882d84..23e546446 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py @@ -14,6 +14,5 @@ def build(self): """ return self.dataset_implementation.from_records( - {ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}, - index=[0], + [{ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}] ) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 7a6449d31..7303bd1dd 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -117,8 +117,8 @@ def from_dict(cls, data: dict, **kwargs): return cls(dataframe) @classmethod - def from_records(cls, data: List[dict], **kwargs): - data = pd.DataFrame.from_records(data, **kwargs) + def from_records(cls, data: List[dict]): + data = pd.DataFrame.from_records(data) dataframe = dd.from_pandas(data, npartitions=DEFAULT_NUM_PARTITIONS) return cls(dataframe) diff --git a/cdisc_rules_engine/models/dataset/dataset_interface.py b/cdisc_rules_engine/models/dataset/dataset_interface.py index 5854cef82..4aea0bc01 100644 --- a/cdisc_rules_engine/models/dataset/dataset_interface.py +++ b/cdisc_rules_engine/models/dataset/dataset_interface.py @@ -33,7 +33,7 @@ def from_dict(cls, data: dict, **kwargs): @classmethod @abstractmethod - def from_records(cls, data: List[dict], **kwargs): + def from_records(cls, data: List[dict]): """ Create the underlying dataset from provided list of records """ diff --git a/cdisc_rules_engine/models/dataset/pandas_dataset.py b/cdisc_rules_engine/models/dataset/pandas_dataset.py index 497120e3e..7b8f51e3e 100644 --- a/cdisc_rules_engine/models/dataset/pandas_dataset.py +++ b/cdisc_rules_engine/models/dataset/pandas_dataset.py @@ -56,8 +56,8 @@ def from_dict(cls, data: dict, **kwargs): return cls(dataframe) @classmethod - def from_records(cls, data: List[dict], **kwargs): - dataframe = pd.DataFrame.from_records(data, **kwargs) + def from_records(cls, data: List[dict]): + dataframe = pd.DataFrame.from_records(data) return cls(dataframe) def __getitem__( From 7f2bab2f896b903d92b4728ab7866b74d05211a8 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 30 Apr 2026 16:41:21 -0400 Subject: [PATCH 14/14] test --- tests/unit/test_rules_engine.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 8ed533dd8..bf06570ca 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1080,9 +1080,15 @@ def test_rule_with_domain_prefix_replacement( { "domain": "AE", "dataset": "AE", - "errors": [], - "executionStatus": ExecutionStatus.SUCCESS.value, - "message": None, + "errors": [ + { + "dataset": "AE", + "error": "Empty dataset", + "message": "Dataset skipped - Dataset is empty after preprocessing and operations. rule id=TEST1, dataset=AE", + } + ], + "executionStatus": ExecutionStatus.SKIPPED.value, + "message": "Dataset skipped - Dataset is empty after preprocessing and operations. rule id=TEST1, dataset=AE", "variables": [], } ],