diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 04672819b..9dfe32ff1 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -5,6 +5,7 @@ from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) +from cdisc_rules_engine.utilities.decorators import cached from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, @@ -34,6 +35,7 @@ def __init__( ): self.data_service = data_service self.cache = cache_service + self.cache_service = cache_service self.data_processor = data_processor self.rule_processor = rule_processor self.dataset_metadata = dataset_metadata @@ -44,6 +46,14 @@ def __init__( self.standard_substandard = standard_substandard self.library_metadata = library_metadata self.dataset_implementation = self.data_service.dataset_implementation + if isinstance(dataset_metadata, SDTMDatasetMetadata): + self.domain = ( + f"SUPP{dataset_metadata.rdomain}" + if dataset_metadata.rdomain + else dataset_metadata.domain + ) + self.dataset_name = dataset_metadata.name + self.name = self.__class__.__name__ @abstractmethod def build(self) -> DatasetInterface: @@ -67,7 +77,8 @@ def build_split_datasets(self, dataset_name: str, **kwargs) -> DatasetInterface: finally: self.dataset_metadata = original_dataset_metadata - def get_dataset(self, **kwargs): + @cached("get_dataset") + def get_dataset(self): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: # Handle split datasets for content checks. @@ -77,7 +88,6 @@ def get_dataset(self, **kwargs): datasets_metadata=get_corresponding_datasets( self.data_service.get_datasets(), self.dataset_metadata ), - **kwargs, ) else: # single dataset. the most common case @@ -85,7 +95,7 @@ def get_dataset(self, **kwargs): dataset = tag_source(dataset, self.dataset_metadata) return dataset - def get_dataset_contents(self, **kwargs): + def get_dataset_contents(self): # If validating dataset content, ensure split datasets are handled. if self.dataset_metadata.is_split: # Handle split datasets for content checks. @@ -95,7 +105,6 @@ def get_dataset_contents(self, **kwargs): datasets_metadata=get_corresponding_datasets( self.data_service.get_datasets(), self.dataset_metadata ), - **kwargs, ) else: # single dataset. the most common case diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 1f1c844e8..9bb822f09 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -17,8 +17,8 @@ def build_split_datasets(self, dataset_name, **kwargs): """ return self.data_service.get_dataset(dataset_name=dataset_name) - def get_dataset(self, **kwargs): - dataset = super().get_dataset(**kwargs) + def get_dataset(self): + dataset = super().get_dataset() length = sum( [ dataset.record_count diff --git a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py index eba882d84..23e546446 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py @@ -14,6 +14,5 @@ def build(self): """ return self.dataset_implementation.from_records( - {ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}, - index=[0], + [{ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}] ) diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py index c139347b0..6838fb610 100644 --- a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py @@ -46,7 +46,7 @@ def _get_cached_dataset(self) -> dict[str, list[str]]: return errlist - def get_dataset(self, **kwargs) -> DatasetInterface: + def get_dataset(self) -> DatasetInterface: dataset = self._get_cached_dataset() records = [ {key: dataset[key][i] for key in dataset} @@ -56,10 +56,10 @@ def get_dataset(self, **kwargs) -> DatasetInterface: row for row in records if row["dataset"] == self.dataset_metadata.name ] if filtered: - result = self.dataset_implementation.from_records(filtered, **kwargs) + result = self.dataset_implementation.from_records(filtered) else: empty_row = {key: "" for key in self.dataset_template.keys()} - result = self.dataset_implementation.from_records([empty_row], **kwargs) + result = self.dataset_implementation.from_records([empty_row]) return tag_source(result, self.dataset_metadata) def list_errors(self, tree: exceptions.ErrorTree, errlist: dict[str, list]): diff --git a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py index e1041a79f..2ac387f12 100644 --- a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py @@ -1,5 +1,6 @@ from json import load from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder +from cdisc_rules_engine.utilities.decorators import cached def add_json_pointer_paths(node, path=""): @@ -19,7 +20,8 @@ def add_json_pointer_paths(node, path=""): class JSONataDatasetBuilder(BaseDatasetBuilder): - def get_dataset(self, **kwargs): + @cached("get_dataset") + def get_dataset(self): if not self.dataset_metadata.full_path: return None with self.data_service.read_data(self.dataset_metadata.full_path) as fp: diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py index 7c818bf00..0e2a08978 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py @@ -17,8 +17,7 @@ def build(self): """ # Get basic variable metadata variables_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) # Check if the rule requires variable_max_size diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py index 7c279538b..2234ae68e 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py @@ -22,8 +22,7 @@ def build(self): """ data_contents_long_df = super().build() variable_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) merged_df = data_contents_long_df.merge( variable_metadata._data, how="left", on="variable_name" diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index 173b6a9da..930c55953 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -42,8 +42,7 @@ def build(self): """ variable_metadata: List[dict] = self.get_define_xml_variables_metadata() content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index b5a55fa83..3c406deea 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -36,8 +36,7 @@ def build(self): variable_metadata: List[dict] = self.get_define_xml_variables_metadata() # get dataset metadata and execute the rule content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index c7d87e86c..543e4ce0b 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -26,8 +26,7 @@ def build(self): # get dataset metadata and execute the rule content_variables_metadata: DatasetInterface = ( self.data_service.get_variables_metadata( - dataset_name=self.dataset_metadata.name, - drop_duplicates=True, + dataset_name=self.dataset_metadata.name ) ) dataset_contents = self.get_dataset_contents() diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index db890ff15..861bcc8d5 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -52,7 +52,7 @@ def get_raw_dataset_metadata( """ @abstractmethod - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets variables metadata of a dataset. """ @@ -71,7 +71,6 @@ def concat_split_datasets( self, func_to_call: Callable, datasets_metadata: Iterable[DatasetMetadata], - **kwargs, ): """ Accepts a list of split dataset filenames, diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 7a6449d31..7303bd1dd 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -117,8 +117,8 @@ def from_dict(cls, data: dict, **kwargs): return cls(dataframe) @classmethod - def from_records(cls, data: List[dict], **kwargs): - data = pd.DataFrame.from_records(data, **kwargs) + def from_records(cls, data: List[dict]): + data = pd.DataFrame.from_records(data) dataframe = dd.from_pandas(data, npartitions=DEFAULT_NUM_PARTITIONS) return cls(dataframe) diff --git a/cdisc_rules_engine/models/dataset/dataset_interface.py b/cdisc_rules_engine/models/dataset/dataset_interface.py index 5854cef82..4aea0bc01 100644 --- a/cdisc_rules_engine/models/dataset/dataset_interface.py +++ b/cdisc_rules_engine/models/dataset/dataset_interface.py @@ -33,7 +33,7 @@ def from_dict(cls, data: dict, **kwargs): @classmethod @abstractmethod - def from_records(cls, data: List[dict], **kwargs): + def from_records(cls, data: List[dict]): """ Create the underlying dataset from provided list of records """ diff --git a/cdisc_rules_engine/models/dataset/pandas_dataset.py b/cdisc_rules_engine/models/dataset/pandas_dataset.py index 497120e3e..7b8f51e3e 100644 --- a/cdisc_rules_engine/models/dataset/pandas_dataset.py +++ b/cdisc_rules_engine/models/dataset/pandas_dataset.py @@ -56,8 +56,8 @@ def from_dict(cls, data: dict, **kwargs): return cls(dataframe) @classmethod - def from_records(cls, data: List[dict], **kwargs): - dataframe = pd.DataFrame.from_records(data, **kwargs) + def from_records(cls, data: List[dict]): + dataframe = pd.DataFrame.from_records(data) return cls(dataframe) def __getitem__( diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index b1566a270..9073e63d8 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -140,7 +140,6 @@ def concat_split_datasets( self, func_to_call: Callable, datasets_metadata: Iterable[DatasetMetadata], - **kwargs, ) -> DatasetInterface: """ Accepts a list of split dataset filenames, asynchronously downloads @@ -149,8 +148,6 @@ def concat_split_datasets( func_to_call must accept dataset_name and kwargs as input parameters and return pandas DataFrame. """ - # pop drop_duplicates param at the beginning to avoid passing it to func_to_call - drop_duplicates: bool = kwargs.pop("drop_duplicates", False) # download datasets asynchronously datasets: Iterator[DatasetInterface] = self._async_get_datasets( @@ -158,15 +155,12 @@ def concat_split_datasets( dataset_names=[ dataset_metadata.name for dataset_metadata in datasets_metadata ], - **kwargs, ) full_dataset = self.dataset_implementation() for dataset, dataset_metadata in zip(datasets, datasets_metadata): tagged_dataset = tag_source(dataset, dataset_metadata) full_dataset = full_dataset.concat(tagged_dataset, ignore_index=True) - if drop_duplicates: - full_dataset = full_dataset.drop_duplicates() return full_dataset def check_filepath(self, dataset_names: List[str]) -> List: diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index a3a98b978..27d9d78db 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -81,7 +81,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada for dataset in self.data } - def get_variables_metadata(self, dataset_name: str, **params) -> PandasDataset: + def get_variables_metadata(self, dataset_name: str) -> PandasDataset: metadata_to_return = { "variable_name": [], "variable_order_number": [], diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 1da5ec33e..6f79f05a3 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -202,7 +202,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 1fe731e5f..77eb5119a 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -150,7 +150,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 83052f2e6..a2d06fd1d 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -158,7 +158,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ diff --git a/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py new file mode 100644 index 000000000..2729a5372 --- /dev/null +++ b/tests/unit/test_dataset_builders/test_jsonata_dataset_bilder.py @@ -0,0 +1,50 @@ +from unittest.mock import MagicMock, mock_open, patch + +from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import ( + JSONataDatasetBuilder, +) + + +def test_get_dataset_uses_cache(): + metadata = MagicMock() + metadata.full_path = "fake_path.json" + + data_service = MagicMock() + data_service.read_data.return_value.__enter__.return_value = mock_open( + read_data='{"a": {"b": 1}}' + )() + + cache_service = MagicMock() + cache_service.get.side_effect = [None, {"cached": True}] + + builder = JSONataDatasetBuilder( + dataset_metadata=metadata, + data_service=data_service, + rule={}, + cache_service=cache_service, + rule_processor=MagicMock(), + data_processor=MagicMock(), + define_xml_path=None, + standard="USDM", + standard_version="4.0", + standard_substandard=None, + ) + builder.cache_service = cache_service + + with patch( + "cdisc_rules_engine.dataset_builders.jsonata_dataset_builder.load", + return_value={"a": {"b": 1}}, + ) as mock_load: + + result1 = builder.get_dataset() + result2 = builder.get_dataset() + + assert result1 == {"_path": "", "a": {"_path": "/a", "b": 1}} + + # second call gets from cache + assert result2 == {"cached": True} + + assert mock_load.call_count == 1 + + assert cache_service.get.call_count == 2 + assert cache_service.add.call_count == 1 diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index e3b988ebd..bf06570ca 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -26,6 +26,11 @@ from cdisc_rules_engine.models.dataset import PandasDataset +@pytest.fixture(autouse=True) +def clear_lru_cache(): + RulesEngine(standard="sdtmig").cache.clear_all() + + def test_get_schema(): schema = RulesEngine().get_schema() assert "variables" in schema @@ -1075,9 +1080,15 @@ def test_rule_with_domain_prefix_replacement( { "domain": "AE", "dataset": "AE", - "errors": [], - "executionStatus": ExecutionStatus.SUCCESS.value, - "message": None, + "errors": [ + { + "dataset": "AE", + "error": "Empty dataset", + "message": "Dataset skipped - Dataset is empty after preprocessing and operations. rule id=TEST1, dataset=AE", + } + ], + "executionStatus": ExecutionStatus.SKIPPED.value, + "message": "Dataset skipped - Dataset is empty after preprocessing and operations. rule id=TEST1, dataset=AE", "variables": [], } ], diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index f2cb210d3..b87677991 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -9,6 +9,7 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_types import DatasetTypes from cdisc_rules_engine.services.data_services import cached_dataset, LocalDataService +from cdisc_rules_engine.utilities.decorators import cached from cdisc_rules_engine.utilities.utils import get_dataset_cache_key_from_path from cdisc_rules_engine.constants.classes import ( FINDINGS, @@ -382,3 +383,72 @@ def to_be_decorated(instance, dataset_name: str): test_dataset_name, DatasetTypes.CONTENTS.value ): test_df }, "New dataset was not added to the cache" + + +def test_cached_cache_hit_call_count(): + cache_service = MagicMock() + expected_data = PandasDataset() + cache_service.get.return_value = expected_data + + instance = MagicMock() + instance.cache_service = cache_service + instance.name = "Builder" + + func_mock = MagicMock(return_value=PandasDataset()) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + result = func(instance, dataset_name="ae.xpt", name="Builder", domain_name="AE") + + assert result is expected_data + assert func_mock.call_count == 0 + + +def test_cached_cache_miss_call_count(): + cache_service = MagicMock() + cache_service.get.return_value = None + + instance = MagicMock() + instance.cache_service = cache_service + + func_mock = MagicMock(return_value=PandasDataset()) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + func(instance, dataset_name="ae.xpt") + + assert func_mock.call_count == 1 + + +def test_cached_different_builders_have_different_cache(): + cache_service = MagicMock() + cache_service.get.return_value = None + + func_mock = MagicMock(side_effect=[PandasDataset(), PandasDataset()]) + + @cached("get_dataset") + def func(self, **kwargs): + return func_mock() + + # builder 1 + instance1 = MagicMock() + instance1.cache_service = cache_service + instance1.name = "BuilderA" + + # builder 2 + instance2 = MagicMock() + instance2.cache_service = cache_service + instance2.name = "BuilderB" + + func(instance1, dataset_name="ae.xpt", domain_name="AE") + func(instance2, dataset_name="ae.xpt", domain_name="AE") + + assert func_mock.call_count == 2 + assert cache_service.add.call_count == 2 + keys = [call[0][0] for call in cache_service.get.call_args_list] + + assert keys[0] != keys[1] diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index 0123fbee1..8c32c0b24 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -89,6 +89,7 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): "cdisc_rules_engine.services.data_services.USDMDataService.get_dataset", return_value=dataset_mock, ): + RulesEngine(standard="sdtmig").cache.clear_all() validation_result: List[dict] = RulesEngine( standard="usdm", dataset_paths=[dataset_path] ).validate_single_dataset(