From c516cdf91b32f4b28d625e3432801399f03b5148 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Fri, 10 Apr 2026 13:40:25 -0400 Subject: [PATCH 01/18] before test fixes --- .../dataset_builders/base_dataset_builder.py | 14 +- .../contents_dataset_builder.py | 6 +- .../contents_define_dataset_builder.py | 6 +- .../contents_define_vlm_dataset_builder.py | 2 +- .../values_dataset_builder.py | 2 +- cdisc_rules_engine/models/dataset_metadata.py | 5 - .../operations/day_data_validator.py | 4 +- cdisc_rules_engine/operations/distinct.py | 2 +- cdisc_rules_engine/operations/domain_label.py | 4 +- .../parent_library_model_column_order.py | 2 +- .../operations/variable_count.py | 5 +- .../operations/variable_exists.py | 2 +- .../operations/variable_is_null.py | 4 +- .../operations/variable_value_count.py | 5 +- .../data_services/base_data_service.py | 79 ++++++--- .../data_services/dummy_data_service.py | 64 +++----- .../data_services/excel_data_service.py | 154 +++++++++--------- .../data_services/local_data_service.py | 97 +++++------ .../data_services/usdm_data_service.py | 68 ++++---- .../utilities/data_processor.py | 12 +- .../utilities/dataset_preprocessor.py | 38 ++--- .../utilities/rule_processor.py | 15 +- .../utilities/sdtm_utilities.py | 12 +- cdisc_rules_engine/utilities/utils.py | 12 +- .../test_excel_data_service.py | 33 ++-- .../test_local_data_service.py | 9 +- 26 files changed, 323 insertions(+), 333 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 8aa2957d2..1f6205a7d 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -56,18 +56,20 @@ def build(self) -> DatasetInterface: """ pass - def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface: + def build_split_datasets(self, dataset_name: str, **kwargs) -> DatasetInterface: """ Returns correct dataframe to operate on. - Default implementation that temporarily sets dataset_path to dataset_name and calls build(). + Default implementation that temporarily sets dataset_metadata and calls build(). """ - original_path = self.dataset_path + original_dataset_metadata = self.dataset_metadata try: - self.dataset_path = dataset_name + self.dataset_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=dataset_name + ) result = self.build(**kwargs) return result finally: - self.dataset_path = original_path + self.dataset_metadata = original_dataset_metadata def get_dataset(self, **kwargs): # If validating dataset content, ensure split datasets are handled. @@ -102,7 +104,7 @@ def get_dataset_contents(self, **kwargs): else: # single dataset. the most common case dataset: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) dataset = tag_source(dataset, self.dataset_metadata) return dataset diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 426cbdacf..13a8c3549 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -9,14 +9,14 @@ def build(self, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ - return self.data_service.get_dataset(dataset_name=self.dataset_path) + return self.data_service.get_dataset(dataset_name=self.dataset_metadata.name) - def build_split_datasets(self, dataset_name, **kwargs): + def build_split_datasets(self, dataset_metadata, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ return self.data_service.get_dataset( - dataset_name=dataset_name, datasets=self.datasets + dataset_name=dataset_metadata.name, datasets=self.datasets ) def get_dataset(self, **kwargs): diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index a64fa8d9a..ed6905ffb 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -31,12 +31,14 @@ def build(self): ..., """ data_contents_df = self.data_service.get_dataset( - dataset_name=self.dataset_path, datasets=self.datasets + dataset_name=self.dataset_metadata.name ) # Build dataset metadata dataframe size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, size_unit=size_unit, datasets=self.datasets + dataset_name=self.dataset_metadata.name, + size_unit=size_unit, + datasets=self.datasets, ).to_dict(orient="records")[0] # Build define xml dataframe define = self.get_define_xml_item_group_metadata_for_dataset(dataset_metadata) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index 3e12bdabb..fb2374b9c 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -28,7 +28,7 @@ def build(self): """ # get dataset contents and convert it from wide to long data_contents_df: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) self.add_row_number(data_contents_df) data_contents_long_df: DatasetInterface = ValuesDatasetBuilder.build(self) diff --git a/cdisc_rules_engine/dataset_builders/values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/values_dataset_builder.py index 265911569..f2de4b051 100644 --- a/cdisc_rules_engine/dataset_builders/values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/values_dataset_builder.py @@ -14,7 +14,7 @@ def build(self): ..., """ data_contents_df: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) self.add_row_number(data_contents_df) values_df: DatasetInterface = data_contents_df.melt( diff --git a/cdisc_rules_engine/models/dataset_metadata.py b/cdisc_rules_engine/models/dataset_metadata.py index b4b8a2dfd..eae183ba4 100644 --- a/cdisc_rules_engine/models/dataset_metadata.py +++ b/cdisc_rules_engine/models/dataset_metadata.py @@ -1,6 +1,5 @@ from dataclasses import dataclass from typing import Union -from os.path import basename @dataclass @@ -18,7 +17,3 @@ class DatasetMetadata: full_path: Union[str, None] = None first_record: Union[dict, None] = None original_path: Union[str, None] = None - - @property - def data_service_identifier(self) -> str: - return basename(self.full_path) if self.full_path else self.filename diff --git a/cdisc_rules_engine/operations/day_data_validator.py b/cdisc_rules_engine/operations/day_data_validator.py index 6a9b6269c..54b0bfb5f 100644 --- a/cdisc_rules_engine/operations/day_data_validator.py +++ b/cdisc_rules_engine/operations/day_data_validator.py @@ -23,9 +23,7 @@ def _execute_operation(self): self.data_service.get_dataset, dm_datasets ) else: - dm_data = self.data_service.get_dataset( - dataset_name=dm_datasets[0].full_path or dm_datasets[0].filename - ) + dm_data = self.data_service.get_dataset(dataset_name=dm_datasets[0].name) dm_data = tag_source(dm_data, dm_datasets[0]) new_dataset = self.evaluation_dataset.merge( diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index cecf00560..147c77426 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -71,7 +71,7 @@ def get_existing_column_names(group): def _get_referenced_datasets(self): referenced_datasets = {} for dataset_metadata in self.data_service.get_datasets(): - dataset = self.data_service.get_dataset(dataset_metadata.filename) + dataset = self.data_service.get_dataset(dataset_metadata.name) referenced_datasets[dataset_metadata.name] = dataset return referenced_datasets diff --git a/cdisc_rules_engine/operations/domain_label.py b/cdisc_rules_engine/operations/domain_label.py index 69e8ce737..468fef10e 100644 --- a/cdisc_rules_engine/operations/domain_label.py +++ b/cdisc_rules_engine/operations/domain_label.py @@ -1,6 +1,6 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, ) @@ -12,7 +12,7 @@ def _execute_operation(self): standard_data = self.library_metadata.standard_metadata domain_details = None for c in standard_data.get("classes", []): - domain_details = search_in_list_of_dicts( + domain_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == self.params.domain ) if domain_details: diff --git a/cdisc_rules_engine/operations/parent_library_model_column_order.py b/cdisc_rules_engine/operations/parent_library_model_column_order.py index 665da4a6a..b4ac54e31 100644 --- a/cdisc_rules_engine/operations/parent_library_model_column_order.py +++ b/cdisc_rules_engine/operations/parent_library_model_column_order.py @@ -48,6 +48,6 @@ def _get_parent_variable_names_list(self, domain_to_datasets: dict, rdomain: str f"{rdomain} but Domain not found in datasets" ) parent_dataframe = self.data_service.get_dataset( - dataset_name=parent_datasets[0].full_path + dataset_name=parent_datasets[0].name ) return self._get_variable_names_list(rdomain, parent_dataframe) diff --git a/cdisc_rules_engine/operations/variable_count.py b/cdisc_rules_engine/operations/variable_count.py index c9962e7bb..c3b9f8680 100644 --- a/cdisc_rules_engine/operations/variable_count.py +++ b/cdisc_rules_engine/operations/variable_count.py @@ -1,4 +1,3 @@ -import pandas as pd from cdisc_rules_engine.operations.base_operation import BaseOperation from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata import asyncio @@ -35,9 +34,7 @@ async def _get_all_study_variable_counts(self) -> dict: async def _get_dataset_variable_count( self, dataset: SDTMDatasetMetadata ) -> Counter: - data: pd.DataFrame = self.data_service.get_dataset( - dataset_name=dataset.full_path - ) + data = self.data_service.get_dataset(dataset_name=dataset.name) target_variable = BaseOperation._replace_variable_wildcard( self.params.original_target, dataset.wildcard_replacement ) diff --git a/cdisc_rules_engine/operations/variable_exists.py b/cdisc_rules_engine/operations/variable_exists.py index 9fffbacfb..7c503068c 100644 --- a/cdisc_rules_engine/operations/variable_exists.py +++ b/cdisc_rules_engine/operations/variable_exists.py @@ -4,5 +4,5 @@ class VariableExists(BaseOperation): def _execute_operation(self): # get metadata - dataframe = self.data_service.get_dataset(dataset_name=self.params.dataset_path) + dataframe = self.data_service.get_dataset(dataset_name=self.params.domain) return self.params.target in dataframe diff --git a/cdisc_rules_engine/operations/variable_is_null.py b/cdisc_rules_engine/operations/variable_is_null.py index 7e14bfcd4..89a79707e 100644 --- a/cdisc_rules_engine/operations/variable_is_null.py +++ b/cdisc_rules_engine/operations/variable_is_null.py @@ -6,9 +6,7 @@ def _execute_operation(self): if self.params.source == "submission": if self.params.level == "row": raise ValueError("level: row may only be used with source: evaluation") - dataframe = self.data_service.get_dataset( - dataset_name=self.params.dataset_path - ) + dataframe = self.data_service.get_dataset(dataset_name=self.params.domain) else: dataframe = self.evaluation_dataset diff --git a/cdisc_rules_engine/operations/variable_value_count.py b/cdisc_rules_engine/operations/variable_value_count.py index f283a0e60..75e7b6b10 100644 --- a/cdisc_rules_engine/operations/variable_value_count.py +++ b/cdisc_rules_engine/operations/variable_value_count.py @@ -2,7 +2,6 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.base_operation import BaseOperation import asyncio -import os from collections import Counter from typing import List from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets @@ -44,9 +43,7 @@ async def _get_dataset_variable_value_count( ) else: data: DatasetInterface = self.data_service.get_dataset( - dataset_name=os.path.join( - self.params.directory_path, dataset_metadata.filename - ) + dataset_name=dataset_metadata.name ) data = tag_source(data, dataset_metadata) target_variable = BaseOperation._replace_variable_wildcard( diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 8ac16f92e..81ceb2bb3 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -1,5 +1,5 @@ import asyncio -from abc import ABC +from abc import ABC, abstractmethod from functools import wraps, partial from typing import Callable, List, Optional, Iterable, Iterator from concurrent.futures import ThreadPoolExecutor @@ -34,8 +34,7 @@ from cdisc_rules_engine.services.data_readers import DataReaderFactory from cdisc_rules_engine.utilities.utils import ( get_dataset_cache_key_from_path, - get_directory_path, - search_in_list_of_dicts, + search_in_list, replace_nan_values_in_df, ) from cdisc_rules_engine.utilities.sdtm_utilities import ( @@ -116,6 +115,10 @@ def __init__( self.dataset_implementation = kwargs.get( "dataset_implementation", PandasDataset ) + # Call the subclass implementation to populate metadata + self._datasets_metadata: dict[str, SDTMDatasetMetadata] = ( + self._initialize_datasets_metadata(**kwargs) + ) def get_dataset_by_type( self, dataset_name: str, dataset_type: str, **params @@ -152,7 +155,7 @@ def concat_split_datasets( # download datasets asynchronously datasets: Iterator[DatasetInterface] = self._async_get_datasets( func_to_call, - dataset_names=[dataset.full_path for dataset in datasets_metadata], + dataset_names=[dataset.name for dataset in datasets_metadata], **kwargs, ) full_dataset = self.dataset_implementation() @@ -173,7 +176,6 @@ def check_filepath(self, dataset_names: List[str]) -> List: def get_dataset_class( self, dataset: DatasetInterface, - file_path: str, datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: @@ -185,9 +187,7 @@ def get_dataset_class( name = class_data.get("name") if name: return convert_library_class_name_to_ct_class(name) - return self._handle_custom_domains( - dataset, dataset_metadata, file_path, datasets - ) + return self._handle_custom_domains(dataset, dataset_metadata, datasets) def get_data_structure( self, @@ -231,11 +231,53 @@ def get_dataset_metadata( } return self.dataset_implementation.from_dict(metadata_to_return) + def get_raw_dataset_metadata( + self, dataset_name: str, **kwargs + ) -> SDTMDatasetMetadata: + """ + Returns dataset metadata from the metadata dictionary. + + Args: + dataset_name: Name or filename of the dataset + + Returns: + SDTMDatasetMetadata instance + + Raises: + KeyError: If dataset_name is not found in the metadata dictionary + """ + if dataset_name not in self._datasets_metadata: + raise KeyError( + f"Dataset '{dataset_name}' not found in metadata. " + f"Available datasets: {list(self._datasets_metadata.keys())}" + ) + return self._datasets_metadata[dataset_name] + + def get_datasets(self) -> List[SDTMDatasetMetadata]: + """ + Returns list of dataset metadata. + """ + return list(self._datasets_metadata.values()) + + @abstractmethod + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata dictionary. + + Subclasses must implement this method to populate the metadata dictionary + with their specific logic for reading and organizing dataset metadata. + + Args: + **kwargs: Additional keyword arguments passed from __init__ + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + def _handle_custom_domains( self, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - file_path: str, datasets: Iterable[SDTMDatasetMetadata], ): if self._contains_topic_variable(dataset, dataset_metadata.domain, "TERM"): @@ -250,13 +292,12 @@ def _handle_custom_domains( return FINDINGS if dataset_metadata.is_ap: return self._get_associated_persons_inherit_class( - file_path, datasets, dataset_metadata + datasets, dataset_metadata ) return None def _get_associated_persons_inherit_class( self, - file_path, datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): @@ -266,24 +307,20 @@ def _get_associated_persons_inherit_class( ap_suffix = dataset_metadata.ap_suffix if not ap_suffix: return None - directory_path = get_directory_path(file_path) if len(datasets) > 1: - domain_details: SDTMDatasetMetadata = search_in_list_of_dicts( + new_dataset_metadata: SDTMDatasetMetadata = search_in_list( datasets, lambda item: item.domain == ap_suffix ) - if domain_details: - if domain_details.is_ap: + if new_dataset_metadata: + if new_dataset_metadata.is_ap: raise ValueError("Nested Associated Persons domain reference") - file_name = domain_details.filename - new_file_path = os.path.join(directory_path, file_name) - new_domain_dataset = self.get_dataset(dataset_name=new_file_path) + new_dataset = self.get_dataset(dataset_name=new_dataset_metadata.name) else: raise ValueError("Filename for domain doesn't exist") return self.get_dataset_class( - new_domain_dataset, - new_file_path, + new_dataset, datasets, - domain_details, + new_dataset_metadata, ) else: return None diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index 9275e2262..7953de5ec 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -1,14 +1,10 @@ from datetime import datetime from io import IOBase -from typing import List, Optional, Iterable, Sequence +from typing import List, Optional, Sequence -import os import pandas as pd import tempfile from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset -from cdisc_rules_engine.exceptions.custom_exceptions import ( - DatasetNotFoundError, -) from cdisc_rules_engine.interfaces import CacheServiceInterface, ConfigInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_types import DatasetTypes @@ -31,11 +27,11 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.data: List[DummyDataset] = kwargs.get("data") + self.define_xml: str = kwargs.get("define_xml") super(DummyDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) - self.data: List[DummyDataset] = kwargs.get("data") - self.define_xml: str = kwargs.get("define_xml") @classmethod def get_instance( @@ -53,15 +49,9 @@ def get_instance( **kwargs, ) - def check_dataset_exists(self, dataset_name): - dataset_name = dataset_name.replace("/", "") - if dataset_name not in self.data: - raise DatasetNotFoundError("dataset does not exist") - def get_dataset_data(self, dataset_name: str) -> Optional[DummyDataset]: - dataset_name = os.path.basename(dataset_name) for dataset in self.data: - if dataset.filename == dataset_name: + if dataset.name == dataset_name: return dataset return None @@ -75,20 +65,28 @@ def get_dataset(self, dataset_name: str, **params) -> PandasDataset: else: return PandasDataset.from_dict({}) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: - dataset_metadata: dict = self.__get_dataset_metadata(dataset_name, **kwargs) - return SDTMDatasetMetadata( - name=dataset_metadata["dataset_name"][0], - first_record={"DOMAIN": dataset_metadata["dataset_name"][0]}, - label=dataset_metadata["dataset_label"][0], - modification_date=datetime.now().isoformat(), - filename=dataset_metadata["filename"][0], - file_size=dataset_metadata["dataset_size"][0], - full_path=dataset_metadata["filename"][0], - record_count=dataset_metadata["record_count"][0], - ) + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata by converting DummyDataset objects to SDTMDatasetMetadata. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + result = {} + for dataset in self.data: + dataset_metadata_dict: dict = dataset.get_metadata() + metadata = SDTMDatasetMetadata( + name=dataset_metadata_dict["dataset_name"][0], + first_record={"DOMAIN": dataset_metadata_dict["dataset_name"][0]}, + label=dataset_metadata_dict["dataset_label"][0], + modification_date=datetime.now().isoformat(), + filename=dataset_metadata_dict["filename"][0], + file_size=dataset_metadata_dict["dataset_size"][0], + full_path=dataset_metadata_dict["filename"][0], + record_count=dataset_metadata_dict["record_count"][0], + ) + result[metadata.name] = metadata + return result def get_variables_metadata(self, dataset_name: str, **params) -> PandasDataset: metadata_to_return = { @@ -154,13 +152,6 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> dict: - dataset: Optional[DummyDataset] = self.get_dataset_data(dataset_name) - metadata_to_return = {} - if dataset: - metadata_to_return: dict = dataset.get_metadata() - return metadata_to_return - def to_parquet(self, file_path: str) -> str: """ Save the dataset with full_path == file_path to a parquet file. @@ -179,9 +170,6 @@ def to_parquet(self, file_path: str) -> str: return len(df.index), temp_file.name return 0, "" - def get_datasets(self) -> Iterable[SDTMDatasetMetadata]: - return self.data - @staticmethod def get_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING): json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file( diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 3b3e2224a..c29b6da8b 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -35,10 +35,10 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.dataset_path: str = kwargs.get("dataset_path", "") super(ExcelDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) - self.dataset_path: str = kwargs.get("dataset_path", "") @classmethod def get_instance( @@ -75,8 +75,7 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: return f return None - @cached_dataset(DatasetTypes.CONTENTS.value) - def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + def __get_dataset(self, sheet_name: str) -> DatasetInterface: dtype_mapping = { "Char": str, "Num": float, @@ -86,7 +85,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: } header = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=sheet_name, header=None, nrows=3, na_values=[""], @@ -96,7 +95,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: dtypes = {key: dtype_mapping.get(value, str) for key, value in dtypes.items()} dataframe = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=sheet_name, dtype=dtypes, skiprows=(1, 2, 3), na_values=[""], @@ -108,15 +107,21 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: offending = [col for col in dataframe.columns if col != col.strip()] if offending: raise ExcelTestDataError( - f"Sheet '{dataset_name}' has column headers with leading/trailing whitespace: " + f"Sheet '{sheet_name}' has column headers with leading/trailing whitespace: " f"{[repr(c) for c in offending]}." ) dataset = PandasDataset(dataframe) return dataset - def _get_dataset_name( - self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str - ) -> str: + @cached_dataset(DatasetTypes.CONTENTS.value) + def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + dataset_metadata = self._datasets_metadata.get(dataset_name) + if dataset_metadata is None: + return PandasDataset.from_dict({}) + sheet_name = dataset_metadata.filename + return self.__get_dataset(sheet_name) + + def _get_dataset_name(self, first_record: dict, dataset_filename: str) -> str: if self.standard == "usdm": return first_record.get("instanceType", dataset_filename.split(".")[0]) return dataset_filename.split(".")[0].upper() @@ -130,38 +135,71 @@ def _get_datasets_worksheet(self) -> pd.DataFrame: keep_default_na=False, ) - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, - dataset_name: str, - **kwargs, - ) -> SDTMDatasetMetadata: + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: """ - Returns dataset metadata as DatasetMetadata instance. + Initialize the dataset metadata by reading metadata for all datasets in the Excel file. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata """ - datasets_worksheet = self._get_datasets_worksheet() - metadata = datasets_worksheet[ - datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] - == dataset_name - ] - dataset = self.get_dataset(dataset_name=dataset_name) - first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} - return SDTMDatasetMetadata( - name=self._get_dataset_name(metadata, first_record, dataset_name), - first_record=first_record, - label=( - metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] - if not metadata.empty - else "" - ), - modification_date=datetime.fromtimestamp( - os.path.getmtime(self.dataset_path) - ).isoformat(), - filename=dataset_name, - full_path=dataset_name, - file_size=0, - record_count=len(dataset), + result = {} + try: + datasets_worksheet = self._get_datasets_worksheet() + except ValueError as e: + # Pandas raises ValueError when sheet is not found + if "Worksheet named" in str(e): + try: + with pd.ExcelFile(self.dataset_path) as xl: + sheet_names = xl.sheet_names + available = ", ".join(repr(s) for s in sheet_names) or "(none)" + except Exception: + available = "(unable to read sheet names)" + raise ExcelTestDataError( + f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " + f"Submitted sheet names: {available}." + ) from e + raise + + # Check for required columns + missing_cols = sorted( + set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) + - set(datasets_worksheet.columns) ) + if missing_cols: + raise ExcelTestDataError( + f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " + f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " + f"{missing_cols}. Column headers are case-sensitive. " + ) + + for dataset_filename in datasets_worksheet[ + ExcelDataSheets.DATASET_FILENAME_COLUMN.value + ]: + dataset = self.__get_dataset(dataset_filename) + first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} + metadata_row = datasets_worksheet[ + datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] + == dataset_filename + ] + dataset_name = self._get_dataset_name(first_record, dataset_filename) + dataset_metadata = SDTMDatasetMetadata( + name=dataset_name, + first_record=first_record, + label=( + metadata_row[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] + if not metadata_row.empty + else "" + ), + modification_date=datetime.fromtimestamp( + os.path.getmtime(self.dataset_path) + ).isoformat(), + filename=dataset_filename, + full_path=dataset_filename, + file_size=0, + record_count=len(dataset), + ) + result[dataset_name] = dataset_metadata + return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: @@ -211,46 +249,6 @@ def get_define_xml_contents(self, dataset_name: str) -> bytes: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def get_datasets(self) -> List[dict]: - try: - with pd.ExcelFile(self.dataset_path) as xl: - sheet_names = xl.sheet_names - if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names: - available = ", ".join(repr(s) for s in sheet_names) or "(none)" - raise ExcelTestDataError( - f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " - f"Submitted sheet names: {available}." - ) - worksheet = xl.parse( - ExcelDataSheets.DATASETS_SHEET_NAME.value, - na_values=[""], - keep_default_na=False, - ) - except ExcelTestDataError: - raise - except Exception as e: - raise ExcelTestDataError( - f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. " - f"Details: {e}" - ) from e - - missing_cols = sorted( - set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) - - set(worksheet.columns) - ) - if missing_cols: - raise ExcelTestDataError( - f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " - f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " - f"{missing_cols}. Column headers are case-sensitive. " - ) - - datasets = [ - self.get_raw_dataset_metadata(dataset_name=fn) - for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] - ] - return datasets - def to_parquet(self, file_path: str) -> str: """ Stub implementation to satisfy abstract interface requirements. diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 201fbe1f5..26124f28d 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -42,11 +42,46 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.encoding: str = kwargs.get("encoding") super(LocalDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) - self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) - self.encoding: str = kwargs.get("encoding") + + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata by reading metadata for all dataset paths. + + Args: + **kwargs: Keyword arguments including dataset_paths + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) + result = {} + for dataset_path in dataset_paths: + try: + file_metadata, contents_metadata = self.__get_dataset_metadata( + dataset_path + ) + metadata = SDTMDatasetMetadata( + name=contents_metadata["dataset_name"], + first_record=contents_metadata["first_record"], + label=contents_metadata["dataset_label"], + modification_date=contents_metadata["dataset_modification_date"], + filename=file_metadata["name"], + full_path=file_metadata["path"], + file_size=file_metadata["file_size"], + record_count=contents_metadata["dataset_length"], + ) + result[metadata.name] = metadata + except InvalidDatasetFormat: + raise + except Exception as e: + raise InvalidDatasetFormat( + f"Your data file could not be read: {dataset_path}." + ) from e + return result @classmethod def get_instance( @@ -98,33 +133,13 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: @cached_dataset(DatasetTypes.CONTENTS.value) def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + full_path = self._datasets_metadata[dataset_name].full_path reader = self._reader_factory.get_service( - basename(dataset_name).split(".")[1].upper() + basename(full_path).split(".")[1].upper() ) - df = reader.from_file(dataset_name) + df = reader.from_file(full_path) return df - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: - """ - Returns dataset metadata as DatasetMetadata instance. - """ - file_metadata, contents_metadata = self.__get_dataset_metadata( - dataset_name, **kwargs - ) - return SDTMDatasetMetadata( - name=contents_metadata["dataset_name"], - first_record=contents_metadata["first_record"], - label=contents_metadata["dataset_label"], - modification_date=contents_metadata["dataset_modification_date"], - filename=file_metadata["name"], - full_path=file_metadata["path"], - file_size=file_metadata["file_size"], - record_count=contents_metadata["dataset_length"], - ) - @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) def get_variables_metadata( self, dataset_name: str, datasets: list, **params @@ -166,18 +181,20 @@ def get_dataset_by_type( ) def read_metadata( - self, file_path: str, datasets: Optional[Iterable[SDTMDatasetMetadata]] = None + self, + dataset_path: str, + datasets: Optional[Iterable[SDTMDatasetMetadata]] = None, ) -> dict: - file_size = os.path.getsize(file_path) - file_name = basename(file_path) + file_size = os.path.getsize(dataset_path) + file_name = basename(dataset_path) file_metadata = { - "path": file_path, + "path": dataset_path, "name": file_name, "file_size": file_size, } if file_name.endswith(".parquet") and datasets: for obj in datasets: - if obj.full_path == file_path: + if obj.full_path == dataset_path: file_metadata = { "path": obj.original_path, "name": basename(obj.original_path), @@ -216,12 +233,12 @@ def read_metadata( def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> Tuple[dict, dict]: + def __get_dataset_metadata(self, dataset_path: str, **kwargs) -> Tuple[dict, dict]: """ Internal method that gets dataset metadata and converts file size if needed. """ - metadata: dict = self.read_metadata(dataset_name, kwargs.get("datasets")) + metadata: dict = self.read_metadata(dataset_path, kwargs.get("datasets")) file_metadata: dict = metadata["file_metadata"] size_unit: Optional[str] = kwargs.get("size_unit") if size_unit: # convert file size from bytes to desired unit if needed @@ -236,22 +253,6 @@ def to_parquet(self, file_path: str) -> str: ) return reader.to_parquet(file_path) - def get_datasets(self) -> List[dict]: - datasets = [] - for dataset_path in self.dataset_paths: - try: - dataset_metadata = self.get_raw_dataset_metadata( - dataset_name=dataset_path - ) - datasets.append(dataset_metadata) - except InvalidDatasetFormat: - raise - except Exception as e: - raise InvalidDatasetFormat( - f"Your data file could not be read: {dataset_path}." - ) from e - return datasets - @staticmethod def is_valid_data(dataset_paths: List[str]) -> bool: for dataset_path in dataset_paths: diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 30928a102..a5c5d1a8f 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -71,18 +71,13 @@ def __init__( config: ConfigInterface, **kwargs, ): - super(USDMDataService, self).__init__( - cache_service, reader_factory, config, **kwargs - ) self.dataset_path: str = kwargs.get("dataset_path", "") self.encoding: str = kwargs.get("encoding") with open(os.path.join("resources", "schema", "USDM.yaml")) as entity_dict: self.entity_dict: dict = safe_load(entity_dict) - self.json = self._reader_factory.get_service("USDM").from_file( - self.dataset_path - ) + self.json = reader_factory.get_service("USDM").from_file(self.dataset_path) # Build the id lookup dict once for fast reference resolution self._id_lookup = self.__build_id_lookup(self.json) @@ -93,6 +88,10 @@ def __init__( self._jsonpath_cache = {} + super(USDMDataService, self).__init__( + cache_service, reader_factory, config, **kwargs + ) + @classmethod def get_instance( cls, @@ -132,27 +131,34 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: return self.__get_dataset(dataset_name) - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: """ - Returns dataset metadata as DatasetMetadata instance. + Initialize the dataset metadata by reading metadata for all datasets in the USDM JSON. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata """ - dataset = self.get_dataset(dataset_name=dataset_name) - domain = self.__get_domain_from_dataset_name(dataset_name) - return SDTMDatasetMetadata( - name=domain, - first_record={"DOMAIN": domain}, - label=domain, - modification_date=datetime.fromtimestamp( - os.path.getmtime(self.dataset_path) - ).isoformat(), - filename=basename(dataset_name), - full_path=dataset_name, - file_size=0, - record_count=len(dataset), - ) + result = {} + for dataset_info in self.dataset_content_index: + dataset_name = dataset_info.get("dataset_name") + if not dataset_name: + continue + dataset = self.__get_dataset(dataset_name) + domain = self.__get_domain_from_dataset_name(dataset_name) + metadata = SDTMDatasetMetadata( + name=domain, + first_record={"DOMAIN": domain}, + label=domain, + modification_date=datetime.fromtimestamp( + os.path.getmtime(self.dataset_path) + ).isoformat(), + filename=basename(dataset_name), + full_path=dataset_name, + file_size=0, + record_count=len(dataset), + ) + result[domain] = metadata + return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: @@ -217,18 +223,6 @@ def read_metadata(self, dataset_name: str) -> dict: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def get_datasets(self) -> List[dict]: - datasets = [] - for dataset in self.dataset_content_index: - dataset_name = dataset.get("dataset_name") - if not dataset_name: - continue - dataset_metadata: SDTMDatasetMetadata = self.get_raw_dataset_metadata( - dataset_name=dataset_name - ) - datasets.append(dataset_metadata) - return datasets - def to_parquet(self, file_path: str) -> str: """ Stub implementation to satisfy abstract interface requirements. diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 6ebab89a0..5ba977ea6 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -19,7 +19,7 @@ ) from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, ) from cdisc_rules_engine.utilities.sdtm_utilities import add_variable_wildcards @@ -105,13 +105,13 @@ def merge_on_relrec_record( model_metadata = ( dataset_preprocessor._data_service.library_metadata.model_metadata ) - file_info: SDTMDatasetMetadata = search_in_list_of_dicts( + dataset_metadata: SDTMDatasetMetadata = search_in_list( datasets, lambda item: item.domain == relrec_row["RDOMAIN_RIGHT"] ) - if not file_info: + if not dataset_metadata: return DatasetInterface() - right_dataset: DatasetInterface = dataset_preprocessor._download_dataset( - file_info.filename + right_dataset: DatasetInterface = ( + dataset_preprocessor._data_service.get_dataset(dataset_metadata.name) ) variables_with_wildcards = { source: f"RELREC.{target}" @@ -479,7 +479,7 @@ def column_metadata_equal_to_define_and_library( library_metadata: dict, rule: dict, ) -> bool: - define_variable_metadata: Optional[dict] = search_in_list_of_dicts( + define_variable_metadata: Optional[dict] = search_in_list( define_metadata, lambda item: item.get("define_variable_name") == column ) if not define_variable_metadata: diff --git a/cdisc_rules_engine/utilities/dataset_preprocessor.py b/cdisc_rules_engine/utilities/dataset_preprocessor.py index 0b34890a1..28ec832cc 100644 --- a/cdisc_rules_engine/utilities/dataset_preprocessor.py +++ b/cdisc_rules_engine/utilities/dataset_preprocessor.py @@ -16,7 +16,6 @@ get_sided_match_keys, ) from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError -import os import pandas as pd @@ -66,7 +65,7 @@ def preprocess( # noqa is_child = bool(domain_details.get("child")) # download other datasets from blob storage and merge if is_child: - file_infos = [] + dataset_metadatas = [] # find parent of SUPP or SQAP dataset if ( (domain_name[:4] == "SUPP" or domain_name[:4] == "SQAP") @@ -77,7 +76,7 @@ def preprocess( # noqa domain_name == "SUPP--" or domain_name == self._dataset_metadata.name ): - file_infos: list[SDTMDatasetMetadata] = [ + dataset_metadatas: list[SDTMDatasetMetadata] = [ item for item in datasets if (item.domain == self._dataset_metadata.rdomain) @@ -87,8 +86,8 @@ def preprocess( # noqa domain_name == self._dataset_metadata.domain or domain_name == self._dataset_metadata.name ): - file_infos: list[SDTMDatasetMetadata] = self._find_parent_dataset( - datasets, domain_details + dataset_metadatas: list[SDTMDatasetMetadata] = ( + self._find_parent_dataset(datasets, domain_details) ) else: if self._is_split_domain(domain_name): @@ -96,7 +95,7 @@ def preprocess( # noqa target_domain_name: str = ( self._dataset_metadata.domain or self._dataset_metadata.name ) - file_infos: list[SDTMDatasetMetadata] = [ + dataset_metadatas: list[SDTMDatasetMetadata] = [ item for item in datasets if ( @@ -111,7 +110,7 @@ def preprocess( # noqa ) ] - if not file_infos and not ( + if not dataset_metadatas and not ( (self._dataset_metadata.is_supp and domain_name == "SUPP--") or self._dataset_metadata.name == "RELREC" ): @@ -121,18 +120,18 @@ def preprocess( # noqa ) continue - for file_info in file_infos: - if file_info.domain in merged_domains: + for dataset_metadata in dataset_metadatas: + if dataset_metadata.domain in merged_domains: continue # Try to download the dataset try: - other_dataset: DatasetInterface = self._download_dataset( - file_info.data_service_identifier + other_dataset: DatasetInterface = self._data_service.get_dataset( + dataset_metadata.name ) except Exception as e: raise PreprocessingError( - f"Failed to download dataset '{file_info.data_service_identifier}' for preprocessing: {str(e)}" + f"Failed to download dataset '{dataset_metadata.name}' for preprocessing: {str(e)}" ) referenced_targets = set( @@ -156,11 +155,11 @@ def preprocess( # noqa left_dataset=result, left_dataset_domain_name=self._dataset_metadata.domain, right_dataset=other_dataset, - right_dataset_domain_name=file_info.domain, + right_dataset_domain_name=dataset_metadata.domain, match_keys=domain_details.get("match_key"), datasets=datasets, ) - merged_domains.add(file_info.domain) + merged_domains.add(dataset_metadata.domain) else: result = self._merge_datasets( left_dataset=result, @@ -170,7 +169,9 @@ def preprocess( # noqa datasets=datasets, ) merged_domains.add( - file_info.domain if file_info.domain else file_info.name + dataset_metadata.domain + if dataset_metadata.domain + else dataset_metadata.name ) return result @@ -209,13 +210,6 @@ def _find_parent_dataset( def _is_split_domain(self, domain: str) -> bool: return domain == self._dataset_metadata.unsplit_name - def _download_dataset(self, filename: str) -> DatasetInterface: - return self._data_service.get_dataset( - dataset_name=os.path.join( - os.path.dirname(self._dataset_metadata.full_path), filename - ) - ) - def _child_merge_datasets( self, left_dataset: DatasetInterface, diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 0d289d107..8cb1326ae 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -1,6 +1,5 @@ import re import copy -import os from typing import Iterable, List, Optional, Union, Tuple from cdisc_rules_engine.enums.rule_types import RuleTypes @@ -40,7 +39,7 @@ from cdisc_rules_engine.utilities.utils import ( get_directory_path, get_operations_cache_key, - search_in_list_of_dicts, + search_in_list, ) from cdisc_rules_engine.models.external_dictionaries_container import ( ExternalDictionariesContainer, @@ -349,7 +348,7 @@ def perform_rule_operations( # change -- pattern to domain name original_target: str = operation.get("name") target: str = original_target - domain: str = operation.get("domain", dataset_metadata.unsplit_name) + domain: str = operation.get("domain", dataset_metadata.name) wildcard_replacement: str = operation.get( "domain", dataset_metadata.wildcard_replacement ) @@ -469,7 +468,7 @@ def _execute_operation( operation_params.dataframe, operation_params.domain ): # download other domain - domain_details: dict = search_in_list_of_dicts( + dataset_metadata: DatasetMetadata = search_in_list( operation_params.datasets, lambda item: ( item.unsplit_name == operation_params.domain @@ -479,7 +478,7 @@ def _execute_operation( ) ), ) - if domain_details is None: + if dataset_metadata is None: raise DomainNotFoundError( f"Failed to execute rule operation. " f"Domain {operation_params.domain} does not exist. " @@ -487,12 +486,8 @@ def _execute_operation( f"Target: {operation_params.target}, " f"Core ID: {operation_params.core_id}" ) - file_path: str = os.path.join( - get_directory_path(operation_params.dataset_path), - domain_details.data_service_identifier, - ) operation_params.dataframe = self.data_service.get_dataset( - dataset_name=file_path + dataset_metadata.name ) # call the operation diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index c00c0d6e2..d53c3f133 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -14,7 +14,7 @@ from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, ) from cdisc_rules_engine.constants.classes import ( DETECTABLE_CLASSES, @@ -74,13 +74,13 @@ def get_class_and_dataset_metadata( """ for c in library_metadata.standard_metadata.get("classes", []): - dataset_details = search_in_list_of_dicts( + dataset_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == dataset_name ) if dataset_details: return c, dataset_details for c in library_metadata.model_metadata.get("classes", []): - dataset_details = search_in_list_of_dicts( + dataset_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == dataset_name ) if dataset_details: @@ -139,7 +139,7 @@ def get_variables_metadata_from_standard( # noqa ) else: class_name = data_service._handle_custom_domains( - data_service.get_dataset(dataset_name=dataset_metadata.full_path), + data_service.get_dataset(dataset_metadata.name), dataset_metadata, dataset_path, datasets, @@ -306,7 +306,7 @@ def get_class_metadata( } """ - class_metadata: Optional[dict] = search_in_list_of_dicts( + class_metadata: Optional[dict] = search_in_list( model_details.get("classes", []), lambda item: convert_library_class_name_to_ct_class(item["name"]) == dataset_class, @@ -480,7 +480,7 @@ def get_variables_metadata_from_standard_model( # noqa def get_model_domain_metadata(model_details: dict, domain_name: str) -> dict: # Get domain metadata from model - domain_details: Optional[dict] = search_in_list_of_dicts( + domain_details: Optional[dict] = search_in_list( model_details.get("datasets", []), lambda item: item["name"] == domain_name ) diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 5e41736a2..ac169225c 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -320,9 +320,9 @@ def get_meddra_code_term_pairs_cache_key(meddra_path: str) -> str: return f"meddra_valid_code_term_pairs_{meddra_path}" -def get_item_index_by_condition( - list_of_dicts: List[dict], condition: Callable -) -> Optional[int]: +def get_item_index_by_condition[ + T +](list_of_dicts: List[T], condition: Callable[[T], bool]) -> Optional[int]: """ Uses linear search to return index of element in unsorted list which applies to the condition. @@ -332,9 +332,9 @@ def get_item_index_by_condition( return index -def search_in_list_of_dicts( - list_of_dicts: List[dict], condition: Callable -) -> Optional[dict]: +def search_in_list[ + T +](list_of_dicts: List[T], condition: Callable[[T], bool]) -> Optional[T]: """ Returns an element of unsorted list that applies to the condition. """ diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index 1e6623aad..e195b697e 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -50,14 +50,13 @@ def test_whitespace_get_dataset_raises(dataset_name): ) mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService.get_instance( - config=ConfigService(), - cache_service=mock_cache, - dataset_implementation=PandasDataset, - dataset_path=dataset_path, - ) with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_dataset(dataset_name=dataset_name) + ExcelDataService.get_instance( + config=ConfigService(), + cache_service=mock_cache, + dataset_implementation=PandasDataset, + dataset_path=dataset_path, + ) assert "leading/trailing whitespace" in str(exc_info.value.message) assert any(col in exc_info.value.message for col in ["STUDYID", "DOMAIN", "EXSEQ"]) @@ -213,7 +212,7 @@ def test_na_value_preserved_not_converted_to_nan(): def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): """ When the workbook has no 'Datasets' sheet (e.g. tab named 'datasets' instead), - get_datasets() raises ExcelTestDataError with message that includes + initialization raises ExcelTestDataError with message that includes case-sensitive guidance. """ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: @@ -238,12 +237,10 @@ def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService( - mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path - ) - with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_datasets() + ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) msg = str(exc_info.value) assert ExcelDataSheets.DATASETS_SHEET_NAME.value in msg @@ -254,7 +251,7 @@ def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): def test_get_datasets_missing_label_column_raises_friendly_error(): """ When the 'Datasets' sheet exists but is missing the 'Label' column, - get_datasets() raises ExcelTestDataError with column names and + initialization raises ExcelTestDataError with column names and case-sensitive guidance. """ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: @@ -280,12 +277,10 @@ def test_get_datasets_missing_label_column_raises_friendly_error(): mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService( - mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path - ) - with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_datasets() + ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) msg = str(exc_info.value) assert "Label" in msg diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 4dcfe8eed..a2f576232 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -89,13 +89,12 @@ def test_get_variables_metdata(dataset_implementation): def test_get_datasets_raises_invalid_dataset_format_when_file_cannot_be_read(): - """get_datasets() raises InvalidDatasetFormat with user-friendly message when a file cannot be read.""" + """LocalDataService __init__ raises InvalidDatasetFormat with user-friendly message when a file cannot be read.""" mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = LocalDataService( - mock_cache, MagicMock(), MagicMock(), dataset_paths=["/bad/path.xpt"] - ) with pytest.raises(InvalidDatasetFormat) as exc_info: - data_service.get_datasets() + LocalDataService( + mock_cache, MagicMock(), MagicMock(), dataset_paths=["/bad/path.xpt"] + ) assert "Your data file could not be read" in str(exc_info.value) assert "/bad/path.xpt" in str(exc_info.value) From ead3ac88fdc7e55a2b5f34c41cbb7bdda0882834 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 14 Apr 2026 00:52:54 -0400 Subject: [PATCH 02/18] Fixed unit tests --- .../contents_dataset_builder.py | 4 +- .../variables_metadata_dataset_builder.py | 4 +- ...riables_metadata_values_dataset_builder.py | 4 +- ...with_define_and_library_dataset_builder.py | 4 +- ...es_metadata_with_define_dataset_builder.py | 4 +- ...ariables_metadata_with_library_metadata.py | 2 +- .../data_services/base_data_service.py | 4 +- .../data_services/excel_data_service.py | 7 ++- .../data_services/local_data_service.py | 22 ++++++--- .../data_services/usdm_data_service.py | 47 +++++++------------ .../utilities/rule_processor.py | 5 +- scripts/list_dataset_metadata_handler.py | 10 ++-- ...ntents_define_variables_dataset_builder.py | 3 +- ...est_contents_define_vlm_dataset_builder.py | 3 +- .../test_values_dataset_metadata_builder.py | 11 ++++- .../test_values_variables_metadata_builder.py | 7 ++- tests/unit/test_dataset_preprocessor.py | 46 ++++++++++++------ .../test_day_data_validator.py | 11 ++--- .../test_parent_library_model_column_order.py | 9 ++-- tests/unit/test_rules_engine.py | 34 +++++++++----- .../test_data_service/test_data_service.py | 35 +++++++------- .../test_dummy_data_service.py | 6 +-- .../test_excel_data_service.py | 6 +-- .../test_local_data_service.py | 14 ++++-- tests/unit/test_usdm_data.py | 16 ++----- 25 files changed, 186 insertions(+), 132 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 13a8c3549..564729b0d 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -11,12 +11,12 @@ def build(self, **kwargs): """ return self.data_service.get_dataset(dataset_name=self.dataset_metadata.name) - def build_split_datasets(self, dataset_metadata, **kwargs): + def build_split_datasets(self, dataset_name, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ return self.data_service.get_dataset( - dataset_name=dataset_metadata.name, datasets=self.datasets + dataset_name=dataset_name, datasets=self.datasets ) def get_dataset(self, **kwargs): diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py index cdc8bb285..5f66170bc 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py @@ -14,5 +14,7 @@ def build(self): variable_format """ return self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + datasets=self.datasets, + drop_duplicates=True, ) diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py index 088e23387..d4c4d66d3 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py @@ -22,7 +22,9 @@ def build(self): """ data_contents_long_df = super().build() variable_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + datasets=self.datasets, + drop_duplicates=True, ) merged_df = data_contents_long_df.merge( variable_metadata._data, how="left", on="variable_name" diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index ee00d2cad..82ff8934c 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -42,7 +42,9 @@ def build(self): """ variable_metadata: List[dict] = self.get_define_xml_variables_metadata() content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + datasets=self.datasets, + drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index 25237c09f..21edb6c6a 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -36,7 +36,9 @@ def build(self): variable_metadata: List[dict] = self.get_define_xml_variables_metadata() # get dataset metadata and execute the rule content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + datasets=self.datasets, + drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index 81dbb9894..0aeed839f 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -26,7 +26,7 @@ def build(self): # get dataset metadata and execute the rule content_variables_metadata: DatasetInterface = ( self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, + dataset_name=self.dataset_metadata.name, datasets=self.datasets, drop_duplicates=True, ) diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 81ceb2bb3..21af8b738 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -155,7 +155,9 @@ def concat_split_datasets( # download datasets asynchronously datasets: Iterator[DatasetInterface] = self._async_get_datasets( func_to_call, - dataset_names=[dataset.name for dataset in datasets_metadata], + dataset_names=[ + dataset_metadata.name for dataset_metadata in datasets_metadata + ], **kwargs, ) full_dataset = self.dataset_implementation() diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index c29b6da8b..1da5ec33e 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -206,9 +206,14 @@ def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterfac """ Gets dataset from blob storage and returns metadata of a certain variable. """ + # Get the sheet name from metadata + dataset_metadata = self._datasets_metadata.get(dataset_name) + if dataset_metadata is None: + return PandasDataset.from_dict({}) + dataframe = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=dataset_metadata.filename, header=None, nrows=4, na_values=[""], diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 8103108cf..f34123972 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -46,6 +46,7 @@ def __init__( self.encoding: str = kwargs.get("encoding") self.variables_csv_path: str = kwargs.get("variables_csv_path") self.tables_csv_path: str = kwargs.get("tables_csv_path") + self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths") or [] super(LocalDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) @@ -60,9 +61,8 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada Returns: Dictionary mapping dataset name to SDTMDatasetMetadata """ - dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) result = {} - for dataset_path in dataset_paths: + for dataset_path in self.dataset_paths: try: file_metadata, contents_metadata = self.__get_dataset_metadata( dataset_path @@ -99,8 +99,14 @@ def get_instance( runs multiple times with different encodings in the same process). """ encoding = kwargs.get("encoding") - if cls._instance is None or ( - encoding is not None and cls._instance.encoding != encoding + dataset_paths = kwargs.get("dataset_paths") + if ( + cls._instance is None + or (encoding is not None and cls._instance.encoding != encoding) + or ( + dataset_paths is not None + and cls._instance.dataset_paths != dataset_paths + ) ): service = cls( cache_service=cache_service, @@ -150,7 +156,9 @@ def get_variables_metadata( """ Gets dataset from blob storage and returns metadata of a certain variable. """ - metadata: dict = self.read_metadata(dataset_name, datasets=datasets) + metadata: dict = self.__read_metadata( + self._datasets_metadata[dataset_name].full_path, datasets=datasets + ) contents_metadata: dict = metadata["contents_metadata"] metadata_to_return: VariableMetadataContainer = VariableMetadataContainer( contents_metadata @@ -183,7 +191,7 @@ def get_dataset_by_type( dataset_name=dataset_name, **params ) - def read_metadata( + def __read_metadata( self, dataset_path: str, datasets: Optional[Iterable[SDTMDatasetMetadata]] = None, @@ -247,7 +255,7 @@ def __get_dataset_metadata(self, dataset_path: str, **kwargs) -> Tuple[dict, dic Internal method that gets dataset metadata and converts file size if needed. """ - metadata: dict = self.read_metadata(dataset_path, kwargs.get("datasets")) + metadata: dict = self.__read_metadata(dataset_path, kwargs.get("datasets")) file_metadata: dict = metadata["file_metadata"] size_unit: Optional[str] = kwargs.get("size_unit") if size_unit: # convert file size from bytes to desired unit if needed diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index a5c5d1a8f..83052f2e6 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -82,9 +82,7 @@ def __init__( # Build the id lookup dict once for fast reference resolution self._id_lookup = self.__build_id_lookup(self.json) - self.dataset_content_index: dict = self.__get_datasets_content_index( - dataset_name="USDM_content_index", json=self.json - ) + self.dataset_content_index: List[dict] = self.__get_datasets_content_index() self._jsonpath_cache = {} @@ -144,20 +142,19 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada if not dataset_name: continue dataset = self.__get_dataset(dataset_name) - domain = self.__get_domain_from_dataset_name(dataset_name) metadata = SDTMDatasetMetadata( - name=domain, - first_record={"DOMAIN": domain}, - label=domain, + name=dataset_name, + first_record={"DOMAIN": dataset_name}, + label=dataset_name, modification_date=datetime.fromtimestamp( os.path.getmtime(self.dataset_path) ).isoformat(), - filename=basename(dataset_name), - full_path=dataset_name, + filename=basename(self.dataset_path), + full_path=self.dataset_path, file_size=0, record_count=len(dataset), ) - result[domain] = metadata + result[dataset_name] = metadata return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) @@ -165,7 +162,7 @@ def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterfac """ Gets dataset from blob storage and returns metadata of a certain variable. """ - metadata: dict = self.read_metadata(dataset_name) + metadata: dict = self.__read_entity_metadata(dataset_name) contents_metadata: dict = metadata["contents_metadata"] metadata_to_return: VariableMetadataContainer = VariableMetadataContainer( contents_metadata @@ -183,7 +180,7 @@ def get_define_xml_contents(self, dataset_name: str) -> bytes: "Can't use 'get_define_xml_contents' in USDMDataService!" ) - def read_metadata(self, dataset_name: str) -> dict: + def __read_entity_metadata(self, dataset_name: str) -> dict: np_json_type_map: dict = {"O": "string", "float64": "float"} file_size = os.path.getsize(self.dataset_path) file_name = basename(self.dataset_path) @@ -397,9 +394,8 @@ def __get_entity_name(self, value, parent: Any, _depth=0): else: return api_type - def __read_metadata( + def __read_node_metadata( self, - json, parent_node: DatumInContext, child_value, content_path: str, @@ -409,7 +405,7 @@ def __read_metadata( f"{parent_node.path}".endswith("Id") or f"{parent_node.path}".endswith("Ids") ): - definition = self.__find_definition(json, child_value) + definition = self.__find_definition(self.json, child_value) if definition: child_value = definition ty = "reference" @@ -424,26 +420,24 @@ def __read_metadata( def __get_full_path(node: DatumInContext): return f"{node.full_path}".replace(".[", "[") - @cached_dataset(DatasetTypes.CONTENTS.value) - def __get_datasets_content_index(self, dataset_name: str, json) -> List[dict]: + def __get_datasets_content_index(self) -> List[dict]: """ This is a bit convoluted because there is a bug in jsonpath_ng where this query does not return object values within an array """ metadata = [] - for node in parse("$..*").find(json): + for node in parse("$..*").find(self.json): if type(node.value) is list: for index, child in enumerate(node.value): - if metadatum := self.__read_metadata( - json, + if metadatum := self.__read_node_metadata( node, child, f"{USDMDataService.__get_full_path(node)}[{index}]", ): metadata.append(metadatum) else: - if metadatum := self.__read_metadata( - json, node, node.value, USDMDataService.__get_full_path(node) + if metadatum := self.__read_node_metadata( + node, node.value, USDMDataService.__get_full_path(node) ): metadata.append(metadatum) dataset_dict = {} @@ -457,19 +451,12 @@ def __get_datasets_content_index(self, dataset_name: str, json) -> List[dict]: ) return [ { - "dataset_name": self.__get_dataset_name_from_domain(key), - "domain": key, + "dataset_name": key, "content_paths": value, } for key, value in dataset_dict.items() ] - def __get_dataset_name_from_domain(self, domain_name: str) -> str: - return os.path.join(self.dataset_path, "{}.json".format(domain_name)) - - def __get_domain_from_dataset_name(self, dataset_name: str) -> str: - return basename(dataset_name).split(".")[0] - @staticmethod def is_valid_data(dataset_paths: Sequence[str], encoding: str = None): if ( diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 8cb1326ae..c71495ecd 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -238,12 +238,11 @@ def rule_applies_to_class( excluded_classes = classes.get("Exclude", []) is_included = True is_excluded = False - dataset_name = dataset_metadata.full_path if included_classes: if ALL_KEYWORD in included_classes: return True variables = self.data_service.get_variables_metadata( - dataset_name=dataset_name, datasets=datasets + dataset_name=dataset_metadata.name, datasets=datasets ).data.variable_name class_name = self.data_service.get_dataset_class( variables, @@ -257,7 +256,7 @@ def rule_applies_to_class( is_included = False if excluded_classes: variables = self.data_service.get_variables_metadata( - dataset_name=dataset_name, datasets=datasets + dataset_name=dataset_metadata.name, datasets=datasets ).data.variable_name class_name = self.data_service.get_dataset_class( variables, diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 053713a5d..17c6f7b62 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -51,9 +51,7 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: raise ValueError(error_msg) cache_service = CacheServiceFactory(config).get_service() - data_service = DataServiceFactory(config, cache_service).get_service() - metadata: List[SDTMDatasetMetadata] = [ - data_service.get_raw_dataset_metadata(dataset_name=path) - for path in dataset_paths - ] - return DatasetMetadataSerializer(metadata).data + factory = DataServiceFactory(config, cache_service) + data_service = factory.get_data_service(dataset_paths=dataset_paths) + datasets_metadata: List[SDTMDatasetMetadata] = data_service.get_datasets() + return DatasetMetadataSerializer(datasets_metadata).data diff --git a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py index dec0fc4d3..24b4cdbed 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py @@ -10,6 +10,7 @@ ) from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @pytest.mark.parametrize( @@ -103,7 +104,7 @@ def test_contents_define_variables_dataset_builder( data_processor=None, dataset_path=None, datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py index e789c8a23..739324010 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py @@ -8,6 +8,7 @@ ) from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset import PandasDataset, DaskDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @pytest.mark.parametrize( @@ -125,7 +126,7 @@ def test_contents_define_vlm_dataset_builder( data_processor=None, dataset_path=None, datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py index c48c71b37..4d58b504a 100644 --- a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py @@ -111,6 +111,15 @@ def test_build_split_datasets(mock_build): data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) original_get_metadata = data_service.get_dataset_metadata + data_service._datasets_metadata = { + "DM": SDTMDatasetMetadata( + name="DM", + label="Demographics", + full_path="/path/to/dm.xpt", + filename="dm.xpt", + ) + } + metadata_df = pd.DataFrame( [ { @@ -142,7 +151,7 @@ def test_build_split_datasets(mock_build): standard_version="", standard_substandard=None, ) - result = builder.build_split_datasets("dm.xpt") + result = builder.build_split_datasets("DM") assert data_service.get_dataset_metadata.called expected_columns = { diff --git a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py index 4230041dd..dfc43d35b 100644 --- a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py @@ -133,9 +133,14 @@ def test_concat_with_split_datasets(): ) data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + # Set up metadata in the data service + data_service._datasets_metadata = { + "AE1": ae1_metadata, + "AE2": ae2_metadata, + } data_service.get_dataset = MagicMock( side_effect=lambda dataset_name, **kwargs: PandasDataset( - ae1_data if dataset_name == "ae1.xpt" else ae2_data + ae1_data if dataset_name == "AE1" else ae2_data ) ) metadata_df = pd.DataFrame.from_dict( diff --git a/tests/unit/test_dataset_preprocessor.py b/tests/unit/test_dataset_preprocessor.py index de6a1ce57..af3e5777e 100644 --- a/tests/unit/test_dataset_preprocessor.py +++ b/tests/unit/test_dataset_preprocessor.py @@ -614,8 +614,8 @@ def test_preprocess( # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ts.xpt"): ts_dataset, + "AE": ae_dataset, + "TS": ts_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -634,7 +634,9 @@ def test_preprocess( preprocessor = DatasetPreprocessor( ec_dataset, SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, full_path=os.path.join("path", "ec.xpt") + name="EC", + first_record={"DOMAIN": "EC"}, + full_path=os.path.join("path", "ec.xpt"), ), data_service, InMemoryCacheService(), @@ -642,8 +644,12 @@ def test_preprocess( preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( dataset_rule_equal_to, [ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), - SDTMDatasetMetadata(first_record={"DOMAIN": "TS"}, filename="ts.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt" + ), ], ) assert preprocessed_dataset.data.equals(expected_dataset.data) @@ -854,8 +860,8 @@ def test_preprocess_relrec_dataset( # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "relrec.xpt"): relrec_dataset, + "AE": ae_dataset, + "RELREC": relrec_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -877,6 +883,7 @@ def test_preprocess_relrec_dataset( preprocessor = DatasetPreprocessor( ec_dataset, SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, full_path=os.path.join("path", "ec.xpt"), ), @@ -886,7 +893,9 @@ def test_preprocess_relrec_dataset( preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( relrec_rule, [ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), SDTMDatasetMetadata(name="RELREC", filename="relrec.xpt"), ], ) @@ -938,8 +947,8 @@ def test_preprocess_with_merge_comparison( ) path_to_dataset_map: dict = { - os.path.join("study_id", "data_bundle_id", "ae.xpt"): match_dataset, - os.path.join("study_id", "data_bundle_id", "ec.xpt"): target_dataset, + "AE": match_dataset, + "EC": target_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -949,6 +958,7 @@ def test_preprocess_with_merge_comparison( preprocessor = DatasetPreprocessor( target_dataset, SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, full_path=os.path.join("study_id", "data_bundle_id", "ec.xpt"), ), @@ -958,8 +968,12 @@ def test_preprocess_with_merge_comparison( result: pd.DataFrame = preprocessor.preprocess( rule=dataset_rule_equal_to_compare_same_value, datasets=[ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), - SDTMDatasetMetadata(first_record={"DOMAIN": "EC"}, filename="ec.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), ], ) assert "NOTVISIT" in result @@ -995,7 +1009,7 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) preprocessor = DatasetPreprocessor( main_dataset, - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), data_service, InMemoryCacheService(), ) @@ -1106,7 +1120,9 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( preprocessor = DatasetPreprocessor( ae_dataset, SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, full_path=os.path.join("path", "ae.xpt") + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=os.path.join("path", "ae.xpt"), ), data_service, InMemoryCacheService(), @@ -1183,7 +1199,7 @@ def test_preprocess_specific_suppae_dataset( data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) preprocessor = DatasetPreprocessor( ae_dataset, - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), data_service, InMemoryCacheService(), ) diff --git a/tests/unit/test_operations/test_day_data_validator.py b/tests/unit/test_operations/test_day_data_validator.py index e1bc801be..74b113a8b 100644 --- a/tests/unit/test_operations/test_day_data_validator.py +++ b/tests/unit/test_operations/test_day_data_validator.py @@ -55,7 +55,7 @@ def test_day_data_calculation( config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() datasets_map = { - "dm.xpt": dataset_type.from_dict( + "DM": dataset_type.from_dict( { "RFSTDTC": [ "1997-07-16T19:20:30", @@ -73,18 +73,15 @@ def test_day_data_calculation( datasets = [ SDTMDatasetMetadata( **{ + "name": "DM", "first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt", "full_path": "/path/to/dm.xpt", } ) ] - mock_data_service.get_dataset.side_effect = ( - lambda *args, **kwargs: datasets_map.get( - args.split("/")[-1] - if args - else kwargs.get("dataset_name", "").split("/")[-1] - ) + mock_data_service.get_dataset.side_effect = lambda **kwargs: datasets_map.get( + kwargs.get("dataset_name") ) operation_params.datasets = datasets operation_params.dataframe = PandasDataset.from_dict(data) diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index 0f40379ec..ff8cbb5b8 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -117,6 +117,7 @@ def test_get_parent_column_order_from_library( ): datasets: List[SDTMDatasetMetadata] = [ SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt", full_path="ae.xpt", @@ -129,7 +130,7 @@ def test_get_parent_column_order_from_library( "AESEQ": [1, 2, 3], } ) - path_to_dataset_map: dict = {"ae.xpt": ae} + path_to_dataset_map: dict = {"AE": ae} with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], @@ -300,11 +301,13 @@ def test_get_parent_findings_class_column_order_from_library( ): datasets: List[dict] = [ { + "name": "AE", "first_record": {"DOMAIN": "AE"}, "filename": "ae.xpt", "full_path": "ae.xpt", }, { + "name": "EC", "first_record": {"DOMAIN": "EC"}, "filename": "ec.xpt", "full_path": "ec.xpt", @@ -336,8 +339,8 @@ def test_get_parent_findings_class_column_order_from_library( } ) path_to_dataset_map: dict = { - "ae.xpt": ae, - "ec.xpt": ec, + "AE": ae, + "EC": ec, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 5de91eb06..600ed07d6 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -235,8 +235,8 @@ def test_validate_rule_cross_dataset_check( mock_get_dataset_class.return_value = None # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ec.xpt"): ec_dataset, + "AE": ae_dataset, + "EC": ec_dataset, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", @@ -350,8 +350,8 @@ def test_validate_one_to_one_rel_across_datasets(dataset_rule_one_to_one_related ) ) path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ec.xpt"): ec_dataset, + "AE": ae_dataset, + "EC": ec_dataset, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", @@ -492,8 +492,8 @@ def test_validate_is_contained_by_distinct(mock_rule_distinct_operation: dict): ) path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "dm.xpt"): dm_dataset, + "AE": ae_dataset, + "DM": dm_dataset, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", @@ -1016,7 +1016,10 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): df = PandasDataset(pd.DataFrame.from_dict({"AESTDY": [11, 12, 40, 59, 59]})) mock_get_dataset.return_value = df dataset_metadata = SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, filename="bundle", full_path="study/bundle" + name="AE", + first_record={"DOMAIN": "AE"}, + filename="bundle", + full_path="study/bundle", ) validation_result: List[dict] = RulesEngine( standard="sdtmig" @@ -1135,6 +1138,7 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): ): datasets = [ SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="bundle", full_path="study/bundle", @@ -1221,6 +1225,7 @@ def test_validate_single_dataset_not_equal_to( return_value=df, ): dataset_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="data_bundle", full_path="study/data_bundle", @@ -1905,8 +1910,8 @@ def test_validate_record_in_parent_domain( ) ) path_to_dataset_map: dict = { - os.path.join("path", "ec.xpt"): ec_dataset, - os.path.join("path", "suppec.xpt"): suppec_dataset, + "EC": ec_dataset, + "SUPPEC": suppec_dataset, } mock_get_dataset_class.return_value = None with patch( @@ -1988,6 +1993,7 @@ def test_validate_additional_columns( return_value=dataset, ): datset_metadata = SDTMDatasetMetadata( + name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt", full_path="CDISC01/test/ts.xpt", @@ -2077,8 +2083,8 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( ) path_to_dataset_map: dict = { - os.path.join("study_id", "data_bundle_id", "ie.xpt"): target_dataset, - os.path.join("study_id", "data_bundle_id", "ti.xpt"): operation_result_dataset, + "IE": target_dataset, + "TI": operation_result_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -2250,7 +2256,10 @@ def test_dataset_references_invalid_whodrug_terms( {"classes": [{"name": "EVENTS", "datasets": [{"name": "AE"}]}]}, ) dataset_metadata = SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, filename="dataset_path", full_path="dataset_path" + name="AE", + first_record={"DOMAIN": "AE"}, + filename="dataset_path", + full_path="dataset_path", ) # run validation @@ -2464,6 +2473,7 @@ def test_validate_variables_order_against_library_metadata( model_metadata=cache_data, standard_metadata=standard_data ) dataset_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="dataset_path", full_path="dataset_path", diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index 37da3e277..f8e1e61b9 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -27,7 +27,9 @@ ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.read_metadata") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService._LocalDataService__read_metadata" +) def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: dict): # mock file read mock_read_metadata.return_value = dataset_metadata @@ -36,9 +38,11 @@ def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: d cache_mock = MagicMock() cache_mock.get = lambda cache_key: None - data_service = LocalDataService(cache_mock, MagicMock(), MagicMock()) + data_service = LocalDataService( + cache_mock, MagicMock(), MagicMock(), dataset_paths=["dataset_path"] + ) actual_metadata: PandasDataset = data_service.get_dataset_metadata( - dataset_name="dataset_name" + dataset_name=dataset_metadata["contents_metadata"]["dataset_name"] ) assert actual_metadata.equals( PandasDataset.from_dict( @@ -57,7 +61,9 @@ def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: d ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.read_metadata") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService._LocalDataService__read_metadata" +) def test_get_raw_dataset_metadata( mock_read_metadata: MagicMock, dataset_metadata: dict ): @@ -68,9 +74,11 @@ def test_get_raw_dataset_metadata( cache_mock = MagicMock() cache_mock.get_dataset = lambda cache_key: None - data_service = LocalDataService(cache_mock, MagicMock(), MagicMock()) + data_service = LocalDataService( + cache_mock, MagicMock(), MagicMock(), dataset_paths=["dataset_path"] + ) actual_metadata: SDTMDatasetMetadata = data_service.get_raw_dataset_metadata( - dataset_name="dataset_name" + dataset_name=dataset_metadata["contents_metadata"]["dataset_name"] ) expected_metadata = SDTMDatasetMetadata( name=dataset_metadata["contents_metadata"]["dataset_name"], @@ -221,7 +229,6 @@ def test_get_dataset_class(dataset_metadata, data, expected_class): ) class_name = data_service.get_dataset_class( df, - dataset_metadata.get("filename"), [SDTMDatasetMetadata(**dataset_metadata)], SDTMDatasetMetadata(**dataset_metadata), ) @@ -239,9 +246,7 @@ def test_get_dataset_class_without_standard_and_version(): first_record={"DOMAIN": "DM"}, filename="dm.xpt" ) with pytest.raises(Exception): - data_service.get_dataset_class( - df, "dm.xpt", [dataset_metadata], dataset_metadata - ) + data_service.get_dataset_class(df, [dataset_metadata], dataset_metadata) def test_get_dataset_class_associated_domains(): @@ -249,18 +254,18 @@ def test_get_dataset_class_associated_domains(): SDTMDatasetMetadata(**dataset) for dataset in [ { + "name": "APDM", "first_record": {"DOMAIN": "APDM", "APID": "AP001"}, "filename": "apdm.xpt", }, - {"first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt"}, + {"name": "DM", "first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt"}, ] ] ap_dataset = PandasDataset.from_dict({"DOMAIN": ["APDM"], "APID": ["test"]}) ce_dataset = PandasDataset.from_dict({"DOMAIN": ["DM"]}) - data_bundle_path = "cdisc/databundle" path_to_dataset_map: dict = { - os.path.join(data_bundle_path, "apdm.xpt"): ap_dataset, - os.path.join(data_bundle_path, "dm.xpt"): ce_dataset, + "APDM": ap_dataset, + "DM": ce_dataset, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", @@ -306,10 +311,8 @@ def test_get_dataset_class_associated_domains(): standard_version="3-4", library_metadata=library_metadata, ) - filepath = f"{data_bundle_path}/apdm.xpt" class_name = data_service.get_dataset_class( ap_dataset, - filepath, datasets, datasets[0], ) diff --git a/tests/unit/test_services/test_data_service/test_dummy_data_service.py b/tests/unit/test_services/test_data_service/test_dummy_data_service.py index f625306e8..24b45e051 100644 --- a/tests/unit/test_services/test_data_service/test_dummy_data_service.py +++ b/tests/unit/test_services/test_data_service/test_dummy_data_service.py @@ -50,7 +50,7 @@ def test_get_dataset(): data_service = DummyDataService( MagicMock(), MagicMock(), MagicMock(), data=datasets ) - dataset = data_service.get_dataset("ae.xpt") + dataset = data_service.get_dataset("AE") assert dataset["AESEQ"].to_list() == [ 1, 2, @@ -102,7 +102,7 @@ def test_get_dataset_metadata(): cache_mock = MagicMock() cache_mock.get_dataset.return_value = None data_service = DummyDataService(cache_mock, MagicMock(), MagicMock(), data=datasets) - metadata = data_service.get_dataset_metadata(dataset_name="ae.xpt") + metadata = data_service.get_dataset_metadata(dataset_name="AE") assert metadata["dataset_label"][0] == "ADVERSE EVENTS" assert metadata["dataset_name"][0] == "AE" assert metadata["dataset_size"][0] == 2000 @@ -131,7 +131,7 @@ def test_get_variables_metadata(): data_service = DummyDataService( MagicMock(), MagicMock(), MagicMock(), data=datasets ) - metadata = data_service.get_variables_metadata("/ae.xpt") + metadata = data_service.get_variables_metadata("AE") assert metadata["variable_name"].iloc[0] == "AESEQ" assert metadata["variable_label"].iloc[0] == "AE Sequence" assert metadata["variable_data_type"].iloc[0] == "integer" diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index bab2c0827..b93dee096 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -95,7 +95,7 @@ def test_get_dataset_metadata(expected_result): cache_mock, MagicMock(), MagicMock(), dataset_path=dataset_path ) metadata = data_service.get_dataset_metadata( - dataset_name=expected_result["dataset_location"] + dataset_name=expected_result["dataset_name"] ) assert metadata["dataset_label"][0] == expected_result["dataset_label"] assert metadata["dataset_name"][0] == expected_result["dataset_name"] @@ -106,7 +106,7 @@ def test_get_dataset_metadata(expected_result): @pytest.mark.parametrize( "dataset_name", - ("ecaa.xpt", "ecbb.xpt", "suppec.xpt"), + ("ECAA", "ECBB", "SUPPEC"), ) def test_get_variables_metadata(dataset_name): dataset_path = f"{os.path.dirname(__file__)}/../../../resources/test_datasets.xlsx" @@ -184,7 +184,7 @@ def test_na_value_preserved_not_converted_to_nan(): ) # Get the dataset - dataset = data_service.get_dataset(dataset_name="test.xpt") + dataset = data_service.get_dataset(dataset_name="TEST") # Assertions assert isinstance(dataset, PandasDataset) diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 56a43b2b4..4c5eeffd9 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -15,7 +15,9 @@ def test_read_metadata(): """ dataset_path = f"{os.path.dirname(__file__)}/../../../resources/test_dataset.xpt" data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) - metadata = data_service.read_metadata(dataset_path) + metadata = data_service._LocalDataService__read_metadata( # pyright: ignore[reportAttributeAccessIssue] + dataset_path + ) assert "file_metadata" in metadata assert metadata["file_metadata"].get("name") == "test_dataset.xpt" assert metadata["file_metadata"].get("file_size") == 823120 @@ -55,8 +57,11 @@ def test_get_dataset(dataset_implementation): config=ConfigService(), cache_service=mock_cache, dataset_implementation=dataset_implementation, + dataset_paths=[dataset_path], ) - data = data_service.get_dataset(dataset_name=dataset_path) + # Get the dataset name from metadata + dataset_name = list(data_service._datasets_metadata.keys())[0] + data = data_service.get_dataset(dataset_name=dataset_name) assert isinstance(data, dataset_implementation) @@ -74,8 +79,11 @@ def test_get_variables_metadata(dataset_implementation): config=ConfigService(), cache_service=mock_cache, dataset_implementation=dataset_implementation, + dataset_paths=[dataset_path], + ) + data = data_service.get_variables_metadata( + dataset_name="TEST_ADAM_DATASET", datasets=[] ) - data = data_service.get_variables_metadata(dataset_name=dataset_path, datasets=[]) assert isinstance(data, dataset_implementation) expected_keys = [ "variable_name", diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index c2461b0e9..2ae92f5e5 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -28,9 +28,8 @@ def test_list_dataset_metadata_with_valid_paths(self): os.path.join("tests", "resources", dataset_file), ], ) - expected_output = """[ - { - "domain": null, + expected_output = """{ + "domain": "null", "filename": "USDM_EliLilly_NCT03421379_Diabetes.json",""" self.assertEqual(result.exit_code, 0) self.assertIn(expected_output, result.output) @@ -58,8 +57,7 @@ def test_get_dataset(domain_name, record_count): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=mock_cache, dataset_path=dataset_path ) - dataset_name = os.path.join(dataset_path, "{}.json".format(domain_name)) - data = data_service.get_dataset(dataset_name=dataset_name) + data = data_service.get_dataset(dataset_name=domain_name) assert isinstance(data, PandasDataset) assert len(data) == record_count @@ -71,9 +69,7 @@ def test_get_raw_dataset_metadata(): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=cache, dataset_path=dataset_path ) - data = data_service.get_raw_dataset_metadata( - dataset_name=os.path.join(dataset_path, "Code.json") - ) + data = data_service.get_raw_dataset_metadata(dataset_name="Code") assert data.record_count == 117 @@ -136,9 +132,7 @@ def test_get_variables_metadata(): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=mock_cache, dataset_path=dataset_path ) - data = data_service.get_variables_metadata( - dataset_name=os.path.join(dataset_path, "StudyIdentifier.json") - ) + data = data_service.get_variables_metadata(dataset_name="StudyIdentifier") assert isinstance(data, PandasDataset) expected_keys = [ "variable_name", From 4b5a0819cbadca1bad6acee945a8a39fca78ae31 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 14 Apr 2026 13:54:01 -0400 Subject: [PATCH 03/18] more test fixes --- .../content_metadata_dataset_builder.py | 2 +- .../dataset_metadata_define_dataset_builder.py | 10 +++++----- .../dataset_metadata_values_builder.py | 2 +- .../interfaces/data_service_interface.py | 1 - cdisc_rules_engine/operations/distinct.py | 2 +- cdisc_rules_engine/operations/extract_metadata.py | 2 +- cdisc_rules_engine/utilities/data_processor.py | 4 +++- cdisc_rules_engine/utilities/dataset_preprocessor.py | 2 +- cdisc_rules_engine/utilities/rule_processor.py | 4 +--- cdisc_rules_engine/utilities/sdtm_utilities.py | 2 +- .../test_content_metadata_dataset_builder.py | 9 +++++++-- .../test_data_service/test_dummy_data_service.py | 2 +- 12 files changed, 23 insertions(+), 19 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py index f79e05f65..1c325d9f5 100644 --- a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py @@ -14,7 +14,7 @@ def build(self): """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) return self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, + dataset_name=self.dataset_metadata.name, size_unit=size_unit, datasets=self.datasets, ) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index fd42ed2d9..20bdb6d4c 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -136,18 +136,18 @@ def _get_dataset_dataframe(self): logger.info(f"No datasets metadata is provided in {__name__}.") else: datasets = self.dataset_implementation() - for dataset in self.datasets: + for dataset_metadata in self.datasets: ds_metadata = None try: ds_metadata = self.data_service.get_dataset_metadata( - dataset_name=dataset.filename + dataset_name=dataset_metadata.name ) ds_metadata.data["dataset_domain"] = getattr( - dataset, "domain", None + dataset_metadata, "domain", None ) - if dataset.first_record: + if dataset_metadata.first_record: ds_metadata.data["dataset_columns"] = [ - list(dataset.first_record.keys()) + list(dataset_metadata.first_record.keys()) ] else: ds_metadata.data["dataset_columns"] = [[]] diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py index 8fb48d9f3..01d9ae702 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py @@ -21,7 +21,7 @@ def build(self): """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, + dataset_name=self.dataset_metadata.name, size_unit=size_unit, datasets=self.datasets, ) diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 8e3ab1778..03b150119 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -107,7 +107,6 @@ def read_data(self, file_path: str) -> IOBase: def get_dataset_class( self, dataset: DatasetInterface, - file_path: str, datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index 147c77426..a7485a4f6 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -71,7 +71,7 @@ def get_existing_column_names(group): def _get_referenced_datasets(self): referenced_datasets = {} for dataset_metadata in self.data_service.get_datasets(): - dataset = self.data_service.get_dataset(dataset_metadata.name) + dataset = self.data_service.get_dataset(dataset_name=dataset_metadata.name) referenced_datasets[dataset_metadata.name] = dataset return referenced_datasets diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 5709f22d3..ff13163cf 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -7,7 +7,7 @@ class ExtractMetadata(BaseOperation): def _execute_operation(self): # get metadata metadata: pd.DataFrame = self.data_service.get_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets + dataset_name=self.params.domain, datasets=self.params.datasets ) # extract target value. Metadata df always has one row diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index c184db6a8..026df1761 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -112,7 +112,9 @@ def merge_on_relrec_record( if not dataset_metadata: return DatasetInterface() right_dataset: DatasetInterface = ( - dataset_preprocessor._data_service.get_dataset(dataset_metadata.name) + dataset_preprocessor._data_service.get_dataset( + dataset_name=dataset_metadata.name + ) ) variables_with_wildcards = { source: f"RELREC.{target}" diff --git a/cdisc_rules_engine/utilities/dataset_preprocessor.py b/cdisc_rules_engine/utilities/dataset_preprocessor.py index 28ec832cc..b8a54f725 100644 --- a/cdisc_rules_engine/utilities/dataset_preprocessor.py +++ b/cdisc_rules_engine/utilities/dataset_preprocessor.py @@ -127,7 +127,7 @@ def preprocess( # noqa # Try to download the dataset try: other_dataset: DatasetInterface = self._data_service.get_dataset( - dataset_metadata.name + dataset_name=dataset_metadata.name ) except Exception as e: raise PreprocessingError( diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index c71495ecd..5e452cad8 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -246,7 +246,6 @@ def rule_applies_to_class( ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - dataset_metadata.full_path, datasets, dataset_metadata, ) @@ -260,7 +259,6 @@ def rule_applies_to_class( ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - dataset_metadata.full_path, datasets, dataset_metadata, ) @@ -486,7 +484,7 @@ def _execute_operation( f"Core ID: {operation_params.core_id}" ) operation_params.dataframe = self.data_service.get_dataset( - dataset_metadata.name + dataset_name=dataset_metadata.name ) # call the operation diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index d53c3f133..858256354 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -139,7 +139,7 @@ def get_variables_metadata_from_standard( # noqa ) else: class_name = data_service._handle_custom_domains( - data_service.get_dataset(dataset_metadata.name), + data_service.get_dataset(dataset_name=dataset_metadata.name), dataset_metadata, dataset_path, datasets, diff --git a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py index d54064b43..26e849f89 100644 --- a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.data_services import DummyDataService from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset from cdisc_rules_engine.models.rule_conditions import ConditionCompositeFactory @@ -299,7 +300,9 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): data_processor=None, dataset_path=test_data["datasets"][0]["filename"], datasets=test_data.get("datasets", {}), - dataset_metadata=test_data["datasets"][0], + dataset_metadata=SDTMDatasetMetadata( + name="QSCG", + ), define_xml_path=None, standard="sdtmig", standard_version="3-4", @@ -326,7 +329,9 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): data_processor=None, dataset_path=test_data["datasets"][1]["filename"], datasets=test_data.get("datasets", {}), - dataset_metadata=test_data["datasets"][1], + dataset_metadata=SDTMDatasetMetadata( + name="QSPG", + ), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_services/test_data_service/test_dummy_data_service.py b/tests/unit/test_services/test_data_service/test_dummy_data_service.py index 24b45e051..4c2e6723b 100644 --- a/tests/unit/test_services/test_data_service/test_dummy_data_service.py +++ b/tests/unit/test_services/test_data_service/test_dummy_data_service.py @@ -50,7 +50,7 @@ def test_get_dataset(): data_service = DummyDataService( MagicMock(), MagicMock(), MagicMock(), data=datasets ) - dataset = data_service.get_dataset("AE") + dataset = data_service.get_dataset(dataset_name="AE") assert dataset["AESEQ"].to_list() == [ 1, 2, From 16bee96055b0ab07915fc50d60aec9f4ad082d06 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 14 Apr 2026 14:37:50 -0400 Subject: [PATCH 04/18] test suite fixes --- .../interfaces/data_service_interface.py | 11 +++++++++++ cdisc_rules_engine/operations/base_operation.py | 2 +- .../services/data_services/base_data_service.py | 4 ++-- cdisc_rules_engine/utilities/sdtm_utilities.py | 7 +++---- tests/unit/test_utilities/test_sdtm_utils.py | 10 +++++----- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 03b150119..18916c599 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -131,6 +131,17 @@ def to_parquet(self, file_path: str) -> str: Converts a given file_path to parquet. Returns path to new file """ + @abstractmethod + def handle_custom_domains( + self, + dataset: DatasetInterface, + dataset_metadata: SDTMDatasetMetadata, + datasets: Iterable[SDTMDatasetMetadata], + ) -> str | None: + """ + Handles custom domains by returning the appropriate class name based on the dataset contents. + """ + @staticmethod @abstractmethod def is_valid_data(dataset_paths: Sequence[str]) -> bool: diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 7d1647c05..2d4f1b761 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -230,7 +230,7 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: library_metadata=self.library_metadata, data_service=self.data_service, dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets + dataset_name=self.params.domain, datasets=self.params.datasets ), datasets=self.params.datasets, dataset_path=self.params.dataset_path, diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 21af8b738..c35cef665 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -189,7 +189,7 @@ def get_dataset_class( name = class_data.get("name") if name: return convert_library_class_name_to_ct_class(name) - return self._handle_custom_domains(dataset, dataset_metadata, datasets) + return self.handle_custom_domains(dataset, dataset_metadata, datasets) def get_data_structure( self, @@ -276,7 +276,7 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada Dictionary mapping dataset name to SDTMDatasetMetadata """ - def _handle_custom_domains( + def handle_custom_domains( self, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index 858256354..8c091a68f 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -138,10 +138,9 @@ def get_variables_metadata_from_standard( # noqa IG_class_details.get("name") ) else: - class_name = data_service._handle_custom_domains( + class_name = data_service.handle_custom_domains( data_service.get_dataset(dataset_name=dataset_metadata.name), dataset_metadata, - dataset_path, datasets, ) model_class_details = get_class_metadata(model_details, class_name) @@ -382,8 +381,8 @@ def get_variables_metadata_from_standard_model( # noqa IG_class_details.get("name") ) else: - class_name = data_service._handle_custom_domains( - dataframe, dataset_metadata, dataset_path, datasets + class_name = data_service.handle_custom_domains( + dataframe, dataset_metadata, datasets ) if class_name in DETECTABLE_CLASSES: model_class_details = get_class_metadata(model_details, class_name) diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py index 73a1da9f7..0b4ebc37e 100644 --- a/tests/unit/test_utilities/test_sdtm_utils.py +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -29,7 +29,7 @@ def library_metadata(): def mock_data_service(): """Mock data service for tests that require it.""" mock_service = Mock() - mock_service._handle_custom_domains = Mock(return_value=None) + mock_service.handle_custom_domains = Mock(return_value=None) return mock_service @@ -225,7 +225,7 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets def test_custom_domain_events_class(library_metadata, mock_data_service, mock_datasets): """Test custom domain detection and variable metadata retrieval for EVENTS class.""" dataset_metadata = SDTMDatasetMetadata(name="ZZ", first_record={"DOMAIN": "ZZ"}) - mock_data_service._handle_custom_domains = Mock(return_value="EVENTS") + mock_data_service.handle_custom_domains = Mock(return_value="EVENTS") variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, @@ -233,7 +233,7 @@ def test_custom_domain_events_class(library_metadata, mock_data_service, mock_da "/path/to/zz.xpt", mock_datasets, ) - mock_data_service._handle_custom_domains.assert_called_once() + mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "DOMAIN" for var in variables) assert any(var["name"] == "ZZTERM" for var in variables) @@ -245,7 +245,7 @@ def test_custom_domain_findings_class( ): """Test custom domain detection and variable metadata retrieval for FINDINGS class.""" dataset_metadata = SDTMDatasetMetadata(name="XX", first_record={"DOMAIN": "XX"}) - mock_data_service._handle_custom_domains = Mock(return_value="FINDINGS") + mock_data_service.handle_custom_domains = Mock(return_value="FINDINGS") variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, @@ -253,7 +253,7 @@ def test_custom_domain_findings_class( "/path/to/xx.xpt", mock_datasets, ) - mock_data_service._handle_custom_domains.assert_called_once() + mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "DOMAIN" for var in variables) assert any(var["name"] == "USUBJID" for var in variables) From c02661fec5561426cafcede6cd820c30a388c944 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 12:31:10 -0400 Subject: [PATCH 05/18] simplify dummy data service --- .../dummy_models/dummy_dataset.py | 9 ------ .../data_services/data_service_factory.py | 27 ++++++---------- .../data_services/dummy_data_service.py | 32 +++++++------------ cdisc_rules_engine/utilities/utils.py | 17 ++++++++++ .../test_contents_define_dataset_builder.py | 26 +++++++-------- tests/unit/test_dummy_dataset.py | 18 ++--------- 6 files changed, 53 insertions(+), 76 deletions(-) diff --git a/cdisc_rules_engine/dummy_models/dummy_dataset.py b/cdisc_rules_engine/dummy_models/dummy_dataset.py index 34288d1b1..042a5aa8c 100644 --- a/cdisc_rules_engine/dummy_models/dummy_dataset.py +++ b/cdisc_rules_engine/dummy_models/dummy_dataset.py @@ -45,14 +45,5 @@ def __init__(self, dataset_data: dict): self.record_count = len(self.data.index) - def get_metadata(self): - return { - "dataset_size": [self.file_size or 1000], - "dataset_name": [self.name or "test"], - "dataset_label": [self.label or "test"], - "filename": [self.filename], - "record_count": [self.record_count], - } - def __repr__(self): return asdict(self).__repr__() diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py index b7bdf4f6b..7a9f107b1 100644 --- a/cdisc_rules_engine/services/data_services/data_service_factory.py +++ b/cdisc_rules_engine/services/data_services/data_service_factory.py @@ -1,6 +1,4 @@ -from typing import Iterable, List, Type - -from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset +from typing import Iterable, Type from cdisc_rules_engine.interfaces import ( CacheServiceInterface, ConfigInterface, @@ -76,8 +74,15 @@ def get_data_service( ) elif DummyDataService.is_valid_data(dataset_paths, encoding=self.encoding): """Get dummy data service""" - return self.get_dummy_data_service( - data=DummyDataService.get_data(dataset_paths, encoding=self.encoding) + return self.get_service( + "dummy", + standard=self.standard, + standard_version=self.standard_version, + standard_substandard=self.standard_substandard, + library_metadata=self.library_metadata, + dataset_path=dataset_paths[0], + dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) elif ExcelDataService.is_valid_data(dataset_paths): """Get Excel file to dataset data service""" @@ -106,18 +111,6 @@ def get_data_service( tables_csv_path=self.tables_csv_path, ) - def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface: - return self.get_service( - "dummy", - data=data, - standard=self.standard, - standard_version=self.standard_version, - standard_substandard=self.standard_substandard, - library_metadata=self.library_metadata, - dataset_implementation=self.get_dataset_implementation(), - encoding=self.encoding, - ) - def get_dataset_implementation(self): """ Gets the class that should be used to represent datasets for the rules engine. This class may be dependent on diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index 7953de5ec..a3a98b978 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -1,4 +1,3 @@ -from datetime import datetime from io import IOBase from typing import List, Optional, Sequence @@ -13,6 +12,7 @@ from cdisc_rules_engine.services.data_services import BaseDataService from cdisc_rules_engine.constants import DEFAULT_ENCODING from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.utilities.utils import convert_dataclass_to_superclass class DummyDataService(BaseDataService): @@ -27,7 +27,11 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.encoding = kwargs.get("encoding") or DEFAULT_ENCODING + self.dataset_path: str | None = kwargs.get("dataset_path") self.data: List[DummyDataset] = kwargs.get("data") + if self.data is None and self.dataset_path is not None: + self.data = self.get_data() self.define_xml: str = kwargs.get("define_xml") super(DummyDataService, self).__init__( cache_service, reader_factory, config, **kwargs @@ -72,21 +76,10 @@ def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetada Returns: Dictionary mapping dataset name to SDTMDatasetMetadata """ - result = {} - for dataset in self.data: - dataset_metadata_dict: dict = dataset.get_metadata() - metadata = SDTMDatasetMetadata( - name=dataset_metadata_dict["dataset_name"][0], - first_record={"DOMAIN": dataset_metadata_dict["dataset_name"][0]}, - label=dataset_metadata_dict["dataset_label"][0], - modification_date=datetime.now().isoformat(), - filename=dataset_metadata_dict["filename"][0], - file_size=dataset_metadata_dict["dataset_size"][0], - full_path=dataset_metadata_dict["filename"][0], - record_count=dataset_metadata_dict["record_count"][0], - ) - result[metadata.name] = metadata - return result + return { + dataset.name: convert_dataclass_to_superclass(dataset, SDTMDatasetMetadata) + for dataset in self.data + } def get_variables_metadata(self, dataset_name: str, **params) -> PandasDataset: metadata_to_return = { @@ -170,11 +163,8 @@ def to_parquet(self, file_path: str) -> str: return len(df.index), temp_file.name return 0, "" - @staticmethod - def get_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING): - json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file( - dataset_paths[0] - ) + def get_data(self): + json = JSONReader(encoding=self.encoding).from_file(self.dataset_path) return [DummyDataset(data) for data in json.get("datasets", [])] @staticmethod diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index df8ade3a6..00648c0d2 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -10,6 +10,7 @@ import re import ast import pandas as pd +from dataclasses import fields from datetime import datetime from typing import Callable, List, Optional, Union from uuid import UUID @@ -21,6 +22,22 @@ from cdisc_rules_engine.constants.adam_products import ADAM_PRODUCTS +def convert_dataclass_to_superclass[T](instance: object, superclass: type[T]) -> T: + """ + Convert a dataclass subclass instance to its superclass by copying all fields. + + Args: + instance: The subclass instance to convert + superclass: The target superclass type + + Returns: + A new instance of the superclass with fields copied from the subclass instance + """ + return superclass( + **{field.name: getattr(instance, field.name) for field in fields(superclass)} + ) + + def convert_file_size(size_in_bytes: int, desired_unit: str) -> float: """ Converts file size from bytes to any of the following units: diff --git a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py index 865f39daa..63e02e8d9 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py @@ -458,7 +458,7 @@ "2012-12-05", "2012-12-05", ], - "dataset_size": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "dataset_size": [0, 0, 0, 0, 0, 0, 0, 0], "dataset_name": [ "ECAA", "ECAA", @@ -555,7 +555,7 @@ "ECTRT": ["ZANOMALINE", "ZANOMALINE", "ZANOMALINE"], "ECDOSE": [5, 5, 5], "ECSTDTC": ["2013-12-01", "2012-12-02", "2012-12-03"], - "dataset_size": [1000, 1000, 1000], + "dataset_size": [0, 0, 0], "dataset_name": ["ECBB", "ECBB", "ECBB"], "dataset_label": [ "Exposure as Collected BB", @@ -683,17 +683,17 @@ "INVESTIGATOR", ], "dataset_size": [ - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, ], "dataset_name": [ "SUPPEC", diff --git a/tests/unit/test_dummy_dataset.py b/tests/unit/test_dummy_dataset.py index 39744a903..a30320eaf 100644 --- a/tests/unit/test_dummy_dataset.py +++ b/tests/unit/test_dummy_dataset.py @@ -11,19 +11,5 @@ def test_valid_dataset_data(): ] dataset = DummyDataset(dataset_data[0]) assert dataset.domain == "AE" - - -def test_get_dataset_metadata(): - dataset_data = [ - { - "domain": "AE", - "filename": "ae.xpt", - "label": "Adverse Events", - "records": {"AESEQ": [1, 2, 3, 4]}, - } - ] - dataset = DummyDataset(dataset_data[0]) - metadata = dataset.get_metadata() - assert "dataset_name" in metadata - assert metadata["dataset_name"] == ["AE"] - assert metadata["dataset_label"] == ["Adverse Events"] + assert dataset.name == "AE" + assert dataset.label == "Adverse Events" From a4f337c41ee6caa6a039d861f5064cf5462ddf23 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 16:41:05 -0400 Subject: [PATCH 06/18] fixed dataset name in reports --- .../constants/metadata_columns.py | 4 +- cdisc_rules_engine/models/actions.py | 25 +-- cdisc_rules_engine/rules_engine.py | 12 +- .../services/reporting/usdm_report_data.py | 11 +- .../utilities/sdtm_utilities.py | 6 +- tests/unit/test_actions.py | 6 +- .../test_contents_dataset_builder.py | 60 +++--- .../test_values_variables_metadata_builder.py | 40 ++-- tests/unit/test_rules_engine.py | 190 +++++++++--------- tests/unit/test_usdm_data.py | 6 +- 10 files changed, 184 insertions(+), 176 deletions(-) diff --git a/cdisc_rules_engine/constants/metadata_columns.py b/cdisc_rules_engine/constants/metadata_columns.py index 8d06e840e..9f171d0d8 100644 --- a/cdisc_rules_engine/constants/metadata_columns.py +++ b/cdisc_rules_engine/constants/metadata_columns.py @@ -1,3 +1,3 @@ -SOURCE_FILENAME = "source_filename" +SOURCE_DATASET_NAME = "source_dataset_name" SOURCE_ROW_NUMBER = "source_row_number" -METADATA_COLUMNS = {SOURCE_FILENAME, SOURCE_ROW_NUMBER} +METADATA_COLUMNS = {SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER} diff --git a/cdisc_rules_engine/models/actions.py b/cdisc_rules_engine/models/actions.py index 63682ee4c..c03fd54df 100644 --- a/cdisc_rules_engine/models/actions.py +++ b/cdisc_rules_engine/models/actions.py @@ -1,12 +1,11 @@ from typing import List, Optional, Set, Hashable, Iterable -from os import path import pandas as pd from business_rules.actions import BaseActions, rule_action from business_rules.fields import FIELD_TEXT from cdisc_rules_engine.constants import NULL_FLAVORS from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.enums.sensitivity import Sensitivity @@ -452,14 +451,16 @@ def _generate_errors_by_group( return errors_list def _get_dataset_name(self, data: pd.DataFrame) -> str: - source_pathnames = data.get(SOURCE_FILENAME, []) - source_filenames = [ - path.basename(source_pathname) for source_pathname in source_pathnames - ] - source_filename_str = ", ".join( - sorted(set(source_filename or "" for source_filename in source_filenames)) + source_dataset_names = data.get(SOURCE_DATASET_NAME, []) + source_dataset_name_str = ", ".join( + sorted( + set( + source_dataset_name or "" + for source_dataset_name in source_dataset_names + ) + ) ) - return source_filename_str + return source_dataset_name_str def _create_error_object( self, df_row: pd.Series, data: pd.DataFrame @@ -471,15 +472,15 @@ def _create_error_object( json_path: Optional[pd.Series] = data.get("_path") instance_id: Optional[pd.Series] = data.get("id") source_row_number: Optional[pd.Series] = data.get(SOURCE_ROW_NUMBER) - source_filename: Optional[pd.Series] = data.get(SOURCE_FILENAME) + source_dataset_name: Optional[pd.Series] = data.get(SOURCE_DATASET_NAME) row_dict = df_row.to_dict() filtered_dict = {} for key, value in row_dict.items(): filtered_dict[key] = self._filter_null_values(value) error_object = ValidationErrorEntity( dataset=( - path.basename(source_filename[df_row.name]) - if isinstance(source_filename, pd.Series) + source_dataset_name[df_row.name] + if isinstance(source_dataset_name, pd.Series) else "" ), row=( diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 33f371800..1471a9d0f 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -245,7 +245,7 @@ def validate_single_dataset( # No errors were generated, create success error container return [ ValidationErrorContainer( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain, errors=[], ).to_representation() @@ -255,7 +255,7 @@ def validate_single_dataset( f"Skipped dataset {dataset_metadata.name}. Reason: {reason}" ) error_obj = FailedValidationEntity( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, error=SkippedReason.OUTSIDE_SCOPE.value, message=reason, ) @@ -263,7 +263,7 @@ def validate_single_dataset( ValidationErrorContainer( status=ExecutionStatus.SKIPPED.value, message=reason, - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain or "", @@ -283,7 +283,7 @@ def validate_single_dataset( """ ) error_obj: ValidationErrorContainer = self.handle_validation_exceptions( - e, dataset_metadata.filename + e, dataset_metadata.name ) error_obj.domain = dataset_metadata.domain or dataset_metadata.rdomain or "" # this wrapping into a list is necessary to keep return type consistent @@ -408,7 +408,7 @@ def execute_rule( ) logger.info(f"Skipped dataset {dataset_metadata.name}. Reason: {reason}") error_obj = FailedValidationEntity( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, error=SkippedReason.EMPTY_DATASET.value, message=reason, ) @@ -416,7 +416,7 @@ def execute_rule( ValidationErrorContainer( status=ExecutionStatus.SKIPPED.value, message=reason, - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain or "", errors=[error_obj], ).to_representation() diff --git a/cdisc_rules_engine/services/reporting/usdm_report_data.py b/cdisc_rules_engine/services/reporting/usdm_report_data.py index fce4ca59e..0fb94ad50 100644 --- a/cdisc_rules_engine/services/reporting/usdm_report_data.py +++ b/cdisc_rules_engine/services/reporting/usdm_report_data.py @@ -1,6 +1,5 @@ from datetime import datetime from typing import BinaryIO, Iterable -import os from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths from cdisc_rules_engine.enums.execution_status import ExecutionStatus @@ -80,7 +79,7 @@ def get_conformance_details_data( ReportMetadataItem( "JSON file name", 9, - os.path.basename(os.path.dirname(self._datasets[0].full_path)), + self._datasets[0].filename, ) ) conformance_details.append( @@ -126,7 +125,7 @@ def get_summary_data(self) -> list[dict]: ): summary_item = { "entity": result.get("entity") - or (result.get("dataset", "") or "").replace(".json", ""), + or (result.get("dataset", "") or ""), "core_id": validation_result.id, "cdisc_rule_id": validation_result.cdisc_rule_id, "message": result.get("message"), @@ -188,8 +187,7 @@ def _issue_details( "cdisc_rule_id": validation_result.cdisc_rule_id, "message": result.get("message"), "executability": validation_result.executability, - "entity": error.get("entity") - or error.get("dataset", "").replace(".json", ""), + "entity": error.get("entity") or error.get("dataset", ""), "instance_id": error.get("instance_id"), "path": error.get("path"), "attributes": variables, @@ -210,8 +208,7 @@ def _error_details(self, validation_result: RuleValidationResult, result: dict): "cdisc_rule_id": validation_result.cdisc_rule_id, "message": (f"{result.get('message')} - {error.get('error')}"), "executability": validation_result.executability, - "entity": error.get("entity") - or error.get("dataset", "").replace(".json", ""), + "entity": error.get("entity") or error.get("dataset", ""), "instance_id": "", "path": "", "attributes": "", diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index 8c091a68f..df342276c 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -7,7 +7,7 @@ SUPPLEMENTARY_DOMAINS, ) from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.interfaces.data_service_interface import DataServiceInterface @@ -561,8 +561,8 @@ def tag_source( ) -> DatasetInterface: """ For sdtm split datasets, - Adds source filename and row number to dataset + Adds source dataset name and row number to dataset """ - dataset[SOURCE_FILENAME] = dataset_metadata.filename + dataset[SOURCE_DATASET_NAME] = dataset_metadata.name dataset[SOURCE_ROW_NUMBER] = list(range(1, dataset.len() + 1)) return dataset diff --git a/tests/unit/test_actions.py b/tests/unit/test_actions.py index f025849f7..7a60e9938 100644 --- a/tests/unit/test_actions.py +++ b/tests/unit/test_actions.py @@ -11,7 +11,7 @@ from cdisc_rules_engine.utilities.sdtm_utilities import tag_source from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) @@ -394,7 +394,7 @@ def test_empty_sequential(): {"TVSEQ": [2, 4, 6, None, "", 8], "TV": [1, 3, 5, 7, 9, "8"]} ) variable = DatasetVariable(df) - dataset_metadata = SDTMDatasetMetadata(first_record={"DOMAIN": "TV"}, filename="tv") + dataset_metadata = SDTMDatasetMetadata(first_record={"DOMAIN": "TV"}, name="tv") action = COREActions( [], variable, @@ -478,7 +478,7 @@ def test_nan_handling_in_error_object(): "TVSEQ": [1, 2, 3, 4], } ) - df[SOURCE_FILENAME] = "test.xpt" + df[SOURCE_DATASET_NAME] = "test" df[SOURCE_ROW_NUMBER] = [1, 2, 3, 4] expected_nan_vals = [1.0, None, 3.0, None] diff --git a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py index 3df542ff3..f84888135 100644 --- a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py @@ -1,6 +1,6 @@ import pytest from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.dataset_builders.contents_dataset_builder import ( @@ -305,35 +305,35 @@ def test_ContentDatasetBuilder_split_datasets(conditions): "26": "ALIVE", "27": "DEAD", }, - SOURCE_FILENAME: { - "0": "ss11.xpt", - "1": "ss11.xpt", - "2": "ss11.xpt", - "3": "ss11.xpt", - "4": "ss11.xpt", - "5": "ss11.xpt", - "6": "ss11.xpt", - "7": "ss11.xpt", - "8": "ss11.xpt", - "9": "ss11.xpt", - "10": "ss11.xpt", - "11": "ss11.xpt", - "12": "ss11.xpt", - "13": "ss11.xpt", - "14": "ss12.xpt", - "15": "ss12.xpt", - "16": "ss12.xpt", - "17": "ss12.xpt", - "18": "ss12.xpt", - "19": "ss12.xpt", - "20": "ss12.xpt", - "21": "ss12.xpt", - "22": "ss12.xpt", - "23": "ss12.xpt", - "24": "ss12.xpt", - "25": "ss12.xpt", - "26": "ss12.xpt", - "27": "ss12.xpt", + SOURCE_DATASET_NAME: { + "0": "SS11", + "1": "SS11", + "2": "SS11", + "3": "SS11", + "4": "SS11", + "5": "SS11", + "6": "SS11", + "7": "SS11", + "8": "SS11", + "9": "SS11", + "10": "SS11", + "11": "SS11", + "12": "SS11", + "13": "SS11", + "14": "SS12", + "15": "SS12", + "16": "SS12", + "17": "SS12", + "18": "SS12", + "19": "SS12", + "20": "SS12", + "21": "SS12", + "22": "SS12", + "23": "SS12", + "24": "SS12", + "25": "SS12", + "26": "SS12", + "27": "SS12", }, SOURCE_ROW_NUMBER: { "0": 1, diff --git a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py index dfc43d35b..caf5798c2 100644 --- a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py @@ -337,25 +337,25 @@ def test_concat_with_split_datasets(): 4, 7, ], - "source_filename": [ - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", + "source_dataset_name": [ + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", ], "source_row_number": [ 1, @@ -388,7 +388,7 @@ def test_concat_with_split_datasets(): "row_number", "variable_name", "variable_value", - "source_filename", + "source_dataset_name", "source_row_number", ] for col in key_columns: diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 600ed07d6..fb34233d0 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -102,12 +102,12 @@ def test_validate_rule_invalid_suffix( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Suffix of AESTDY is equal to test.", "errors": [ - {"value": {"AESTDY": "valid-test"}, "dataset": "bundle", "row": 1} + {"value": {"AESTDY": "valid-test"}, "dataset": "AE", "row": 1} ], } ] @@ -150,12 +150,12 @@ def test_validate_rule_invalid_prefix( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Prefix of AESTDY is equal to test.", "errors": [ - {"value": {"AESTDY": "test-valid"}, "dataset": "bundle", "row": 1} + {"value": {"AESTDY": "test-valid"}, "dataset": "AE", "row": 1} ], } ] @@ -262,20 +262,20 @@ def test_validate_rule_cross_dataset_check( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ec.xpt", + "dataset": "EC", "domain": "EC", "variables": ["ECSTDY"], "message": "Value of ECSTDY is equal to AESTDY.", "errors": [ { - "dataset": "ec.xpt", + "dataset": "EC", "row": 1, "value": {"ECSTDY": 4.0}, "USUBJID": "CDISC001", "SEQ": 1, }, { - "dataset": "ec.xpt", + "dataset": "EC", "row": 2, "value": {"ECSTDY": 5.0}, "USUBJID": "CDISC001", @@ -367,14 +367,14 @@ def test_validate_one_to_one_rel_across_datasets(dataset_rule_one_to_one_related assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ec.xpt", + "dataset": "EC", "domain": "EC", "variables": ["VISITNUM"], "message": "VISITNUM is not one-to-one related to VISIT", "errors": [ - {"value": {"VISITNUM": 1}, "dataset": "ec.xpt", "row": 1}, - {"value": {"VISITNUM": 1}, "dataset": "ec.xpt", "row": 3}, - {"value": {"VISITNUM": 3}, "dataset": "ec.xpt", "row": 4}, + {"value": {"VISITNUM": 1}, "dataset": "EC", "row": 1}, + {"value": {"VISITNUM": 1}, "dataset": "EC", "row": 3}, + {"value": {"VISITNUM": 3}, "dataset": "EC", "row": 4}, ], } ] @@ -414,12 +414,12 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Value for ECCOOLVAR greater than 30.", "errors": [ - {"value": {"ECCOOLVAR": 100}, "dataset": "bundle", "row": 2}, - {"value": {"ECCOOLVAR": 34}, "dataset": "bundle", "row": 4}, + {"value": {"ECCOOLVAR": 100}, "dataset": "EC", "row": 2}, + {"value": {"ECCOOLVAR": 34}, "dataset": "EC", "row": 4}, ], } ] @@ -460,11 +460,11 @@ def test_validate_rule_equal_length(dataset_rule_has_equal_length: dict): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Length of ECCOOLVAR is equal to 5.", "errors": [ - {"value": {"ECCOOLVAR": "equal"}, "dataset": "bundle", "row": 2} + {"value": {"ECCOOLVAR": "equal"}, "dataset": "EC", "row": 2} ], } ] @@ -509,11 +509,11 @@ def test_validate_is_contained_by_distinct(mock_rule_distinct_operation: dict): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ae.xpt", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Value for AESTDY not in DM.USUBJID", - "errors": [{"value": {"AESTDY": 5000}, "dataset": "ae.xpt", "row": 4}], + "errors": [{"value": {"AESTDY": 5000}, "dataset": "AE", "row": 4}], } ] @@ -553,13 +553,13 @@ def test_validate_rule_not_equal_length(dataset_rule_has_not_equal_length: dict) { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Length of ECCOOLVAR is not equal to 5.", "errors": [ { "value": {"ECCOOLVAR": "first_string"}, - "dataset": "bundle", + "dataset": "EC", "row": 1, } ], @@ -596,14 +596,14 @@ def test_validate_rule_multiple_conditions(dataset_rule_multiple_conditions: dic { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": ( "Length of ECCOOLVAR is not equal to 5 or ECCOOLVAR == cool." ), "errors": [ - {"value": {"ECCOOLVAR": "valid"}, "dataset": "bundle", "row": 2}, - {"value": {"ECCOOLVAR": "cool"}, "dataset": "bundle", "row": 3}, + {"value": {"ECCOOLVAR": "valid"}, "dataset": "EC", "row": 2}, + {"value": {"ECCOOLVAR": "cool"}, "dataset": "EC", "row": 3}, ], } ] @@ -637,13 +637,13 @@ def test_validate_record_rule_numbers_separated_by_dash_pattern(): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: ^\\d+\\-\\d+$", "errors": [ - {"value": {"AESTDY": "5-5"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "10-10"}, "dataset": "bundle", "row": 2}, + {"value": {"AESTDY": "5-5"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "10-10"}, "dataset": "AE", "row": 2}, ], } ] @@ -678,12 +678,12 @@ def test_validate_record_rule_semi_colon_delimited_pattern(): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: [^,]*;[^,]*", "errors": [ - {"value": {"AESTDY": "5;5"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "alex;alex"}, "dataset": "bundle", "row": 2}, + {"value": {"AESTDY": "5;5"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "alex;alex"}, "dataset": "AE", "row": 2}, ], } ] @@ -719,13 +719,13 @@ def test_validate_record_rule_no_letters_numbers_underscores(): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: ^((?![a-zA-Z0-9_]).)*$", "errors": [ - {"value": {"AESTDY": "[.*)]#@"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "|>.§!"}, "dataset": "bundle", "row": 3}, + {"value": {"AESTDY": "[.*)]#@"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "|>.§!"}, "dataset": "AE", "row": 3}, ], } ] @@ -772,7 +772,7 @@ def test_validate_dataset_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -824,12 +824,12 @@ def test_validate_dataset_metadata_wrong_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["dataset_label", "dataset_name", "dataset_size"], "errors": [ { - "dataset": "bundle", + "dataset": "EC", "row": 1, "value": { "dataset_name": "AD", @@ -879,7 +879,7 @@ def test_validate_variable_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -902,7 +902,7 @@ def test_validate_variable_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -951,12 +951,12 @@ def test_validate_variable_metadata_wrong_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["variable_name", "variable_label", "variable_data_type"], "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "errors": [ { - "dataset": "bundle", + "dataset": "EC", "row": 1, "value": { "variable_name": "longer than eight", @@ -965,7 +965,7 @@ def test_validate_variable_metadata_wrong_metadata( }, }, { - "dataset": "bundle", + "dataset": "EC", "row": 2, "value": { "variable_name": "longer than eight as well", @@ -1027,16 +1027,16 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Invalid AESTDY value", "errors": [ - {"dataset": "bundle", "row": 1, "value": {"AESTDY": 11}}, - {"dataset": "bundle", "row": 2, "value": {"AESTDY": 12}}, - {"dataset": "bundle", "row": 3, "value": {"AESTDY": 40}}, - {"dataset": "bundle", "row": 4, "value": {"AESTDY": 59}}, - {"dataset": "bundle", "row": 5, "value": {"AESTDY": 59}}, + {"dataset": "AE", "row": 1, "value": {"AESTDY": 11}}, + {"dataset": "AE", "row": 2, "value": {"AESTDY": 12}}, + {"dataset": "AE", "row": 3, "value": {"AESTDY": 40}}, + {"dataset": "AE", "row": 4, "value": {"AESTDY": 59}}, + {"dataset": "AE", "row": 5, "value": {"AESTDY": 59}}, ], } ] @@ -1068,7 +1068,7 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): [ { "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -1154,12 +1154,12 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): assert validation_result == [ { "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "bundle", + "dataset": "AE", "row": 1, "value": { "AESTDY": "test", @@ -1168,7 +1168,7 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): "SEQ": 1, }, { - "dataset": "bundle", + "dataset": "AE", "row": 4, "value": { "AESTDY": "test", @@ -1177,7 +1177,7 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): "SEQ": 4, }, { - "dataset": "bundle", + "dataset": "AE", "row": 5, "value": { "AESTDY": "test", @@ -1240,12 +1240,12 @@ def test_validate_single_dataset_not_equal_to( assert validation_result == [ { "domain": "AE", - "dataset": "data_bundle", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "data_bundle", + "dataset": "AE", "row": 2, "value": { "AESTDY": "alex", @@ -1254,7 +1254,7 @@ def test_validate_single_dataset_not_equal_to( "SEQ": 2, }, { - "dataset": "data_bundle", + "dataset": "AE", "row": 3, "value": { "AESTDY": "alex", @@ -1293,7 +1293,7 @@ def test_validate_single_dataset_not_equal_to( [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ae.xpt", + "dataset": "AE", "domain": "AE", "variables": ["dataset_label", "dataset_name", "dataset_location"], "message": "Dataset metadata does not correspond to Define XML", @@ -1304,7 +1304,7 @@ def test_validate_single_dataset_not_equal_to( "dataset_location": "ae.xpt", "dataset_label": "Adverse", }, - "dataset": "ae.xpt", + "dataset": "AE", "row": 1, }, ], @@ -1331,7 +1331,7 @@ def test_validate_single_dataset_not_equal_to( [ { "domain": "AE", - "dataset": "ae.xpt", + "dataset": "AE", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -1420,11 +1420,11 @@ def test_validate_dataset_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "test", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_size"], "errors": [ - {"dataset": "test", "row": 1, "value": {"variable_size": 30}} + {"dataset": "AE", "row": 1, "value": {"variable_size": 30}} ], "message": ( "Variable metadata variable_size " @@ -1463,11 +1463,11 @@ def test_validate_dataset_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "test", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_size"], "errors": [ - {"dataset": "test", "row": 1, "value": {"variable_size": 30}} + {"dataset": "AE", "row": 1, "value": {"variable_size": 30}} ], "message": ( "Variable metadata variable_size " @@ -1524,15 +1524,15 @@ def test_validate_variable_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "ae_2.xpt", + "dataset": "AE_2", "executionStatus": ExecutionStatus.SKIPPED.value, "variables": [], - "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=", + "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=AE_2", "errors": [ { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "error": "Outside scope", - "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=", + "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=AE_2", } ], } @@ -1544,24 +1544,24 @@ def test_validate_variable_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "ae_1.xpt, ae_2.xpt", + "dataset": "AE_1, AE_2", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "row": 1, "value": {"AESTDY": "test"}, "USUBJID": "1", }, { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "row": 4, "value": {"AESTDY": "test"}, "USUBJID": "1", }, { - "dataset": "ae_1.xpt", + "dataset": "AE_1", "row": 4, "value": {"AESTDY": "test"}, "USUBJID": "2", @@ -1646,11 +1646,13 @@ def test_validate_split_dataset_contents( mock_async_get_datasets.return_value = [first_dataset_part, second_dataset_part] datasets = [ SDTMDatasetMetadata( + name="AE_2", first_record={"DOMAIN": "AE"}, filename="ae_2.xpt", full_path="CDISC01/test/ae_2.xpt", ), SDTMDatasetMetadata( + name="AE_1", first_record={"DOMAIN": "AE"}, filename="ae_1.xpt", full_path="CDISC01/test/ae_1.xpt", @@ -1724,10 +1726,16 @@ def test_validate_split_dataset_metadata( mock_get_dataset_metadata.return_value = second_dataset_part datasets = [ SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, filename="ec_2.xpt", full_path="ec_2.xpt" + name="EC_2", + first_record={"DOMAIN": "EC"}, + filename="ec_2.xpt", + full_path="ec_2.xpt", ), SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, filename="ec_1.xpt", full_path="ec_1.xpt" + name="EC_1", + first_record={"DOMAIN": "EC"}, + filename="ec_1.xpt", + full_path="ec_1.xpt", ), ] validation_result: List[dict] = RulesEngine( @@ -1742,11 +1750,11 @@ def test_validate_split_dataset_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "ec_1.xpt", + "dataset": "EC_1", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "errors": [ { - "dataset": "ec_1.xpt", + "dataset": "EC_1", "row": 1, "value": { "dataset_label": "EC Label", @@ -1798,11 +1806,13 @@ def test_validate_split_dataset_variables_metadata( ] datasets = [ SDTMDatasetMetadata( + name="EC_2", first_record={"DOMAIN": "EC"}, filename="ec_2.xpt", full_path="CDISC/test/ec_2.xpt", ), SDTMDatasetMetadata( + name="EC_1", first_record={"DOMAIN": "EC"}, filename="ec_1.xpt", full_path="CDISC/test/ec_1.xpt", @@ -1818,12 +1828,12 @@ def test_validate_split_dataset_variables_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "ec_2.xpt", + "dataset": "EC_2", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_name", "variable_label", "variable_data_type"], "errors": [ { - "dataset": "ec_2.xpt", + "dataset": "EC_2", "row": 1, "value": { "variable_label": ( @@ -1943,12 +1953,12 @@ def test_validate_record_in_parent_domain( { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "ec.xpt", + "dataset": "EC", "variables": ["ECREASOC", "ECPRESP"], "message": "Dataset contents is wrong.", "errors": [ { - "dataset": "ec.xpt", + "dataset": "EC", "row": 4, "value": {"ECPRESP": "Y", "ECREASOC": "Some Value 1"}, "USUBJID": "CDISC005", @@ -2008,20 +2018,20 @@ def test_validate_additional_columns( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ts.xpt", + "dataset": "TS", "domain": "TS", "variables": ["TSVAL"], "message": "Inconsistencies found in enumerated TSVAL columns.", "errors": [ { "value": {"TSVAL": None}, - "dataset": "ts.xpt", + "dataset": "TS", "row": 2, "USUBJID": "1", }, { "value": {"TSVAL": None}, - "dataset": "ts.xpt", + "dataset": "TS", "row": 4, "USUBJID": "1", }, @@ -2114,7 +2124,7 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( assert validation_result == [ { "executionStatus": ExecutionStatus.SUCCESS.value, - "dataset": "ie.xpt", + "dataset": "IE", "domain": "IE", "variables": [], "message": None, @@ -2189,7 +2199,7 @@ def test_validate_extract_metadata_operation( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "domain": "SUPPEC", "variables": [ "RDOMAIN", @@ -2199,21 +2209,21 @@ def test_validate_extract_metadata_operation( ), "errors": [ { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 1, "value": { "RDOMAIN": "EC", }, }, { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 2, "value": { "RDOMAIN": "EC", }, }, { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 3, "value": { "RDOMAIN": "EC", @@ -2285,7 +2295,7 @@ def test_dataset_references_invalid_whodrug_terms( { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "AE", - "dataset": "dataset_path", + "dataset": "AE", "variables": [ "AEINA", ], @@ -2294,14 +2304,14 @@ def test_dataset_references_invalid_whodrug_terms( ), "errors": [ { - "dataset": "dataset_path", + "dataset": "AE", "row": 3, "value": { "AEINA": "A01AC", }, }, { - "dataset": "dataset_path", + "dataset": "AE", "row": 4, "value": { "AEINA": "A01AD", @@ -2501,7 +2511,7 @@ def mock_cached_method(*args, **kwargs): assert result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "dataset_path", + "dataset": "AE", "domain": "AE", "variables": [ "$column_order_from_dataset", @@ -2532,7 +2542,7 @@ def mock_cached_method(*args, **kwargs): ], "STUDYID": "TEST_STUDY", }, - "dataset": "dataset_path", + "dataset": "AE", } ], } diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index 2ae92f5e5..550c82f39 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -105,19 +105,19 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "domain": "EC", "variables": ["ECCOOLVAR"], "message": "Value for ECCOOLVAR greater than 30.", "errors": [ { "value": {"ECCOOLVAR": 100}, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "row": 2, }, { "value": {"ECCOOLVAR": 34}, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "row": 4, }, ], From bc781603788ff8d498737450e3efd335d5929c67 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 17:36:39 -0400 Subject: [PATCH 07/18] regression report fixes --- .../test_Issues/test_CoreIssue1345.py | 12 ++++++------ .../test_Issues/test_CoreIssue1348.py | 16 ++++++++-------- .../test_Issues/test_CoreIssue1421.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py index 2b66a4e3a..bb160fd3f 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py @@ -78,7 +78,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( dm_related_issues = [ issue for issue in json_report.get("Issue_Details", []) - if issue.get("dataset", "").lower() in {"dm.json", "suppdm.json"} + if issue.get("dataset", "") in {"DM", "SUPPDM"} ] assert not dm_related_issues, ( @@ -88,7 +88,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( dm_related_summary = [ s for s in json_report.get("Issue_Summary", []) - if s.get("dataset", "").lower() in {"dm.json", "suppdm.json"} + if s.get("dataset", "") in {"DM", "SUPPDM"} ] assert not dm_related_summary, ( @@ -99,7 +99,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( ec_detail_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "ec.json" + if i.get("dataset", "") in {"EC"} ] assert ( @@ -112,7 +112,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( ec_summary_issues = [ s for s in json_report.get("Issue_Summary", []) - if s.get("dataset", "").lower() == "ec.json" + if s.get("dataset", "") in {"EC"} ] assert ( @@ -147,7 +147,7 @@ def test_engine_correctly_processes_relrec_when_supp_datasets_provided( relrec_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "relrec.json" + if i.get("dataset", "") in {"RELREC"} ] assert ( len(relrec_issues) == 2 @@ -157,7 +157,7 @@ def test_engine_correctly_processes_relrec_when_supp_datasets_provided( ec_detail_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "ec.json" + if i.get("dataset", "") in {"EC"} ] assert ( len(ec_detail_issues) == 2 diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py index 0cbad8d0a..74c41a025 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py @@ -13,7 +13,7 @@ ) _summary = [ { - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "core_id": "CORE-000409", "message": _message, "issues": 7, @@ -24,7 +24,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 1, "SEQ": "", @@ -49,7 +49,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 2, "SEQ": "", @@ -67,7 +67,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 3, "SEQ": "", @@ -92,7 +92,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 4, "SEQ": "", @@ -110,7 +110,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 5, "SEQ": "", @@ -135,7 +135,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 6, "SEQ": "", @@ -153,7 +153,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 7, "SEQ": "", diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py index 675a6359d..009e7eca0 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py @@ -73,7 +73,7 @@ def test_validate_define_xml_against_lib_metadata(): dataset_column_values = [ cell.value for cell in dataset_column[1:] if cell.value is not None ] - assert sorted(set(dataset_column_values)) == ["dm.xpt", "suppec.xpt"] + assert sorted(set(dataset_column_values)) == ["DM", "SUPPEC"] core_id_column = sheet[issue_sheet_coreid_column] core_id_column_values = [ @@ -125,7 +125,7 @@ def test_validate_define_xml_against_lib_metadata(): for row in summary_values: assert row[2] == "Issue with codelist definition in the Define-XML document." datasets_in_summary = set(row[0] for row in summary_values if row[0] is not None) - assert datasets_in_summary == {"dm.xpt", "suppec.xpt"} + assert datasets_in_summary == {"DM", "SUPPEC"} # Delete the excel file if os.path.exists(excel_file_path): From b6ae02cca2917198d9a5029aa9351fc454f5347b Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 17:44:38 -0400 Subject: [PATCH 08/18] fix rule editor test --- .github/test/selenium_test_editor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/test/selenium_test_editor.py b/.github/test/selenium_test_editor.py index 12b36b329..2844b505e 100644 --- a/.github/test/selenium_test_editor.py +++ b/.github/test/selenium_test_editor.py @@ -184,7 +184,7 @@ "DM": [ { "executionStatus": "success", - "dataset": "dm.xpt", + "dataset": "DM", "domain": "DM", "variables": [], "message": None, @@ -194,7 +194,7 @@ "FA": [ { "executionStatus": "issue reported", - "dataset": "fa.xpt", + "dataset": "FA", "domain": "FA", "variables": [ "$val_dy", @@ -215,7 +215,7 @@ "RFSTDTC": "2012-11-15", "FADTC": "2012-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 1, "USUBJID": "CDISC002", "SEQ": 1, @@ -227,7 +227,7 @@ "RFSTDTC": "2013-10-08", "FADTC": "2013-10-12", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 2, "USUBJID": "CDISC004", "SEQ": 2, @@ -239,7 +239,7 @@ "RFSTDTC": "2013-01-05", "FADTC": "2012-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 4, "USUBJID": "CDISC007", "SEQ": 4, @@ -251,7 +251,7 @@ "RFSTDTC": "2014-05-11", "FADTC": "2014-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 5, "USUBJID": "CDISC008", "SEQ": 5, @@ -262,7 +262,7 @@ "IE": [ { "executionStatus": "issue reported", - "dataset": "ie.xpt", + "dataset": "IE", "domain": "IE", "variables": [ "$val_dy", @@ -283,7 +283,7 @@ "RFSTDTC": "2022-03-20", "IEDTC": "2022-03-17", }, - "dataset": "ie.xpt", + "dataset": "IE", "row": 1, "USUBJID": "CDISC-TEST-001", "SEQ": 1, @@ -294,7 +294,7 @@ "LB": [ { "executionStatus": "issue reported", - "dataset": "lb.xpt", + "dataset": "LB", "domain": "LB", "variables": [ "$val_dy", @@ -315,7 +315,7 @@ "LBDTC": "2022-03-30", "LBDY": 2, }, - "dataset": "lb.xpt", + "dataset": "LB", "row": 1, "USUBJID": "CDISC-TEST-001", "SEQ": 1, From 039f678832e792a1c1436032608d9aba4270400c Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 18:09:05 -0400 Subject: [PATCH 09/18] removed more dataset path --- cdisc_rules_engine/interfaces/data_service_interface.py | 2 -- .../services/data_services/base_data_service.py | 2 -- cdisc_rules_engine/utilities/rule_processor.py | 6 ++---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 18916c599..19a1510f2 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -117,8 +117,6 @@ def get_dataset_class( @abstractmethod def get_data_structure( self, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: """ diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index c35cef665..a47476868 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -193,8 +193,6 @@ def get_dataset_class( def get_data_structure( self, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: # TODO: look at defineXML if applicable for more accurate data structure detection diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 5e452cad8..a3d9110f8 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -188,7 +188,7 @@ def _domain_matched_ap_or_supp( ) def rule_applies_to_data_structure( - self, rule, datasets, dataset_metadata: SDTMDatasetMetadata + self, rule, dataset_metadata: SDTMDatasetMetadata ): datastructures = rule.get("data_structures") or {} included_datastructures = datastructures.get("Include", []) @@ -201,8 +201,6 @@ def rule_applies_to_data_structure( if ALL_KEYWORD in included_datastructures: return True ds = self.data_service.get_data_structure( - dataset_metadata.full_path, - datasets, dataset_metadata, ) if ds and (ds not in included_datastructures): @@ -675,7 +673,7 @@ def is_suitable_for_validation( ) logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason - if not self.rule_applies_to_data_structure(rule, datasets, dataset_metadata): + if not self.rule_applies_to_data_structure(rule, dataset_metadata): reason = ( f"Rule skipped - doesn't apply to data structure for " f"rule id={rule_id}, dataset={dataset_name}" From 85eef279628373ae8fa539a7ff5b3b77873c5bfc Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 18:53:53 -0400 Subject: [PATCH 10/18] remove unused method --- cdisc_rules_engine/rules_engine.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 1471a9d0f..2874d89e4 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -45,9 +45,6 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.cache import CacheServiceFactory from cdisc_rules_engine.services.data_services import DataServiceFactory -from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( - DefineXMLReaderFactory, -) from cdisc_rules_engine.utilities.jsonata_processor import JSONataProcessor from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor @@ -442,17 +439,6 @@ def execute_rule( ) return results - def get_define_xml_value_level_metadata( - self, dataset_path: str, domain_name: str - ) -> List[dict]: - """ - Gets Define XML variable metadata and returns it as dataframe. - """ - define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - dataset_path, self.define_xml_path, self.data_service, self.cache - ) - return define_xml_reader.extract_value_level_metadata(domain_name=domain_name) - def handle_validation_exceptions( # noqa self, exception, filename: str ) -> ValidationErrorContainer: From 9b836b1a5e54b94988cfe5d87562dd6adb01d659 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Wed, 15 Apr 2026 20:29:12 -0400 Subject: [PATCH 11/18] remove unnecessary dataset path params --- .../dataset_builders/base_dataset_builder.py | 28 +++++++++++++------ .../dataset_builder_factory.py | 1 - ...dataset_metadata_define_dataset_builder.py | 5 +++- .../operations/base_operation.py | 2 -- cdisc_rules_engine/rules_engine.py | 1 - .../define_xml/define_xml_reader_factory.py | 7 ++--- .../utilities/rule_processor.py | 4 +-- .../utilities/sdtm_utilities.py | 2 -- cdisc_rules_engine/utilities/utils.py | 4 --- .../test_base_dataset_builder.py | 10 +++---- .../test_content_metadata_dataset_builder.py | 2 -- .../test_contents_dataset_builder.py | 1 - .../test_contents_define_dataset_builder.py | 1 - ...ntents_define_variables_dataset_builder.py | 1 - ...est_contents_define_vlm_dataset_builder.py | 1 - ...dataset_metadata_define_dataset_builder.py | 1 - ..._define_variables_with_library_metadata.py | 1 - .../test_domain_presence_define_builder.py | 1 - .../test_json_schema_check_dataset_builder.py | 3 -- .../test_values_dataset_metadata_builder.py | 2 -- .../test_values_variables_metadata_builder.py | 2 -- ...with_define_and_library_dataset_builder.py | 7 +++-- ...a_with_library_metadata_dataset_builder.py | 2 -- tests/unit/test_utilities/test_sdtm_utils.py | 14 ---------- 24 files changed, 39 insertions(+), 64 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 1f6205a7d..0ed5364da 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -25,7 +25,6 @@ def __init__( cache_service, rule_processor: RuleProcessor, data_processor, - dataset_path, datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, define_xml_path, @@ -38,7 +37,6 @@ def __init__( self.cache = cache_service self.data_processor = data_processor self.rule_processor = rule_processor - self.dataset_path = dataset_path self.datasets = datasets self.dataset_metadata = dataset_metadata self.rule = rule @@ -128,7 +126,10 @@ def get_define_xml_item_group_metadata_for_dataset( """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_dataset_metadata( dataset_metadata["dataset_name"] @@ -151,7 +152,10 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_domain_metadata(domain) @@ -166,7 +170,10 @@ def get_define_xml_variables_metadata(self) -> List[dict]: | SUPPDM | DM | """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) domain = self.dataset_metadata.domain or self.dataset_metadata.rdomain return define_xml_reader.extract_variables_metadata( @@ -178,7 +185,10 @@ def get_define_xml_value_level_metadata(self) -> List[dict]: Gets Define XML value level metadata and returns it as dataframe. """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_value_level_metadata( domain_name=self.dataset_metadata.domain @@ -190,7 +200,10 @@ def add_row_number(dataframe: DatasetInterface) -> None: def get_define_metadata(self): define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.read() @@ -209,7 +222,6 @@ def get_library_variables_metadata(self) -> DatasetInterface: data_service=self.data_service, datasets=self.datasets, dataset_metadata=self.dataset_metadata, - dataset_path=self.dataset_path, ) variables_metadata: dict = self.library_metadata.variables_metadata.get( domain, {} diff --git a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py index a275cb70e..bbddcbdd5 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py +++ b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py @@ -106,7 +106,6 @@ def get_service( kwargs.get("cache_service"), kwargs.get("rule_processor"), kwargs.get("data_processor"), - kwargs.get("dataset_path"), kwargs.get("datasets"), kwargs.get("dataset_metadata", ""), kwargs.get("define_xml_path"), diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 20bdb6d4c..4a23ba618 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -80,7 +80,10 @@ def _get_define_xml_dataframe(self): logger.info(f"No define_metadata is provided for {__name__}.") return self.dataset_implementation(columns=define_col_order) define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) enriched_metadata = [] for basic_metadata in define_metadata: diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 2d4f1b761..684c16b3f 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -233,7 +233,6 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: dataset_name=self.params.domain, datasets=self.params.datasets ), datasets=self.params.datasets, - dataset_path=self.params.dataset_path, ) def _get_variable_names_list(self, domain, dataframe): @@ -276,7 +275,6 @@ def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: return sdtm_utilities.get_variables_metadata_from_standard_model( dataframe=dataframe, datasets=self.params.datasets, - dataset_path=self.params.dataset_path, data_service=self.data_service, library_metadata=self.library_metadata, dataset_metadata=self.data_service.get_raw_dataset_metadata( diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 2874d89e4..060265b25 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -301,7 +301,6 @@ def get_dataset_builder( rule_processor=self.rule_processor, dataset_metadata=dataset_metadata, datasets=datasets, - dataset_path=dataset_metadata.full_path, define_xml_path=self.define_xml_path, standard=self.standard, standard_version=self.standard_version, diff --git a/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py b/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py index 61156cfad..1e6dff126 100644 --- a/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py +++ b/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py @@ -1,4 +1,4 @@ -import os +from os.path import dirname, join from xml.etree import ElementTree from re import compile from typing import Union @@ -17,7 +17,6 @@ from cdisc_rules_engine.services.define_xml.base_define_xml_reader import ( BaseDefineXMLReader, ) -from cdisc_rules_engine.utilities.utils import get_directory_path class DefineXMLReaderFactory: @@ -107,9 +106,9 @@ def _from_namespace(cls, namespace: str) -> BaseDefineXMLReader: def get_define_xml_reader( cls, dataset_path: str, define_xml_path: str, data_service, cache ): - directory_path = get_directory_path(dataset_path) + directory_path = dirname(dataset_path) if define_xml_path is None: - define_xml_path: str = os.path.join( + define_xml_path: str = join( directory_path, DEFINE_XML_FILE_NAME, ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index a3d9110f8..3d6f148f4 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -1,6 +1,7 @@ import re import copy +from os.path import dirname from typing import Iterable, List, Optional, Union, Tuple from cdisc_rules_engine.enums.rule_types import RuleTypes from cdisc_rules_engine.interfaces.cache_service_interface import ( @@ -37,7 +38,6 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.utils import ( - get_directory_path, get_operations_cache_key, search_in_list, ) @@ -376,7 +376,7 @@ def perform_rule_operations( datasets=datasets, delimiter=operation.get("delimiter"), dictionary_term_type=operation.get("dictionary_term_type"), - directory_path=get_directory_path(dataset_metadata.full_path), + directory_path=dirname(dataset_metadata.full_path), domain=domain, domain_class=operation.get("domain_class"), external_dictionaries=external_dictionaries, diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index df342276c..d15a6c0e8 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -107,7 +107,6 @@ def get_variables_metadata_from_standard( # noqa library_metadata, data_service, dataset_metadata: SDTMDatasetMetadata, - dataset_path: str, datasets: Iterable[SDTMDatasetMetadata], ): add_AP = False @@ -343,7 +342,6 @@ def group_class_variables_by_role( def get_variables_metadata_from_standard_model( # noqa dataframe, datasets: Iterable[SDTMDatasetMetadata], - dataset_path: str, data_service: DataServiceInterface, library_metadata: LibraryMetadataContainer, dataset_metadata: SDTMDatasetMetadata, diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 00648c0d2..16c3cda1a 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -265,10 +265,6 @@ def get_operations_cache_key( return key -def get_directory_path(dataset_path): - return os.path.dirname(dataset_path) - - def serialize_rule(rule: dict) -> dict: """ Converts rule "conditions" to dict. diff --git a/tests/unit/test_dataset_builders/test_base_dataset_builder.py b/tests/unit/test_dataset_builders/test_base_dataset_builder.py index 92936891b..29ba4b675 100644 --- a/tests/unit/test_dataset_builders/test_base_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_base_dataset_builder.py @@ -61,13 +61,12 @@ def create_mock_reader_with_metadata(item_group_defs, item_defs=None): return mock_reader -def create_builder_instance(dataset_metadata, dataset_path="/path/to/dataset.xpt"): +def create_builder_instance(dataset_metadata): """ Helper function to create a ConcreteDatasetBuilder instance. Args: dataset_metadata: SDTMDatasetMetadata instance - dataset_path: Path to the dataset file Returns: ConcreteDatasetBuilder instance @@ -78,7 +77,6 @@ def create_builder_instance(dataset_metadata, dataset_path="/path/to/dataset.xpt cache_service=MagicMock(), rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path=dataset_path, datasets=[dataset_metadata], dataset_metadata=dataset_metadata, define_xml_path="/path/to/define.xml", @@ -171,10 +169,11 @@ def test_get_define_xml_variables_metadata( filename=f"{dataset_name.lower()}.xpt", label=f"{dataset_name} Label", first_record=first_record, + full_path="/path/to/dataset.xpt", ) # Create builder instance - builder = create_builder_instance(dataset_metadata, "/path/to/dataset.xpt") + builder = create_builder_instance(dataset_metadata) # Call the method result = builder.get_define_xml_variables_metadata() @@ -221,10 +220,11 @@ def test_get_define_xml_variables_metadata_domain_not_found( filename="ae.xpt", label="Adverse Events", first_record={"DOMAIN": "AE"}, + full_path="/path/to/ae.xpt", ) # Create builder instance - builder = create_builder_instance(dataset_metadata, "/path/to/ae.xpt") + builder = create_builder_instance(dataset_metadata) # Verify that DomainNotFoundInDefineXMLError is raised with pytest.raises( diff --git a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py index 26e849f89..0abf83017 100644 --- a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py @@ -298,7 +298,6 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path=test_data["datasets"][0]["filename"], datasets=test_data.get("datasets", {}), dataset_metadata=SDTMDatasetMetadata( name="QSCG", @@ -327,7 +326,6 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path=test_data["datasets"][1]["filename"], datasets=test_data.get("datasets", {}), dataset_metadata=SDTMDatasetMetadata( name="QSPG", diff --git a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py index f84888135..b6d2ec81e 100644 --- a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py @@ -377,7 +377,6 @@ def test_ContentDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path="", datasets=datasets, dataset_metadata=DummyDataset(test_data.get("datasets", {})[0]), define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py index 63e02e8d9..2dc7b98cf 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py @@ -829,7 +829,6 @@ def test_contents_define_dataset_builder(dataset_path): cache_service=None, rule_processor=RuleProcessor(mock_data_service, InMemoryCacheService()), data_processor=None, - dataset_path=dataset_path, datasets=[ SDTMDatasetMetadata(**dataset) for dataset in dataset_metadata.values() ], diff --git a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py index 24b4cdbed..915dd0d98 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py @@ -102,7 +102,6 @@ def test_contents_define_variables_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, datasets=None, dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py index 739324010..f62413579 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py @@ -124,7 +124,6 @@ def test_contents_define_vlm_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, datasets=None, dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index 8ea9110d0..005a095a8 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -124,7 +124,6 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): cache_service=None, rule_processor=None, data_processor=None, - dataset_path=dataset_path, datasets=[data_metadata], dataset_metadata=SDTMDatasetMetadata(full_path=dataset_path), define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py index 8f9ca0f16..c57f1e59c 100644 --- a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py +++ b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py @@ -184,7 +184,6 @@ def test_define_variables_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=test_define_file_path, datasets=[], dataset_metadata=DummyDataset( { diff --git a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py index 3005e31cc..e8cb96ec0 100644 --- a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py +++ b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py @@ -326,7 +326,6 @@ def test_domain_list_with_define_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path="ae.xpt", datasets=mock_datasets, dataset_metadata=SDTMDatasetMetadata(full_path="ae.xpt"), define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py index 04dad5d4a..4def032e0 100644 --- a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py @@ -25,7 +25,6 @@ def _make_builder(schema, instance): cache_service=MagicMock(), rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, @@ -87,7 +86,6 @@ def test_json_schema_check_dataset_builder_valid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", datasets=[], dataset_metadata=MagicMock(name="test_dataset"), define_xml_path=None, @@ -144,7 +142,6 @@ def test_json_schema_check_dataset_builder_invalid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", datasets=[], dataset_metadata=dataset_metadata, define_xml_path=None, diff --git a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py index 4d58b504a..ebf612664 100644 --- a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py @@ -60,7 +60,6 @@ def test_build_with_dataset_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - dataset_path="ae.xpt", datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} @@ -143,7 +142,6 @@ def test_build_split_datasets(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - dataset_path="", datasets=[], dataset_metadata=None, define_xml_path="", diff --git a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py index caf5798c2..eac4d0372 100644 --- a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py @@ -59,7 +59,6 @@ def test_build_with_variable_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path="ae.xpt", datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} @@ -162,7 +161,6 @@ def test_concat_with_split_datasets(): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path="ae.xpt", datasets=[], dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), define_xml_path="", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 674e01176..85c7a58bd 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -140,9 +140,12 @@ def test_build_combined_metadata( cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path=str(test_define_file_path), datasets=[], - dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), + dataset_metadata=SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=str(test_define_file_path), + ), define_xml_path=str(test_define_file_path), standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py index ee6c8d947..1708785a4 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py @@ -130,7 +130,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=None, datasets=[], dataset_metadata=DummyDataset( { @@ -354,7 +353,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=None, datasets=[], dataset_metadata=DummyDataset( { diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py index 0b4ebc37e..38905fde0 100644 --- a/tests/unit/test_utilities/test_sdtm_utils.py +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -45,7 +45,6 @@ def test_standard_domain_ae(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/ae.xpt", mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) @@ -59,7 +58,6 @@ def test_standard_domain_dm(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/dm.xpt", mock_datasets, ) assert any(var["name"] == "USUBJID" for var in variables) @@ -73,7 +71,6 @@ def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/lb.xpt", mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) @@ -88,7 +85,6 @@ def test_supp_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/suppae.xpt", mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) @@ -102,7 +98,6 @@ def test_sq_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/sqae.xpt", mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) @@ -116,7 +111,6 @@ def test_ap_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/apdm.xpt", mock_datasets, ) assert any(var["name"] == "APID" for var in variables) @@ -134,7 +128,6 @@ def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/sqapmh.xpt", mock_datasets, ) assert any(var["name"] == "APID" for var in variables) @@ -149,7 +142,6 @@ def test_findings_about_domain_fa(library_metadata, mock_data_service, mock_data library_metadata, mock_data_service, dataset_metadata, - "/path/to/fa.xpt", mock_datasets, ) assert any(var["name"] == "FATEST" for var in variables) @@ -163,7 +155,6 @@ def test_findings_domain_from_model(library_metadata, mock_data_service, mock_da variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, datasets=mock_datasets, - dataset_path="/path/to/lb.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -179,7 +170,6 @@ def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datase variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, datasets=mock_datasets, - dataset_path="/path/to/suppae.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -195,7 +185,6 @@ def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datase variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, datasets=mock_datasets, - dataset_path="/path/to/suppae.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -211,7 +200,6 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, datasets=mock_datasets, - dataset_path="/path/to/apdm.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -230,7 +218,6 @@ def test_custom_domain_events_class(library_metadata, mock_data_service, mock_da library_metadata, mock_data_service, dataset_metadata, - "/path/to/zz.xpt", mock_datasets, ) mock_data_service.handle_custom_domains.assert_called_once() @@ -250,7 +237,6 @@ def test_custom_domain_findings_class( library_metadata, mock_data_service, dataset_metadata, - "/path/to/xx.xpt", mock_datasets, ) mock_data_service.handle_custom_domains.assert_called_once() From 6da3024ed761c6590369c8414f4a9617a14e7fe8 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Thu, 16 Apr 2026 09:41:59 -0400 Subject: [PATCH 12/18] missed a dataset_path --- cdisc_rules_engine/operations/base_operation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 684c16b3f..f0ee50924 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -278,7 +278,7 @@ def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: data_service=self.data_service, library_metadata=self.library_metadata, dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets + dataset_name=self.params.domain, datasets=self.params.datasets ), ) From 5429014cfc60c62d2570f8d01b717f90871d4145 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Thu, 16 Apr 2026 22:13:37 -0400 Subject: [PATCH 13/18] remove extra datasets references --- .../dataset_builders/base_dataset_builder.py | 9 +- .../content_metadata_dataset_builder.py | 1 - .../contents_dataset_builder.py | 6 +- .../contents_define_dataset_builder.py | 1 - .../dataset_builder_factory.py | 1 - ...dataset_metadata_define_dataset_builder.py | 4 +- .../dataset_metadata_values_builder.py | 1 - .../domain_list_dataset_builder.py | 3 +- .../domain_list_with_define_builder.py | 4 +- .../variables_metadata_dataset_builder.py | 1 - ...riables_metadata_values_dataset_builder.py | 1 - ...with_define_and_library_dataset_builder.py | 1 - ...es_metadata_with_define_dataset_builder.py | 1 - ...ariables_metadata_with_library_metadata.py | 1 - .../dummy_models/dummy_dataset.py | 2 +- .../interfaces/data_service_interface.py | 2 - cdisc_rules_engine/models/operation_params.py | 4 +- .../operations/base_operation.py | 6 +- .../operations/dataset_names.py | 2 +- .../operations/day_data_validator.py | 4 +- .../operations/extract_metadata.py | 2 +- .../parent_library_model_column_order.py | 2 +- .../operations/related_domain_is_custom.py | 2 +- .../operations/study_domains.py | 4 +- .../operations/variable_count.py | 5 +- .../operations/variable_value_count.py | 6 +- cdisc_rules_engine/rules_engine.py | 26 +- .../data_services/base_data_service.py | 11 +- .../data_services/local_data_service.py | 13 +- .../services/reporting/report_factory.py | 8 +- .../utilities/dataset_preprocessor.py | 26 +- .../utilities/rule_processor.py | 23 +- .../utilities/sdtm_utilities.py | 7 +- scripts/run_validation.py | 11 +- tests/conftest.py | 1 - .../test_base_dataset_builder.py | 1 - .../test_content_metadata_dataset_builder.py | 2 - .../test_contents_dataset_builder.py | 4 +- .../test_contents_define_dataset_builder.py | 3 - ...ntents_define_variables_dataset_builder.py | 1 - ...est_contents_define_vlm_dataset_builder.py | 1 - ...dataset_metadata_define_dataset_builder.py | 1 - ..._define_variables_with_library_metadata.py | 1 - .../test_domain_presence_define_builder.py | 36 ++- .../test_json_schema_check_dataset_builder.py | 2 - .../test_values_dataset_metadata_builder.py | 2 - .../test_values_variables_metadata_builder.py | 2 - ...with_define_and_library_dataset_builder.py | 1 - ...a_with_library_metadata_dataset_builder.py | 2 - tests/unit/test_dataset_preprocessor.py | 262 ++++++++++-------- .../test_operations/test_dataset_names.py | 23 +- .../test_day_data_validator.py | 2 +- .../test_parent_library_model_column_order.py | 8 +- .../test_related_domain_is_custom.py | 11 +- .../test_operations/test_study_domains.py | 8 +- .../test_operations/test_variable_count.py | 2 +- .../test_variable_value_count.py | 2 +- tests/unit/test_rules_engine.py | 212 ++++++++------ .../test_data_service/test_data_service.py | 17 +- .../test_excel_data_service.py | 2 +- .../test_local_data_service.py | 4 +- .../test_reporting/test_report_factory.py | 1 - tests/unit/test_usdm_data.py | 1 - .../test_utilities/test_rule_processor.py | 92 +++--- tests/unit/test_utilities/test_sdtm_utils.py | 48 +--- 65 files changed, 484 insertions(+), 472 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 0ed5364da..04672819b 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -9,7 +9,7 @@ from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, ) -from typing import List, Iterable, Optional +from typing import List, Optional from cdisc_rules_engine.utilities import sdtm_utilities from cdisc_rules_engine.utilities.rule_processor import RuleProcessor from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface @@ -25,7 +25,6 @@ def __init__( cache_service, rule_processor: RuleProcessor, data_processor, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, define_xml_path, standard, @@ -37,7 +36,6 @@ def __init__( self.cache = cache_service self.data_processor = data_processor self.rule_processor = rule_processor - self.datasets = datasets self.dataset_metadata = dataset_metadata self.rule = rule self.define_xml_path = define_xml_path @@ -77,7 +75,7 @@ def get_dataset(self, **kwargs): dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.build_split_datasets, datasets_metadata=get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ), **kwargs, ) @@ -95,7 +93,7 @@ def get_dataset_contents(self, **kwargs): dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.data_service.get_dataset, datasets_metadata=get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ), **kwargs, ) @@ -220,7 +218,6 @@ def get_library_variables_metadata(self) -> DatasetInterface: variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard( library_metadata=self.library_metadata, data_service=self.data_service, - datasets=self.datasets, dataset_metadata=self.dataset_metadata, ) variables_metadata: dict = self.library_metadata.variables_metadata.get( diff --git a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py index 1c325d9f5..9fa191320 100644 --- a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py @@ -16,5 +16,4 @@ def build(self): return self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, size_unit=size_unit, - datasets=self.datasets, ) diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 564729b0d..1f1c844e8 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -15,9 +15,7 @@ def build_split_datasets(self, dataset_name, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ - return self.data_service.get_dataset( - dataset_name=dataset_name, datasets=self.datasets - ) + return self.data_service.get_dataset(dataset_name=dataset_name) def get_dataset(self, **kwargs): dataset = super().get_dataset(**kwargs) @@ -25,7 +23,7 @@ def get_dataset(self, **kwargs): [ dataset.record_count for dataset in get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ) ] ) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index ed6905ffb..9c21c5e72 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -38,7 +38,6 @@ def build(self): dataset_metadata = self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, size_unit=size_unit, - datasets=self.datasets, ).to_dict(orient="records")[0] # Build define xml dataframe define = self.get_define_xml_item_group_metadata_for_dataset(dataset_metadata) diff --git a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py index bbddcbdd5..ee98f2d0f 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py +++ b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py @@ -106,7 +106,6 @@ def get_service( kwargs.get("cache_service"), kwargs.get("rule_processor"), kwargs.get("data_processor"), - kwargs.get("datasets"), kwargs.get("dataset_metadata", ""), kwargs.get("define_xml_path"), kwargs.get("standard"), diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 4a23ba618..12c6e794c 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -134,12 +134,12 @@ def _get_dataset_dataframe(self): "ap_suffix", ] - if len(self.datasets) == 0: + if len(self.data_service.get_datasets()) == 0: dataset_df = self.dataset_implementation(columns=dataset_col_order) logger.info(f"No datasets metadata is provided in {__name__}.") else: datasets = self.dataset_implementation() - for dataset_metadata in self.datasets: + for dataset_metadata in self.data_service.get_datasets(): ds_metadata = None try: ds_metadata = self.data_service.get_dataset_metadata( diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py index 01d9ae702..eb8206f78 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py @@ -23,7 +23,6 @@ def build(self): dataset_metadata = self.data_service.get_dataset_metadata( dataset_name=self.dataset_metadata.name, size_unit=size_unit, - datasets=self.datasets, ) dataset_metadata = dataset_metadata.to_dict(orient="records")[0] data_contents_long_df = super().build() diff --git a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py index fd9b10982..eba882d84 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py @@ -14,5 +14,6 @@ def build(self): """ return self.dataset_implementation.from_records( - {ds.unsplit_name: ds.filename for ds in self.datasets}, index=[0] + {ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}, + index=[0], ) diff --git a/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py index 8ec2741c3..af92d63ba 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py @@ -26,7 +26,9 @@ def build(self): 1 EC ec.xpt EC False 2 SE None SE True """ - domain_files = {ds.unsplit_name: ds.filename for ds in self.datasets} + domain_files = { + ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets() + } all_define_metadata = self.get_define_metadata() records = [] for define_item in all_define_metadata: diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py index 5f66170bc..d22f89ce2 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py @@ -15,6 +15,5 @@ def build(self): """ return self.data_service.get_variables_metadata( dataset_name=self.dataset_metadata.name, - datasets=self.datasets, drop_duplicates=True, ) diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py index d4c4d66d3..7c279538b 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py @@ -23,7 +23,6 @@ def build(self): data_contents_long_df = super().build() variable_metadata = self.data_service.get_variables_metadata( dataset_name=self.dataset_metadata.name, - datasets=self.datasets, drop_duplicates=True, ) merged_df = data_contents_long_df.merge( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index 82ff8934c..173b6a9da 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -43,7 +43,6 @@ def build(self): variable_metadata: List[dict] = self.get_define_xml_variables_metadata() content_metadata: DatasetInterface = self.data_service.get_variables_metadata( dataset_name=self.dataset_metadata.name, - datasets=self.datasets, drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index 21edb6c6a..b5a55fa83 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -37,7 +37,6 @@ def build(self): # get dataset metadata and execute the rule content_metadata: DatasetInterface = self.data_service.get_variables_metadata( dataset_name=self.dataset_metadata.name, - datasets=self.datasets, drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index 0aeed839f..c7d87e86c 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -27,7 +27,6 @@ def build(self): content_variables_metadata: DatasetInterface = ( self.data_service.get_variables_metadata( dataset_name=self.dataset_metadata.name, - datasets=self.datasets, drop_duplicates=True, ) ) diff --git a/cdisc_rules_engine/dummy_models/dummy_dataset.py b/cdisc_rules_engine/dummy_models/dummy_dataset.py index 042a5aa8c..36b58fa1c 100644 --- a/cdisc_rules_engine/dummy_models/dummy_dataset.py +++ b/cdisc_rules_engine/dummy_models/dummy_dataset.py @@ -7,7 +7,7 @@ class DummyDataset(SDTMDatasetMetadata): - def __init__(self, dataset_data: dict): + def __init__(self, dataset_data: dict | SDTMDatasetMetadata): # with XPT in test, we pass the dataset_data as an instance of SDTMDatasetMetadata if isinstance(dataset_data, SDTMDatasetMetadata): super().__init__( diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 19a1510f2..db890ff15 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -107,7 +107,6 @@ def read_data(self, file_path: str) -> IOBase: def get_dataset_class( self, dataset: DatasetInterface, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: """ @@ -134,7 +133,6 @@ def handle_custom_domains( self, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], ) -> str | None: """ Handles custom domains by returning the appropriate class name based on the dataset contents. diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 3cc8b938a..6ba08542c 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -1,9 +1,8 @@ from dataclasses import dataclass -from typing import Iterable, List +from typing import List from cdisc_rules_engine.models.external_dictionaries_container import ( ExternalDictionariesContainer, ) -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata import pandas as pd @@ -19,7 +18,6 @@ class OperationParams: core_id: str dataframe: pd.DataFrame dataset_path: str - datasets: Iterable[SDTMDatasetMetadata] domain: str directory_path: str operation_id: str diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index f0ee50924..97508ca30 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -230,9 +230,8 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: library_metadata=self.library_metadata, data_service=self.data_service, dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.domain, datasets=self.params.datasets + dataset_name=self.params.domain ), - datasets=self.params.datasets, ) def _get_variable_names_list(self, domain, dataframe): @@ -274,11 +273,10 @@ def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: return sdtm_utilities.get_variables_metadata_from_standard_model( dataframe=dataframe, - datasets=self.params.datasets, data_service=self.data_service, library_metadata=self.library_metadata, dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.domain, datasets=self.params.datasets + dataset_name=self.params.domain ), ) diff --git a/cdisc_rules_engine/operations/dataset_names.py b/cdisc_rules_engine/operations/dataset_names.py index 85331f377..7f4216e1e 100644 --- a/cdisc_rules_engine/operations/dataset_names.py +++ b/cdisc_rules_engine/operations/dataset_names.py @@ -6,4 +6,4 @@ def _execute_operation(self): """ Returns a list of the dataset names in the study """ - return list({dataset.name for dataset in self.params.datasets}) + return list({dataset.name for dataset in self.data_service.get_datasets()}) diff --git a/cdisc_rules_engine/operations/day_data_validator.py b/cdisc_rules_engine/operations/day_data_validator.py index 54b0bfb5f..d09893b5a 100644 --- a/cdisc_rules_engine/operations/day_data_validator.py +++ b/cdisc_rules_engine/operations/day_data_validator.py @@ -12,7 +12,9 @@ def _execute_operation(self): ) # Always get RFSTDTC column from DM dataset. dm_datasets = [ - dataset for dataset in self.params.datasets if dataset.domain == "DM" + dataset + for dataset in self.data_service.get_datasets() + if dataset.domain == "DM" ] if not dm_datasets: raise DomainNotFoundError( diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index ff13163cf..225c88b1f 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -7,7 +7,7 @@ class ExtractMetadata(BaseOperation): def _execute_operation(self): # get metadata metadata: pd.DataFrame = self.data_service.get_dataset_metadata( - dataset_name=self.params.domain, datasets=self.params.datasets + dataset_name=self.params.domain ) # extract target value. Metadata df always has one row diff --git a/cdisc_rules_engine/operations/parent_library_model_column_order.py b/cdisc_rules_engine/operations/parent_library_model_column_order.py index b4ac54e31..0549bd64f 100644 --- a/cdisc_rules_engine/operations/parent_library_model_column_order.py +++ b/cdisc_rules_engine/operations/parent_library_model_column_order.py @@ -36,7 +36,7 @@ def _execute_operation(self): def _get_domain_to_datasets(self): domain_to_datasets = defaultdict(list) - for dataset in self.params.datasets: + for dataset in self.data_service.get_datasets(): domain_to_datasets[dataset.domain].append(dataset) return domain_to_datasets diff --git a/cdisc_rules_engine/operations/related_domain_is_custom.py b/cdisc_rules_engine/operations/related_domain_is_custom.py index 5b681d6f4..e313b7140 100644 --- a/cdisc_rules_engine/operations/related_domain_is_custom.py +++ b/cdisc_rules_engine/operations/related_domain_is_custom.py @@ -10,7 +10,7 @@ def _execute_operation(self): If no -> the domain is custom. """ - for ds in self.params.datasets: + for ds in self.data_service.get_datasets(): if ds.is_supp and self.params.domain.endswith(ds.rdomain): return is_custom_domain(self.library_metadata, ds.rdomain) return False diff --git a/cdisc_rules_engine/operations/study_domains.py b/cdisc_rules_engine/operations/study_domains.py index b931ccda1..4c2fd85b5 100644 --- a/cdisc_rules_engine/operations/study_domains.py +++ b/cdisc_rules_engine/operations/study_domains.py @@ -6,4 +6,6 @@ def _execute_operation(self): """ Returns a list of the domains in the study """ - return list({(dataset.domain or "") for dataset in self.params.datasets}) + return list( + {(dataset.domain or "") for dataset in self.data_service.get_datasets()} + ) diff --git a/cdisc_rules_engine/operations/variable_count.py b/cdisc_rules_engine/operations/variable_count.py index c3b9f8680..a14d60502 100644 --- a/cdisc_rules_engine/operations/variable_count.py +++ b/cdisc_rules_engine/operations/variable_count.py @@ -22,7 +22,10 @@ async def _get_all_study_variable_counts(self) -> dict: of times that value appears as a variable in the study. """ datasets_with_unique_domains = list( - {dataset.unsplit_name: dataset for dataset in self.params.datasets}.values() + { + dataset.unsplit_name: dataset + for dataset in self.data_service.get_datasets() + }.values() ) coroutines = [ self._get_dataset_variable_count(dataset) diff --git a/cdisc_rules_engine/operations/variable_value_count.py b/cdisc_rules_engine/operations/variable_value_count.py index 75e7b6b10..ce3887a59 100644 --- a/cdisc_rules_engine/operations/variable_value_count.py +++ b/cdisc_rules_engine/operations/variable_value_count.py @@ -22,7 +22,9 @@ async def _get_all_study_variable_value_counts(self) -> dict: of times that value appears in the study. """ datasets_with_unique_domains = list( - {dataset.domain: dataset for dataset in self.params.datasets}.values() + { + dataset.domain: dataset for dataset in self.data_service.get_datasets() + }.values() ) coroutines = [ self._get_dataset_variable_value_count(dataset) @@ -36,7 +38,7 @@ async def _get_dataset_variable_value_count( ) -> Counter: if dataset_metadata.is_split: corresponding_datasets = get_corresponding_datasets( - self.params.datasets, dataset_metadata + self.data_service.get_datasets(), dataset_metadata ) data: DatasetInterface = self.data_service.concat_split_datasets( self.data_service.get_dataset, corresponding_datasets diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 060265b25..bc677ef04 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Iterable, List, Union +from typing import List, Union from dateutil.parser._parser import ParserError import traceback @@ -123,7 +123,7 @@ def get_first_dataset_path(self) -> str | None: ): return self.data_service.dataset_paths[0] - def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadata]): + def validate_single_rule(self, rule: dict): results = {} rule["conditions"] = ConditionCompositeFactory.get_condition_composite( rule["conditions"] @@ -131,14 +131,13 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat if rule.get("rule_type") == RuleTypes.JSONATA.value: results["json"] = self.validate_single_dataset( rule, - datasets, SDTMDatasetMetadata( name="json", full_path=self.get_first_dataset_path() ), ) else: total_errors = 0 - for dataset_metadata in datasets: + for dataset_metadata in self.data_service.get_datasets(): if ( self.max_errors_per_rule and not self.errors_per_dataset_flag @@ -155,7 +154,6 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat continue # handling split datasets dataset_results = self.validate_single_dataset( rule, - datasets, dataset_metadata, ) if self.errors_per_dataset_flag and self.max_errors_per_rule: @@ -209,7 +207,6 @@ def _truncate_dataset_errors(self, dataset_results, rule, dataset_metadata): def validate_single_dataset( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> List[Union[dict, str]]: """ @@ -218,20 +215,18 @@ def validate_single_dataset( """ logger.info( f"Validating {dataset_metadata.name}. " - f"rule={rule}. dataset_path={dataset_metadata.full_path}. datasets={datasets}." + f"rule={rule}. dataset_path={dataset_metadata.full_path}. datasets={self.data_service.get_datasets()}." ) try: is_suitable, reason = self.rule_processor.is_suitable_for_validation( rule, dataset_metadata, - datasets, self.standard, - self.standard_substandard, self.use_case, ) if is_suitable: result: List[Union[dict, str]] = self.validate_rule( - rule, datasets, dataset_metadata + rule, dataset_metadata ) logger.info( f"Validated dataset {dataset_metadata.name}. Result = {result}" @@ -289,7 +284,6 @@ def validate_single_dataset( def get_dataset_builder( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): return builder_factory.get_service( @@ -300,7 +294,6 @@ def get_dataset_builder( data_processor=self.data_processor, rule_processor=self.rule_processor, dataset_metadata=dataset_metadata, - datasets=datasets, define_xml_path=self.define_xml_path, standard=self.standard, standard_version=self.standard_version, @@ -312,7 +305,6 @@ def get_dataset_builder( def validate_rule( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> List[Union[dict, str]]: """ @@ -320,7 +312,7 @@ def validate_rule( It defines a rule validator based on its type and calls it. """ kwargs = {} - builder = self.get_dataset_builder(rule, datasets, dataset_metadata) + builder = self.get_dataset_builder(rule, dataset_metadata) try: dataset = builder.get_dataset() except Exception as e: @@ -348,13 +340,12 @@ def validate_rule( kwargs["ct_packages"] = list(self.ct_packages) logger.info(f"Using dataset build by: {builder.__class__}") - return self.execute_rule(rule, dataset, datasets, dataset_metadata, **kwargs) + return self.execute_rule(rule, dataset, dataset_metadata, **kwargs) def execute_rule( self, rule: dict, dataset: DatasetInterface, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, value_level_metadata: List[dict] = None, variable_codelist_map: dict = None, @@ -383,12 +374,11 @@ def execute_rule( dataset_preprocessor = DatasetPreprocessor( dataset, dataset_metadata, self.data_service, self.cache ) - dataset = dataset_preprocessor.preprocess(rule_copy, datasets) + dataset = dataset_preprocessor.preprocess(rule_copy) dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, dataset_metadata, - datasets, standard=self.standard, standard_version=self.standard_version, standard_substandard=self.standard_substandard, diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index a47476868..b1566a270 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -178,7 +178,6 @@ def check_filepath(self, dataset_names: List[str]) -> List: def get_dataset_class( self, dataset: DatasetInterface, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: if self.library_metadata.standard_metadata: @@ -189,7 +188,7 @@ def get_dataset_class( name = class_data.get("name") if name: return convert_library_class_name_to_ct_class(name) - return self.handle_custom_domains(dataset, dataset_metadata, datasets) + return self.handle_custom_domains(dataset, dataset_metadata) def get_data_structure( self, @@ -278,7 +277,6 @@ def handle_custom_domains( self, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], ): if self._contains_topic_variable(dataset, dataset_metadata.domain, "TERM"): return EVENTS @@ -291,14 +289,11 @@ def handle_custom_domains( return FINDINGS_ABOUT return FINDINGS if dataset_metadata.is_ap: - return self._get_associated_persons_inherit_class( - datasets, dataset_metadata - ) + return self._get_associated_persons_inherit_class(dataset_metadata) return None def _get_associated_persons_inherit_class( self, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): """ @@ -307,6 +302,7 @@ def _get_associated_persons_inherit_class( ap_suffix = dataset_metadata.ap_suffix if not ap_suffix: return None + datasets = self.get_datasets() if len(datasets) > 1: new_dataset_metadata: SDTMDatasetMetadata = search_in_list( datasets, lambda item: item.domain == ap_suffix @@ -319,7 +315,6 @@ def _get_associated_persons_inherit_class( raise ValueError("Filename for domain doesn't exist") return self.get_dataset_class( new_dataset, - datasets, new_dataset_metadata, ) else: diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index f34123972..fefb23b56 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -150,14 +150,12 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: return df @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata( - self, dataset_name: str, datasets: list, **params - ) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ metadata: dict = self.__read_metadata( - self._datasets_metadata[dataset_name].full_path, datasets=datasets + self._datasets_metadata[dataset_name].full_path ) contents_metadata: dict = metadata["contents_metadata"] metadata_to_return: VariableMetadataContainer = VariableMetadataContainer( @@ -194,7 +192,6 @@ def get_dataset_by_type( def __read_metadata( self, dataset_path: str, - datasets: Optional[Iterable[SDTMDatasetMetadata]] = None, ) -> dict: file_size = os.path.getsize(dataset_path) file_name = basename(dataset_path) @@ -203,8 +200,8 @@ def __read_metadata( "name": file_name, "file_size": file_size, } - if file_name.endswith(".parquet") and datasets: - for obj in datasets: + if file_name.endswith(".parquet") and self.get_datasets(): + for obj in self.get_datasets(): if obj.full_path == dataset_path: file_metadata = { "path": obj.original_path, @@ -255,7 +252,7 @@ def __get_dataset_metadata(self, dataset_path: str, **kwargs) -> Tuple[dict, dic Internal method that gets dataset metadata and converts file size if needed. """ - metadata: dict = self.__read_metadata(dataset_path, kwargs.get("datasets")) + metadata: dict = self.__read_metadata(dataset_path) file_metadata: dict = metadata["file_metadata"] size_unit: Optional[str] = kwargs.get("size_unit") if size_unit: # convert file size from bytes to desired unit if needed diff --git a/cdisc_rules_engine/services/reporting/report_factory.py b/cdisc_rules_engine/services/reporting/report_factory.py index ad388075e..6228b4ae3 100644 --- a/cdisc_rules_engine/services/reporting/report_factory.py +++ b/cdisc_rules_engine/services/reporting/report_factory.py @@ -1,10 +1,9 @@ -from typing import List, Type, Iterable +from typing import List, Type from cdisc_rules_engine.enums.report_types import ReportTypes from cdisc_rules_engine.interfaces import DataServiceInterface from cdisc_rules_engine.models.rule_validation_result import RuleValidationResult from cdisc_rules_engine.models.validation_args import Validation_args -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.reporting.base_report_data import ( BaseReportData, ) @@ -31,15 +30,14 @@ class ReportFactory: def __init__( self, - datasets: Iterable[SDTMDatasetMetadata], results: List[RuleValidationResult], elapsed_time: float, args: Validation_args, data_service: DataServiceInterface, dictionary_versions, ): - self._datasets = datasets - self._dataset_paths = [dataset.full_path for dataset in datasets] + self._datasets = data_service.get_datasets() + self._dataset_paths = [dataset.full_path for dataset in self._datasets] self._results = results self._elapsed_time = elapsed_time self._args = args diff --git a/cdisc_rules_engine/utilities/dataset_preprocessor.py b/cdisc_rules_engine/utilities/dataset_preprocessor.py index b8a54f725..620f8b7d5 100644 --- a/cdisc_rules_engine/utilities/dataset_preprocessor.py +++ b/cdisc_rules_engine/utilities/dataset_preprocessor.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Union +from typing import List, Union from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @@ -45,9 +45,7 @@ def __init__( self._data_service = data_service self._rule_processor = RuleProcessor(self._data_service, cache_service) - def preprocess( # noqa - self, rule: dict, datasets: Iterable[SDTMDatasetMetadata] - ) -> DatasetInterface: + def preprocess(self, rule: dict) -> DatasetInterface: # noqa """ Preprocesses the dataset by merging it with the datasets from the provided rule. @@ -78,7 +76,7 @@ def preprocess( # noqa ): dataset_metadatas: list[SDTMDatasetMetadata] = [ item - for item in datasets + for item in self._data_service.get_datasets() if (item.domain == self._dataset_metadata.rdomain) ] # find parent of other datasets @@ -87,7 +85,7 @@ def preprocess( # noqa or domain_name == self._dataset_metadata.name ): dataset_metadatas: list[SDTMDatasetMetadata] = ( - self._find_parent_dataset(datasets, domain_details) + self._find_parent_dataset(domain_details) ) else: if self._is_split_domain(domain_name): @@ -97,7 +95,7 @@ def preprocess( # noqa ) dataset_metadatas: list[SDTMDatasetMetadata] = [ item - for item in datasets + for item in self._data_service.get_datasets() if ( item.domain == domain_name or item.name == domain_name @@ -157,7 +155,6 @@ def preprocess( # noqa right_dataset=other_dataset, right_dataset_domain_name=dataset_metadata.domain, match_keys=domain_details.get("match_key"), - datasets=datasets, ) merged_domains.add(dataset_metadata.domain) else: @@ -166,7 +163,6 @@ def preprocess( # noqa left_dataset_domain_name=self._dataset_metadata.domain, right_dataset=other_dataset, right_dataset_domain_details=domain_details, - datasets=datasets, ) merged_domains.add( dataset_metadata.domain @@ -175,20 +171,18 @@ def preprocess( # noqa ) return result - def _find_parent_dataset( - self, datasets: Iterable[SDTMDatasetMetadata], domain_details: dict - ) -> SDTMDatasetMetadata: + def _find_parent_dataset(self, domain_details: dict) -> SDTMDatasetMetadata: matching_datasets = [] try: if "RDOMAIN" in self._dataset.columns: rdomain_column = self._dataset.data["RDOMAIN"] unique_domains = set(rdomain_column.unique()) - for dataset in datasets: + for dataset in self._data_service.get_datasets(): if dataset.domain in unique_domains: matching_datasets.append(dataset) else: match_keys = domain_details.get("match_key") - for dataset in datasets: + for dataset in self._data_service.get_datasets(): has_all_match_keys = all( match_key in dataset.first_record for match_key in match_keys ) @@ -217,7 +211,6 @@ def _child_merge_datasets( right_dataset: DatasetInterface, right_dataset_domain_name: str, match_keys: List[str], - datasets: Iterable[SDTMDatasetMetadata] = None, ) -> DatasetInterface: is_supplemental, rdomain_dataset = self._classify_dataset( left_dataset, self._dataset_metadata @@ -508,7 +501,6 @@ def _merge_datasets( # noqa left_dataset_domain_name: str, right_dataset: DatasetInterface, right_dataset_domain_details: dict, - datasets: List[dict], ) -> DatasetInterface: """ Merges datasets on their match keys. @@ -537,7 +529,7 @@ def _merge_datasets( # noqa left_dataset=left_dataset, left_dataset_domain_name=left_dataset_domain_name, relrec_dataset=right_dataset, - datasets=datasets, + datasets=self._data_service.get_datasets(), dataset_preprocessor=self, wildcard=right_dataset_domain_details.get("wildcard"), ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 3d6f148f4..71fccd2a0 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -2,7 +2,7 @@ import copy from os.path import dirname -from typing import Iterable, List, Optional, Union, Tuple +from typing import List, Optional, Union, Tuple from cdisc_rules_engine.enums.rule_types import RuleTypes from cdisc_rules_engine.interfaces.cache_service_interface import ( CacheServiceInterface, @@ -212,7 +212,6 @@ def rule_applies_to_data_structure( def rule_applies_to_class( self, rule, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): """ @@ -240,11 +239,10 @@ def rule_applies_to_class( if ALL_KEYWORD in included_classes: return True variables = self.data_service.get_variables_metadata( - dataset_name=dataset_metadata.name, datasets=datasets + dataset_name=dataset_metadata.name ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - datasets, dataset_metadata, ) if (class_name not in included_classes) and not ( @@ -253,11 +251,10 @@ def rule_applies_to_class( is_included = False if excluded_classes: variables = self.data_service.get_variables_metadata( - dataset_name=dataset_metadata.name, datasets=datasets + dataset_name=dataset_metadata.name ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - datasets, dataset_metadata, ) if class_name and ( @@ -269,10 +266,8 @@ def rule_applies_to_class( def rule_applies_to_use_case( self, - dataset_metadata: SDTMDatasetMetadata, rule: dict, standard: str, - standard_substandard: str, use_case: str, ) -> bool: if standard.lower() != "tig": @@ -321,7 +316,6 @@ def perform_rule_operations( rule: dict, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], standard: str, standard_version: str, standard_substandard: str, @@ -373,7 +367,6 @@ def perform_rule_operations( define_xml_path=kwargs.get("define_xml_path"), dataframe=dataset_copy, dataset_path=dataset_metadata.full_path, - datasets=datasets, delimiter=operation.get("delimiter"), dictionary_term_type=operation.get("dictionary_term_type"), directory_path=dirname(dataset_metadata.full_path), @@ -464,7 +457,7 @@ def _execute_operation( ): # download other domain dataset_metadata: DatasetMetadata = search_in_list( - operation_params.datasets, + self.data_service.get_datasets(), lambda item: ( item.unsplit_name == operation_params.domain or ( @@ -576,7 +569,7 @@ def _preprocess_operation_params( if domain_details.is_supp: current_domain = domain_details.rdomain for param_name in vars(params_copy): - if param_name in ("datasets", "dataframe"): + if param_name in ("dataframe"): continue param_value = getattr(params_copy, param_name) updated_value = self._replace_wildcards_in_value( @@ -643,9 +636,7 @@ def is_suitable_for_validation( self, rule: dict, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], standard, - standard_substandard: str, use_case: str, ) -> Tuple[bool, str]: """Check if rule is suitable and return reason if not""" @@ -661,10 +652,8 @@ def is_suitable_for_validation( ): return self.log_suitable_for_validation(rule_id, dataset_name) if not self.rule_applies_to_use_case( - dataset_metadata, rule, standard, - standard_substandard, use_case, ): reason = ( @@ -687,7 +676,7 @@ def is_suitable_for_validation( ) logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason - if not self.rule_applies_to_class(rule, datasets, dataset_metadata): + if not self.rule_applies_to_class(rule, dataset_metadata): reason = ( f"Rule skipped - doesn't apply to class for " f"rule id={rule_id}, dataset={dataset_name}" diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index d15a6c0e8..0e464490e 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -107,7 +107,6 @@ def get_variables_metadata_from_standard( # noqa library_metadata, data_service, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], ): add_AP = False domain = dataset_metadata.unsplit_name @@ -140,7 +139,6 @@ def get_variables_metadata_from_standard( # noqa class_name = data_service.handle_custom_domains( data_service.get_dataset(dataset_name=dataset_metadata.name), dataset_metadata, - datasets, ) model_class_details = get_class_metadata(model_details, class_name) # Both custom and standard General Observations pull from model @@ -341,7 +339,6 @@ def group_class_variables_by_role( def get_variables_metadata_from_standard_model( # noqa dataframe, - datasets: Iterable[SDTMDatasetMetadata], data_service: DataServiceInterface, library_metadata: LibraryMetadataContainer, dataset_metadata: SDTMDatasetMetadata, @@ -379,9 +376,7 @@ def get_variables_metadata_from_standard_model( # noqa IG_class_details.get("name") ) else: - class_name = data_service.handle_custom_domains( - dataframe, dataset_metadata, datasets - ) + class_name = data_service.handle_custom_domains(dataframe, dataset_metadata) if class_name in DETECTABLE_CLASSES: model_class_details = get_class_metadata(model_details, class_name) ( diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 5f10dd3e6..3e476bd73 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -101,7 +101,7 @@ def validate_single_rule( errors_per_dataset_flag=per_dataset_flag, encoding=args.encoding, ) - results = engine.validate_single_rule(rule, datasets) + results = engine.validate_single_rule(rule) results = list(itertools.chain(*results.values())) if args.progress == ProgressParameterOptions.VERBOSE_OUTPUT.value: engine_logger.log(f"{rule['core_id']} validation complete") @@ -128,14 +128,14 @@ def initialize_logger(disabled, log_level): def _convert_datasets_to_parquet_if_needed( - data_service, datasets, created_files, large_dataset_validation: bool + data_service, created_files, large_dataset_validation: bool ): if not (large_dataset_validation and data_service.standard != "usdm"): return engine_logger.warning( "Large datasets must use parquet format, converting all datasets to parquet" ) - for dataset in datasets: + for dataset in data_service.get_datasets(): file_path = dataset.full_path if file_path.endswith(".parquet"): continue @@ -188,7 +188,6 @@ def run_validation(args: Validation_args): datasets = data_service.get_datasets() _convert_datasets_to_parquet_if_needed( data_service, - datasets, created_files, large_dataset_validation, ) @@ -223,7 +222,7 @@ def run_validation(args: Validation_args): elapsed_time = end - start engine_logger.info("Done Rule execution, creating reports") reporting_factory = ReportFactory( - datasets, results, elapsed_time, args, data_service, dictionary_versions + results, elapsed_time, args, data_service, dictionary_versions ) reporting_services: List[BaseReport] = reporting_factory.get_report_services() output_files = [] @@ -324,5 +323,5 @@ def run_single_rule_validation( engine.rule_processor = RuleProcessor(data_service, cache, library_metadata) engine.data_processor = DataProcessor(data_service, cache) rule = Rule.from_cdisc_metadata(rule) - results = engine.validate_single_rule(rule, datasets) + results = engine.validate_single_rule(rule) return results diff --git a/tests/conftest.py b/tests/conftest.py index c25c9684a..013b3f381 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1278,7 +1278,6 @@ def operation_params() -> OperationParams: domain="domain", dataset_path="dataset_path", directory_path="directory_path", - datasets=[], standard="standard", standard_version="standard_version", external_dictionaries=ExternalDictionariesContainer( diff --git a/tests/unit/test_dataset_builders/test_base_dataset_builder.py b/tests/unit/test_dataset_builders/test_base_dataset_builder.py index 29ba4b675..3660714db 100644 --- a/tests/unit/test_dataset_builders/test_base_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_base_dataset_builder.py @@ -77,7 +77,6 @@ def create_builder_instance(dataset_metadata): cache_service=MagicMock(), rule_processor=MagicMock(), data_processor=MagicMock(), - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, define_xml_path="/path/to/define.xml", standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py index 0abf83017..bb1ef4d89 100644 --- a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py @@ -298,7 +298,6 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - datasets=test_data.get("datasets", {}), dataset_metadata=SDTMDatasetMetadata( name="QSCG", ), @@ -326,7 +325,6 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - datasets=test_data.get("datasets", {}), dataset_metadata=SDTMDatasetMetadata( name="QSPG", ), diff --git a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py index b6d2ec81e..78af9b2f2 100644 --- a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py @@ -62,8 +62,7 @@ def test_ContentDatasetBuilder_split_datasets(conditions): "conditions": ConditionCompositeFactory.get_condition_composite(conditions), } processor = RuleProcessor(mock_data_service, InMemoryCacheService()) - data_metadata = test_data - datasets = [DummyDataset(data) for data in data_metadata.get("datasets", [])] + datasets = [DummyDataset(data) for data in test_data.get("datasets", [])] expected_output = { "STUDYID": { "0": "CDISCCORE01", @@ -377,7 +376,6 @@ def test_ContentDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - datasets=datasets, dataset_metadata=DummyDataset(test_data.get("datasets", {})[0]), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py index 2dc7b98cf..18e5da039 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py @@ -829,9 +829,6 @@ def test_contents_define_dataset_builder(dataset_path): cache_service=None, rule_processor=RuleProcessor(mock_data_service, InMemoryCacheService()), data_processor=None, - datasets=[ - SDTMDatasetMetadata(**dataset) for dataset in dataset_metadata.values() - ], dataset_metadata=SDTMDatasetMetadata(**dataset_metadata[dataset_path]), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py index 915dd0d98..53dcaf781 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py @@ -102,7 +102,6 @@ def test_contents_define_variables_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - datasets=None, dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py index f62413579..967cee656 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py @@ -124,7 +124,6 @@ def test_contents_define_vlm_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - datasets=None, dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index 005a095a8..aff6c25e8 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -124,7 +124,6 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): cache_service=None, rule_processor=None, data_processor=None, - datasets=[data_metadata], dataset_metadata=SDTMDatasetMetadata(full_path=dataset_path), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py index c57f1e59c..812e4ca04 100644 --- a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py +++ b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py @@ -184,7 +184,6 @@ def test_define_variables_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", diff --git a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py index e8cb96ec0..28278d9b8 100644 --- a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py +++ b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py @@ -68,10 +68,18 @@ [ ( [ - MagicMock(unsplit_name="AE", filename="ae.xpt"), - MagicMock(unsplit_name="DM", filename="dm.xpt"), - MagicMock(unsplit_name="SE", filename="se.xpt"), - MagicMock(unsplit_name="EC", filename="ec.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="DM", first_record={"DOMAIN": "DM"}, filename="dm.xpt" + ), + SDTMDatasetMetadata( + name="SE", first_record={"DOMAIN": "SE"}, filename="se.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), ], define_metadata, pd.DataFrame( @@ -153,9 +161,15 @@ ), ( [ - MagicMock(unsplit_name="AE", filename="ae.xpt"), - MagicMock(unsplit_name="DM", filename="dm.xpt"), - MagicMock(unsplit_name="EC", filename="ec.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="DM", first_record={"DOMAIN": "DM"}, filename="dm.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), ], define_metadata, pd.DataFrame( @@ -322,11 +336,15 @@ def test_domain_list_with_define_dataset_builder( ): builder = DomainListWithDefineDatasetBuilder( rule=None, - data_service=DummyDataService(MagicMock(), MagicMock(), MagicMock(), data=[]), + data_service=DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=mock_datasets, + ), cache_service=None, rule_processor=None, data_processor=None, - datasets=mock_datasets, dataset_metadata=SDTMDatasetMetadata(full_path="ae.xpt"), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py index 4def032e0..dee39886f 100644 --- a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py @@ -86,7 +86,6 @@ def test_json_schema_check_dataset_builder_valid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - datasets=[], dataset_metadata=MagicMock(name="test_dataset"), define_xml_path=None, standard="USDM", @@ -142,7 +141,6 @@ def test_json_schema_check_dataset_builder_invalid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - datasets=[], dataset_metadata=dataset_metadata, define_xml_path=None, standard="USDM", diff --git a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py index ebf612664..563563f63 100644 --- a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py @@ -60,7 +60,6 @@ def test_build_with_dataset_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} ), @@ -142,7 +141,6 @@ def test_build_split_datasets(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - datasets=[], dataset_metadata=None, define_xml_path="", standard="", diff --git a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py index eac4d0372..266d82bbc 100644 --- a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py @@ -59,7 +59,6 @@ def test_build_with_variable_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} ), @@ -161,7 +160,6 @@ def test_concat_with_split_datasets(): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), define_xml_path="", standard="", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 85c7a58bd..341ddb0f0 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -140,7 +140,6 @@ def test_build_combined_metadata( cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py index 1708785a4..084f76f4c 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py @@ -130,7 +130,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", @@ -353,7 +352,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i cache_service=cache, rule_processor=None, data_processor=None, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", diff --git a/tests/unit/test_dataset_preprocessor.py b/tests/unit/test_dataset_preprocessor.py index af3e5777e..c58789760 100644 --- a/tests/unit/test_dataset_preprocessor.py +++ b/tests/unit/test_dataset_preprocessor.py @@ -6,7 +6,9 @@ from cdisc_rules_engine.services.cache.in_memory_cache_service import ( InMemoryCacheService, ) -from cdisc_rules_engine.services.data_services import LocalDataService +from cdisc_rules_engine.services.data_services.dummy_data_service import ( + DummyDataService, +) from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor from cdisc_rules_engine.constants.rule_constants import ALL_KEYWORD @@ -38,8 +40,9 @@ def test_preprocess_no_datasets_in_rule(dataset_rule_equal_to_error_objects: dic } ) ) - datasets = [SDTMDatasetMetadata(name="AE")] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), MagicMock(), MagicMock(), data=[SDTMDatasetMetadata(name="AE")] + ) preprocessor = DatasetPreprocessor( dataset, SDTMDatasetMetadata(name="AE", full_path="path"), @@ -47,12 +50,12 @@ def test_preprocess_no_datasets_in_rule(dataset_rule_equal_to_error_objects: dic InMemoryCacheService(), ) preprocessed_dataset: PandasDataset = preprocessor.preprocess( - dataset_rule_equal_to_error_objects, datasets + dataset_rule_equal_to_error_objects ) assert preprocessed_dataset.data.equals(dataset.data) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002"], @@ -93,7 +96,17 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "LB"}, + filename="lb.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -104,13 +117,7 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "LB"}, - filename="lb.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 2 assert "USUBJID" in result.data.columns assert "QVAL" in result.data.columns @@ -121,7 +128,7 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock assert "Cholesterol" in matched_records["LBTEST"].values -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002"], @@ -161,7 +168,17 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "VS"}, + filename="vs.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -172,13 +189,7 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "VS"}, - filename="vs.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 2 assert "QVAL" in result.data.columns assert "VSTEST" in result.data.columns @@ -188,7 +199,7 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): assert "Height" in matched_records["VSTEST"].values -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001"], @@ -227,7 +238,17 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"}, + filename="ae.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -238,13 +259,7 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, - filename="ae.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 1 assert result.data.iloc[0]["USUBJID"] == "CDISC001" assert result.data.iloc[0]["QVAL"] == "AE999" @@ -252,7 +267,7 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): assert pd.isna(result.data.iloc[0]["AETERM"]) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002", "CDISC003"], @@ -294,7 +309,17 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "LB"}, + filename="lb.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -305,13 +330,7 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "LB"}, - filename="lb.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 3 assert "LBTEST" in result.data.columns matched_records = result.data[result.data["LBTEST"].notna()] @@ -507,7 +526,7 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc ), ], ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess( mock_get_dataset: MagicMock, dataset_rule_equal_to: dict, @@ -630,7 +649,19 @@ def test_preprocess( for ds in dataset_rule_equal_to["datasets"]: ds["join_type"] = join_type - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt" + ), + ], + ) preprocessor = DatasetPreprocessor( ec_dataset, SDTMDatasetMetadata( @@ -641,17 +672,7 @@ def test_preprocess( data_service, InMemoryCacheService(), ) - preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( - dataset_rule_equal_to, - [ - SDTMDatasetMetadata( - name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" - ), - SDTMDatasetMetadata( - name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt" - ), - ], - ) + preprocessed_dataset: pd.DataFrame = preprocessor.preprocess(dataset_rule_equal_to) assert preprocessed_dataset.data.equals(expected_dataset.data) @@ -750,7 +771,7 @@ def test_preprocess( ), ], ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_relrec_dataset( mock_get_dataset: MagicMock, relrec: dict, expected: dict ): @@ -874,9 +895,15 @@ def test_preprocess_relrec_dataset( return_value=["--SEQ", "--STDY"] ) # execute operation - data_service = LocalDataService.get_instance( + data_service = DummyDataService.get_instance( cache_service=cache, config=ConfigService(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata(name="RELREC", filename="relrec.xpt"), + ], ) data_service.library_metadata = LibraryMetadataContainer() @@ -890,20 +917,12 @@ def test_preprocess_relrec_dataset( data_service, InMemoryCacheService(), ) - preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( - relrec_rule, - [ - SDTMDatasetMetadata( - name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" - ), - SDTMDatasetMetadata(name="RELREC", filename="relrec.xpt"), - ], - ) + preprocessed_dataset: pd.DataFrame = preprocessor.preprocess(relrec_rule) expected_dataset = PandasDataset(pd.DataFrame.from_dict(expected)) assert preprocessed_dataset.data.equals(expected_dataset.data) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_with_merge_comparison( mock_get_dataset: MagicMock, dataset_rule_equal_to_compare_same_value: dict, @@ -954,7 +973,19 @@ def test_preprocess_with_merge_comparison( dataset_name ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), + ], + ) preprocessor = DatasetPreprocessor( target_dataset, SDTMDatasetMetadata( @@ -966,15 +997,7 @@ def test_preprocess_with_merge_comparison( InMemoryCacheService(), ) result: pd.DataFrame = preprocessor.preprocess( - rule=dataset_rule_equal_to_compare_same_value, - datasets=[ - SDTMDatasetMetadata( - name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" - ), - SDTMDatasetMetadata( - name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" - ), - ], + rule=dataset_rule_equal_to_compare_same_value ) assert "NOTVISIT" in result assert result["NOTVISIT"].iloc[0] == 12 @@ -982,7 +1005,7 @@ def test_preprocess_with_merge_comparison( assert result["AE.VISIT"].iloc[0] == 24 -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): """ Test preprocessing when SUPP dataset has blank IDVAR and IDVARVAL. @@ -1006,7 +1029,16 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): supp_dataset = PandasDataset(pd.DataFrame(supp_data)) mock_get_dataset.return_value = supp_dataset - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" + ) + ], + ) preprocessor = DatasetPreprocessor( main_dataset, SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), @@ -1033,12 +1065,9 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess( + rule, + ) assert len(result.data) == 2 assert "AESPID" in result.data.columns assert "QNAM" not in result.data.columns @@ -1053,7 +1082,7 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_supp_wildcard_matches_all_supp_datasets( mock_get_dataset: MagicMock, ): @@ -1109,14 +1138,18 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", - first_record={"RDOMAIN": "AE"}, - filename="suppae.xpt", - ), - ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", + first_record={"RDOMAIN": "AE"}, + filename="suppae.xpt", + ), + ], + ) preprocessor = DatasetPreprocessor( ae_dataset, SDTMDatasetMetadata( @@ -1128,7 +1161,7 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_supp_wildcard, datasets) + result = preprocessor.preprocess(rule_with_supp_wildcard) assert len(result.data) == 2 assert "RDOMAIN" in result.data.columns assert "AESPID" in result.data.columns @@ -1137,7 +1170,7 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( assert result.data.loc[1, "AESEV"] == "MILD" -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_specific_suppae_dataset( mock_get_dataset: MagicMock, ): @@ -1188,15 +1221,19 @@ def test_preprocess_specific_suppae_dataset( } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", - first_record={"RDOMAIN": "AE"}, - filename="suppae.xpt", - ), - ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", + first_record={"RDOMAIN": "AE"}, + filename="suppae.xpt", + ), + ], + ) preprocessor = DatasetPreprocessor( ae_dataset, SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), @@ -1204,7 +1241,7 @@ def test_preprocess_specific_suppae_dataset( InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_specific_supp, datasets) + result = preprocessor.preprocess(rule_with_specific_supp) assert len(result.data) == 1 assert "AESPID" in result.data.columns @@ -1265,7 +1302,7 @@ def test_data_processor_groups_qnam_suppdm_qvals(suppdm_with_race): assert suppdm_df.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_dm_merged_with_suppdm_without_dupes( mock_get_dataset, suppdm_with_race, rule_with_specific_supp ): @@ -1293,14 +1330,16 @@ def test_dm_merged_with_suppdm_without_dupes( dm_ds = PandasDataset(pd.DataFrame(dm)) assert suppdm_with_race.data.shape[0] == 3 - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), MagicMock(), MagicMock(), data=[supp_dm_meta] + ) preprocessor = DatasetPreprocessor( dm_ds, dm_meta, data_service, InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_specific_supp, [supp_dm_meta]) + result = preprocessor.preprocess(rule_with_specific_supp) assert result.data.shape[0] == 1 assert {"RACE1", "RACE2", "RACE3"}.issubset(set(result.columns)) assert result.data.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() @@ -1342,12 +1381,15 @@ def test_relrec_processed_correctly_with_others(rule_with_specific_supp): preprocessor = DatasetPreprocessor( relrec_ds, relrec_meta, - LocalDataService(MagicMock(), MagicMock(), MagicMock()), + DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ec_meta, supp_ec_meta, relrec_meta], + ), InMemoryCacheService(), ) - result = preprocessor.preprocess( - rule_with_specific_supp, [ec_meta, supp_ec_meta, relrec_meta] - ) + result = preprocessor.preprocess(rule_with_specific_supp) # relrec preprocessing did not change data assert result.data.shape[0] == 2 diff --git a/tests/unit/test_operations/test_dataset_names.py b/tests/unit/test_operations/test_dataset_names.py index 02ee8352a..a2e3edc9a 100644 --- a/tests/unit/test_operations/test_dataset_names.py +++ b/tests/unit/test_operations/test_dataset_names.py @@ -1,3 +1,5 @@ +from unittest.mock import MagicMock + from cdisc_rules_engine.config.config import ConfigService from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset @@ -5,9 +7,6 @@ from cdisc_rules_engine.models.operation_params import OperationParams from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.cache.cache_service_factory import CacheServiceFactory -from cdisc_rules_engine.services.data_services.data_service_factory import ( - DataServiceFactory, -) import pytest @@ -17,7 +16,6 @@ def test_get_study_domains_with_duplicates( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - data_service = DataServiceFactory(config, cache).get_data_service() datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ @@ -27,9 +25,13 @@ def test_get_study_domains_with_duplicates( {"name": "TV", "filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] ] - operation_params.datasets = datasets + mock_data_service = MagicMock() + mock_data_service.get_datasets.return_value = datasets result = DatasetNames( - operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service + operation_params, + dataset_type.from_dict({"A": [1, 2, 3]}), + cache, + mock_data_service, ).execute() assert operation_params.operation_id in result for val in result[operation_params.operation_id]: @@ -42,7 +44,6 @@ def test_get_study_domains_with_missing_domains( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - data_service = DataServiceFactory(config, cache).get_data_service() datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ @@ -52,9 +53,13 @@ def test_get_study_domains_with_missing_domains( {"name": "TV", "filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] ] - operation_params.datasets = datasets + mock_data_service = MagicMock() + mock_data_service.get_datasets.return_value = datasets result = DatasetNames( - operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service + operation_params, + dataset_type.from_dict({"A": [1, 2, 3]}), + cache, + mock_data_service, ).execute() assert operation_params.operation_id in result for val in result[operation_params.operation_id]: diff --git a/tests/unit/test_operations/test_day_data_validator.py b/tests/unit/test_operations/test_day_data_validator.py index 74b113a8b..e0b88cd8e 100644 --- a/tests/unit/test_operations/test_day_data_validator.py +++ b/tests/unit/test_operations/test_day_data_validator.py @@ -83,7 +83,7 @@ def test_day_data_calculation( mock_data_service.get_dataset.side_effect = lambda **kwargs: datasets_map.get( kwargs.get("dataset_name") ) - operation_params.datasets = datasets + mock_data_service.get_datasets.return_value = datasets operation_params.dataframe = PandasDataset.from_dict(data) operation_params.target = "values" result = DayDataValidator( diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index ff8cbb5b8..95a4a8900 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -140,7 +140,6 @@ def test_get_parent_column_order_from_library( operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = datasets operation_params.dataset_path = "suppae.xpt" # save model metadata to cache @@ -157,6 +156,7 @@ def test_get_parent_column_order_from_library( standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: datasets def mock_get_raw_metadata(dataset_name, **kwargs): if "ae" in dataset_name.lower(): @@ -351,9 +351,6 @@ def test_get_parent_findings_class_column_order_from_library( operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(**dataset) for dataset in datasets - ] operation_params.dataset_path = "suppae.xpt" # save model metadata to cache @@ -371,6 +368,9 @@ def test_get_parent_findings_class_column_order_from_library( standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] def mock_get_raw_metadata(dataset_name, **kwargs): if "ae" in dataset_name.lower(): diff --git a/tests/unit/test_operations/test_related_domain_is_custom.py b/tests/unit/test_operations/test_related_domain_is_custom.py index c01ea774e..897075568 100644 --- a/tests/unit/test_operations/test_related_domain_is_custom.py +++ b/tests/unit/test_operations/test_related_domain_is_custom.py @@ -1,3 +1,5 @@ +from unittest.mock import MagicMock + import pytest from cdisc_rules_engine.models.library_metadata_container import ( @@ -16,8 +18,7 @@ def __init__(self, name: str, is_supp: bool, rdomain: str): class DummyParams: - def __init__(self, datasets, domain: str): - self.datasets = datasets + def __init__(self, domain: str): self.domain = domain @@ -73,14 +74,16 @@ def test_related_domain_is_custom( library_metadata = LibraryMetadataContainer( standard_metadata={"dataset_names": standard_domains} ) - params = DummyParams(datasets=study_datasets, domain=domain) + params = DummyParams(domain=domain) + data_service = MagicMock() + data_service.get_datasets.return_value = study_datasets op = RelatedDomainIsCustom( params=params, library_metadata=library_metadata, original_dataset=None, cache_service=None, - data_service=None, + data_service=data_service, ) assert op._execute_operation() is expected diff --git a/tests/unit/test_operations/test_study_domains.py b/tests/unit/test_operations/test_study_domains.py index 08c89c4f8..68eab7164 100644 --- a/tests/unit/test_operations/test_study_domains.py +++ b/tests/unit/test_operations/test_study_domains.py @@ -24,7 +24,9 @@ def test_get_study_domains_with_duplicates( {"filename": "ae.xpt", "first_record": {"DOMAIN": "AE"}}, {"filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] - operation_params.datasets = [SDTMDatasetMetadata(**dataset) for dataset in datasets] + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] result = StudyDomains( operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service ).execute() @@ -46,7 +48,9 @@ def test_get_study_domains_with_missing_domains( {"filename": "ae.xpt", "first_record": {"DOMAIN": "AE"}}, {"filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] - operation_params.datasets = [SDTMDatasetMetadata(**dataset) for dataset in datasets] + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] result = StudyDomains( operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service ).execute() diff --git a/tests/unit/test_operations/test_variable_count.py b/tests/unit/test_operations/test_variable_count.py index c59a9d73c..9f64f0b5f 100644 --- a/tests/unit/test_operations/test_variable_count.py +++ b/tests/unit/test_operations/test_variable_count.py @@ -73,7 +73,7 @@ def test_variable_count( mock_data_service.concat_split_datasets.side_effect = lambda func, files: pd.concat( [func(f) for f in files] ) - operation_params.datasets = datasets + mock_data_service.get_datasets = lambda: datasets operation_params.target = target operation_params.original_target = target operation_params.dataset_path = dataset_path diff --git a/tests/unit/test_operations/test_variable_value_count.py b/tests/unit/test_operations/test_variable_value_count.py index 9e193be62..5cc2a8ce0 100644 --- a/tests/unit/test_operations/test_variable_value_count.py +++ b/tests/unit/test_operations/test_variable_value_count.py @@ -71,7 +71,7 @@ def test_variable_value_count( mock_data_service.concat_split_datasets.side_effect = ( lambda func, files: dataset_type().concat([func(f) for f in files]) ) - operation_params.datasets = datasets + mock_data_service.get_datasets = lambda: datasets operation_params.original_target = target operation_params.dataset_path = dataset_path result = VariableValueCount( diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index fb34233d0..e3b988ebd 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -91,7 +91,6 @@ def test_validate_rule_invalid_suffix( standard="sdtmig" ).validate_single_dataset( mock_ae_record_rule_equal_to_suffix, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -139,7 +138,6 @@ def test_validate_rule_invalid_prefix( standard="sdtmig" ).validate_single_dataset( mock_record_rule_equal_to_string_prefix, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -238,27 +236,33 @@ def test_validate_rule_cross_dataset_check( "AE": ae_dataset, "EC": ec_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + datasets = [ + SDTMDatasetMetadata( + name="EC", + first_record={"DOMAIN": "EC"}, + filename="ec.xpt", + full_path=os.path.join("path", "ec.xpt"), + ), + SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="ae.xpt", + full_path=os.path.join("path", "ae.xpt"), + ), + ] + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): - datasets = [ - SDTMDatasetMetadata( - name="EC", - first_record={"DOMAIN": "EC"}, - filename="ec.xpt", - full_path=os.path.join("path", "ec.xpt"), - ), - SDTMDatasetMetadata( - name="AE", - first_record={"DOMAIN": "AE"}, - filename="ae.xpt", - full_path=os.path.join("path", "ae.xpt"), - ), - ] validation_result: List[str] = RulesEngine( standard="sdtmig", standard_version="3-4" - ).validate_single_dataset(dataset_rule_equal_to, datasets, datasets[0]) + ).validate_single_dataset(dataset_rule_equal_to, datasets[0]) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, @@ -353,15 +357,20 @@ def test_validate_one_to_one_rel_across_datasets(dataset_rule_one_to_one_related "AE": ae_dataset, "EC": ec_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_one_to_one_related, - datasets, datasets[0], ) assert validation_result == [ @@ -402,7 +411,6 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): standard="sdtmig" ).validate_single_dataset( dataset_rule_greater_than, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -448,7 +456,6 @@ def test_validate_rule_equal_length(dataset_rule_has_equal_length: dict): standard="sdtmig" ).validate_single_dataset( dataset_rule_has_equal_length, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -495,15 +502,20 @@ def test_validate_is_contained_by_distinct(mock_rule_distinct_operation: dict): "AE": ae_dataset, "DM": dm_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( mock_rule_distinct_operation, - datasets, datasets[1], ) assert validation_result == [ @@ -541,7 +553,6 @@ def test_validate_rule_not_equal_length(dataset_rule_has_not_equal_length: dict) standard="sdtmig" ).validate_single_dataset( dataset_rule_has_not_equal_length, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -584,7 +595,6 @@ def test_validate_rule_multiple_conditions(dataset_rule_multiple_conditions: dic standard="sdtmig" ).validate_single_dataset( dataset_rule_multiple_conditions, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -626,7 +636,6 @@ def test_validate_record_rule_numbers_separated_by_dash_pattern(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -666,7 +675,6 @@ def test_validate_record_rule_semi_colon_delimited_pattern(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -708,7 +716,6 @@ def test_validate_record_rule_no_letters_numbers_underscores(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -761,7 +768,6 @@ def test_validate_dataset_metadata( standard="sdtmig" ).validate_single_dataset( dataset_metadata_not_equal_to_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -813,7 +819,6 @@ def test_validate_dataset_metadata_wrong_metadata( standard="sdtmig" ).validate_single_dataset( dataset_metadata_not_equal_to_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -868,7 +873,6 @@ def test_validate_variable_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -891,7 +895,6 @@ def test_validate_variable_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -940,7 +943,6 @@ def test_validate_variable_metadata_wrong_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -984,7 +986,10 @@ def test_validate_variable_metadata_wrong_metadata( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", ) -def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") +def test_rule_with_domain_prefix_replacement( + mock_get_datasets: MagicMock, mock_get_dataset: MagicMock +): rule = { "core_id": "TEST1", "standards": [], @@ -1021,9 +1026,10 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): filename="bundle", full_path="study/bundle", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" - ).validate_single_dataset(rule, [dataset_metadata], dataset_metadata) + ).validate_single_dataset(rule, dataset_metadata) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, @@ -1078,8 +1084,12 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): ), ], ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_domain_presence( - domain_presence_rule: dict, datasets: List[str], expected_validation_result: list + mock_get_datasets: MagicMock, + domain_presence_rule: dict, + datasets: List[str], + expected_validation_result: list, ): """ Unit test for RulesEngine.validate_domain_presence. @@ -1092,11 +1102,11 @@ def test_validate_domain_presence( ) for dataset in datasets ] + mock_get_datasets.return_value = dataset_metadata actual_validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( domain_presence_rule, - dataset_metadata, SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -1132,23 +1142,28 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): } ) ) - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=df, + datasets = [ + SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="bundle", + full_path="study/bundle", + ) + ] + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): - datasets = [ - SDTMDatasetMetadata( - name="AE", - first_record={"DOMAIN": "AE"}, - filename="bundle", - full_path="study/bundle", - ) - ] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_equal_to_error_objects, - datasets, datasets[0], ) assert validation_result == [ @@ -1220,21 +1235,26 @@ def test_validate_single_dataset_not_equal_to( } ) ) - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=df, + dataset_metadata = SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="data_bundle", + full_path="study/data_bundle", + ) + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=[dataset_metadata], + ), ): - dataset_metadata = SDTMDatasetMetadata( - name="AE", - first_record={"DOMAIN": "AE"}, - filename="data_bundle", - full_path="study/data_bundle", - ) validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_not_equal_to_error_objects, - [dataset_metadata], dataset_metadata, ) assert validation_result == [ @@ -1351,7 +1371,11 @@ def test_validate_single_dataset_not_equal_to( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_dataset_metadata_against_define_xml( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_get_define_xml_metadata_for_domain: MagicMock, mock_get_define_xml_reader: MagicMock, @@ -1377,11 +1401,11 @@ def test_validate_dataset_metadata_against_define_xml( filename="ae.xpt", original_path="ae.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( define_xml_validation_rule, - [dataset_metadata], dataset_metadata, ) assert validation_result == expected_validation_result @@ -1485,7 +1509,9 @@ def test_validate_dataset_metadata_against_define_xml( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_variable_metadata_against_define_xml( + mock_get_datasets: MagicMock, mock_get_variables_metadata: MagicMock, mock_get_define_xml_variables_metadata: MagicMock, define_xml_variable_validation_rule: dict, @@ -1505,12 +1531,12 @@ def test_validate_variable_metadata_against_define_xml( filename="test", full_path="CDISC01/test", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=dataset_metadata, rule=define_xml_variable_validation_rule, - datasets=[dataset_metadata], ) assert validation_result == expected_validation_result @@ -1576,7 +1602,11 @@ def test_validate_variable_metadata_against_define_xml( @patch( "cdisc_rules_engine.services.data_services.LocalDataService._async_get_datasets", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_split_dataset_contents( + mock_get_datasets: MagicMock, mock_async_get_datasets: MagicMock, dataset_rule_equal_to_error_objects: dict, include_split_datasets: bool, @@ -1658,12 +1688,12 @@ def test_validate_split_dataset_contents( full_path="CDISC01/test/ae_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=datasets[0], rule=dataset_rule_equal_to_error_objects, - datasets=datasets, ) # check validation result assert validation_result == result @@ -1675,7 +1705,11 @@ def test_validate_split_dataset_contents( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_split_dataset_metadata( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_async_get_datasets: MagicMock, dataset_metadata_not_equal_to_rule: dict, @@ -1738,12 +1772,12 @@ def test_validate_split_dataset_metadata( full_path="ec_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=datasets[1], rule=dataset_metadata_not_equal_to_rule, - datasets=datasets, ) # check validation result. # error is contained only in the second part of the dataset. @@ -1770,8 +1804,11 @@ def test_validate_split_dataset_metadata( @patch("cdisc_rules_engine.services.data_services.LocalDataService._async_get_datasets") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_split_dataset_variables_metadata( - mock_async_get_datasets: MagicMock, variables_metadata_rule: dict + mock_get_datasets: MagicMock, + mock_async_get_datasets: MagicMock, + variables_metadata_rule: dict, ): """ Unit test for validating variables metadata of a split dataset. @@ -1818,11 +1855,11 @@ def test_validate_split_dataset_variables_metadata( full_path="CDISC/test/ec_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( rule=variables_metadata_rule, - datasets=datasets, dataset_metadata=datasets[0], ) assert validation_result == [ @@ -1850,7 +1887,9 @@ def test_validate_split_dataset_variables_metadata( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_record_in_parent_domain( + mock_get_datasets, mock_get_dataset_class, dataset_rule_record_in_parent_domain_equal_to: dict, ): @@ -1942,11 +1981,11 @@ def test_validate_record_in_parent_domain( full_path=os.path.join("path", "suppec.xpt"), ), ] + mock_get_datasets.return_value = datasets validation_result: List[str] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( dataset_rule_record_in_parent_domain_equal_to, - datasets, datasets[0], ) assert validation_result == [ @@ -1970,8 +2009,11 @@ def test_validate_record_in_parent_domain( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_additional_columns( - mock_get_dataset_class, dataset_rule_inconsistent_enumerated_columns: dict + mock_get_datasets, + mock_get_dataset_class, + dataset_rule_inconsistent_enumerated_columns: dict, ): """ Unit test for validating additional columns like TSVAL1, TSVAL2. @@ -2002,18 +2044,18 @@ def test_validate_additional_columns( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", return_value=dataset, ): - datset_metadata = SDTMDatasetMetadata( + dataset_metadata = SDTMDatasetMetadata( name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt", full_path="CDISC01/test/ts.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( rule=dataset_rule_inconsistent_enumerated_columns, - datasets=[datset_metadata], - dataset_metadata=datset_metadata, + dataset_metadata=dataset_metadata, ) assert validation_result == [ { @@ -2042,7 +2084,9 @@ def test_validate_additional_columns( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( + mock_get_datasets: MagicMock, mock_get_dataset_class: MagicMock, mock_get_dataset: MagicMock, rule_distinct_operation_is_not_contained_by: dict, @@ -2114,11 +2158,11 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( full_path=os.path.join("study_id", "data_bundle_id", "ti.xpt"), ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( rule=rule_distinct_operation_is_not_contained_by, - datasets=datasets, dataset_metadata=datasets[0], ) assert validation_result == [ @@ -2137,7 +2181,9 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata" ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_extract_metadata_operation( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_get_dataset: MagicMock, rule_equal_to_with_extract_metadata_operation: dict, @@ -2187,13 +2233,13 @@ def test_validate_extract_metadata_operation( filename="suppec.xpt", full_path="study_id/data_bundle_id/suppec.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( rule=rule_equal_to_with_extract_metadata_operation, - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, ) assert validation_result == [ @@ -2235,7 +2281,9 @@ def test_validate_extract_metadata_operation( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_dataset_references_invalid_whodrug_terms( + mock_get_datasets: MagicMock, mock_get_dataset: MagicMock, rule_dataset_references_invalid_whodrug_terms: dict, installed_whodrug_dictionaries: dict, @@ -2271,6 +2319,7 @@ def test_dataset_references_invalid_whodrug_terms( filename="dataset_path", full_path="dataset_path", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation engine = RulesEngine( @@ -2288,7 +2337,6 @@ def test_dataset_references_invalid_whodrug_terms( ) validation_result: List[dict] = engine.validate_single_dataset( rule=rule_dataset_references_invalid_whodrug_terms, - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, ) assert validation_result == [ @@ -2327,7 +2375,9 @@ def test_dataset_references_invalid_whodrug_terms( "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" ) @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_variables_order_against_library_metadata( + mock_get_datasets: MagicMock, mock_get_dataset_class: MagicMock, mock_get_variables_metadata: MagicMock, mock_get_dataset: MagicMock, @@ -2488,6 +2538,7 @@ def test_validate_variables_order_against_library_metadata( filename="dataset_path", full_path="dataset_path", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation engine = RulesEngine( cache=cache, @@ -2505,7 +2556,6 @@ def mock_cached_method(*args, **kwargs): ): result: List[dict] = engine.validate_single_dataset( rule_validate_columns_order_against_library_metadata, - [dataset_metadata], dataset_metadata, ) assert result == [ diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index f8e1e61b9..f2cb210d3 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -227,9 +227,9 @@ def test_get_dataset_class(dataset_metadata, data, expected_class): standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: [SDTMDatasetMetadata(**dataset_metadata)] class_name = data_service.get_dataset_class( df, - [SDTMDatasetMetadata(**dataset_metadata)], SDTMDatasetMetadata(**dataset_metadata), ) assert class_name == expected_class @@ -267,10 +267,16 @@ def test_get_dataset_class_associated_domains(): "APDM": ap_dataset, "DM": ce_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=ap_dataset, - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=ap_dataset, + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): library_metadata: LibraryMetadataContainer = get_library_metadata_from_cache( Validation_args( @@ -313,7 +319,6 @@ def test_get_dataset_class_associated_domains(): ) class_name = data_service.get_dataset_class( ap_dataset, - datasets, datasets[0], ) assert class_name == SPECIAL_PURPOSE diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index b93dee096..3070999fe 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -118,7 +118,7 @@ def test_get_variables_metadata(dataset_name): dataset_implementation=PandasDataset, dataset_path=dataset_path, ) - data = data_service.get_variables_metadata(dataset_name=dataset_name, datasets=[]) + data = data_service.get_variables_metadata(dataset_name=dataset_name) assert isinstance(data, PandasDataset) expected_keys = [ "variable_name", diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 4c5eeffd9..07104ae48 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -81,9 +81,7 @@ def test_get_variables_metadata(dataset_implementation): dataset_implementation=dataset_implementation, dataset_paths=[dataset_path], ) - data = data_service.get_variables_metadata( - dataset_name="TEST_ADAM_DATASET", datasets=[] - ) + data = data_service.get_variables_metadata(dataset_name="TEST_ADAM_DATASET") assert isinstance(data, dataset_implementation) expected_keys = [ "variable_name", diff --git a/tests/unit/test_services/test_reporting/test_report_factory.py b/tests/unit/test_services/test_reporting/test_report_factory.py index b5eff0091..7c96680a0 100644 --- a/tests/unit/test_services/test_reporting/test_report_factory.py +++ b/tests/unit/test_services/test_reporting/test_report_factory.py @@ -11,7 +11,6 @@ def test_get_report_services(): Unit test for ReportFactory.get_report_services """ factory = ReportFactory( - datasets=[], results=[], elapsed_time=10.5, args=MagicMock( diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index 550c82f39..0123fbee1 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -93,7 +93,6 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): standard="usdm", dataset_paths=[dataset_path] ).validate_single_dataset( dataset_rule_greater_than, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index a345e5785..bd089c448 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -362,6 +362,7 @@ def test_rule_applies_to_class( processor = RuleProcessor(mock_data_service, InMemoryCacheService()) dataset_mock = PandasDataset.from_dict(data) mock_data_service.get_dataset_class.return_value = class_name + mock_data_service.get_datasets.return_value = datasets with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", return_value=dataset_mock, @@ -369,7 +370,6 @@ def test_rule_applies_to_class( assert ( processor.rule_applies_to_class( rule_metadata, - datasets, SDTMDatasetMetadata(*datasets[0]), ) == outcome @@ -377,62 +377,47 @@ def test_rule_applies_to_class( @pytest.mark.parametrize( - "dataset_name, domain, rdomain, rule_use_case, use_case, standard, standard_substandard, outcome", + "rule_use_case, use_case, standard, outcome", [ # Basic use case tests - user provides "INDH" or "PROD" - ("AE", "AE", None, "INDH, PROD", "INDH", "tig", "SDTM", True), - ("AE", "AE", None, "INDH, PROD", "PROD", "tig", "SDTM", True), - ("CM", "CM", None, "INDH", "INDH", "tig", "SDTM", True), - ("TS", "TS", None, "INDH", "INDH", "tig", "SDTM", True), - ("ES", "ES", None, "PROD", "PROD", "tig", "SDTM", True), - ("ES", "ES", None, "PROD", "INDH", "tig", "SDTM", False), - ("BW", "BW", None, "NONCLIN", "NONCLIN", "tig", "SEND", True), - ("BW", "BW", None, "NONCLIN", "INDH", "tig", "SEND", False), + ("INDH, PROD", "INDH", "tig", True), + ("INDH, PROD", "PROD", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("PROD", "PROD", "tig", True), + ("PROD", "INDH", "tig", False), + ("NONCLIN", "NONCLIN", "tig", True), + ("NONCLIN", "INDH", "tig", False), # Tests for ADaM datasets - ("ADSL", "ADSL", None, "ANALYSIS", "ANALYSIS", "tig", "ADAM", True), - ("ADAE", "ADAE", None, "ANALYSIS", "ANALYSIS", "tig", "ADAM", True), - ("ADAE", "ADAE", None, "ANALYSIS", "INDH", "tig", "ADAM", False), + ("ANALYSIS", "ANALYSIS", "tig", True), + ("ANALYSIS", "ANALYSIS", "tig", True), + ("ANALYSIS", "INDH", "tig", False), # Tests for supplementary datasets - ("SUPPAE", None, "AE", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPQS", None, "QS", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPEC", None, "EC", "INDH", "INDH", "tig", "SDTM", True), - ("SUPP--", None, "AE", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPPT", None, "PT", "PROD", "PROD", "tig", "SDTM", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("PROD", "PROD", "tig", True), # Tests for empty/None use cases in rule (should always return True) - ("AE", "AE", None, "", "INDH", "tig", "SDTM", True), - ("AE", "AE", None, None, "INDH", "tig", "SDTM", True), + ("", "INDH", "tig", True), + (None, "INDH", "tig", True), # Tests for non-TIG standard (should always return True) - ("AE", "AE", None, "INDH", "INDH", "sdtmig", "SDTM", True), - ("BW", "BW", None, "NONCLIN", "NONCLIN", "sendct", "SEND", True), + ("INDH", "INDH", "sdtmig", True), + ("NONCLIN", "NONCLIN", "sendct", True), # Test case mismatch - ("AE", "AE", None, "INDH, PROD", "SAFETY", "tig", "SDTM", False), + ("INDH, PROD", "SAFETY", "tig", False), ], ) def test_rule_applies_to_use_case( mock_data_service, - dataset_name, - domain, - rdomain, rule_use_case, standard, - standard_substandard, use_case, outcome, ): processor = RuleProcessor(mock_data_service, InMemoryCacheService()) rule = {"use_case": rule_use_case} - dataset_metadata = SDTMDatasetMetadata( - name=dataset_name, - first_record=( - {"DOMAIN": domain, "RDOMAIN": rdomain} if domain or rdomain else {} - ), - ) - assert ( - processor.rule_applies_to_use_case( - dataset_metadata, rule, standard, standard_substandard, use_case - ) - == outcome - ) + assert processor.rule_applies_to_use_case(rule, standard, use_case) == outcome @pytest.mark.parametrize("dataset_implementation", [PandasDataset, DaskDataset]) @@ -496,12 +481,12 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation): ) ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) result = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -592,12 +577,12 @@ def test_perform_rule_operation_with_grouping( ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -709,12 +694,12 @@ def test_perform_rule_operation_with_multi_key_grouping( ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -766,12 +751,12 @@ def test_perform_rule_operation_with_null_operations( label="Adverse Events", ) ] + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) new_data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -792,7 +777,6 @@ def test_preprocess_operation_params_wildcard_replacement(mock_data_service): domain="AE", dataset_path="test/ae.xpt", directory_path="test/", - datasets=[], standard="sdtmig", standard_version="3-4", grouping=["--SEQ", "--DTC", "USUBJID"], @@ -825,7 +809,6 @@ def test_preprocess_operation_params_supp_domain_uses_rdomain(mock_data_service) domain=None, dataset_path="test/suppae.xpt", directory_path="test/", - datasets=[], standard="sdtmig", standard_version="3-4", ) @@ -884,7 +867,14 @@ def test_perform_extract_metadata_operation( ], } ) - + datasets_metadata = [ + SDTMDatasetMetadata( + name="SUPPEC", + first_record={"RDOMAIN": "EC"}, + filename="suppec.xpt", + full_path="study/data_bundle/suppec.xpt", + ) + ] mock = MagicMock() mock.get_dataset.return_value = dataset mock.get_dataset_metadata.return_value = dataset_implementation.from_dict( @@ -894,20 +884,12 @@ def test_perform_extract_metadata_operation( ], } ) + mock.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock, InMemoryCacheService()) - datasets_metadata = [ - SDTMDatasetMetadata( - name="SUPPEC", - first_record={"RDOMAIN": "EC"}, - filename="suppec.xpt", - full_path="study/data_bundle/suppec.xpt", - ) - ] dataset_after_operation = processor.perform_rule_operations( rule=rule_equal_to_with_extract_metadata_operation, dataset=dataset, dataset_metadata=datasets_metadata[0], - datasets=datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py index 38905fde0..aee50aa8b 100644 --- a/tests/unit/test_utilities/test_sdtm_utils.py +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -39,39 +39,34 @@ def mock_datasets(): return [] -def test_standard_domain_ae(library_metadata, mock_data_service, mock_datasets): +def test_standard_domain_ae(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "AETERM" for var in variables) assert any(var["name"] == "AESTDTC" for var in variables) -def test_standard_domain_dm(library_metadata, mock_data_service, mock_datasets): +def test_standard_domain_dm(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="DM", first_record={"DOMAIN": "DM"}) variables = get_variables_metadata_from_standard( - library_metadata, - mock_data_service, - dataset_metadata, - mock_datasets, + library_metadata, mock_data_service, dataset_metadata ) assert any(var["name"] == "USUBJID" for var in variables) assert any(var["name"] == "AGE" for var in variables) assert any(var["name"] == "SEX" for var in variables) -def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): +def test_findings_domain_lb(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "USUBJID" for var in variables) @@ -79,39 +74,36 @@ def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): assert any(var["name"] == "LBORRES" for var in variables) -def test_supp_domain(library_metadata, mock_data_service, mock_datasets): +def test_supp_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "QNAM" for var in variables) assert any(var["name"] == "QLABEL" for var in variables) -def test_sq_domain(library_metadata, mock_data_service, mock_datasets): +def test_sq_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="SQAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "QNAM" for var in variables) assert any(var["name"] == "QLABEL" for var in variables) -def test_ap_domain(library_metadata, mock_data_service, mock_datasets): +def test_ap_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "APID" for var in variables) assert not any(var["name"] == "USUBJID" for var in variables) @@ -120,7 +112,7 @@ def test_ap_domain(library_metadata, mock_data_service, mock_datasets): assert any(var["name"] == "DMDY" for var in variables) -def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): +def test_sqap_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata( name="SQAPMH", first_record={"QNAM": "TEST", "RDOMAIN": "APMH"} ) @@ -128,33 +120,30 @@ def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "APID" for var in variables) assert not any(var["name"] == "USUBJID" for var in variables) assert any(var["name"] == "RDOMAIN" for var in variables) -def test_findings_about_domain_fa(library_metadata, mock_data_service, mock_datasets): +def test_findings_about_domain_fa(library_metadata, mock_data_service): """Test Findings About domain includes FINDINGS class variables.""" dataset_metadata = SDTMDatasetMetadata(name="FA", first_record={"DOMAIN": "FA"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) assert any(var["name"] == "FATEST" for var in variables) assert any(var["name"] == "FAOBJ" for var in variables) # Tests for get_variables_metadata_from_standard_model -def test_findings_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_findings_domain_from_model(library_metadata, mock_data_service): mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -163,13 +152,12 @@ def test_findings_domain_from_model(library_metadata, mock_data_service, mock_da assert any(var["name"] == "LBTEST" for var in variables) -def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_supp_domain_from_model(library_metadata, mock_data_service): """Test retrieving variables for SUPP domain from model.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -178,13 +166,12 @@ def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datase assert any(var["name"] == "IDVAR" for var in variables) -def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_sqap_domain_from_model(library_metadata, mock_data_service): """Test retrieving variables for SUPP domain from model.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SQAP", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -193,13 +180,12 @@ def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datase assert any(var["name"] == "APID" for var in variables) -def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_ap_domain_from_model(library_metadata, mock_data_service): """Test AP domain excludes USUBJID and includes APID.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -210,7 +196,7 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets assert any(var["name"] == "DMDY" for var in variables) -def test_custom_domain_events_class(library_metadata, mock_data_service, mock_datasets): +def test_custom_domain_events_class(library_metadata, mock_data_service): """Test custom domain detection and variable metadata retrieval for EVENTS class.""" dataset_metadata = SDTMDatasetMetadata(name="ZZ", first_record={"DOMAIN": "ZZ"}) mock_data_service.handle_custom_domains = Mock(return_value="EVENTS") @@ -218,7 +204,6 @@ def test_custom_domain_events_class(library_metadata, mock_data_service, mock_da library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) @@ -227,9 +212,7 @@ def test_custom_domain_events_class(library_metadata, mock_data_service, mock_da assert any(var["name"] == "ZZSEQ" for var in variables) -def test_custom_domain_findings_class( - library_metadata, mock_data_service, mock_datasets -): +def test_custom_domain_findings_class(library_metadata, mock_data_service): """Test custom domain detection and variable metadata retrieval for FINDINGS class.""" dataset_metadata = SDTMDatasetMetadata(name="XX", first_record={"DOMAIN": "XX"}) mock_data_service.handle_custom_domains = Mock(return_value="FINDINGS") @@ -237,7 +220,6 @@ def test_custom_domain_findings_class( library_metadata, mock_data_service, dataset_metadata, - mock_datasets, ) mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) From befd6b9e28e6d9a1b783f0c42ea5771731fd610d Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Thu, 16 Apr 2026 22:38:18 -0400 Subject: [PATCH 14/18] fix merged test code --- .../test_variables_metadata_dataset_builder.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py index cb427d59d..058b12268 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py @@ -55,8 +55,6 @@ def test_variables_metadata_without_max_size(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -116,8 +114,6 @@ def test_variables_metadata_with_max_size_in_operations(mock_get_vars, mock_get_ cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -182,8 +178,6 @@ def test_variables_metadata_with_max_size_in_output_variables( cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -244,8 +238,6 @@ def test_variables_metadata_with_max_size_in_conditions_dict( cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -303,8 +295,6 @@ def test_variables_metadata_handles_nulls(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -365,8 +355,6 @@ def test_variables_metadata_handles_missing_columns(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", From d2b13d9ea23208f094d96eaf4ab5ceca43fe443f Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Fri, 17 Apr 2026 13:30:06 -0400 Subject: [PATCH 15/18] refactor operation params --- cdisc_rules_engine/models/operation_params.py | 6 ++- .../operations/base_operation.py | 16 ++++-- .../operations/operations_factory.py | 4 +- .../utilities/rule_processor.py | 13 +++-- cdisc_rules_engine/utilities/utils.py | 51 ++++--------------- tests/conftest.py | 2 - .../test_define_variable_metadata.py | 9 +++- .../test_operations/test_get_xhtml_errors.py | 2 +- .../test_operations_factory.py | 2 +- .../test_parent_library_model_column_order.py | 2 - .../test_related_domain_is_custom.py | 2 +- .../test_operations/test_variable_count.py | 2 - .../test_operations/test_variable_names.py | 9 ---- .../test_variable_value_count.py | 2 - .../test_utilities/test_rule_processor.py | 4 -- 15 files changed, 42 insertions(+), 84 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 6ba08542c..df77c0501 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -6,6 +6,8 @@ import pandas as pd +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata + @dataclass class OperationParams: @@ -17,9 +19,7 @@ class OperationParams: # Required parameters (no defaults) first core_id: str dataframe: pd.DataFrame - dataset_path: str domain: str - directory_path: str operation_id: str operation_name: str standard: str @@ -36,11 +36,13 @@ class OperationParams: ct_package_types: List[str] = None ct_version: str = None ct_package_type: str = None + dataframe_metadata: SDTMDatasetMetadata = None domain_class: str = None term_code: str = None term_value: str = None term_pref_term: str = None dictionary_term_type: str = None + evaluation_dataset_metadata: SDTMDatasetMetadata = None external_dictionaries: ExternalDictionariesContainer = None external_dictionary_term_variable: str = None external_dictionary_type: str = None diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 97508ca30..be03eda09 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -1,4 +1,4 @@ -import os +from os.path import dirname, exists, join from cdisc_rules_engine.constants.define_xml_constants import DEFINE_XML_FILE_NAME from cdisc_rules_engine.models.operation_params import OperationParams @@ -42,7 +42,7 @@ class BaseOperation: def __init__( self, params: OperationParams, - original_dataset: DatasetInterface, + evaluation_dataset: DatasetInterface, cache_service: CacheServiceInterface, data_service: DataServiceInterface, library_metadata: LibraryMetadataContainer = LibraryMetadataContainer(), @@ -50,7 +50,7 @@ def __init__( self.params = params self.cache = cache_service self.data_service = data_service - self.evaluation_dataset = original_dataset + self.evaluation_dataset = evaluation_dataset self.library_metadata = library_metadata @abstractmethod @@ -304,9 +304,15 @@ def _get_define_contents(self): define_path = ( self.params.define_xml_path if self.params.define_xml_path - else os.path.join(self.params.directory_path, DEFINE_XML_FILE_NAME) + else join( + dirname( + self.params.evaluation_dataset_metadata.original_path + or self.params.evaluation_dataset_metadata.full_path + ), + DEFINE_XML_FILE_NAME, + ) ) - if not os.path.exists(define_path): + if not exists(define_path): raise FileNotFoundError(f"Define XML file {define_path} not found") define_contents = self.data_service.get_define_xml_contents( dataset_name=define_path diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index 1c01a086a..37aee00cd 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -163,7 +163,7 @@ def get_service( """Get instance of operation that matches operation specified in params""" required_args = { "operation_params", - "original_dataset", + "evaluation_dataset", "cache", "data_service", "library_metadata", @@ -176,7 +176,7 @@ def get_service( if name in self._operations_map: return self._operations_map.get(name)( kwargs.get("operation_params"), - kwargs.get("original_dataset"), + kwargs.get("evaluation_dataset"), kwargs.get("cache"), kwargs.get("data_service"), kwargs.get("library_metadata"), diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 71fccd2a0..da08cc536 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -1,7 +1,6 @@ import re import copy -from os.path import dirname from typing import List, Optional, Union, Tuple from cdisc_rules_engine.enums.rule_types import RuleTypes from cdisc_rules_engine.interfaces.cache_service_interface import ( @@ -366,12 +365,12 @@ def perform_rule_operations( ct_version=operation.get("version"), define_xml_path=kwargs.get("define_xml_path"), dataframe=dataset_copy, - dataset_path=dataset_metadata.full_path, + dataframe_metadata=dataset_metadata, delimiter=operation.get("delimiter"), dictionary_term_type=operation.get("dictionary_term_type"), - directory_path=dirname(dataset_metadata.full_path), domain=domain, domain_class=operation.get("domain_class"), + evaluation_dataset_metadata=dataset_metadata, external_dictionaries=external_dictionaries, external_dictionary_term_variable=operation.get( "external_dictionary_term_variable" @@ -427,7 +426,7 @@ def perform_rule_operations( def _execute_operation( self, operation_params: OperationParams, - dataset: DatasetInterface, + evaluation_dataset: DatasetInterface, previous_operations: List[str] = [], ): """ @@ -438,12 +437,11 @@ def _execute_operation( # check cache cache_key = get_operations_cache_key( core_id=operation_params.core_id, - directory_path=operation_params.directory_path, operation_name=operation_params.operation_name, + evaluation_dataset_name=operation_params.evaluation_dataset_metadata.name, domain=operation_params.domain, grouping=";".join(operation_params.grouping), target_variable=operation_params.target, - dataset_path=operation_params.dataset_path, operation_id=operation_params.operation_id, ) if previous_operations: @@ -477,12 +475,13 @@ def _execute_operation( operation_params.dataframe = self.data_service.get_dataset( dataset_name=dataset_metadata.name ) + operation_params.dataframe_metadata = dataset_metadata # call the operation operation = operations_factory.get_service( operation_params.operation_name, operation_params=operation_params, - original_dataset=dataset, + evaluation_dataset=evaluation_dataset, cache=self.cache, data_service=self.data_service, library_metadata=self.library_metadata, diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 16c3cda1a..8ee1c1e75 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -129,47 +129,9 @@ def is_valid_iso_date(date_to_validate: str) -> bool: return is_valid -def get_dataset_path( - study_id: str, data_bundle_id: str = None, filename: str = None -) -> str: - """ - Returns a path to dataset in the blob storage. - """ - path: str = study_id - if data_bundle_id: - path = os.path.join(path, data_bundle_id) - if filename: - path = os.path.join(path, filename) - return path - - DATASET_CACHE_KEY_TEMPLATE: str = "{dataset_path}_{dataset_type}" -def get_dataset_cache_key_from_study( - study_id: str, - data_bundle_id: str = None, - filename: str = None, - dataset_type: str = None, -) -> str: - """ - Creates a cache key for a dataset. - Usually, template of a dataset cache key is {dataset_path}_{dataset_type}. - Ex.: CDISC01/test/ae.xpt_contents or CDISC01/test/ae.xpt_metadata. - So, the function also builds the path. - - If dataset_type parameter is not passed, the returned key - can be used to clean several values with matching key pattern. - dataset_type param can be: contents, metadata, variables_metadata. - """ - dataset_path: str = get_dataset_path(study_id, data_bundle_id, filename) - if dataset_type: - dataset_path = DATASET_CACHE_KEY_TEMPLATE.format( - dataset_path=dataset_path, dataset_type=dataset_type - ) - return dataset_path - - def get_dataset_cache_key_from_path(dataset_path: str, dataset_type: str) -> str: return DATASET_CACHE_KEY_TEMPLATE.format( dataset_path=dataset_path, dataset_type=dataset_type @@ -246,19 +208,24 @@ def replace_pattern_in_list_of_strings( def get_operations_cache_key( core_id: str, - directory_path: str, operation_id: str, domain: str = None, operation_name: str = None, + evaluation_dataset_name: str = None, grouping: str = None, target_variable: str = None, - dataset_path: str = None, ) -> str: """ Creates the cache key for operations. """ - key = f"operations/{core_id}/{directory_path}/{operation_id}" - optional_items = [domain, operation_name, grouping, target_variable, dataset_path] + key = f"operations/{core_id}/{operation_id}" + optional_items = [ + domain, + operation_name, + evaluation_dataset_name, + grouping, + target_variable, + ] for item in optional_items: if item: key = f"{key}/{item}" diff --git a/tests/conftest.py b/tests/conftest.py index 013b3f381..861b197d0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1276,8 +1276,6 @@ def operation_params() -> OperationParams: dataframe=PandasDataset.from_dict({}), target="target", domain="domain", - dataset_path="dataset_path", - directory_path="directory_path", standard="standard", standard_version="standard_version", external_dictionaries=ExternalDictionariesContainer( diff --git a/tests/unit/test_operations/test_define_variable_metadata.py b/tests/unit/test_operations/test_define_variable_metadata.py index 4ffb11a83..dd3dab36a 100644 --- a/tests/unit/test_operations/test_define_variable_metadata.py +++ b/tests/unit/test_operations/test_define_variable_metadata.py @@ -1,6 +1,7 @@ from cdisc_rules_engine.config.config import ConfigService from pathlib import Path from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.define_variable_metadata import ( DefineVariableMetadata, ) @@ -23,7 +24,9 @@ def test_get_define_variable_metadata_variable_in_domain( cache = CacheServiceFactory(config).get_cache_service() data_service = DataServiceFactory(config, cache).get_data_service() resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") - operation_params.directory_path = str(resources_path) + operation_params.evaluation_dataset_metadata = SDTMDatasetMetadata( + full_path=Path(resources_path, "ae.xpt") + ) operation_params.domain = "AE" operation_params.target = "--SER" operation_params.attribute_name = "define_variable_ccode" @@ -44,7 +47,9 @@ def test_get_define_variable_metadata_variable_not_in_domain( cache = CacheServiceFactory(config).get_cache_service() data_service = DataServiceFactory(config, cache).get_data_service() resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") - operation_params.directory_path = str(resources_path) + operation_params.evaluation_dataset_metadata = SDTMDatasetMetadata( + full_path=Path(resources_path, "ae.xpt") + ) operation_params.domain = "AE" operation_params.target = "VERYFAKEVARIABLE" operation_params.attribute_name = "define_variable_ccode" diff --git a/tests/unit/test_operations/test_get_xhtml_errors.py b/tests/unit/test_operations/test_get_xhtml_errors.py index 04bf0392e..71e196132 100644 --- a/tests/unit/test_operations/test_get_xhtml_errors.py +++ b/tests/unit/test_operations/test_get_xhtml_errors.py @@ -89,7 +89,7 @@ def test_get_xhtml_errors( operation_params.namespace = namespace operation = GetXhtmlErrors( params=operation_params, - original_dataset=dataset, + evaluation_dataset=dataset, cache_service=MagicMock(), data_service=MagicMock(), ) diff --git a/tests/unit/test_operations/test_operations_factory.py b/tests/unit/test_operations/test_operations_factory.py index 0889aff9c..8a9352d69 100644 --- a/tests/unit/test_operations/test_operations_factory.py +++ b/tests/unit/test_operations/test_operations_factory.py @@ -29,7 +29,7 @@ def _execute_operation(self): operation_params=operation_params, cache=cache, data_service=data_service, - original_dataset=pd.DataFrame(), + evaluation_dataset=pd.DataFrame(), library_metadata=LibraryMetadataContainer(), ) assert isinstance(op, DummyOperation) diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index 95a4a8900..43bca1ece 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -140,7 +140,6 @@ def test_get_parent_column_order_from_library( operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.dataset_path = "suppae.xpt" # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -351,7 +350,6 @@ def test_get_parent_findings_class_column_order_from_library( operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.dataset_path = "suppae.xpt" # save model metadata to cache cache = InMemoryCacheService.get_instance() diff --git a/tests/unit/test_operations/test_related_domain_is_custom.py b/tests/unit/test_operations/test_related_domain_is_custom.py index 897075568..ffbe71871 100644 --- a/tests/unit/test_operations/test_related_domain_is_custom.py +++ b/tests/unit/test_operations/test_related_domain_is_custom.py @@ -81,7 +81,7 @@ def test_related_domain_is_custom( op = RelatedDomainIsCustom( params=params, library_metadata=library_metadata, - original_dataset=None, + evaluation_dataset=None, cache_service=None, data_service=data_service, ) diff --git a/tests/unit/test_operations/test_variable_count.py b/tests/unit/test_operations/test_variable_count.py index 9f64f0b5f..da62fe949 100644 --- a/tests/unit/test_operations/test_variable_count.py +++ b/tests/unit/test_operations/test_variable_count.py @@ -23,7 +23,6 @@ def test_variable_count( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - dataset_path = os.path.join("study", "bundle", "blah") datasets_map = { "AE": dataset_type.from_dict( {"STUDYID": [4, 7, 9], "AESEQ": [1, 2, 3], "DOMAIN": [12, 6, 1]} @@ -76,7 +75,6 @@ def test_variable_count( mock_data_service.get_datasets = lambda: datasets operation_params.target = target operation_params.original_target = target - operation_params.dataset_path = dataset_path result = VariableCount( operation_params, datasets_map["AE"], cache, mock_data_service ).execute() diff --git a/tests/unit/test_operations/test_variable_names.py b/tests/unit/test_operations/test_variable_names.py index 9daa296da..e8401f969 100644 --- a/tests/unit/test_operations/test_variable_names.py +++ b/tests/unit/test_operations/test_variable_names.py @@ -44,18 +44,11 @@ def test_get_variable_names_for_given_standard( } }, ) - dataset_path = "study/bundle/blah" datasets_map = { "AE": dataset_type.from_dict({"STUDYID": [4, 7, 9], "DOMAIN": [12, 6, 1]}), "EX": dataset_type.from_dict({"STUDYID": [4, 8, 12], "DOMAIN": [12, 6, 1]}), "AE2": dataset_type.from_dict({"STUDYID": [4, 7, 9], "DOMAIN": [12, 6, 1]}), } - - datasets = [ - {"domain": "AE", "filename": "AE"}, - {"domain": "EX", "filename": "EX"}, - {"domain": "AE", "filename": "AE2"}, - ] mock_data_service.get_dataset.side_effect = lambda name: datasets_map.get( name.split("/")[-1] ) @@ -63,8 +56,6 @@ def test_get_variable_names_for_given_standard( [func(f) for f in files] ) operation_params.target = target - operation_params.datasets = datasets - operation_params.dataset_path = dataset_path operation_params.standard = standard operation_params.standard_version = standard_version result = VariableNames( diff --git a/tests/unit/test_operations/test_variable_value_count.py b/tests/unit/test_operations/test_variable_value_count.py index 5cc2a8ce0..8f48f4ce1 100644 --- a/tests/unit/test_operations/test_variable_value_count.py +++ b/tests/unit/test_operations/test_variable_value_count.py @@ -28,7 +28,6 @@ def test_variable_value_count( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - dataset_path = os.path.join("study", "bundle", "blah") datasets_map = { "AE": dataset_type.from_dict( {"STUDYID": [4, 7, 9], "AESEQ": [1, 2, 3], "DOMAIN": [12, 6, 1]} @@ -73,7 +72,6 @@ def test_variable_value_count( ) mock_data_service.get_datasets = lambda: datasets operation_params.original_target = target - operation_params.dataset_path = dataset_path result = VariableValueCount( operation_params, datasets_map["AE"], diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index bd089c448..e0f0ed23a 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -775,8 +775,6 @@ def test_preprocess_operation_params_wildcard_replacement(mock_data_service): target="--SEQ", original_target="--SEQ", domain="AE", - dataset_path="test/ae.xpt", - directory_path="test/", standard="sdtmig", standard_version="3-4", grouping=["--SEQ", "--DTC", "USUBJID"], @@ -807,8 +805,6 @@ def test_preprocess_operation_params_supp_domain_uses_rdomain(mock_data_service) target="--SEQ", original_target="--SEQ", domain=None, - dataset_path="test/suppae.xpt", - directory_path="test/", standard="sdtmig", standard_version="3-4", ) From a49479454384f09f21e8f1c32f80c506e9e520f2 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Mon, 20 Apr 2026 22:32:26 -0400 Subject: [PATCH 16/18] removed unneeded self.params.domain from operations --- cdisc_rules_engine/operations/extract_metadata.py | 5 +++-- cdisc_rules_engine/operations/variable_exists.py | 4 +--- cdisc_rules_engine/operations/variable_is_null.py | 2 +- tests/unit/test_operations/test_extract_metadata.py | 6 ++++++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 225c88b1f..876952f1f 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -1,13 +1,14 @@ import pandas as pd +from cdisc_rules_engine.models.dataset import DatasetInterface from cdisc_rules_engine.operations.base_operation import BaseOperation class ExtractMetadata(BaseOperation): def _execute_operation(self): # get metadata - metadata: pd.DataFrame = self.data_service.get_dataset_metadata( - dataset_name=self.params.domain + metadata: DatasetInterface = self.data_service.get_dataset_metadata( + dataset_name=self.params.dataframe_metadata.name ) # extract target value. Metadata df always has one row diff --git a/cdisc_rules_engine/operations/variable_exists.py b/cdisc_rules_engine/operations/variable_exists.py index 7c503068c..e0e047b81 100644 --- a/cdisc_rules_engine/operations/variable_exists.py +++ b/cdisc_rules_engine/operations/variable_exists.py @@ -3,6 +3,4 @@ class VariableExists(BaseOperation): def _execute_operation(self): - # get metadata - dataframe = self.data_service.get_dataset(dataset_name=self.params.domain) - return self.params.target in dataframe + return self.params.target in self.params.dataframe diff --git a/cdisc_rules_engine/operations/variable_is_null.py b/cdisc_rules_engine/operations/variable_is_null.py index 89a79707e..f6821f317 100644 --- a/cdisc_rules_engine/operations/variable_is_null.py +++ b/cdisc_rules_engine/operations/variable_is_null.py @@ -6,7 +6,7 @@ def _execute_operation(self): if self.params.source == "submission": if self.params.level == "row": raise ValueError("level: row may only be used with source: evaluation") - dataframe = self.data_service.get_dataset(dataset_name=self.params.domain) + dataframe = self.params.dataframe else: dataframe = self.evaluation_dataset diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index c8efd7d52..4bc5afe7c 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -32,6 +32,7 @@ def test_extract_metadata_get_dataset_name( ], } ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") operation_params.target = "dataset_name" cache = InMemoryCacheService.get_instance() # execute operation @@ -100,6 +101,9 @@ def test_extract_metadata_domain_suffix( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": [domain_value]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name=dataset_name, first_record=first_record + ) operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( @@ -122,6 +126,7 @@ def test_extract_metadata_domain_suffix_uses_domain( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APXX"]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="APFA") operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( @@ -140,6 +145,7 @@ def test_extract_metadata_domain_suffix_empty_metadata( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="APFA") operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( From a61a74d2f85fe990ce23fe752c1b9228b32666e1 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 21 Apr 2026 18:53:37 -0400 Subject: [PATCH 17/18] more fixes for operations dataset metadata source --- .../operations/base_operation.py | 20 ++++++------- .../get_model_filtered_variables.py | 4 +-- .../operations/library_model_column_order.py | 4 ++- .../parent_library_model_column_order.py | 7 ++++- .../utilities/rule_processor.py | 2 +- tests/conftest.py | 4 ++- .../test_expected_variables.py | 28 ++++++++----------- .../test_get_dataset_filtered_variables.py | 28 +++---------------- .../test_get_model_filtered_variables.py | 10 +------ ...test_label_referenced_variable_metadata.py | 15 ++++------ .../test_library_column_order.py | 14 ++++------ .../test_library_model_column_order.py | 21 ++++---------- .../test_name_referenced_variable_metadata.py | 14 ++++------ .../test_parent_library_model_column_order.py | 10 ++++--- .../test_permissible_variables.py | 14 ++++------ .../test_required_variables.py | 14 ++++------ 16 files changed, 80 insertions(+), 129 deletions(-) diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index be03eda09..633726f69 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -229,23 +229,25 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: return sdtm_utilities.get_variables_metadata_from_standard( library_metadata=self.library_metadata, data_service=self.data_service, - dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.domain - ), + dataset_metadata=self.params.dataframe_metadata, ) - def _get_variable_names_list(self, domain, dataframe): + def _get_variable_names_list(self, dataset_metadata, dataframe): # get variables metadata from the standard model variables_metadata: List[dict] = ( - self._get_variables_metadata_from_standard_model(dataframe) + self._get_variables_metadata_from_standard_model( + dataset_metadata, dataframe + ) ) # create a list of variable names in accordance to the "ordinal" key variable_names_list = self._replace_variable_wildcards( - variables_metadata, domain + variables_metadata, dataset_metadata.wildcard_replacement ) return list(OrderedDict.fromkeys(variable_names_list)) - def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: + def _get_variables_metadata_from_standard_model( + self, dataset_metadata, dataframe + ) -> List[dict]: """ Gets variables metadata for the given class and domain from cache. The cache stores CDISC Library metadata. @@ -275,9 +277,7 @@ def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: dataframe=dataframe, data_service=self.data_service, library_metadata=self.library_metadata, - dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.domain - ), + dataset_metadata=dataset_metadata, ) @staticmethod diff --git a/cdisc_rules_engine/operations/get_model_filtered_variables.py b/cdisc_rules_engine/operations/get_model_filtered_variables.py index faff0daca..a36c0ffc3 100644 --- a/cdisc_rules_engine/operations/get_model_filtered_variables.py +++ b/cdisc_rules_engine/operations/get_model_filtered_variables.py @@ -23,10 +23,10 @@ def _get_model_filtered_variables(self): key = self.params.key_name val = self.params.key_value model_variables: List[dict] = self._get_variables_metadata_from_standard_model( - self.params.dataframe + self.params.dataframe_metadata, self.params.dataframe ) filtered_model = [var for var in model_variables if var.get(key) == val] variable_names_list = self._replace_variable_wildcards( - filtered_model, self.params.domain + filtered_model, self.params.dataframe_metadata.wildcard_replacement ) return variable_names_list diff --git a/cdisc_rules_engine/operations/library_model_column_order.py b/cdisc_rules_engine/operations/library_model_column_order.py index d60fd58ed..91ccbaf97 100644 --- a/cdisc_rules_engine/operations/library_model_column_order.py +++ b/cdisc_rules_engine/operations/library_model_column_order.py @@ -16,4 +16,6 @@ def _execute_operation(self): The lists with column names are sorted in accordance to "ordinal" key of library metadata. """ - return self._get_variable_names_list(self.params.domain, self.params.dataframe) + return self._get_variable_names_list( + self.params.dataframe_metadata, self.params.dataframe + ) diff --git a/cdisc_rules_engine/operations/parent_library_model_column_order.py b/cdisc_rules_engine/operations/parent_library_model_column_order.py index 0549bd64f..f1b73b47a 100644 --- a/cdisc_rules_engine/operations/parent_library_model_column_order.py +++ b/cdisc_rules_engine/operations/parent_library_model_column_order.py @@ -50,4 +50,9 @@ def _get_parent_variable_names_list(self, domain_to_datasets: dict, rdomain: str parent_dataframe = self.data_service.get_dataset( dataset_name=parent_datasets[0].name ) - return self._get_variable_names_list(rdomain, parent_dataframe) + parent_dataframe_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=parent_datasets[0].name + ) + return self._get_variable_names_list( + parent_dataframe_metadata, parent_dataframe + ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index da08cc536..1395947b2 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -336,7 +336,7 @@ def perform_rule_operations( # change -- pattern to domain name original_target: str = operation.get("name") target: str = original_target - domain: str = operation.get("domain", dataset_metadata.name) + domain: str = operation.get("domain", dataset_metadata.unsplit_name) wildcard_replacement: str = operation.get( "domain", dataset_metadata.wildcard_replacement ) diff --git a/tests/conftest.py b/tests/conftest.py index 861b197d0..c2f4c1d8f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1251,7 +1251,9 @@ def installed_meddra_dictionaries(request) -> dict: """ cache_service = InMemoryCacheService.get_instance() # install dictionaries and save to cache - local_data_service = LocalDataService.get_instance(cache_service=cache_service) + local_data_service = LocalDataService.get_instance( + config=ConfigService(), cache_service=cache_service + ) factory = MedDRATermsFactory(local_data_service) terms: dict = factory.install_terms(meddra_path) diff --git a/tests/unit/test_operations/test_expected_variables.py b/tests/unit/test_operations/test_expected_variables.py index 7d547a980..a3bdd53b1 100644 --- a/tests/unit/test_operations/test_expected_variables.py +++ b/tests/unit/test_operations/test_expected_variables.py @@ -6,7 +6,7 @@ ) import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles @@ -175,6 +175,9 @@ def test_get_expected_variables(operation_params: OperationParams, dataset_type) operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -189,21 +192,14 @@ def test_get_expected_variables(operation_params: OperationParams, dataset_type) ) data_service.get_dataset_class = Mock(return_value=mock_dataset_class) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - operation = ExpectedVariables( - operation_params, - operation_params.dataframe, - cache, - data_service, - library_metadata, - ) - result = operation.execute() + operation = ExpectedVariables( + operation_params, + operation_params.dataframe, + cache, + data_service, + library_metadata, + ) + result = operation.execute() variables = ["STUDYID", "DOMAIN", "AENEW", "TIMING_VAR"] expected = pd.Series( diff --git a/tests/unit/test_operations/test_get_dataset_filtered_variables.py b/tests/unit/test_operations/test_get_dataset_filtered_variables.py index dff836b6c..de018a965 100644 --- a/tests/unit/test_operations/test_get_dataset_filtered_variables.py +++ b/tests/unit/test_operations/test_get_dataset_filtered_variables.py @@ -503,7 +503,7 @@ def test_get_dataset_filtered_variables( operation_params.standard_version = "3-4" operation_params.key_name = key_name operation_params.key_value = key_value - operation_params.datasets = [SDTMDatasetMetadata(**dataset_metadata)] + operation_params.dataframe_metadata = SDTMDatasetMetadata(**dataset_metadata) cache = InMemoryCacheService.get_instance() library_metadata = LibraryMetadataContainer( @@ -525,11 +525,6 @@ def test_get_dataset_filtered_variables( else FINDINGS_ABOUT ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(**dataset_metadata) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object( LocalDataService, "get_dataset_class", return_value=expected_class ): @@ -569,7 +564,7 @@ def test_get_dataset_filtered_variables_dask( operation_params.standard_version = "3-4" operation_params.key_name = "role" operation_params.key_value = "Timing" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -685,11 +680,6 @@ def test_get_dataset_filtered_variables_dask( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, @@ -716,7 +706,7 @@ def test_get_dataset_filtered_variables_empty_dataset( operation_params.domain = "AE" operation_params.key_name = "role" operation_params.key_value = "Timing" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -792,11 +782,6 @@ def test_get_dataset_filtered_variables_empty_dataset( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, @@ -822,7 +807,7 @@ def test_get_dataset_filtered_variables_invalid_key(operation_params: OperationP operation_params.domain = "AE" operation_params.key_name = "invalid_key" operation_params.key_value = "SomeValue" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -904,11 +889,6 @@ def test_get_dataset_filtered_variables_invalid_key(operation_params: OperationP library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, diff --git a/tests/unit/test_operations/test_get_model_filtered_variables.py b/tests/unit/test_operations/test_get_model_filtered_variables.py index 526605a9a..8b9965795 100644 --- a/tests/unit/test_operations/test_get_model_filtered_variables.py +++ b/tests/unit/test_operations/test_get_model_filtered_variables.py @@ -294,7 +294,7 @@ def test_get_model_filtered_variables( operation_params.standard_version = "3-4" operation_params.key_name = "role" operation_params.key_value = key_val - operation_params.datasets = [SDTMDatasetMetadata(**dataset_metadata)] + operation_params.dataframe_metadata = SDTMDatasetMetadata(**dataset_metadata) # save model metadata to cache cache = InMemoryCacheService.get_instance() library_metadata = LibraryMetadataContainer( @@ -314,15 +314,7 @@ def test_get_model_filtered_variables( if model_metadata["datasets"][0]["_links"]["parentClass"]["title"] == "Events" else FINDINGS_ABOUT ) - """ - this fuction replaces get_raw_dataset_metadata in LocalDataService to - prevent filtering into the decorator that checks cache - """ - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(**dataset_metadata) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata with patch.object( LocalDataService, "get_dataset_class", return_value=expected_class ): diff --git a/tests/unit/test_operations/test_label_referenced_variable_metadata.py b/tests/unit/test_operations/test_label_referenced_variable_metadata.py index 58caa1c15..c96570043 100644 --- a/tests/unit/test_operations/test_label_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_label_referenced_variable_metadata.py @@ -14,7 +14,7 @@ from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock @pytest.mark.parametrize("dataset_type", [(PandasDataset)]) @@ -170,6 +170,10 @@ def test_get_label_referenced_variable_metadata( operation_params.standard_version = "3-4" operation_params.target = "AELABEL" operation_params.operation_id = "$label_referenced_variable" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) + # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -191,14 +195,7 @@ def test_get_label_referenced_variable_metadata( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() expected_columns = [ "STUDYID", "AETERM", diff --git a/tests/unit/test_operations/test_library_column_order.py b/tests/unit/test_operations/test_library_column_order.py index b13f9d8e8..906346b1a 100644 --- a/tests/unit/test_operations/test_library_column_order.py +++ b/tests/unit/test_operations/test_library_column_order.py @@ -8,7 +8,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams @@ -314,6 +314,9 @@ def test_get_column_order_from_library( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -335,14 +338,7 @@ def test_get_column_order_from_library( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result = operation.execute() + result = operation.execute() variables: List[str] = [ "STUDYID", "DOMAIN", diff --git a/tests/unit/test_operations/test_library_model_column_order.py b/tests/unit/test_operations/test_library_model_column_order.py index bc69b7e43..2d48bed13 100644 --- a/tests/unit/test_operations/test_library_model_column_order.py +++ b/tests/unit/test_operations/test_library_model_column_order.py @@ -118,9 +118,9 @@ def test_get_column_order_from_library(operation_params: OperationParams, datase operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - ] + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -138,11 +138,6 @@ def test_get_column_order_from_library(operation_params: OperationParams, datase library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - operation = LibraryModelColumnOrder( operation_params, operation_params.dataframe, @@ -277,9 +272,9 @@ def test_get_findings_class_column_order_from_library( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - ] + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -296,10 +291,6 @@ def test_get_findings_class_column_order_from_library( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata operation = LibraryModelColumnOrder( operation_params, operation_params.dataframe, diff --git a/tests/unit/test_operations/test_name_referenced_variable_metadata.py b/tests/unit/test_operations/test_name_referenced_variable_metadata.py index 51ba41363..7c45ee0ee 100644 --- a/tests/unit/test_operations/test_name_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_name_referenced_variable_metadata.py @@ -13,7 +13,7 @@ from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock @pytest.mark.parametrize("dataset_type", [(PandasDataset)]) @@ -170,6 +170,9 @@ def test_get_name_referenced_variable_metadata( operation_params.standard_version = "3-4" operation_params.target = "AEREF" operation_params.operation_id = "$name_referenced_variable" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService() library_metadata = LibraryMetadataContainer( @@ -190,14 +193,7 @@ def test_get_name_referenced_variable_metadata( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result = operation.execute() + result = operation.execute() expected_columns = [ "STUDYID", "AETERM", diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index 43bca1ece..bdd777d64 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -135,11 +135,12 @@ def test_get_parent_column_order_from_library( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], ): - # Set evaluation_dataset instead of dataframe - operation_params.evaluation_dataset = data operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"RDOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -345,11 +346,12 @@ def test_get_parent_findings_class_column_order_from_library( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], ): - # Set evaluation_dataset instead of dataframe - operation_params.evaluation_dataset = data operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"RDOMAIN": "AE"}, + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() diff --git a/tests/unit/test_operations/test_permissible_variables.py b/tests/unit/test_operations/test_permissible_variables.py index be9838326..b5f7b8a38 100644 --- a/tests/unit/test_operations/test_permissible_variables.py +++ b/tests/unit/test_operations/test_permissible_variables.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams @@ -314,6 +314,9 @@ def test_get_permissible_variables( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) cache = InMemoryCacheService library_metadata = LibraryMetadataContainer( standard_metadata=standard_metadata, model_metadata=model_metadata @@ -332,14 +335,7 @@ def test_get_permissible_variables( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() variables: List[str] = [ "STUDYID", "DOMAIN", diff --git a/tests/unit/test_operations/test_required_variables.py b/tests/unit/test_operations/test_required_variables.py index 3a92a945b..d4c3d11e6 100644 --- a/tests/unit/test_operations/test_required_variables.py +++ b/tests/unit/test_operations/test_required_variables.py @@ -8,7 +8,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles @@ -178,6 +178,9 @@ def test_get_required_variables(operation_params: OperationParams, dataset_type) operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -199,14 +202,7 @@ def test_get_required_variables(operation_params: OperationParams, dataset_type) library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() variables: List[str] = sorted(["AETEST"]) for result_array in result[operation_params.operation_id]: assert sorted(result_array) == variables From 6285fd385bd93db63a8f44c027ab43ae835be663 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 21 Apr 2026 19:13:12 -0400 Subject: [PATCH 18/18] fixed test_contents_library_variables_dataset_builder --- .../test_contents_library_dataset_builder.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py index f9a566f9f..add665612 100644 --- a/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py @@ -10,6 +10,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.data_services import LocalDataService @@ -105,9 +106,7 @@ def test_contents_library_variables_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4",