diff --git a/.github/test/selenium_test_editor.py b/.github/test/selenium_test_editor.py index 12b36b329..2844b505e 100644 --- a/.github/test/selenium_test_editor.py +++ b/.github/test/selenium_test_editor.py @@ -184,7 +184,7 @@ "DM": [ { "executionStatus": "success", - "dataset": "dm.xpt", + "dataset": "DM", "domain": "DM", "variables": [], "message": None, @@ -194,7 +194,7 @@ "FA": [ { "executionStatus": "issue reported", - "dataset": "fa.xpt", + "dataset": "FA", "domain": "FA", "variables": [ "$val_dy", @@ -215,7 +215,7 @@ "RFSTDTC": "2012-11-15", "FADTC": "2012-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 1, "USUBJID": "CDISC002", "SEQ": 1, @@ -227,7 +227,7 @@ "RFSTDTC": "2013-10-08", "FADTC": "2013-10-12", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 2, "USUBJID": "CDISC004", "SEQ": 2, @@ -239,7 +239,7 @@ "RFSTDTC": "2013-01-05", "FADTC": "2012-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 4, "USUBJID": "CDISC007", "SEQ": 4, @@ -251,7 +251,7 @@ "RFSTDTC": "2014-05-11", "FADTC": "2014-12-02", }, - "dataset": "fa.xpt", + "dataset": "FA", "row": 5, "USUBJID": "CDISC008", "SEQ": 5, @@ -262,7 +262,7 @@ "IE": [ { "executionStatus": "issue reported", - "dataset": "ie.xpt", + "dataset": "IE", "domain": "IE", "variables": [ "$val_dy", @@ -283,7 +283,7 @@ "RFSTDTC": "2022-03-20", "IEDTC": "2022-03-17", }, - "dataset": "ie.xpt", + "dataset": "IE", "row": 1, "USUBJID": "CDISC-TEST-001", "SEQ": 1, @@ -294,7 +294,7 @@ "LB": [ { "executionStatus": "issue reported", - "dataset": "lb.xpt", + "dataset": "LB", "domain": "LB", "variables": [ "$val_dy", @@ -315,7 +315,7 @@ "LBDTC": "2022-03-30", "LBDY": 2, }, - "dataset": "lb.xpt", + "dataset": "LB", "row": 1, "USUBJID": "CDISC-TEST-001", "SEQ": 1, diff --git a/cdisc_rules_engine/constants/metadata_columns.py b/cdisc_rules_engine/constants/metadata_columns.py index 8d06e840e..9f171d0d8 100644 --- a/cdisc_rules_engine/constants/metadata_columns.py +++ b/cdisc_rules_engine/constants/metadata_columns.py @@ -1,3 +1,3 @@ -SOURCE_FILENAME = "source_filename" +SOURCE_DATASET_NAME = "source_dataset_name" SOURCE_ROW_NUMBER = "source_row_number" -METADATA_COLUMNS = {SOURCE_FILENAME, SOURCE_ROW_NUMBER} +METADATA_COLUMNS = {SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER} diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 8aa2957d2..04672819b 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -9,7 +9,7 @@ from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, ) -from typing import List, Iterable, Optional +from typing import List, Optional from cdisc_rules_engine.utilities import sdtm_utilities from cdisc_rules_engine.utilities.rule_processor import RuleProcessor from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface @@ -25,8 +25,6 @@ def __init__( cache_service, rule_processor: RuleProcessor, data_processor, - dataset_path, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, define_xml_path, standard, @@ -38,8 +36,6 @@ def __init__( self.cache = cache_service self.data_processor = data_processor self.rule_processor = rule_processor - self.dataset_path = dataset_path - self.datasets = datasets self.dataset_metadata = dataset_metadata self.rule = rule self.define_xml_path = define_xml_path @@ -56,18 +52,20 @@ def build(self) -> DatasetInterface: """ pass - def build_split_datasets(self, dataset_name, **kwargs) -> DatasetInterface: + def build_split_datasets(self, dataset_name: str, **kwargs) -> DatasetInterface: """ Returns correct dataframe to operate on. - Default implementation that temporarily sets dataset_path to dataset_name and calls build(). + Default implementation that temporarily sets dataset_metadata and calls build(). """ - original_path = self.dataset_path + original_dataset_metadata = self.dataset_metadata try: - self.dataset_path = dataset_name + self.dataset_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=dataset_name + ) result = self.build(**kwargs) return result finally: - self.dataset_path = original_path + self.dataset_metadata = original_dataset_metadata def get_dataset(self, **kwargs): # If validating dataset content, ensure split datasets are handled. @@ -77,7 +75,7 @@ def get_dataset(self, **kwargs): dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.build_split_datasets, datasets_metadata=get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ), **kwargs, ) @@ -95,14 +93,14 @@ def get_dataset_contents(self, **kwargs): dataset: DatasetInterface = self.data_service.concat_split_datasets( func_to_call=self.data_service.get_dataset, datasets_metadata=get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ), **kwargs, ) else: # single dataset. the most common case dataset: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) dataset = tag_source(dataset, self.dataset_metadata) return dataset @@ -126,7 +124,10 @@ def get_define_xml_item_group_metadata_for_dataset( """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_dataset_metadata( dataset_metadata["dataset_name"] @@ -149,7 +150,10 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_domain_metadata(domain) @@ -164,7 +168,10 @@ def get_define_xml_variables_metadata(self) -> List[dict]: | SUPPDM | DM | """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) domain = self.dataset_metadata.domain or self.dataset_metadata.rdomain return define_xml_reader.extract_variables_metadata( @@ -176,7 +183,10 @@ def get_define_xml_value_level_metadata(self) -> List[dict]: Gets Define XML value level metadata and returns it as dataframe. """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.extract_value_level_metadata( domain_name=self.dataset_metadata.domain @@ -188,7 +198,10 @@ def add_row_number(dataframe: DatasetInterface) -> None: def get_define_metadata(self): define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) return define_xml_reader.read() @@ -205,9 +218,7 @@ def get_library_variables_metadata(self) -> DatasetInterface: variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard( library_metadata=self.library_metadata, data_service=self.data_service, - datasets=self.datasets, dataset_metadata=self.dataset_metadata, - dataset_path=self.dataset_path, ) variables_metadata: dict = self.library_metadata.variables_metadata.get( domain, {} diff --git a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py index f79e05f65..9fa191320 100644 --- a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py @@ -14,7 +14,6 @@ def build(self): """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) return self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, + dataset_name=self.dataset_metadata.name, size_unit=size_unit, - datasets=self.datasets, ) diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 426cbdacf..1f1c844e8 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -9,15 +9,13 @@ def build(self, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ - return self.data_service.get_dataset(dataset_name=self.dataset_path) + return self.data_service.get_dataset(dataset_name=self.dataset_metadata.name) def build_split_datasets(self, dataset_name, **kwargs): """ Returns the contents of a file as a dataframe for evaluation. """ - return self.data_service.get_dataset( - dataset_name=dataset_name, datasets=self.datasets - ) + return self.data_service.get_dataset(dataset_name=dataset_name) def get_dataset(self, **kwargs): dataset = super().get_dataset(**kwargs) @@ -25,7 +23,7 @@ def get_dataset(self, **kwargs): [ dataset.record_count for dataset in get_corresponding_datasets( - self.datasets, self.dataset_metadata + self.data_service.get_datasets(), self.dataset_metadata ) ] ) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index a64fa8d9a..9c21c5e72 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -31,12 +31,13 @@ def build(self): ..., """ data_contents_df = self.data_service.get_dataset( - dataset_name=self.dataset_path, datasets=self.datasets + dataset_name=self.dataset_metadata.name ) # Build dataset metadata dataframe size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, size_unit=size_unit, datasets=self.datasets + dataset_name=self.dataset_metadata.name, + size_unit=size_unit, ).to_dict(orient="records")[0] # Build define xml dataframe define = self.get_define_xml_item_group_metadata_for_dataset(dataset_metadata) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index 3e12bdabb..fb2374b9c 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -28,7 +28,7 @@ def build(self): """ # get dataset contents and convert it from wide to long data_contents_df: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) self.add_row_number(data_contents_df) data_contents_long_df: DatasetInterface = ValuesDatasetBuilder.build(self) diff --git a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py index f62d39572..faaaba9e0 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py +++ b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py @@ -110,8 +110,6 @@ def get_service( kwargs.get("cache_service"), kwargs.get("rule_processor"), kwargs.get("data_processor"), - kwargs.get("dataset_path"), - kwargs.get("datasets"), kwargs.get("dataset_metadata", ""), kwargs.get("define_xml_path"), kwargs.get("standard"), diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index fd42ed2d9..12c6e794c 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -80,7 +80,10 @@ def _get_define_xml_dataframe(self): logger.info(f"No define_metadata is provided for {__name__}.") return self.dataset_implementation(columns=define_col_order) define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - self.dataset_path, self.define_xml_path, self.data_service, self.cache + self.dataset_metadata.full_path, + self.define_xml_path, + self.data_service, + self.cache, ) enriched_metadata = [] for basic_metadata in define_metadata: @@ -131,23 +134,23 @@ def _get_dataset_dataframe(self): "ap_suffix", ] - if len(self.datasets) == 0: + if len(self.data_service.get_datasets()) == 0: dataset_df = self.dataset_implementation(columns=dataset_col_order) logger.info(f"No datasets metadata is provided in {__name__}.") else: datasets = self.dataset_implementation() - for dataset in self.datasets: + for dataset_metadata in self.data_service.get_datasets(): ds_metadata = None try: ds_metadata = self.data_service.get_dataset_metadata( - dataset_name=dataset.filename + dataset_name=dataset_metadata.name ) ds_metadata.data["dataset_domain"] = getattr( - dataset, "domain", None + dataset_metadata, "domain", None ) - if dataset.first_record: + if dataset_metadata.first_record: ds_metadata.data["dataset_columns"] = [ - list(dataset.first_record.keys()) + list(dataset_metadata.first_record.keys()) ] else: ds_metadata.data["dataset_columns"] = [[]] diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py index 8fb48d9f3..eb8206f78 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py @@ -21,9 +21,8 @@ def build(self): """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( - dataset_name=self.dataset_path, + dataset_name=self.dataset_metadata.name, size_unit=size_unit, - datasets=self.datasets, ) dataset_metadata = dataset_metadata.to_dict(orient="records")[0] data_contents_long_df = super().build() diff --git a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py index fd9b10982..eba882d84 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_dataset_builder.py @@ -14,5 +14,6 @@ def build(self): """ return self.dataset_implementation.from_records( - {ds.unsplit_name: ds.filename for ds in self.datasets}, index=[0] + {ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets()}, + index=[0], ) diff --git a/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py index 8ec2741c3..af92d63ba 100644 --- a/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py +++ b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py @@ -26,7 +26,9 @@ def build(self): 1 EC ec.xpt EC False 2 SE None SE True """ - domain_files = {ds.unsplit_name: ds.filename for ds in self.datasets} + domain_files = { + ds.unsplit_name: ds.filename for ds in self.data_service.get_datasets() + } all_define_metadata = self.get_define_metadata() records = [] for define_item in all_define_metadata: diff --git a/cdisc_rules_engine/dataset_builders/values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/values_dataset_builder.py index 265911569..f2de4b051 100644 --- a/cdisc_rules_engine/dataset_builders/values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/values_dataset_builder.py @@ -14,7 +14,7 @@ def build(self): ..., """ data_contents_df: DatasetInterface = self.data_service.get_dataset( - dataset_name=self.dataset_path + dataset_name=self.dataset_metadata.name ) self.add_row_number(data_contents_df) values_df: DatasetInterface = data_contents_df.melt( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py index 0c412d37b..7c818bf00 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_dataset_builder.py @@ -17,7 +17,8 @@ def build(self): """ # Get basic variable metadata variables_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + drop_duplicates=True, ) # Check if the rule requires variable_max_size @@ -120,7 +121,7 @@ def _add_variable_max_size(self, variables_metadata): This column contains the maximum length of actual data for each variable. """ # Get the dataset contents - dataset = self.data_service.get_dataset(dataset_name=self.dataset_path) + dataset = self.data_service.get_dataset(dataset_name=self.dataset_metadata.name) # Calculate max size for each variable max_sizes = {} diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py index 088e23387..7c279538b 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_values_dataset_builder.py @@ -22,7 +22,8 @@ def build(self): """ data_contents_long_df = super().build() variable_metadata = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + drop_duplicates=True, ) merged_df = data_contents_long_df.merge( variable_metadata._data, how="left", on="variable_name" diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index ee00d2cad..173b6a9da 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -42,7 +42,8 @@ def build(self): """ variable_metadata: List[dict] = self.get_define_xml_variables_metadata() content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index 25237c09f..b5a55fa83 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -36,7 +36,8 @@ def build(self): variable_metadata: List[dict] = self.get_define_xml_variables_metadata() # get dataset metadata and execute the rule content_metadata: DatasetInterface = self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, datasets=self.datasets, drop_duplicates=True + dataset_name=self.dataset_metadata.name, + drop_duplicates=True, ) define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index 81dbb9894..c7d87e86c 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -26,8 +26,7 @@ def build(self): # get dataset metadata and execute the rule content_variables_metadata: DatasetInterface = ( self.data_service.get_variables_metadata( - dataset_name=self.dataset_path, - datasets=self.datasets, + dataset_name=self.dataset_metadata.name, drop_duplicates=True, ) ) diff --git a/cdisc_rules_engine/dummy_models/dummy_dataset.py b/cdisc_rules_engine/dummy_models/dummy_dataset.py index 34288d1b1..36b58fa1c 100644 --- a/cdisc_rules_engine/dummy_models/dummy_dataset.py +++ b/cdisc_rules_engine/dummy_models/dummy_dataset.py @@ -7,7 +7,7 @@ class DummyDataset(SDTMDatasetMetadata): - def __init__(self, dataset_data: dict): + def __init__(self, dataset_data: dict | SDTMDatasetMetadata): # with XPT in test, we pass the dataset_data as an instance of SDTMDatasetMetadata if isinstance(dataset_data, SDTMDatasetMetadata): super().__init__( @@ -45,14 +45,5 @@ def __init__(self, dataset_data: dict): self.record_count = len(self.data.index) - def get_metadata(self): - return { - "dataset_size": [self.file_size or 1000], - "dataset_name": [self.name or "test"], - "dataset_label": [self.label or "test"], - "filename": [self.filename], - "record_count": [self.record_count], - } - def __repr__(self): return asdict(self).__repr__() diff --git a/cdisc_rules_engine/interfaces/data_service_interface.py b/cdisc_rules_engine/interfaces/data_service_interface.py index 8e3ab1778..db890ff15 100644 --- a/cdisc_rules_engine/interfaces/data_service_interface.py +++ b/cdisc_rules_engine/interfaces/data_service_interface.py @@ -107,8 +107,6 @@ def read_data(self, file_path: str) -> IOBase: def get_dataset_class( self, dataset: DatasetInterface, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: """ @@ -118,8 +116,6 @@ def get_dataset_class( @abstractmethod def get_data_structure( self, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: """ @@ -132,6 +128,16 @@ def to_parquet(self, file_path: str) -> str: Converts a given file_path to parquet. Returns path to new file """ + @abstractmethod + def handle_custom_domains( + self, + dataset: DatasetInterface, + dataset_metadata: SDTMDatasetMetadata, + ) -> str | None: + """ + Handles custom domains by returning the appropriate class name based on the dataset contents. + """ + @staticmethod @abstractmethod def is_valid_data(dataset_paths: Sequence[str]) -> bool: diff --git a/cdisc_rules_engine/models/actions.py b/cdisc_rules_engine/models/actions.py index 88a5d412f..091785b3a 100644 --- a/cdisc_rules_engine/models/actions.py +++ b/cdisc_rules_engine/models/actions.py @@ -1,12 +1,11 @@ from typing import List, Optional, Set, Hashable, Iterable -from os import path import pandas as pd from business_rules.actions import BaseActions, rule_action from business_rules.fields import FIELD_TEXT from cdisc_rules_engine.constants import NULL_FLAVORS from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.enums.sensitivity import Sensitivity @@ -452,14 +451,16 @@ def _generate_errors_by_group( return errors_list def _get_dataset_name(self, data: pd.DataFrame) -> str: - source_pathnames = data.get(SOURCE_FILENAME, []) - source_filenames = [ - path.basename(source_pathname) for source_pathname in source_pathnames - ] - source_filename_str = ", ".join( - sorted(set(source_filename or "" for source_filename in source_filenames)) + source_dataset_names = data.get(SOURCE_DATASET_NAME, []) + source_dataset_name_str = ", ".join( + sorted( + set( + source_dataset_name or "" + for source_dataset_name in source_dataset_names + ) + ) ) - return source_filename_str + return source_dataset_name_str def _create_error_object( self, df_row: pd.Series, data: pd.DataFrame @@ -471,15 +472,15 @@ def _create_error_object( json_path: Optional[pd.Series] = data.get("_path") instance_id: Optional[pd.Series] = data.get("id") source_row_number: Optional[pd.Series] = data.get(SOURCE_ROW_NUMBER) - source_filename: Optional[pd.Series] = data.get(SOURCE_FILENAME) + source_dataset_name: Optional[pd.Series] = data.get(SOURCE_DATASET_NAME) row_dict = df_row.to_dict() filtered_dict = {} for key, value in row_dict.items(): filtered_dict[key] = self._filter_null_values(value) error_object = ValidationErrorEntity( dataset=( - path.basename(source_filename[df_row.name]) - if isinstance(source_filename, pd.Series) + source_dataset_name[df_row.name] + if isinstance(source_dataset_name, pd.Series) else "" ), row=( diff --git a/cdisc_rules_engine/models/dataset_metadata.py b/cdisc_rules_engine/models/dataset_metadata.py index b4b8a2dfd..eae183ba4 100644 --- a/cdisc_rules_engine/models/dataset_metadata.py +++ b/cdisc_rules_engine/models/dataset_metadata.py @@ -1,6 +1,5 @@ from dataclasses import dataclass from typing import Union -from os.path import basename @dataclass @@ -18,7 +17,3 @@ class DatasetMetadata: full_path: Union[str, None] = None first_record: Union[dict, None] = None original_path: Union[str, None] = None - - @property - def data_service_identifier(self) -> str: - return basename(self.full_path) if self.full_path else self.filename diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 3cc8b938a..df77c0501 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -1,12 +1,13 @@ from dataclasses import dataclass -from typing import Iterable, List +from typing import List from cdisc_rules_engine.models.external_dictionaries_container import ( ExternalDictionariesContainer, ) -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata import pandas as pd +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata + @dataclass class OperationParams: @@ -18,10 +19,7 @@ class OperationParams: # Required parameters (no defaults) first core_id: str dataframe: pd.DataFrame - dataset_path: str - datasets: Iterable[SDTMDatasetMetadata] domain: str - directory_path: str operation_id: str operation_name: str standard: str @@ -38,11 +36,13 @@ class OperationParams: ct_package_types: List[str] = None ct_version: str = None ct_package_type: str = None + dataframe_metadata: SDTMDatasetMetadata = None domain_class: str = None term_code: str = None term_value: str = None term_pref_term: str = None dictionary_term_type: str = None + evaluation_dataset_metadata: SDTMDatasetMetadata = None external_dictionaries: ExternalDictionariesContainer = None external_dictionary_term_variable: str = None external_dictionary_type: str = None diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index 7d1647c05..633726f69 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -1,4 +1,4 @@ -import os +from os.path import dirname, exists, join from cdisc_rules_engine.constants.define_xml_constants import DEFINE_XML_FILE_NAME from cdisc_rules_engine.models.operation_params import OperationParams @@ -42,7 +42,7 @@ class BaseOperation: def __init__( self, params: OperationParams, - original_dataset: DatasetInterface, + evaluation_dataset: DatasetInterface, cache_service: CacheServiceInterface, data_service: DataServiceInterface, library_metadata: LibraryMetadataContainer = LibraryMetadataContainer(), @@ -50,7 +50,7 @@ def __init__( self.params = params self.cache = cache_service self.data_service = data_service - self.evaluation_dataset = original_dataset + self.evaluation_dataset = evaluation_dataset self.library_metadata = library_metadata @abstractmethod @@ -229,25 +229,25 @@ def _get_variables_metadata_from_standard(self) -> List[dict]: return sdtm_utilities.get_variables_metadata_from_standard( library_metadata=self.library_metadata, data_service=self.data_service, - dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets - ), - datasets=self.params.datasets, - dataset_path=self.params.dataset_path, + dataset_metadata=self.params.dataframe_metadata, ) - def _get_variable_names_list(self, domain, dataframe): + def _get_variable_names_list(self, dataset_metadata, dataframe): # get variables metadata from the standard model variables_metadata: List[dict] = ( - self._get_variables_metadata_from_standard_model(dataframe) + self._get_variables_metadata_from_standard_model( + dataset_metadata, dataframe + ) ) # create a list of variable names in accordance to the "ordinal" key variable_names_list = self._replace_variable_wildcards( - variables_metadata, domain + variables_metadata, dataset_metadata.wildcard_replacement ) return list(OrderedDict.fromkeys(variable_names_list)) - def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: + def _get_variables_metadata_from_standard_model( + self, dataset_metadata, dataframe + ) -> List[dict]: """ Gets variables metadata for the given class and domain from cache. The cache stores CDISC Library metadata. @@ -275,13 +275,9 @@ def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: return sdtm_utilities.get_variables_metadata_from_standard_model( dataframe=dataframe, - datasets=self.params.datasets, - dataset_path=self.params.dataset_path, data_service=self.data_service, library_metadata=self.library_metadata, - dataset_metadata=self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets - ), + dataset_metadata=dataset_metadata, ) @staticmethod @@ -308,9 +304,15 @@ def _get_define_contents(self): define_path = ( self.params.define_xml_path if self.params.define_xml_path - else os.path.join(self.params.directory_path, DEFINE_XML_FILE_NAME) + else join( + dirname( + self.params.evaluation_dataset_metadata.original_path + or self.params.evaluation_dataset_metadata.full_path + ), + DEFINE_XML_FILE_NAME, + ) ) - if not os.path.exists(define_path): + if not exists(define_path): raise FileNotFoundError(f"Define XML file {define_path} not found") define_contents = self.data_service.get_define_xml_contents( dataset_name=define_path diff --git a/cdisc_rules_engine/operations/dataset_names.py b/cdisc_rules_engine/operations/dataset_names.py index 85331f377..7f4216e1e 100644 --- a/cdisc_rules_engine/operations/dataset_names.py +++ b/cdisc_rules_engine/operations/dataset_names.py @@ -6,4 +6,4 @@ def _execute_operation(self): """ Returns a list of the dataset names in the study """ - return list({dataset.name for dataset in self.params.datasets}) + return list({dataset.name for dataset in self.data_service.get_datasets()}) diff --git a/cdisc_rules_engine/operations/day_data_validator.py b/cdisc_rules_engine/operations/day_data_validator.py index 6a9b6269c..d09893b5a 100644 --- a/cdisc_rules_engine/operations/day_data_validator.py +++ b/cdisc_rules_engine/operations/day_data_validator.py @@ -12,7 +12,9 @@ def _execute_operation(self): ) # Always get RFSTDTC column from DM dataset. dm_datasets = [ - dataset for dataset in self.params.datasets if dataset.domain == "DM" + dataset + for dataset in self.data_service.get_datasets() + if dataset.domain == "DM" ] if not dm_datasets: raise DomainNotFoundError( @@ -23,9 +25,7 @@ def _execute_operation(self): self.data_service.get_dataset, dm_datasets ) else: - dm_data = self.data_service.get_dataset( - dataset_name=dm_datasets[0].full_path or dm_datasets[0].filename - ) + dm_data = self.data_service.get_dataset(dataset_name=dm_datasets[0].name) dm_data = tag_source(dm_data, dm_datasets[0]) new_dataset = self.evaluation_dataset.merge( diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index cecf00560..a7485a4f6 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -71,7 +71,7 @@ def get_existing_column_names(group): def _get_referenced_datasets(self): referenced_datasets = {} for dataset_metadata in self.data_service.get_datasets(): - dataset = self.data_service.get_dataset(dataset_metadata.filename) + dataset = self.data_service.get_dataset(dataset_name=dataset_metadata.name) referenced_datasets[dataset_metadata.name] = dataset return referenced_datasets diff --git a/cdisc_rules_engine/operations/domain_label.py b/cdisc_rules_engine/operations/domain_label.py index 69e8ce737..468fef10e 100644 --- a/cdisc_rules_engine/operations/domain_label.py +++ b/cdisc_rules_engine/operations/domain_label.py @@ -1,6 +1,6 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, ) @@ -12,7 +12,7 @@ def _execute_operation(self): standard_data = self.library_metadata.standard_metadata domain_details = None for c in standard_data.get("classes", []): - domain_details = search_in_list_of_dicts( + domain_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == self.params.domain ) if domain_details: diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 5709f22d3..876952f1f 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -1,13 +1,14 @@ import pandas as pd +from cdisc_rules_engine.models.dataset import DatasetInterface from cdisc_rules_engine.operations.base_operation import BaseOperation class ExtractMetadata(BaseOperation): def _execute_operation(self): # get metadata - metadata: pd.DataFrame = self.data_service.get_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets + metadata: DatasetInterface = self.data_service.get_dataset_metadata( + dataset_name=self.params.dataframe_metadata.name ) # extract target value. Metadata df always has one row diff --git a/cdisc_rules_engine/operations/get_model_filtered_variables.py b/cdisc_rules_engine/operations/get_model_filtered_variables.py index faff0daca..a36c0ffc3 100644 --- a/cdisc_rules_engine/operations/get_model_filtered_variables.py +++ b/cdisc_rules_engine/operations/get_model_filtered_variables.py @@ -23,10 +23,10 @@ def _get_model_filtered_variables(self): key = self.params.key_name val = self.params.key_value model_variables: List[dict] = self._get_variables_metadata_from_standard_model( - self.params.dataframe + self.params.dataframe_metadata, self.params.dataframe ) filtered_model = [var for var in model_variables if var.get(key) == val] variable_names_list = self._replace_variable_wildcards( - filtered_model, self.params.domain + filtered_model, self.params.dataframe_metadata.wildcard_replacement ) return variable_names_list diff --git a/cdisc_rules_engine/operations/library_model_column_order.py b/cdisc_rules_engine/operations/library_model_column_order.py index d60fd58ed..91ccbaf97 100644 --- a/cdisc_rules_engine/operations/library_model_column_order.py +++ b/cdisc_rules_engine/operations/library_model_column_order.py @@ -16,4 +16,6 @@ def _execute_operation(self): The lists with column names are sorted in accordance to "ordinal" key of library metadata. """ - return self._get_variable_names_list(self.params.domain, self.params.dataframe) + return self._get_variable_names_list( + self.params.dataframe_metadata, self.params.dataframe + ) diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index 1c01a086a..37aee00cd 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -163,7 +163,7 @@ def get_service( """Get instance of operation that matches operation specified in params""" required_args = { "operation_params", - "original_dataset", + "evaluation_dataset", "cache", "data_service", "library_metadata", @@ -176,7 +176,7 @@ def get_service( if name in self._operations_map: return self._operations_map.get(name)( kwargs.get("operation_params"), - kwargs.get("original_dataset"), + kwargs.get("evaluation_dataset"), kwargs.get("cache"), kwargs.get("data_service"), kwargs.get("library_metadata"), diff --git a/cdisc_rules_engine/operations/parent_library_model_column_order.py b/cdisc_rules_engine/operations/parent_library_model_column_order.py index 665da4a6a..f1b73b47a 100644 --- a/cdisc_rules_engine/operations/parent_library_model_column_order.py +++ b/cdisc_rules_engine/operations/parent_library_model_column_order.py @@ -36,7 +36,7 @@ def _execute_operation(self): def _get_domain_to_datasets(self): domain_to_datasets = defaultdict(list) - for dataset in self.params.datasets: + for dataset in self.data_service.get_datasets(): domain_to_datasets[dataset.domain].append(dataset) return domain_to_datasets @@ -48,6 +48,11 @@ def _get_parent_variable_names_list(self, domain_to_datasets: dict, rdomain: str f"{rdomain} but Domain not found in datasets" ) parent_dataframe = self.data_service.get_dataset( - dataset_name=parent_datasets[0].full_path + dataset_name=parent_datasets[0].name + ) + parent_dataframe_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=parent_datasets[0].name + ) + return self._get_variable_names_list( + parent_dataframe_metadata, parent_dataframe ) - return self._get_variable_names_list(rdomain, parent_dataframe) diff --git a/cdisc_rules_engine/operations/related_domain_is_custom.py b/cdisc_rules_engine/operations/related_domain_is_custom.py index 5b681d6f4..e313b7140 100644 --- a/cdisc_rules_engine/operations/related_domain_is_custom.py +++ b/cdisc_rules_engine/operations/related_domain_is_custom.py @@ -10,7 +10,7 @@ def _execute_operation(self): If no -> the domain is custom. """ - for ds in self.params.datasets: + for ds in self.data_service.get_datasets(): if ds.is_supp and self.params.domain.endswith(ds.rdomain): return is_custom_domain(self.library_metadata, ds.rdomain) return False diff --git a/cdisc_rules_engine/operations/study_domains.py b/cdisc_rules_engine/operations/study_domains.py index b931ccda1..4c2fd85b5 100644 --- a/cdisc_rules_engine/operations/study_domains.py +++ b/cdisc_rules_engine/operations/study_domains.py @@ -6,4 +6,6 @@ def _execute_operation(self): """ Returns a list of the domains in the study """ - return list({(dataset.domain or "") for dataset in self.params.datasets}) + return list( + {(dataset.domain or "") for dataset in self.data_service.get_datasets()} + ) diff --git a/cdisc_rules_engine/operations/variable_count.py b/cdisc_rules_engine/operations/variable_count.py index c9962e7bb..a14d60502 100644 --- a/cdisc_rules_engine/operations/variable_count.py +++ b/cdisc_rules_engine/operations/variable_count.py @@ -1,4 +1,3 @@ -import pandas as pd from cdisc_rules_engine.operations.base_operation import BaseOperation from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata import asyncio @@ -23,7 +22,10 @@ async def _get_all_study_variable_counts(self) -> dict: of times that value appears as a variable in the study. """ datasets_with_unique_domains = list( - {dataset.unsplit_name: dataset for dataset in self.params.datasets}.values() + { + dataset.unsplit_name: dataset + for dataset in self.data_service.get_datasets() + }.values() ) coroutines = [ self._get_dataset_variable_count(dataset) @@ -35,9 +37,7 @@ async def _get_all_study_variable_counts(self) -> dict: async def _get_dataset_variable_count( self, dataset: SDTMDatasetMetadata ) -> Counter: - data: pd.DataFrame = self.data_service.get_dataset( - dataset_name=dataset.full_path - ) + data = self.data_service.get_dataset(dataset_name=dataset.name) target_variable = BaseOperation._replace_variable_wildcard( self.params.original_target, dataset.wildcard_replacement ) diff --git a/cdisc_rules_engine/operations/variable_exists.py b/cdisc_rules_engine/operations/variable_exists.py index 9fffbacfb..e0e047b81 100644 --- a/cdisc_rules_engine/operations/variable_exists.py +++ b/cdisc_rules_engine/operations/variable_exists.py @@ -3,6 +3,4 @@ class VariableExists(BaseOperation): def _execute_operation(self): - # get metadata - dataframe = self.data_service.get_dataset(dataset_name=self.params.dataset_path) - return self.params.target in dataframe + return self.params.target in self.params.dataframe diff --git a/cdisc_rules_engine/operations/variable_is_null.py b/cdisc_rules_engine/operations/variable_is_null.py index 7e14bfcd4..f6821f317 100644 --- a/cdisc_rules_engine/operations/variable_is_null.py +++ b/cdisc_rules_engine/operations/variable_is_null.py @@ -6,9 +6,7 @@ def _execute_operation(self): if self.params.source == "submission": if self.params.level == "row": raise ValueError("level: row may only be used with source: evaluation") - dataframe = self.data_service.get_dataset( - dataset_name=self.params.dataset_path - ) + dataframe = self.params.dataframe else: dataframe = self.evaluation_dataset diff --git a/cdisc_rules_engine/operations/variable_value_count.py b/cdisc_rules_engine/operations/variable_value_count.py index f283a0e60..ce3887a59 100644 --- a/cdisc_rules_engine/operations/variable_value_count.py +++ b/cdisc_rules_engine/operations/variable_value_count.py @@ -2,7 +2,6 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.base_operation import BaseOperation import asyncio -import os from collections import Counter from typing import List from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets @@ -23,7 +22,9 @@ async def _get_all_study_variable_value_counts(self) -> dict: of times that value appears in the study. """ datasets_with_unique_domains = list( - {dataset.domain: dataset for dataset in self.params.datasets}.values() + { + dataset.domain: dataset for dataset in self.data_service.get_datasets() + }.values() ) coroutines = [ self._get_dataset_variable_value_count(dataset) @@ -37,16 +38,14 @@ async def _get_dataset_variable_value_count( ) -> Counter: if dataset_metadata.is_split: corresponding_datasets = get_corresponding_datasets( - self.params.datasets, dataset_metadata + self.data_service.get_datasets(), dataset_metadata ) data: DatasetInterface = self.data_service.concat_split_datasets( self.data_service.get_dataset, corresponding_datasets ) else: data: DatasetInterface = self.data_service.get_dataset( - dataset_name=os.path.join( - self.params.directory_path, dataset_metadata.filename - ) + dataset_name=dataset_metadata.name ) data = tag_source(data, dataset_metadata) target_variable = BaseOperation._replace_variable_wildcard( diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 33f371800..bc677ef04 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Iterable, List, Union +from typing import List, Union from dateutil.parser._parser import ParserError import traceback @@ -45,9 +45,6 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.cache import CacheServiceFactory from cdisc_rules_engine.services.data_services import DataServiceFactory -from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( - DefineXMLReaderFactory, -) from cdisc_rules_engine.utilities.jsonata_processor import JSONataProcessor from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor @@ -126,7 +123,7 @@ def get_first_dataset_path(self) -> str | None: ): return self.data_service.dataset_paths[0] - def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadata]): + def validate_single_rule(self, rule: dict): results = {} rule["conditions"] = ConditionCompositeFactory.get_condition_composite( rule["conditions"] @@ -134,14 +131,13 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat if rule.get("rule_type") == RuleTypes.JSONATA.value: results["json"] = self.validate_single_dataset( rule, - datasets, SDTMDatasetMetadata( name="json", full_path=self.get_first_dataset_path() ), ) else: total_errors = 0 - for dataset_metadata in datasets: + for dataset_metadata in self.data_service.get_datasets(): if ( self.max_errors_per_rule and not self.errors_per_dataset_flag @@ -158,7 +154,6 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat continue # handling split datasets dataset_results = self.validate_single_dataset( rule, - datasets, dataset_metadata, ) if self.errors_per_dataset_flag and self.max_errors_per_rule: @@ -212,7 +207,6 @@ def _truncate_dataset_errors(self, dataset_results, rule, dataset_metadata): def validate_single_dataset( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> List[Union[dict, str]]: """ @@ -221,20 +215,18 @@ def validate_single_dataset( """ logger.info( f"Validating {dataset_metadata.name}. " - f"rule={rule}. dataset_path={dataset_metadata.full_path}. datasets={datasets}." + f"rule={rule}. dataset_path={dataset_metadata.full_path}. datasets={self.data_service.get_datasets()}." ) try: is_suitable, reason = self.rule_processor.is_suitable_for_validation( rule, dataset_metadata, - datasets, self.standard, - self.standard_substandard, self.use_case, ) if is_suitable: result: List[Union[dict, str]] = self.validate_rule( - rule, datasets, dataset_metadata + rule, dataset_metadata ) logger.info( f"Validated dataset {dataset_metadata.name}. Result = {result}" @@ -245,7 +237,7 @@ def validate_single_dataset( # No errors were generated, create success error container return [ ValidationErrorContainer( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain, errors=[], ).to_representation() @@ -255,7 +247,7 @@ def validate_single_dataset( f"Skipped dataset {dataset_metadata.name}. Reason: {reason}" ) error_obj = FailedValidationEntity( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, error=SkippedReason.OUTSIDE_SCOPE.value, message=reason, ) @@ -263,7 +255,7 @@ def validate_single_dataset( ValidationErrorContainer( status=ExecutionStatus.SKIPPED.value, message=reason, - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain or "", @@ -283,7 +275,7 @@ def validate_single_dataset( """ ) error_obj: ValidationErrorContainer = self.handle_validation_exceptions( - e, dataset_metadata.filename + e, dataset_metadata.name ) error_obj.domain = dataset_metadata.domain or dataset_metadata.rdomain or "" # this wrapping into a list is necessary to keep return type consistent @@ -292,7 +284,6 @@ def validate_single_dataset( def get_dataset_builder( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): return builder_factory.get_service( @@ -303,8 +294,6 @@ def get_dataset_builder( data_processor=self.data_processor, rule_processor=self.rule_processor, dataset_metadata=dataset_metadata, - datasets=datasets, - dataset_path=dataset_metadata.full_path, define_xml_path=self.define_xml_path, standard=self.standard, standard_version=self.standard_version, @@ -316,7 +305,6 @@ def get_dataset_builder( def validate_rule( self, rule: dict, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> List[Union[dict, str]]: """ @@ -324,7 +312,7 @@ def validate_rule( It defines a rule validator based on its type and calls it. """ kwargs = {} - builder = self.get_dataset_builder(rule, datasets, dataset_metadata) + builder = self.get_dataset_builder(rule, dataset_metadata) try: dataset = builder.get_dataset() except Exception as e: @@ -352,13 +340,12 @@ def validate_rule( kwargs["ct_packages"] = list(self.ct_packages) logger.info(f"Using dataset build by: {builder.__class__}") - return self.execute_rule(rule, dataset, datasets, dataset_metadata, **kwargs) + return self.execute_rule(rule, dataset, dataset_metadata, **kwargs) def execute_rule( self, rule: dict, dataset: DatasetInterface, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, value_level_metadata: List[dict] = None, variable_codelist_map: dict = None, @@ -387,12 +374,11 @@ def execute_rule( dataset_preprocessor = DatasetPreprocessor( dataset, dataset_metadata, self.data_service, self.cache ) - dataset = dataset_preprocessor.preprocess(rule_copy, datasets) + dataset = dataset_preprocessor.preprocess(rule_copy) dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, dataset_metadata, - datasets, standard=self.standard, standard_version=self.standard_version, standard_substandard=self.standard_substandard, @@ -408,7 +394,7 @@ def execute_rule( ) logger.info(f"Skipped dataset {dataset_metadata.name}. Reason: {reason}") error_obj = FailedValidationEntity( - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, error=SkippedReason.EMPTY_DATASET.value, message=reason, ) @@ -416,7 +402,7 @@ def execute_rule( ValidationErrorContainer( status=ExecutionStatus.SKIPPED.value, message=reason, - dataset=dataset_metadata.filename, + dataset=dataset_metadata.name, domain=dataset_metadata.domain or dataset_metadata.rdomain or "", errors=[error_obj], ).to_representation() @@ -442,17 +428,6 @@ def execute_rule( ) return results - def get_define_xml_value_level_metadata( - self, dataset_path: str, domain_name: str - ) -> List[dict]: - """ - Gets Define XML variable metadata and returns it as dataframe. - """ - define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( - dataset_path, self.define_xml_path, self.data_service, self.cache - ) - return define_xml_reader.extract_value_level_metadata(domain_name=domain_name) - def handle_validation_exceptions( # noqa self, exception, filename: str ) -> ValidationErrorContainer: diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 8ac16f92e..b1566a270 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -1,5 +1,5 @@ import asyncio -from abc import ABC +from abc import ABC, abstractmethod from functools import wraps, partial from typing import Callable, List, Optional, Iterable, Iterator from concurrent.futures import ThreadPoolExecutor @@ -34,8 +34,7 @@ from cdisc_rules_engine.services.data_readers import DataReaderFactory from cdisc_rules_engine.utilities.utils import ( get_dataset_cache_key_from_path, - get_directory_path, - search_in_list_of_dicts, + search_in_list, replace_nan_values_in_df, ) from cdisc_rules_engine.utilities.sdtm_utilities import ( @@ -116,6 +115,10 @@ def __init__( self.dataset_implementation = kwargs.get( "dataset_implementation", PandasDataset ) + # Call the subclass implementation to populate metadata + self._datasets_metadata: dict[str, SDTMDatasetMetadata] = ( + self._initialize_datasets_metadata(**kwargs) + ) def get_dataset_by_type( self, dataset_name: str, dataset_type: str, **params @@ -152,7 +155,9 @@ def concat_split_datasets( # download datasets asynchronously datasets: Iterator[DatasetInterface] = self._async_get_datasets( func_to_call, - dataset_names=[dataset.full_path for dataset in datasets_metadata], + dataset_names=[ + dataset_metadata.name for dataset_metadata in datasets_metadata + ], **kwargs, ) full_dataset = self.dataset_implementation() @@ -173,8 +178,6 @@ def check_filepath(self, dataset_names: List[str]) -> List: def get_dataset_class( self, dataset: DatasetInterface, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: if self.library_metadata.standard_metadata: @@ -185,14 +188,10 @@ def get_dataset_class( name = class_data.get("name") if name: return convert_library_class_name_to_ct_class(name) - return self._handle_custom_domains( - dataset, dataset_metadata, file_path, datasets - ) + return self.handle_custom_domains(dataset, dataset_metadata) def get_data_structure( self, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: # TODO: look at defineXML if applicable for more accurate data structure detection @@ -231,12 +230,53 @@ def get_dataset_metadata( } return self.dataset_implementation.from_dict(metadata_to_return) - def _handle_custom_domains( + def get_raw_dataset_metadata( + self, dataset_name: str, **kwargs + ) -> SDTMDatasetMetadata: + """ + Returns dataset metadata from the metadata dictionary. + + Args: + dataset_name: Name or filename of the dataset + + Returns: + SDTMDatasetMetadata instance + + Raises: + KeyError: If dataset_name is not found in the metadata dictionary + """ + if dataset_name not in self._datasets_metadata: + raise KeyError( + f"Dataset '{dataset_name}' not found in metadata. " + f"Available datasets: {list(self._datasets_metadata.keys())}" + ) + return self._datasets_metadata[dataset_name] + + def get_datasets(self) -> List[SDTMDatasetMetadata]: + """ + Returns list of dataset metadata. + """ + return list(self._datasets_metadata.values()) + + @abstractmethod + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata dictionary. + + Subclasses must implement this method to populate the metadata dictionary + with their specific logic for reading and organizing dataset metadata. + + Args: + **kwargs: Additional keyword arguments passed from __init__ + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + + def handle_custom_domains( self, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - file_path: str, - datasets: Iterable[SDTMDatasetMetadata], ): if self._contains_topic_variable(dataset, dataset_metadata.domain, "TERM"): return EVENTS @@ -249,15 +289,11 @@ def _handle_custom_domains( return FINDINGS_ABOUT return FINDINGS if dataset_metadata.is_ap: - return self._get_associated_persons_inherit_class( - file_path, datasets, dataset_metadata - ) + return self._get_associated_persons_inherit_class(dataset_metadata) return None def _get_associated_persons_inherit_class( self, - file_path, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): """ @@ -266,24 +302,20 @@ def _get_associated_persons_inherit_class( ap_suffix = dataset_metadata.ap_suffix if not ap_suffix: return None - directory_path = get_directory_path(file_path) + datasets = self.get_datasets() if len(datasets) > 1: - domain_details: SDTMDatasetMetadata = search_in_list_of_dicts( + new_dataset_metadata: SDTMDatasetMetadata = search_in_list( datasets, lambda item: item.domain == ap_suffix ) - if domain_details: - if domain_details.is_ap: + if new_dataset_metadata: + if new_dataset_metadata.is_ap: raise ValueError("Nested Associated Persons domain reference") - file_name = domain_details.filename - new_file_path = os.path.join(directory_path, file_name) - new_domain_dataset = self.get_dataset(dataset_name=new_file_path) + new_dataset = self.get_dataset(dataset_name=new_dataset_metadata.name) else: raise ValueError("Filename for domain doesn't exist") return self.get_dataset_class( - new_domain_dataset, - new_file_path, - datasets, - domain_details, + new_dataset, + new_dataset_metadata, ) else: return None diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py index d0d5a08c7..eb9f49297 100644 --- a/cdisc_rules_engine/services/data_services/data_service_factory.py +++ b/cdisc_rules_engine/services/data_services/data_service_factory.py @@ -1,6 +1,4 @@ -from typing import Iterable, List, Type - -from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset +from typing import Iterable, Type from cdisc_rules_engine.interfaces import ( CacheServiceInterface, ConfigInterface, @@ -76,8 +74,15 @@ def get_data_service( ) elif DummyDataService.is_valid_data(dataset_paths, encoding=self.encoding): """Get dummy data service""" - return self.get_dummy_data_service( - data=DummyDataService.get_data(dataset_paths, encoding=self.encoding) + return self.get_service( + "dummy", + standard=self.standard, + standard_version=self.standard_version, + standard_substandard=self.standard_substandard, + library_metadata=self.library_metadata, + dataset_path=dataset_paths[0], + dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) elif ExcelDataService.is_valid_data(dataset_paths): """Get Excel file to dataset data service""" @@ -106,18 +111,6 @@ def get_data_service( datasets_csv_path=self.datasets_csv_path, ) - def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface: - return self.get_service( - "dummy", - data=data, - standard=self.standard, - standard_version=self.standard_version, - standard_substandard=self.standard_substandard, - library_metadata=self.library_metadata, - dataset_implementation=self.get_dataset_implementation(), - encoding=self.encoding, - ) - def get_dataset_implementation(self): """ Gets the class that should be used to represent datasets for the rules engine. This class may be dependent on diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index 9275e2262..a3a98b978 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -1,14 +1,9 @@ -from datetime import datetime from io import IOBase -from typing import List, Optional, Iterable, Sequence +from typing import List, Optional, Sequence -import os import pandas as pd import tempfile from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset -from cdisc_rules_engine.exceptions.custom_exceptions import ( - DatasetNotFoundError, -) from cdisc_rules_engine.interfaces import CacheServiceInterface, ConfigInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_types import DatasetTypes @@ -17,6 +12,7 @@ from cdisc_rules_engine.services.data_services import BaseDataService from cdisc_rules_engine.constants import DEFAULT_ENCODING from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.utilities.utils import convert_dataclass_to_superclass class DummyDataService(BaseDataService): @@ -31,11 +27,15 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.encoding = kwargs.get("encoding") or DEFAULT_ENCODING + self.dataset_path: str | None = kwargs.get("dataset_path") + self.data: List[DummyDataset] = kwargs.get("data") + if self.data is None and self.dataset_path is not None: + self.data = self.get_data() + self.define_xml: str = kwargs.get("define_xml") super(DummyDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) - self.data: List[DummyDataset] = kwargs.get("data") - self.define_xml: str = kwargs.get("define_xml") @classmethod def get_instance( @@ -53,15 +53,9 @@ def get_instance( **kwargs, ) - def check_dataset_exists(self, dataset_name): - dataset_name = dataset_name.replace("/", "") - if dataset_name not in self.data: - raise DatasetNotFoundError("dataset does not exist") - def get_dataset_data(self, dataset_name: str) -> Optional[DummyDataset]: - dataset_name = os.path.basename(dataset_name) for dataset in self.data: - if dataset.filename == dataset_name: + if dataset.name == dataset_name: return dataset return None @@ -75,20 +69,17 @@ def get_dataset(self, dataset_name: str, **params) -> PandasDataset: else: return PandasDataset.from_dict({}) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: - dataset_metadata: dict = self.__get_dataset_metadata(dataset_name, **kwargs) - return SDTMDatasetMetadata( - name=dataset_metadata["dataset_name"][0], - first_record={"DOMAIN": dataset_metadata["dataset_name"][0]}, - label=dataset_metadata["dataset_label"][0], - modification_date=datetime.now().isoformat(), - filename=dataset_metadata["filename"][0], - file_size=dataset_metadata["dataset_size"][0], - full_path=dataset_metadata["filename"][0], - record_count=dataset_metadata["record_count"][0], - ) + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata by converting DummyDataset objects to SDTMDatasetMetadata. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + return { + dataset.name: convert_dataclass_to_superclass(dataset, SDTMDatasetMetadata) + for dataset in self.data + } def get_variables_metadata(self, dataset_name: str, **params) -> PandasDataset: metadata_to_return = { @@ -154,13 +145,6 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> dict: - dataset: Optional[DummyDataset] = self.get_dataset_data(dataset_name) - metadata_to_return = {} - if dataset: - metadata_to_return: dict = dataset.get_metadata() - return metadata_to_return - def to_parquet(self, file_path: str) -> str: """ Save the dataset with full_path == file_path to a parquet file. @@ -179,14 +163,8 @@ def to_parquet(self, file_path: str) -> str: return len(df.index), temp_file.name return 0, "" - def get_datasets(self) -> Iterable[SDTMDatasetMetadata]: - return self.data - - @staticmethod - def get_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING): - json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file( - dataset_paths[0] - ) + def get_data(self): + json = JSONReader(encoding=self.encoding).from_file(self.dataset_path) return [DummyDataset(data) for data in json.get("datasets", [])] @staticmethod diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 3b3e2224a..1da5ec33e 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -35,10 +35,10 @@ def __init__( config: ConfigInterface, **kwargs, ): + self.dataset_path: str = kwargs.get("dataset_path", "") super(ExcelDataService, self).__init__( cache_service, reader_factory, config, **kwargs ) - self.dataset_path: str = kwargs.get("dataset_path", "") @classmethod def get_instance( @@ -75,8 +75,7 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: return f return None - @cached_dataset(DatasetTypes.CONTENTS.value) - def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + def __get_dataset(self, sheet_name: str) -> DatasetInterface: dtype_mapping = { "Char": str, "Num": float, @@ -86,7 +85,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: } header = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=sheet_name, header=None, nrows=3, na_values=[""], @@ -96,7 +95,7 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: dtypes = {key: dtype_mapping.get(value, str) for key, value in dtypes.items()} dataframe = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=sheet_name, dtype=dtypes, skiprows=(1, 2, 3), na_values=[""], @@ -108,15 +107,21 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: offending = [col for col in dataframe.columns if col != col.strip()] if offending: raise ExcelTestDataError( - f"Sheet '{dataset_name}' has column headers with leading/trailing whitespace: " + f"Sheet '{sheet_name}' has column headers with leading/trailing whitespace: " f"{[repr(c) for c in offending]}." ) dataset = PandasDataset(dataframe) return dataset - def _get_dataset_name( - self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str - ) -> str: + @cached_dataset(DatasetTypes.CONTENTS.value) + def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + dataset_metadata = self._datasets_metadata.get(dataset_name) + if dataset_metadata is None: + return PandasDataset.from_dict({}) + sheet_name = dataset_metadata.filename + return self.__get_dataset(sheet_name) + + def _get_dataset_name(self, first_record: dict, dataset_filename: str) -> str: if self.standard == "usdm": return first_record.get("instanceType", dataset_filename.split(".")[0]) return dataset_filename.split(".")[0].upper() @@ -130,47 +135,85 @@ def _get_datasets_worksheet(self) -> pd.DataFrame: keep_default_na=False, ) - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, - dataset_name: str, - **kwargs, - ) -> SDTMDatasetMetadata: + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: """ - Returns dataset metadata as DatasetMetadata instance. + Initialize the dataset metadata by reading metadata for all datasets in the Excel file. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata """ - datasets_worksheet = self._get_datasets_worksheet() - metadata = datasets_worksheet[ - datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] - == dataset_name - ] - dataset = self.get_dataset(dataset_name=dataset_name) - first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} - return SDTMDatasetMetadata( - name=self._get_dataset_name(metadata, first_record, dataset_name), - first_record=first_record, - label=( - metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] - if not metadata.empty - else "" - ), - modification_date=datetime.fromtimestamp( - os.path.getmtime(self.dataset_path) - ).isoformat(), - filename=dataset_name, - full_path=dataset_name, - file_size=0, - record_count=len(dataset), + result = {} + try: + datasets_worksheet = self._get_datasets_worksheet() + except ValueError as e: + # Pandas raises ValueError when sheet is not found + if "Worksheet named" in str(e): + try: + with pd.ExcelFile(self.dataset_path) as xl: + sheet_names = xl.sheet_names + available = ", ".join(repr(s) for s in sheet_names) or "(none)" + except Exception: + available = "(unable to read sheet names)" + raise ExcelTestDataError( + f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " + f"Submitted sheet names: {available}." + ) from e + raise + + # Check for required columns + missing_cols = sorted( + set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) + - set(datasets_worksheet.columns) ) + if missing_cols: + raise ExcelTestDataError( + f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " + f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " + f"{missing_cols}. Column headers are case-sensitive. " + ) + + for dataset_filename in datasets_worksheet[ + ExcelDataSheets.DATASET_FILENAME_COLUMN.value + ]: + dataset = self.__get_dataset(dataset_filename) + first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} + metadata_row = datasets_worksheet[ + datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] + == dataset_filename + ] + dataset_name = self._get_dataset_name(first_record, dataset_filename) + dataset_metadata = SDTMDatasetMetadata( + name=dataset_name, + first_record=first_record, + label=( + metadata_row[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] + if not metadata_row.empty + else "" + ), + modification_date=datetime.fromtimestamp( + os.path.getmtime(self.dataset_path) + ).isoformat(), + filename=dataset_filename, + full_path=dataset_filename, + file_size=0, + record_count=len(dataset), + ) + result[dataset_name] = dataset_metadata + return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ + # Get the sheet name from metadata + dataset_metadata = self._datasets_metadata.get(dataset_name) + if dataset_metadata is None: + return PandasDataset.from_dict({}) + dataframe = pd.read_excel( self.dataset_path, - sheet_name=dataset_name, + sheet_name=dataset_metadata.filename, header=None, nrows=4, na_values=[""], @@ -211,46 +254,6 @@ def get_define_xml_contents(self, dataset_name: str) -> bytes: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def get_datasets(self) -> List[dict]: - try: - with pd.ExcelFile(self.dataset_path) as xl: - sheet_names = xl.sheet_names - if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names: - available = ", ".join(repr(s) for s in sheet_names) or "(none)" - raise ExcelTestDataError( - f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " - f"Submitted sheet names: {available}." - ) - worksheet = xl.parse( - ExcelDataSheets.DATASETS_SHEET_NAME.value, - na_values=[""], - keep_default_na=False, - ) - except ExcelTestDataError: - raise - except Exception as e: - raise ExcelTestDataError( - f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. " - f"Details: {e}" - ) from e - - missing_cols = sorted( - set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) - - set(worksheet.columns) - ) - if missing_cols: - raise ExcelTestDataError( - f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " - f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " - f"{missing_cols}. Column headers are case-sensitive. " - ) - - datasets = [ - self.get_raw_dataset_metadata(dataset_name=fn) - for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] - ] - return datasets - def to_parquet(self, file_path: str) -> str: """ Stub implementation to satisfy abstract interface requirements. diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 3f0b2d8bc..1fe731e5f 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -43,13 +43,48 @@ def __init__( config: ConfigInterface, **kwargs, ): - super(LocalDataService, self).__init__( - cache_service, reader_factory, config, **kwargs - ) - self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) self.encoding: str = kwargs.get("encoding") self.variables_csv_path: str = kwargs.get("variables_csv_path") self.datasets_csv_path: str = kwargs.get("datasets_csv_path") + self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths") or [] + super(LocalDataService, self).__init__( + cache_service, reader_factory, config, **kwargs + ) + + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: + """ + Initialize the dataset metadata by reading metadata for all dataset paths. + + Args: + **kwargs: Keyword arguments including dataset_paths + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata + """ + result = {} + for dataset_path in self.dataset_paths: + try: + file_metadata, contents_metadata = self.__get_dataset_metadata( + dataset_path + ) + metadata = SDTMDatasetMetadata( + name=contents_metadata["dataset_name"], + first_record=contents_metadata["first_record"], + label=contents_metadata["dataset_label"], + modification_date=contents_metadata["dataset_modification_date"], + filename=file_metadata["name"], + full_path=file_metadata["path"], + file_size=file_metadata["file_size"], + record_count=contents_metadata["dataset_length"], + ) + result[metadata.name] = metadata + except InvalidDatasetFormat: + raise + except Exception as e: + raise InvalidDatasetFormat( + f"Your data file could not be read: {dataset_path}." + ) from e + return result @classmethod def get_instance( @@ -64,8 +99,14 @@ def get_instance( runs multiple times with different encodings in the same process). """ encoding = kwargs.get("encoding") - if cls._instance is None or ( - encoding is not None and cls._instance.encoding != encoding + dataset_paths = kwargs.get("dataset_paths") + if ( + cls._instance is None + or (encoding is not None and cls._instance.encoding != encoding) + or ( + dataset_paths is not None + and cls._instance.dataset_paths != dataset_paths + ) ): service = cls( cache_service=cache_service, @@ -101,41 +142,21 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: @cached_dataset(DatasetTypes.CONTENTS.value) def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: + full_path = self._datasets_metadata[dataset_name].full_path reader = self._reader_factory.get_service( - basename(dataset_name).split(".")[1].upper() + basename(full_path).split(".")[1].upper() ) - df = reader.from_file(dataset_name) + df = reader.from_file(full_path) return df - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: - """ - Returns dataset metadata as DatasetMetadata instance. - """ - file_metadata, contents_metadata = self.__get_dataset_metadata( - dataset_name, **kwargs - ) - return SDTMDatasetMetadata( - name=contents_metadata["dataset_name"], - first_record=contents_metadata["first_record"], - label=contents_metadata["dataset_label"], - modification_date=contents_metadata["dataset_modification_date"], - filename=file_metadata["name"], - full_path=file_metadata["path"], - file_size=file_metadata["file_size"], - record_count=contents_metadata["dataset_length"], - ) - @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) - def get_variables_metadata( - self, dataset_name: str, datasets: list, **params - ) -> DatasetInterface: + def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ - metadata: dict = self.read_metadata(dataset_name, datasets=datasets) + metadata: dict = self.__read_metadata( + self._datasets_metadata[dataset_name].full_path + ) contents_metadata: dict = metadata["contents_metadata"] metadata_to_return: VariableMetadataContainer = VariableMetadataContainer( contents_metadata @@ -168,19 +189,20 @@ def get_dataset_by_type( dataset_name=dataset_name, **params ) - def read_metadata( - self, file_path: str, datasets: Optional[Iterable[SDTMDatasetMetadata]] = None + def __read_metadata( + self, + dataset_path: str, ) -> dict: - file_size = os.path.getsize(file_path) - file_name = basename(file_path) + file_size = os.path.getsize(dataset_path) + file_name = basename(dataset_path) file_metadata = { - "path": file_path, + "path": dataset_path, "name": file_name, "file_size": file_size, } - if file_name.endswith(".parquet") and datasets: - for obj in datasets: - if obj.full_path == file_path: + if file_name.endswith(".parquet") and self.get_datasets(): + for obj in self.get_datasets(): + if obj.full_path == dataset_path: file_metadata = { "path": obj.original_path, "name": basename(obj.original_path), @@ -225,12 +247,12 @@ def read_metadata( def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> Tuple[dict, dict]: + def __get_dataset_metadata(self, dataset_path: str, **kwargs) -> Tuple[dict, dict]: """ Internal method that gets dataset metadata and converts file size if needed. """ - metadata: dict = self.read_metadata(dataset_name, kwargs.get("datasets")) + metadata: dict = self.__read_metadata(dataset_path) file_metadata: dict = metadata["file_metadata"] size_unit: Optional[str] = kwargs.get("size_unit") if size_unit: # convert file size from bytes to desired unit if needed @@ -245,24 +267,6 @@ def to_parquet(self, file_path: str) -> str: ) return reader.to_parquet(file_path) - def get_datasets(self) -> List[dict]: - datasets = [] - for dataset_path in self.dataset_paths: - try: - dataset_metadata = self.get_raw_dataset_metadata( - dataset_name=dataset_path, - variables_csv_path=self.variables_csv_path, - datasets_csv_path=self.datasets_csv_path, - ) - datasets.append(dataset_metadata) - except InvalidDatasetFormat: - raise - except Exception as e: - raise InvalidDatasetFormat( - f"Your data file could not be read: {dataset_path}." - ) from e - return datasets - @staticmethod def is_valid_data(dataset_paths: List[str]) -> bool: for dataset_path in dataset_paths: diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 30928a102..83052f2e6 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -71,28 +71,25 @@ def __init__( config: ConfigInterface, **kwargs, ): - super(USDMDataService, self).__init__( - cache_service, reader_factory, config, **kwargs - ) self.dataset_path: str = kwargs.get("dataset_path", "") self.encoding: str = kwargs.get("encoding") with open(os.path.join("resources", "schema", "USDM.yaml")) as entity_dict: self.entity_dict: dict = safe_load(entity_dict) - self.json = self._reader_factory.get_service("USDM").from_file( - self.dataset_path - ) + self.json = reader_factory.get_service("USDM").from_file(self.dataset_path) # Build the id lookup dict once for fast reference resolution self._id_lookup = self.__build_id_lookup(self.json) - self.dataset_content_index: dict = self.__get_datasets_content_index( - dataset_name="USDM_content_index", json=self.json - ) + self.dataset_content_index: List[dict] = self.__get_datasets_content_index() self._jsonpath_cache = {} + super(USDMDataService, self).__init__( + cache_service, reader_factory, config, **kwargs + ) + @classmethod def get_instance( cls, @@ -132,34 +129,40 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: return self.__get_dataset(dataset_name) - @cached_dataset(DatasetTypes.RAW_METADATA.value) - def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs - ) -> SDTMDatasetMetadata: + def _initialize_datasets_metadata(self, **kwargs) -> dict[str, SDTMDatasetMetadata]: """ - Returns dataset metadata as DatasetMetadata instance. + Initialize the dataset metadata by reading metadata for all datasets in the USDM JSON. + + Returns: + Dictionary mapping dataset name to SDTMDatasetMetadata """ - dataset = self.get_dataset(dataset_name=dataset_name) - domain = self.__get_domain_from_dataset_name(dataset_name) - return SDTMDatasetMetadata( - name=domain, - first_record={"DOMAIN": domain}, - label=domain, - modification_date=datetime.fromtimestamp( - os.path.getmtime(self.dataset_path) - ).isoformat(), - filename=basename(dataset_name), - full_path=dataset_name, - file_size=0, - record_count=len(dataset), - ) + result = {} + for dataset_info in self.dataset_content_index: + dataset_name = dataset_info.get("dataset_name") + if not dataset_name: + continue + dataset = self.__get_dataset(dataset_name) + metadata = SDTMDatasetMetadata( + name=dataset_name, + first_record={"DOMAIN": dataset_name}, + label=dataset_name, + modification_date=datetime.fromtimestamp( + os.path.getmtime(self.dataset_path) + ).isoformat(), + filename=basename(self.dataset_path), + full_path=self.dataset_path, + file_size=0, + record_count=len(dataset), + ) + result[dataset_name] = metadata + return result @cached_dataset(DatasetTypes.VARIABLES_METADATA.value) def get_variables_metadata(self, dataset_name: str, **params) -> DatasetInterface: """ Gets dataset from blob storage and returns metadata of a certain variable. """ - metadata: dict = self.read_metadata(dataset_name) + metadata: dict = self.__read_entity_metadata(dataset_name) contents_metadata: dict = metadata["contents_metadata"] metadata_to_return: VariableMetadataContainer = VariableMetadataContainer( contents_metadata @@ -177,7 +180,7 @@ def get_define_xml_contents(self, dataset_name: str) -> bytes: "Can't use 'get_define_xml_contents' in USDMDataService!" ) - def read_metadata(self, dataset_name: str) -> dict: + def __read_entity_metadata(self, dataset_name: str) -> dict: np_json_type_map: dict = {"O": "string", "float64": "float"} file_size = os.path.getsize(self.dataset_path) file_name = basename(self.dataset_path) @@ -217,18 +220,6 @@ def read_metadata(self, dataset_name: str) -> dict: def read_data(self, file_path: str) -> IOBase: return open(file_path, "rb") - def get_datasets(self) -> List[dict]: - datasets = [] - for dataset in self.dataset_content_index: - dataset_name = dataset.get("dataset_name") - if not dataset_name: - continue - dataset_metadata: SDTMDatasetMetadata = self.get_raw_dataset_metadata( - dataset_name=dataset_name - ) - datasets.append(dataset_metadata) - return datasets - def to_parquet(self, file_path: str) -> str: """ Stub implementation to satisfy abstract interface requirements. @@ -403,9 +394,8 @@ def __get_entity_name(self, value, parent: Any, _depth=0): else: return api_type - def __read_metadata( + def __read_node_metadata( self, - json, parent_node: DatumInContext, child_value, content_path: str, @@ -415,7 +405,7 @@ def __read_metadata( f"{parent_node.path}".endswith("Id") or f"{parent_node.path}".endswith("Ids") ): - definition = self.__find_definition(json, child_value) + definition = self.__find_definition(self.json, child_value) if definition: child_value = definition ty = "reference" @@ -430,26 +420,24 @@ def __read_metadata( def __get_full_path(node: DatumInContext): return f"{node.full_path}".replace(".[", "[") - @cached_dataset(DatasetTypes.CONTENTS.value) - def __get_datasets_content_index(self, dataset_name: str, json) -> List[dict]: + def __get_datasets_content_index(self) -> List[dict]: """ This is a bit convoluted because there is a bug in jsonpath_ng where this query does not return object values within an array """ metadata = [] - for node in parse("$..*").find(json): + for node in parse("$..*").find(self.json): if type(node.value) is list: for index, child in enumerate(node.value): - if metadatum := self.__read_metadata( - json, + if metadatum := self.__read_node_metadata( node, child, f"{USDMDataService.__get_full_path(node)}[{index}]", ): metadata.append(metadatum) else: - if metadatum := self.__read_metadata( - json, node, node.value, USDMDataService.__get_full_path(node) + if metadatum := self.__read_node_metadata( + node, node.value, USDMDataService.__get_full_path(node) ): metadata.append(metadatum) dataset_dict = {} @@ -463,19 +451,12 @@ def __get_datasets_content_index(self, dataset_name: str, json) -> List[dict]: ) return [ { - "dataset_name": self.__get_dataset_name_from_domain(key), - "domain": key, + "dataset_name": key, "content_paths": value, } for key, value in dataset_dict.items() ] - def __get_dataset_name_from_domain(self, domain_name: str) -> str: - return os.path.join(self.dataset_path, "{}.json".format(domain_name)) - - def __get_domain_from_dataset_name(self, dataset_name: str) -> str: - return basename(dataset_name).split(".")[0] - @staticmethod def is_valid_data(dataset_paths: Sequence[str], encoding: str = None): if ( diff --git a/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py b/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py index 61156cfad..1e6dff126 100644 --- a/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py +++ b/cdisc_rules_engine/services/define_xml/define_xml_reader_factory.py @@ -1,4 +1,4 @@ -import os +from os.path import dirname, join from xml.etree import ElementTree from re import compile from typing import Union @@ -17,7 +17,6 @@ from cdisc_rules_engine.services.define_xml.base_define_xml_reader import ( BaseDefineXMLReader, ) -from cdisc_rules_engine.utilities.utils import get_directory_path class DefineXMLReaderFactory: @@ -107,9 +106,9 @@ def _from_namespace(cls, namespace: str) -> BaseDefineXMLReader: def get_define_xml_reader( cls, dataset_path: str, define_xml_path: str, data_service, cache ): - directory_path = get_directory_path(dataset_path) + directory_path = dirname(dataset_path) if define_xml_path is None: - define_xml_path: str = os.path.join( + define_xml_path: str = join( directory_path, DEFINE_XML_FILE_NAME, ) diff --git a/cdisc_rules_engine/services/reporting/report_factory.py b/cdisc_rules_engine/services/reporting/report_factory.py index ad388075e..6228b4ae3 100644 --- a/cdisc_rules_engine/services/reporting/report_factory.py +++ b/cdisc_rules_engine/services/reporting/report_factory.py @@ -1,10 +1,9 @@ -from typing import List, Type, Iterable +from typing import List, Type from cdisc_rules_engine.enums.report_types import ReportTypes from cdisc_rules_engine.interfaces import DataServiceInterface from cdisc_rules_engine.models.rule_validation_result import RuleValidationResult from cdisc_rules_engine.models.validation_args import Validation_args -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.reporting.base_report_data import ( BaseReportData, ) @@ -31,15 +30,14 @@ class ReportFactory: def __init__( self, - datasets: Iterable[SDTMDatasetMetadata], results: List[RuleValidationResult], elapsed_time: float, args: Validation_args, data_service: DataServiceInterface, dictionary_versions, ): - self._datasets = datasets - self._dataset_paths = [dataset.full_path for dataset in datasets] + self._datasets = data_service.get_datasets() + self._dataset_paths = [dataset.full_path for dataset in self._datasets] self._results = results self._elapsed_time = elapsed_time self._args = args diff --git a/cdisc_rules_engine/services/reporting/usdm_report_data.py b/cdisc_rules_engine/services/reporting/usdm_report_data.py index fce4ca59e..0fb94ad50 100644 --- a/cdisc_rules_engine/services/reporting/usdm_report_data.py +++ b/cdisc_rules_engine/services/reporting/usdm_report_data.py @@ -1,6 +1,5 @@ from datetime import datetime from typing import BinaryIO, Iterable -import os from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths from cdisc_rules_engine.enums.execution_status import ExecutionStatus @@ -80,7 +79,7 @@ def get_conformance_details_data( ReportMetadataItem( "JSON file name", 9, - os.path.basename(os.path.dirname(self._datasets[0].full_path)), + self._datasets[0].filename, ) ) conformance_details.append( @@ -126,7 +125,7 @@ def get_summary_data(self) -> list[dict]: ): summary_item = { "entity": result.get("entity") - or (result.get("dataset", "") or "").replace(".json", ""), + or (result.get("dataset", "") or ""), "core_id": validation_result.id, "cdisc_rule_id": validation_result.cdisc_rule_id, "message": result.get("message"), @@ -188,8 +187,7 @@ def _issue_details( "cdisc_rule_id": validation_result.cdisc_rule_id, "message": result.get("message"), "executability": validation_result.executability, - "entity": error.get("entity") - or error.get("dataset", "").replace(".json", ""), + "entity": error.get("entity") or error.get("dataset", ""), "instance_id": error.get("instance_id"), "path": error.get("path"), "attributes": variables, @@ -210,8 +208,7 @@ def _error_details(self, validation_result: RuleValidationResult, result: dict): "cdisc_rule_id": validation_result.cdisc_rule_id, "message": (f"{result.get('message')} - {error.get('error')}"), "executability": validation_result.executability, - "entity": error.get("entity") - or error.get("dataset", "").replace(".json", ""), + "entity": error.get("entity") or error.get("dataset", ""), "instance_id": "", "path": "", "attributes": "", diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 5a928cdb0..026df1761 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -19,7 +19,7 @@ ) from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, custom_str_conversion, ) from cdisc_rules_engine.utilities.sdtm_utilities import add_variable_wildcards @@ -106,13 +106,15 @@ def merge_on_relrec_record( model_metadata = ( dataset_preprocessor._data_service.library_metadata.model_metadata ) - file_info: SDTMDatasetMetadata = search_in_list_of_dicts( + dataset_metadata: SDTMDatasetMetadata = search_in_list( datasets, lambda item: item.domain == relrec_row["RDOMAIN_RIGHT"] ) - if not file_info: + if not dataset_metadata: return DatasetInterface() - right_dataset: DatasetInterface = dataset_preprocessor._download_dataset( - file_info.filename + right_dataset: DatasetInterface = ( + dataset_preprocessor._data_service.get_dataset( + dataset_name=dataset_metadata.name + ) ) variables_with_wildcards = { source: f"RELREC.{target}" @@ -494,7 +496,7 @@ def column_metadata_equal_to_define_and_library( library_metadata: dict, rule: dict, ) -> bool: - define_variable_metadata: Optional[dict] = search_in_list_of_dicts( + define_variable_metadata: Optional[dict] = search_in_list( define_metadata, lambda item: item.get("define_variable_name") == column ) if not define_variable_metadata: diff --git a/cdisc_rules_engine/utilities/dataset_preprocessor.py b/cdisc_rules_engine/utilities/dataset_preprocessor.py index 0b34890a1..620f8b7d5 100644 --- a/cdisc_rules_engine/utilities/dataset_preprocessor.py +++ b/cdisc_rules_engine/utilities/dataset_preprocessor.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Union +from typing import List, Union from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @@ -16,7 +16,6 @@ get_sided_match_keys, ) from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError -import os import pandas as pd @@ -46,9 +45,7 @@ def __init__( self._data_service = data_service self._rule_processor = RuleProcessor(self._data_service, cache_service) - def preprocess( # noqa - self, rule: dict, datasets: Iterable[SDTMDatasetMetadata] - ) -> DatasetInterface: + def preprocess(self, rule: dict) -> DatasetInterface: # noqa """ Preprocesses the dataset by merging it with the datasets from the provided rule. @@ -66,7 +63,7 @@ def preprocess( # noqa is_child = bool(domain_details.get("child")) # download other datasets from blob storage and merge if is_child: - file_infos = [] + dataset_metadatas = [] # find parent of SUPP or SQAP dataset if ( (domain_name[:4] == "SUPP" or domain_name[:4] == "SQAP") @@ -77,9 +74,9 @@ def preprocess( # noqa domain_name == "SUPP--" or domain_name == self._dataset_metadata.name ): - file_infos: list[SDTMDatasetMetadata] = [ + dataset_metadatas: list[SDTMDatasetMetadata] = [ item - for item in datasets + for item in self._data_service.get_datasets() if (item.domain == self._dataset_metadata.rdomain) ] # find parent of other datasets @@ -87,8 +84,8 @@ def preprocess( # noqa domain_name == self._dataset_metadata.domain or domain_name == self._dataset_metadata.name ): - file_infos: list[SDTMDatasetMetadata] = self._find_parent_dataset( - datasets, domain_details + dataset_metadatas: list[SDTMDatasetMetadata] = ( + self._find_parent_dataset(domain_details) ) else: if self._is_split_domain(domain_name): @@ -96,9 +93,9 @@ def preprocess( # noqa target_domain_name: str = ( self._dataset_metadata.domain or self._dataset_metadata.name ) - file_infos: list[SDTMDatasetMetadata] = [ + dataset_metadatas: list[SDTMDatasetMetadata] = [ item - for item in datasets + for item in self._data_service.get_datasets() if ( item.domain == domain_name or item.name == domain_name @@ -111,7 +108,7 @@ def preprocess( # noqa ) ] - if not file_infos and not ( + if not dataset_metadatas and not ( (self._dataset_metadata.is_supp and domain_name == "SUPP--") or self._dataset_metadata.name == "RELREC" ): @@ -121,18 +118,18 @@ def preprocess( # noqa ) continue - for file_info in file_infos: - if file_info.domain in merged_domains: + for dataset_metadata in dataset_metadatas: + if dataset_metadata.domain in merged_domains: continue # Try to download the dataset try: - other_dataset: DatasetInterface = self._download_dataset( - file_info.data_service_identifier + other_dataset: DatasetInterface = self._data_service.get_dataset( + dataset_name=dataset_metadata.name ) except Exception as e: raise PreprocessingError( - f"Failed to download dataset '{file_info.data_service_identifier}' for preprocessing: {str(e)}" + f"Failed to download dataset '{dataset_metadata.name}' for preprocessing: {str(e)}" ) referenced_targets = set( @@ -156,38 +153,36 @@ def preprocess( # noqa left_dataset=result, left_dataset_domain_name=self._dataset_metadata.domain, right_dataset=other_dataset, - right_dataset_domain_name=file_info.domain, + right_dataset_domain_name=dataset_metadata.domain, match_keys=domain_details.get("match_key"), - datasets=datasets, ) - merged_domains.add(file_info.domain) + merged_domains.add(dataset_metadata.domain) else: result = self._merge_datasets( left_dataset=result, left_dataset_domain_name=self._dataset_metadata.domain, right_dataset=other_dataset, right_dataset_domain_details=domain_details, - datasets=datasets, ) merged_domains.add( - file_info.domain if file_info.domain else file_info.name + dataset_metadata.domain + if dataset_metadata.domain + else dataset_metadata.name ) return result - def _find_parent_dataset( - self, datasets: Iterable[SDTMDatasetMetadata], domain_details: dict - ) -> SDTMDatasetMetadata: + def _find_parent_dataset(self, domain_details: dict) -> SDTMDatasetMetadata: matching_datasets = [] try: if "RDOMAIN" in self._dataset.columns: rdomain_column = self._dataset.data["RDOMAIN"] unique_domains = set(rdomain_column.unique()) - for dataset in datasets: + for dataset in self._data_service.get_datasets(): if dataset.domain in unique_domains: matching_datasets.append(dataset) else: match_keys = domain_details.get("match_key") - for dataset in datasets: + for dataset in self._data_service.get_datasets(): has_all_match_keys = all( match_key in dataset.first_record for match_key in match_keys ) @@ -209,13 +204,6 @@ def _find_parent_dataset( def _is_split_domain(self, domain: str) -> bool: return domain == self._dataset_metadata.unsplit_name - def _download_dataset(self, filename: str) -> DatasetInterface: - return self._data_service.get_dataset( - dataset_name=os.path.join( - os.path.dirname(self._dataset_metadata.full_path), filename - ) - ) - def _child_merge_datasets( self, left_dataset: DatasetInterface, @@ -223,7 +211,6 @@ def _child_merge_datasets( right_dataset: DatasetInterface, right_dataset_domain_name: str, match_keys: List[str], - datasets: Iterable[SDTMDatasetMetadata] = None, ) -> DatasetInterface: is_supplemental, rdomain_dataset = self._classify_dataset( left_dataset, self._dataset_metadata @@ -514,7 +501,6 @@ def _merge_datasets( # noqa left_dataset_domain_name: str, right_dataset: DatasetInterface, right_dataset_domain_details: dict, - datasets: List[dict], ) -> DatasetInterface: """ Merges datasets on their match keys. @@ -543,7 +529,7 @@ def _merge_datasets( # noqa left_dataset=left_dataset, left_dataset_domain_name=left_dataset_domain_name, relrec_dataset=right_dataset, - datasets=datasets, + datasets=self._data_service.get_datasets(), dataset_preprocessor=self, wildcard=right_dataset_domain_details.get("wildcard"), ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 0d289d107..1395947b2 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -1,8 +1,7 @@ import re import copy -import os -from typing import Iterable, List, Optional, Union, Tuple +from typing import List, Optional, Union, Tuple from cdisc_rules_engine.enums.rule_types import RuleTypes from cdisc_rules_engine.interfaces.cache_service_interface import ( CacheServiceInterface, @@ -38,9 +37,8 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.utils import ( - get_directory_path, get_operations_cache_key, - search_in_list_of_dicts, + search_in_list, ) from cdisc_rules_engine.models.external_dictionaries_container import ( ExternalDictionariesContainer, @@ -189,7 +187,7 @@ def _domain_matched_ap_or_supp( ) def rule_applies_to_data_structure( - self, rule, datasets, dataset_metadata: SDTMDatasetMetadata + self, rule, dataset_metadata: SDTMDatasetMetadata ): datastructures = rule.get("data_structures") or {} included_datastructures = datastructures.get("Include", []) @@ -202,8 +200,6 @@ def rule_applies_to_data_structure( if ALL_KEYWORD in included_datastructures: return True ds = self.data_service.get_data_structure( - dataset_metadata.full_path, - datasets, dataset_metadata, ) if ds and (ds not in included_datastructures): @@ -215,7 +211,6 @@ def rule_applies_to_data_structure( def rule_applies_to_class( self, rule, - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata, ): """ @@ -239,17 +234,14 @@ def rule_applies_to_class( excluded_classes = classes.get("Exclude", []) is_included = True is_excluded = False - dataset_name = dataset_metadata.full_path if included_classes: if ALL_KEYWORD in included_classes: return True variables = self.data_service.get_variables_metadata( - dataset_name=dataset_name, datasets=datasets + dataset_name=dataset_metadata.name ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - dataset_metadata.full_path, - datasets, dataset_metadata, ) if (class_name not in included_classes) and not ( @@ -258,12 +250,10 @@ def rule_applies_to_class( is_included = False if excluded_classes: variables = self.data_service.get_variables_metadata( - dataset_name=dataset_name, datasets=datasets + dataset_name=dataset_metadata.name ).data.variable_name class_name = self.data_service.get_dataset_class( variables, - dataset_metadata.full_path, - datasets, dataset_metadata, ) if class_name and ( @@ -275,10 +265,8 @@ def rule_applies_to_class( def rule_applies_to_use_case( self, - dataset_metadata: SDTMDatasetMetadata, rule: dict, standard: str, - standard_substandard: str, use_case: str, ) -> bool: if standard.lower() != "tig": @@ -327,7 +315,6 @@ def perform_rule_operations( rule: dict, dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], standard: str, standard_version: str, standard_substandard: str, @@ -378,13 +365,12 @@ def perform_rule_operations( ct_version=operation.get("version"), define_xml_path=kwargs.get("define_xml_path"), dataframe=dataset_copy, - dataset_path=dataset_metadata.full_path, - datasets=datasets, + dataframe_metadata=dataset_metadata, delimiter=operation.get("delimiter"), dictionary_term_type=operation.get("dictionary_term_type"), - directory_path=get_directory_path(dataset_metadata.full_path), domain=domain, domain_class=operation.get("domain_class"), + evaluation_dataset_metadata=dataset_metadata, external_dictionaries=external_dictionaries, external_dictionary_term_variable=operation.get( "external_dictionary_term_variable" @@ -440,7 +426,7 @@ def perform_rule_operations( def _execute_operation( self, operation_params: OperationParams, - dataset: DatasetInterface, + evaluation_dataset: DatasetInterface, previous_operations: List[str] = [], ): """ @@ -451,12 +437,11 @@ def _execute_operation( # check cache cache_key = get_operations_cache_key( core_id=operation_params.core_id, - directory_path=operation_params.directory_path, operation_name=operation_params.operation_name, + evaluation_dataset_name=operation_params.evaluation_dataset_metadata.name, domain=operation_params.domain, grouping=";".join(operation_params.grouping), target_variable=operation_params.target, - dataset_path=operation_params.dataset_path, operation_id=operation_params.operation_id, ) if previous_operations: @@ -469,8 +454,8 @@ def _execute_operation( operation_params.dataframe, operation_params.domain ): # download other domain - domain_details: dict = search_in_list_of_dicts( - operation_params.datasets, + dataset_metadata: DatasetMetadata = search_in_list( + self.data_service.get_datasets(), lambda item: ( item.unsplit_name == operation_params.domain or ( @@ -479,7 +464,7 @@ def _execute_operation( ) ), ) - if domain_details is None: + if dataset_metadata is None: raise DomainNotFoundError( f"Failed to execute rule operation. " f"Domain {operation_params.domain} does not exist. " @@ -487,19 +472,16 @@ def _execute_operation( f"Target: {operation_params.target}, " f"Core ID: {operation_params.core_id}" ) - file_path: str = os.path.join( - get_directory_path(operation_params.dataset_path), - domain_details.data_service_identifier, - ) operation_params.dataframe = self.data_service.get_dataset( - dataset_name=file_path + dataset_name=dataset_metadata.name ) + operation_params.dataframe_metadata = dataset_metadata # call the operation operation = operations_factory.get_service( operation_params.operation_name, operation_params=operation_params, - original_dataset=dataset, + evaluation_dataset=evaluation_dataset, cache=self.cache, data_service=self.data_service, library_metadata=self.library_metadata, @@ -586,7 +568,7 @@ def _preprocess_operation_params( if domain_details.is_supp: current_domain = domain_details.rdomain for param_name in vars(params_copy): - if param_name in ("datasets", "dataframe"): + if param_name in ("dataframe"): continue param_value = getattr(params_copy, param_name) updated_value = self._replace_wildcards_in_value( @@ -653,9 +635,7 @@ def is_suitable_for_validation( self, rule: dict, dataset_metadata: SDTMDatasetMetadata, - datasets: Iterable[SDTMDatasetMetadata], standard, - standard_substandard: str, use_case: str, ) -> Tuple[bool, str]: """Check if rule is suitable and return reason if not""" @@ -671,10 +651,8 @@ def is_suitable_for_validation( ): return self.log_suitable_for_validation(rule_id, dataset_name) if not self.rule_applies_to_use_case( - dataset_metadata, rule, standard, - standard_substandard, use_case, ): reason = ( @@ -683,7 +661,7 @@ def is_suitable_for_validation( ) logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason - if not self.rule_applies_to_data_structure(rule, datasets, dataset_metadata): + if not self.rule_applies_to_data_structure(rule, dataset_metadata): reason = ( f"Rule skipped - doesn't apply to data structure for " f"rule id={rule_id}, dataset={dataset_name}" @@ -697,7 +675,7 @@ def is_suitable_for_validation( ) logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason - if not self.rule_applies_to_class(rule, datasets, dataset_metadata): + if not self.rule_applies_to_class(rule, dataset_metadata): reason = ( f"Rule skipped - doesn't apply to class for " f"rule id={rule_id}, dataset={dataset_name}" diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index c00c0d6e2..0e464490e 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -7,14 +7,14 @@ SUPPLEMENTARY_DOMAINS, ) from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.interfaces.data_service_interface import DataServiceInterface from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata from cdisc_rules_engine.utilities.utils import ( - search_in_list_of_dicts, + search_in_list, ) from cdisc_rules_engine.constants.classes import ( DETECTABLE_CLASSES, @@ -74,13 +74,13 @@ def get_class_and_dataset_metadata( """ for c in library_metadata.standard_metadata.get("classes", []): - dataset_details = search_in_list_of_dicts( + dataset_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == dataset_name ) if dataset_details: return c, dataset_details for c in library_metadata.model_metadata.get("classes", []): - dataset_details = search_in_list_of_dicts( + dataset_details = search_in_list( c.get("datasets", []), lambda item: item["name"] == dataset_name ) if dataset_details: @@ -107,8 +107,6 @@ def get_variables_metadata_from_standard( # noqa library_metadata, data_service, dataset_metadata: SDTMDatasetMetadata, - dataset_path: str, - datasets: Iterable[SDTMDatasetMetadata], ): add_AP = False domain = dataset_metadata.unsplit_name @@ -138,11 +136,9 @@ def get_variables_metadata_from_standard( # noqa IG_class_details.get("name") ) else: - class_name = data_service._handle_custom_domains( - data_service.get_dataset(dataset_name=dataset_metadata.full_path), + class_name = data_service.handle_custom_domains( + data_service.get_dataset(dataset_name=dataset_metadata.name), dataset_metadata, - dataset_path, - datasets, ) model_class_details = get_class_metadata(model_details, class_name) # Both custom and standard General Observations pull from model @@ -306,7 +302,7 @@ def get_class_metadata( } """ - class_metadata: Optional[dict] = search_in_list_of_dicts( + class_metadata: Optional[dict] = search_in_list( model_details.get("classes", []), lambda item: convert_library_class_name_to_ct_class(item["name"]) == dataset_class, @@ -343,8 +339,6 @@ def group_class_variables_by_role( def get_variables_metadata_from_standard_model( # noqa dataframe, - datasets: Iterable[SDTMDatasetMetadata], - dataset_path: str, data_service: DataServiceInterface, library_metadata: LibraryMetadataContainer, dataset_metadata: SDTMDatasetMetadata, @@ -382,9 +376,7 @@ def get_variables_metadata_from_standard_model( # noqa IG_class_details.get("name") ) else: - class_name = data_service._handle_custom_domains( - dataframe, dataset_metadata, dataset_path, datasets - ) + class_name = data_service.handle_custom_domains(dataframe, dataset_metadata) if class_name in DETECTABLE_CLASSES: model_class_details = get_class_metadata(model_details, class_name) ( @@ -480,7 +472,7 @@ def get_variables_metadata_from_standard_model( # noqa def get_model_domain_metadata(model_details: dict, domain_name: str) -> dict: # Get domain metadata from model - domain_details: Optional[dict] = search_in_list_of_dicts( + domain_details: Optional[dict] = search_in_list( model_details.get("datasets", []), lambda item: item["name"] == domain_name ) @@ -562,8 +554,8 @@ def tag_source( ) -> DatasetInterface: """ For sdtm split datasets, - Adds source filename and row number to dataset + Adds source dataset name and row number to dataset """ - dataset[SOURCE_FILENAME] = dataset_metadata.filename + dataset[SOURCE_DATASET_NAME] = dataset_metadata.name dataset[SOURCE_ROW_NUMBER] = list(range(1, dataset.len() + 1)) return dataset diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 6c7f15675..8ee1c1e75 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -10,6 +10,7 @@ import re import ast import pandas as pd +from dataclasses import fields from datetime import datetime from typing import Callable, List, Optional, Union from uuid import UUID @@ -21,6 +22,22 @@ from cdisc_rules_engine.constants.adam_products import ADAM_PRODUCTS +def convert_dataclass_to_superclass[T](instance: object, superclass: type[T]) -> T: + """ + Convert a dataclass subclass instance to its superclass by copying all fields. + + Args: + instance: The subclass instance to convert + superclass: The target superclass type + + Returns: + A new instance of the superclass with fields copied from the subclass instance + """ + return superclass( + **{field.name: getattr(instance, field.name) for field in fields(superclass)} + ) + + def convert_file_size(size_in_bytes: int, desired_unit: str) -> float: """ Converts file size from bytes to any of the following units: @@ -112,47 +129,9 @@ def is_valid_iso_date(date_to_validate: str) -> bool: return is_valid -def get_dataset_path( - study_id: str, data_bundle_id: str = None, filename: str = None -) -> str: - """ - Returns a path to dataset in the blob storage. - """ - path: str = study_id - if data_bundle_id: - path = os.path.join(path, data_bundle_id) - if filename: - path = os.path.join(path, filename) - return path - - DATASET_CACHE_KEY_TEMPLATE: str = "{dataset_path}_{dataset_type}" -def get_dataset_cache_key_from_study( - study_id: str, - data_bundle_id: str = None, - filename: str = None, - dataset_type: str = None, -) -> str: - """ - Creates a cache key for a dataset. - Usually, template of a dataset cache key is {dataset_path}_{dataset_type}. - Ex.: CDISC01/test/ae.xpt_contents or CDISC01/test/ae.xpt_metadata. - So, the function also builds the path. - - If dataset_type parameter is not passed, the returned key - can be used to clean several values with matching key pattern. - dataset_type param can be: contents, metadata, variables_metadata. - """ - dataset_path: str = get_dataset_path(study_id, data_bundle_id, filename) - if dataset_type: - dataset_path = DATASET_CACHE_KEY_TEMPLATE.format( - dataset_path=dataset_path, dataset_type=dataset_type - ) - return dataset_path - - def get_dataset_cache_key_from_path(dataset_path: str, dataset_type: str) -> str: return DATASET_CACHE_KEY_TEMPLATE.format( dataset_path=dataset_path, dataset_type=dataset_type @@ -229,29 +208,30 @@ def replace_pattern_in_list_of_strings( def get_operations_cache_key( core_id: str, - directory_path: str, operation_id: str, domain: str = None, operation_name: str = None, + evaluation_dataset_name: str = None, grouping: str = None, target_variable: str = None, - dataset_path: str = None, ) -> str: """ Creates the cache key for operations. """ - key = f"operations/{core_id}/{directory_path}/{operation_id}" - optional_items = [domain, operation_name, grouping, target_variable, dataset_path] + key = f"operations/{core_id}/{operation_id}" + optional_items = [ + domain, + operation_name, + evaluation_dataset_name, + grouping, + target_variable, + ] for item in optional_items: if item: key = f"{key}/{item}" return key -def get_directory_path(dataset_path): - return os.path.dirname(dataset_path) - - def serialize_rule(rule: dict) -> dict: """ Converts rule "conditions" to dict. @@ -320,9 +300,9 @@ def get_meddra_code_term_pairs_cache_key(meddra_path: str) -> str: return f"meddra_valid_code_term_pairs_{meddra_path}" -def get_item_index_by_condition( - list_of_dicts: List[dict], condition: Callable -) -> Optional[int]: +def get_item_index_by_condition[ + T +](list_of_dicts: List[T], condition: Callable[[T], bool]) -> Optional[int]: """ Uses linear search to return index of element in unsorted list which applies to the condition. @@ -332,9 +312,9 @@ def get_item_index_by_condition( return index -def search_in_list_of_dicts( - list_of_dicts: List[dict], condition: Callable -) -> Optional[dict]: +def search_in_list[ + T +](list_of_dicts: List[T], condition: Callable[[T], bool]) -> Optional[T]: """ Returns an element of unsorted list that applies to the condition. """ diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 053713a5d..17c6f7b62 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -51,9 +51,7 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: raise ValueError(error_msg) cache_service = CacheServiceFactory(config).get_service() - data_service = DataServiceFactory(config, cache_service).get_service() - metadata: List[SDTMDatasetMetadata] = [ - data_service.get_raw_dataset_metadata(dataset_name=path) - for path in dataset_paths - ] - return DatasetMetadataSerializer(metadata).data + factory = DataServiceFactory(config, cache_service) + data_service = factory.get_data_service(dataset_paths=dataset_paths) + datasets_metadata: List[SDTMDatasetMetadata] = data_service.get_datasets() + return DatasetMetadataSerializer(datasets_metadata).data diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 750edd1ab..e44d6aad2 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -101,7 +101,7 @@ def validate_single_rule( errors_per_dataset_flag=per_dataset_flag, encoding=args.encoding, ) - results = engine.validate_single_rule(rule, datasets) + results = engine.validate_single_rule(rule) results = list(itertools.chain(*results.values())) if args.progress == ProgressParameterOptions.VERBOSE_OUTPUT.value: engine_logger.log(f"{rule['core_id']} validation complete") @@ -128,14 +128,14 @@ def initialize_logger(disabled, log_level): def _convert_datasets_to_parquet_if_needed( - data_service, datasets, created_files, large_dataset_validation: bool + data_service, created_files, large_dataset_validation: bool ): if not (large_dataset_validation and data_service.standard != "usdm"): return engine_logger.warning( "Large datasets must use parquet format, converting all datasets to parquet" ) - for dataset in datasets: + for dataset in data_service.get_datasets(): file_path = dataset.full_path if file_path.endswith(".parquet"): continue @@ -188,7 +188,6 @@ def run_validation(args: Validation_args): datasets = data_service.get_datasets() _convert_datasets_to_parquet_if_needed( data_service, - datasets, created_files, large_dataset_validation, ) @@ -223,7 +222,7 @@ def run_validation(args: Validation_args): elapsed_time = end - start engine_logger.info("Done Rule execution, creating reports") reporting_factory = ReportFactory( - datasets, results, elapsed_time, args, data_service, dictionary_versions + results, elapsed_time, args, data_service, dictionary_versions ) reporting_services: List[BaseReport] = reporting_factory.get_report_services() output_files = [] @@ -324,5 +323,5 @@ def run_single_rule_validation( engine.rule_processor = RuleProcessor(data_service, cache, library_metadata) engine.data_processor = DataProcessor(data_service, cache) rule = Rule.from_cdisc_metadata(rule) - results = engine.validate_single_rule(rule, datasets) + results = engine.validate_single_rule(rule) return results diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py index 2b66a4e3a..bb160fd3f 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1345.py @@ -78,7 +78,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( dm_related_issues = [ issue for issue in json_report.get("Issue_Details", []) - if issue.get("dataset", "").lower() in {"dm.json", "suppdm.json"} + if issue.get("dataset", "") in {"DM", "SUPPDM"} ] assert not dm_related_issues, ( @@ -88,7 +88,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( dm_related_summary = [ s for s in json_report.get("Issue_Summary", []) - if s.get("dataset", "").lower() in {"dm.json", "suppdm.json"} + if s.get("dataset", "") in {"DM", "SUPPDM"} ] assert not dm_related_summary, ( @@ -99,7 +99,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( ec_detail_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "ec.json" + if i.get("dataset", "") in {"EC"} ] assert ( @@ -112,7 +112,7 @@ def test_engine_correctly_merges_datasets_and_flags_row_uniqueness_issues( ec_summary_issues = [ s for s in json_report.get("Issue_Summary", []) - if s.get("dataset", "").lower() == "ec.json" + if s.get("dataset", "") in {"EC"} ] assert ( @@ -147,7 +147,7 @@ def test_engine_correctly_processes_relrec_when_supp_datasets_provided( relrec_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "relrec.json" + if i.get("dataset", "") in {"RELREC"} ] assert ( len(relrec_issues) == 2 @@ -157,7 +157,7 @@ def test_engine_correctly_processes_relrec_when_supp_datasets_provided( ec_detail_issues = [ i for i in json_report.get("Issue_Details", []) - if i.get("dataset", "").lower() == "ec.json" + if i.get("dataset", "") in {"EC"} ] assert ( len(ec_detail_issues) == 2 diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py index 0cbad8d0a..74c41a025 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1348.py @@ -13,7 +13,7 @@ ) _summary = [ { - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "core_id": "CORE-000409", "message": _message, "issues": 7, @@ -24,7 +24,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 1, "SEQ": "", @@ -49,7 +49,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 2, "SEQ": "", @@ -67,7 +67,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 3, "SEQ": "", @@ -92,7 +92,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 4, "SEQ": "", @@ -110,7 +110,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 5, "SEQ": "", @@ -135,7 +135,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 6, "SEQ": "", @@ -153,7 +153,7 @@ "core_id": "CORE-000409", "message": _message, "executability": "fully executable", - "dataset": "StudyVersion.xpt", + "dataset": "StudyVersion", "USUBJID": "", "row": 7, "SEQ": "", diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py index 675a6359d..009e7eca0 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1421.py @@ -73,7 +73,7 @@ def test_validate_define_xml_against_lib_metadata(): dataset_column_values = [ cell.value for cell in dataset_column[1:] if cell.value is not None ] - assert sorted(set(dataset_column_values)) == ["dm.xpt", "suppec.xpt"] + assert sorted(set(dataset_column_values)) == ["DM", "SUPPEC"] core_id_column = sheet[issue_sheet_coreid_column] core_id_column_values = [ @@ -125,7 +125,7 @@ def test_validate_define_xml_against_lib_metadata(): for row in summary_values: assert row[2] == "Issue with codelist definition in the Define-XML document." datasets_in_summary = set(row[0] for row in summary_values if row[0] is not None) - assert datasets_in_summary == {"dm.xpt", "suppec.xpt"} + assert datasets_in_summary == {"DM", "SUPPEC"} # Delete the excel file if os.path.exists(excel_file_path): diff --git a/tests/conftest.py b/tests/conftest.py index c25c9684a..c2f4c1d8f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1251,7 +1251,9 @@ def installed_meddra_dictionaries(request) -> dict: """ cache_service = InMemoryCacheService.get_instance() # install dictionaries and save to cache - local_data_service = LocalDataService.get_instance(cache_service=cache_service) + local_data_service = LocalDataService.get_instance( + config=ConfigService(), cache_service=cache_service + ) factory = MedDRATermsFactory(local_data_service) terms: dict = factory.install_terms(meddra_path) @@ -1276,9 +1278,6 @@ def operation_params() -> OperationParams: dataframe=PandasDataset.from_dict({}), target="target", domain="domain", - dataset_path="dataset_path", - directory_path="directory_path", - datasets=[], standard="standard", standard_version="standard_version", external_dictionaries=ExternalDictionariesContainer( diff --git a/tests/unit/test_actions.py b/tests/unit/test_actions.py index f025849f7..7a60e9938 100644 --- a/tests/unit/test_actions.py +++ b/tests/unit/test_actions.py @@ -11,7 +11,7 @@ from cdisc_rules_engine.utilities.sdtm_utilities import tag_source from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) @@ -394,7 +394,7 @@ def test_empty_sequential(): {"TVSEQ": [2, 4, 6, None, "", 8], "TV": [1, 3, 5, 7, 9, "8"]} ) variable = DatasetVariable(df) - dataset_metadata = SDTMDatasetMetadata(first_record={"DOMAIN": "TV"}, filename="tv") + dataset_metadata = SDTMDatasetMetadata(first_record={"DOMAIN": "TV"}, name="tv") action = COREActions( [], variable, @@ -478,7 +478,7 @@ def test_nan_handling_in_error_object(): "TVSEQ": [1, 2, 3, 4], } ) - df[SOURCE_FILENAME] = "test.xpt" + df[SOURCE_DATASET_NAME] = "test" df[SOURCE_ROW_NUMBER] = [1, 2, 3, 4] expected_nan_vals = [1.0, None, 3.0, None] diff --git a/tests/unit/test_dataset_builders/test_base_dataset_builder.py b/tests/unit/test_dataset_builders/test_base_dataset_builder.py index 92936891b..3660714db 100644 --- a/tests/unit/test_dataset_builders/test_base_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_base_dataset_builder.py @@ -61,13 +61,12 @@ def create_mock_reader_with_metadata(item_group_defs, item_defs=None): return mock_reader -def create_builder_instance(dataset_metadata, dataset_path="/path/to/dataset.xpt"): +def create_builder_instance(dataset_metadata): """ Helper function to create a ConcreteDatasetBuilder instance. Args: dataset_metadata: SDTMDatasetMetadata instance - dataset_path: Path to the dataset file Returns: ConcreteDatasetBuilder instance @@ -78,8 +77,6 @@ def create_builder_instance(dataset_metadata, dataset_path="/path/to/dataset.xpt cache_service=MagicMock(), rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path=dataset_path, - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, define_xml_path="/path/to/define.xml", standard="sdtmig", @@ -171,10 +168,11 @@ def test_get_define_xml_variables_metadata( filename=f"{dataset_name.lower()}.xpt", label=f"{dataset_name} Label", first_record=first_record, + full_path="/path/to/dataset.xpt", ) # Create builder instance - builder = create_builder_instance(dataset_metadata, "/path/to/dataset.xpt") + builder = create_builder_instance(dataset_metadata) # Call the method result = builder.get_define_xml_variables_metadata() @@ -221,10 +219,11 @@ def test_get_define_xml_variables_metadata_domain_not_found( filename="ae.xpt", label="Adverse Events", first_record={"DOMAIN": "AE"}, + full_path="/path/to/ae.xpt", ) # Create builder instance - builder = create_builder_instance(dataset_metadata, "/path/to/ae.xpt") + builder = create_builder_instance(dataset_metadata) # Verify that DomainNotFoundInDefineXMLError is raised with pytest.raises( diff --git a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py index d54064b43..bb1ef4d89 100644 --- a/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_content_metadata_dataset_builder.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.data_services import DummyDataService from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset from cdisc_rules_engine.models.rule_conditions import ConditionCompositeFactory @@ -297,9 +298,9 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path=test_data["datasets"][0]["filename"], - datasets=test_data.get("datasets", {}), - dataset_metadata=test_data["datasets"][0], + dataset_metadata=SDTMDatasetMetadata( + name="QSCG", + ), define_xml_path=None, standard="sdtmig", standard_version="3-4", @@ -324,9 +325,9 @@ def test_ContentMetadataDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path=test_data["datasets"][1]["filename"], - datasets=test_data.get("datasets", {}), - dataset_metadata=test_data["datasets"][1], + dataset_metadata=SDTMDatasetMetadata( + name="QSPG", + ), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py index 3df542ff3..78af9b2f2 100644 --- a/tests/unit/test_dataset_builders/test_contents_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_dataset_builder.py @@ -1,6 +1,6 @@ import pytest from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, + SOURCE_DATASET_NAME, SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.dataset_builders.contents_dataset_builder import ( @@ -62,8 +62,7 @@ def test_ContentDatasetBuilder_split_datasets(conditions): "conditions": ConditionCompositeFactory.get_condition_composite(conditions), } processor = RuleProcessor(mock_data_service, InMemoryCacheService()) - data_metadata = test_data - datasets = [DummyDataset(data) for data in data_metadata.get("datasets", [])] + datasets = [DummyDataset(data) for data in test_data.get("datasets", [])] expected_output = { "STUDYID": { "0": "CDISCCORE01", @@ -305,35 +304,35 @@ def test_ContentDatasetBuilder_split_datasets(conditions): "26": "ALIVE", "27": "DEAD", }, - SOURCE_FILENAME: { - "0": "ss11.xpt", - "1": "ss11.xpt", - "2": "ss11.xpt", - "3": "ss11.xpt", - "4": "ss11.xpt", - "5": "ss11.xpt", - "6": "ss11.xpt", - "7": "ss11.xpt", - "8": "ss11.xpt", - "9": "ss11.xpt", - "10": "ss11.xpt", - "11": "ss11.xpt", - "12": "ss11.xpt", - "13": "ss11.xpt", - "14": "ss12.xpt", - "15": "ss12.xpt", - "16": "ss12.xpt", - "17": "ss12.xpt", - "18": "ss12.xpt", - "19": "ss12.xpt", - "20": "ss12.xpt", - "21": "ss12.xpt", - "22": "ss12.xpt", - "23": "ss12.xpt", - "24": "ss12.xpt", - "25": "ss12.xpt", - "26": "ss12.xpt", - "27": "ss12.xpt", + SOURCE_DATASET_NAME: { + "0": "SS11", + "1": "SS11", + "2": "SS11", + "3": "SS11", + "4": "SS11", + "5": "SS11", + "6": "SS11", + "7": "SS11", + "8": "SS11", + "9": "SS11", + "10": "SS11", + "11": "SS11", + "12": "SS11", + "13": "SS11", + "14": "SS12", + "15": "SS12", + "16": "SS12", + "17": "SS12", + "18": "SS12", + "19": "SS12", + "20": "SS12", + "21": "SS12", + "22": "SS12", + "23": "SS12", + "24": "SS12", + "25": "SS12", + "26": "SS12", + "27": "SS12", }, SOURCE_ROW_NUMBER: { "0": 1, @@ -377,8 +376,6 @@ def test_ContentDatasetBuilder_split_datasets(conditions): cache_service=None, rule_processor=processor, data_processor=None, - dataset_path="", - datasets=datasets, dataset_metadata=DummyDataset(test_data.get("datasets", {})[0]), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py index 865f39daa..18e5da039 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_dataset_builder.py @@ -458,7 +458,7 @@ "2012-12-05", "2012-12-05", ], - "dataset_size": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "dataset_size": [0, 0, 0, 0, 0, 0, 0, 0], "dataset_name": [ "ECAA", "ECAA", @@ -555,7 +555,7 @@ "ECTRT": ["ZANOMALINE", "ZANOMALINE", "ZANOMALINE"], "ECDOSE": [5, 5, 5], "ECSTDTC": ["2013-12-01", "2012-12-02", "2012-12-03"], - "dataset_size": [1000, 1000, 1000], + "dataset_size": [0, 0, 0], "dataset_name": ["ECBB", "ECBB", "ECBB"], "dataset_label": [ "Exposure as Collected BB", @@ -683,17 +683,17 @@ "INVESTIGATOR", ], "dataset_size": [ - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, - 1000, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, ], "dataset_name": [ "SUPPEC", @@ -829,10 +829,6 @@ def test_contents_define_dataset_builder(dataset_path): cache_service=None, rule_processor=RuleProcessor(mock_data_service, InMemoryCacheService()), data_processor=None, - dataset_path=dataset_path, - datasets=[ - SDTMDatasetMetadata(**dataset) for dataset in dataset_metadata.values() - ], dataset_metadata=SDTMDatasetMetadata(**dataset_metadata[dataset_path]), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py index dec0fc4d3..53dcaf781 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_variables_dataset_builder.py @@ -10,6 +10,7 @@ ) from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @pytest.mark.parametrize( @@ -101,9 +102,7 @@ def test_contents_define_variables_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py index e789c8a23..967cee656 100644 --- a/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_define_vlm_dataset_builder.py @@ -8,6 +8,7 @@ ) from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset import PandasDataset, DaskDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @pytest.mark.parametrize( @@ -123,9 +124,7 @@ def test_contents_define_vlm_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py index f9a566f9f..add665612 100644 --- a/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_contents_library_dataset_builder.py @@ -10,6 +10,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.data_services import LocalDataService @@ -105,9 +106,7 @@ def test_contents_library_variables_dataset_builder( cache_service=None, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=None, - dataset_metadata=None, + dataset_metadata=SDTMDatasetMetadata(name="TEST"), define_xml_path=None, standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index 8ea9110d0..aff6c25e8 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -124,8 +124,6 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): cache_service=None, rule_processor=None, data_processor=None, - dataset_path=dataset_path, - datasets=[data_metadata], dataset_metadata=SDTMDatasetMetadata(full_path=dataset_path), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py index 8f9ca0f16..812e4ca04 100644 --- a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py +++ b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py @@ -184,8 +184,6 @@ def test_define_variables_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=test_define_file_path, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", diff --git a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py index 3005e31cc..28278d9b8 100644 --- a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py +++ b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py @@ -68,10 +68,18 @@ [ ( [ - MagicMock(unsplit_name="AE", filename="ae.xpt"), - MagicMock(unsplit_name="DM", filename="dm.xpt"), - MagicMock(unsplit_name="SE", filename="se.xpt"), - MagicMock(unsplit_name="EC", filename="ec.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="DM", first_record={"DOMAIN": "DM"}, filename="dm.xpt" + ), + SDTMDatasetMetadata( + name="SE", first_record={"DOMAIN": "SE"}, filename="se.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), ], define_metadata, pd.DataFrame( @@ -153,9 +161,15 @@ ), ( [ - MagicMock(unsplit_name="AE", filename="ae.xpt"), - MagicMock(unsplit_name="DM", filename="dm.xpt"), - MagicMock(unsplit_name="EC", filename="ec.xpt"), + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="DM", first_record={"DOMAIN": "DM"}, filename="dm.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), ], define_metadata, pd.DataFrame( @@ -322,12 +336,15 @@ def test_domain_list_with_define_dataset_builder( ): builder = DomainListWithDefineDatasetBuilder( rule=None, - data_service=DummyDataService(MagicMock(), MagicMock(), MagicMock(), data=[]), + data_service=DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=mock_datasets, + ), cache_service=None, rule_processor=None, data_processor=None, - dataset_path="ae.xpt", - datasets=mock_datasets, dataset_metadata=SDTMDatasetMetadata(full_path="ae.xpt"), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py index 04dad5d4a..dee39886f 100644 --- a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py @@ -25,7 +25,6 @@ def _make_builder(schema, instance): cache_service=MagicMock(), rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, @@ -87,8 +86,6 @@ def test_json_schema_check_dataset_builder_valid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", - datasets=[], dataset_metadata=MagicMock(name="test_dataset"), define_xml_path=None, standard="USDM", @@ -144,8 +141,6 @@ def test_json_schema_check_dataset_builder_invalid(): cache_service=cache_service, rule_processor=MagicMock(), data_processor=MagicMock(), - dataset_path="dummy.xpt", - datasets=[], dataset_metadata=dataset_metadata, define_xml_path=None, standard="USDM", diff --git a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py index c48c71b37..563563f63 100644 --- a/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_dataset_metadata_builder.py @@ -60,8 +60,6 @@ def test_build_with_dataset_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - dataset_path="ae.xpt", - datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} ), @@ -111,6 +109,15 @@ def test_build_split_datasets(mock_build): data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) original_get_metadata = data_service.get_dataset_metadata + data_service._datasets_metadata = { + "DM": SDTMDatasetMetadata( + name="DM", + label="Demographics", + full_path="/path/to/dm.xpt", + filename="dm.xpt", + ) + } + metadata_df = pd.DataFrame( [ { @@ -134,15 +141,13 @@ def test_build_split_datasets(mock_build): cache_service=InMemoryCacheService(), rule_processor=rule_processor_mock, data_processor=None, - dataset_path="", - datasets=[], dataset_metadata=None, define_xml_path="", standard="", standard_version="", standard_substandard=None, ) - result = builder.build_split_datasets("dm.xpt") + result = builder.build_split_datasets("DM") assert data_service.get_dataset_metadata.called expected_columns = { diff --git a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py index 4230041dd..266d82bbc 100644 --- a/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py +++ b/tests/unit/test_dataset_builders/test_values_variables_metadata_builder.py @@ -59,8 +59,6 @@ def test_build_with_variable_metadata(mock_build): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path="ae.xpt", - datasets=[], dataset_metadata=SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"} ), @@ -133,9 +131,14 @@ def test_concat_with_split_datasets(): ) data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + # Set up metadata in the data service + data_service._datasets_metadata = { + "AE1": ae1_metadata, + "AE2": ae2_metadata, + } data_service.get_dataset = MagicMock( side_effect=lambda dataset_name, **kwargs: PandasDataset( - ae1_data if dataset_name == "ae1.xpt" else ae2_data + ae1_data if dataset_name == "AE1" else ae2_data ) ) metadata_df = pd.DataFrame.from_dict( @@ -157,8 +160,6 @@ def test_concat_with_split_datasets(): cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path="ae.xpt", - datasets=[], dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), define_xml_path="", standard="", @@ -332,25 +333,25 @@ def test_concat_with_split_datasets(): 4, 7, ], - "source_filename": [ - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae1.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", - "ae2.xpt", + "source_dataset_name": [ + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE1", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", + "AE2", ], "source_row_number": [ 1, @@ -383,7 +384,7 @@ def test_concat_with_split_datasets(): "row_number", "variable_name", "variable_value", - "source_filename", + "source_dataset_name", "source_row_number", ] for col in key_columns: diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py index cb427d59d..058b12268 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_dataset_builder.py @@ -55,8 +55,6 @@ def test_variables_metadata_without_max_size(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -116,8 +114,6 @@ def test_variables_metadata_with_max_size_in_operations(mock_get_vars, mock_get_ cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -182,8 +178,6 @@ def test_variables_metadata_with_max_size_in_output_variables( cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -244,8 +238,6 @@ def test_variables_metadata_with_max_size_in_conditions_dict( cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -303,8 +295,6 @@ def test_variables_metadata_handles_nulls(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", @@ -365,8 +355,6 @@ def test_variables_metadata_handles_missing_columns(mock_get_vars, mock_get_ds): cache_service=InMemoryCacheService(), rule_processor=MagicMock(), data_processor=None, - dataset_path="/test/ae.xpt", - datasets=[], dataset_metadata=MagicMock(), define_xml_path=None, standard="sdtmig", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 674e01176..341ddb0f0 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -140,9 +140,11 @@ def test_build_combined_metadata( cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path=str(test_define_file_path), - datasets=[], - dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), + dataset_metadata=SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=str(test_define_file_path), + ), define_xml_path=str(test_define_file_path), standard="sdtmig", standard_version="3-4", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py index ee6c8d947..084f76f4c 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py @@ -130,8 +130,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder( cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", @@ -354,8 +352,6 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i cache_service=cache, rule_processor=None, data_processor=None, - dataset_path=None, - datasets=[], dataset_metadata=DummyDataset( { "filename": "ae.xpt", diff --git a/tests/unit/test_dataset_preprocessor.py b/tests/unit/test_dataset_preprocessor.py index de6a1ce57..c58789760 100644 --- a/tests/unit/test_dataset_preprocessor.py +++ b/tests/unit/test_dataset_preprocessor.py @@ -6,7 +6,9 @@ from cdisc_rules_engine.services.cache.in_memory_cache_service import ( InMemoryCacheService, ) -from cdisc_rules_engine.services.data_services import LocalDataService +from cdisc_rules_engine.services.data_services.dummy_data_service import ( + DummyDataService, +) from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor from cdisc_rules_engine.constants.rule_constants import ALL_KEYWORD @@ -38,8 +40,9 @@ def test_preprocess_no_datasets_in_rule(dataset_rule_equal_to_error_objects: dic } ) ) - datasets = [SDTMDatasetMetadata(name="AE")] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), MagicMock(), MagicMock(), data=[SDTMDatasetMetadata(name="AE")] + ) preprocessor = DatasetPreprocessor( dataset, SDTMDatasetMetadata(name="AE", full_path="path"), @@ -47,12 +50,12 @@ def test_preprocess_no_datasets_in_rule(dataset_rule_equal_to_error_objects: dic InMemoryCacheService(), ) preprocessed_dataset: PandasDataset = preprocessor.preprocess( - dataset_rule_equal_to_error_objects, datasets + dataset_rule_equal_to_error_objects ) assert preprocessed_dataset.data.equals(dataset.data) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002"], @@ -93,7 +96,17 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "LB"}, + filename="lb.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -104,13 +117,7 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "LB"}, - filename="lb.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 2 assert "USUBJID" in result.data.columns assert "QVAL" in result.data.columns @@ -121,7 +128,7 @@ def test_rdomain_supplemental_dataset_idvar_matching(mock_get_dataset: MagicMock assert "Cholesterol" in matched_records["LBTEST"].values -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002"], @@ -161,7 +168,17 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "VS"}, + filename="vs.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -172,13 +189,7 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "VS"}, - filename="vs.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 2 assert "QVAL" in result.data.columns assert "VSTEST" in result.data.columns @@ -188,7 +199,7 @@ def test_rdomain_integer_idvar_matching(mock_get_dataset: MagicMock): assert "Height" in matched_records["VSTEST"].values -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001"], @@ -227,7 +238,17 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"}, + filename="ae.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -238,13 +259,7 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, - filename="ae.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 1 assert result.data.iloc[0]["USUBJID"] == "CDISC001" assert result.data.iloc[0]["QVAL"] == "AE999" @@ -252,7 +267,7 @@ def test_rdomain_no_matches_found(mock_get_dataset: MagicMock): assert pd.isna(result.data.iloc[0]["AETERM"]) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMock): supp_data = { "USUBJID": ["CDISC001", "CDISC002", "CDISC003"], @@ -294,7 +309,17 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc } ), } - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + first_record={"DOMAIN": "LB"}, + filename="lb.xpt", + ) + ], + ) preprocessor = DatasetPreprocessor( supp_dataset, SDTMDatasetMetadata( @@ -305,13 +330,7 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc data_service, InMemoryCacheService(), ) - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "LB"}, - filename="lb.xpt", - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess(rule) assert len(result.data) == 3 assert "LBTEST" in result.data.columns matched_records = result.data[result.data["LBTEST"].notna()] @@ -507,7 +526,7 @@ def test_rdomain_combined_standard_and_idvar_matching(mock_get_dataset: MagicMoc ), ], ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess( mock_get_dataset: MagicMock, dataset_rule_equal_to: dict, @@ -614,8 +633,8 @@ def test_preprocess( # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ts.xpt"): ts_dataset, + "AE": ae_dataset, + "TS": ts_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -630,22 +649,30 @@ def test_preprocess( for ds in dataset_rule_equal_to["datasets"]: ds["join_type"] = join_type - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt" + ), + ], + ) preprocessor = DatasetPreprocessor( ec_dataset, SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, full_path=os.path.join("path", "ec.xpt") + name="EC", + first_record={"DOMAIN": "EC"}, + full_path=os.path.join("path", "ec.xpt"), ), data_service, InMemoryCacheService(), ) - preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( - dataset_rule_equal_to, - [ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), - SDTMDatasetMetadata(first_record={"DOMAIN": "TS"}, filename="ts.xpt"), - ], - ) + preprocessed_dataset: pd.DataFrame = preprocessor.preprocess(dataset_rule_equal_to) assert preprocessed_dataset.data.equals(expected_dataset.data) @@ -744,7 +771,7 @@ def test_preprocess( ), ], ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_relrec_dataset( mock_get_dataset: MagicMock, relrec: dict, expected: dict ): @@ -854,8 +881,8 @@ def test_preprocess_relrec_dataset( # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "relrec.xpt"): relrec_dataset, + "AE": ae_dataset, + "RELREC": relrec_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -868,33 +895,34 @@ def test_preprocess_relrec_dataset( return_value=["--SEQ", "--STDY"] ) # execute operation - data_service = LocalDataService.get_instance( + data_service = DummyDataService.get_instance( cache_service=cache, config=ConfigService(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata(name="RELREC", filename="relrec.xpt"), + ], ) data_service.library_metadata = LibraryMetadataContainer() preprocessor = DatasetPreprocessor( ec_dataset, SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, full_path=os.path.join("path", "ec.xpt"), ), data_service, InMemoryCacheService(), ) - preprocessed_dataset: pd.DataFrame = preprocessor.preprocess( - relrec_rule, - [ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), - SDTMDatasetMetadata(name="RELREC", filename="relrec.xpt"), - ], - ) + preprocessed_dataset: pd.DataFrame = preprocessor.preprocess(relrec_rule) expected_dataset = PandasDataset(pd.DataFrame.from_dict(expected)) assert preprocessed_dataset.data.equals(expected_dataset.data) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_with_merge_comparison( mock_get_dataset: MagicMock, dataset_rule_equal_to_compare_same_value: dict, @@ -938,17 +966,30 @@ def test_preprocess_with_merge_comparison( ) path_to_dataset_map: dict = { - os.path.join("study_id", "data_bundle_id", "ae.xpt"): match_dataset, - os.path.join("study_id", "data_bundle_id", "ec.xpt"): target_dataset, + "AE": match_dataset, + "EC": target_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt" + ), + SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, filename="ec.xpt" + ), + ], + ) preprocessor = DatasetPreprocessor( target_dataset, SDTMDatasetMetadata( + name="EC", first_record={"DOMAIN": "EC"}, full_path=os.path.join("study_id", "data_bundle_id", "ec.xpt"), ), @@ -956,11 +997,7 @@ def test_preprocess_with_merge_comparison( InMemoryCacheService(), ) result: pd.DataFrame = preprocessor.preprocess( - rule=dataset_rule_equal_to_compare_same_value, - datasets=[ - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, filename="ae.xpt"), - SDTMDatasetMetadata(first_record={"DOMAIN": "EC"}, filename="ec.xpt"), - ], + rule=dataset_rule_equal_to_compare_same_value ) assert "NOTVISIT" in result assert result["NOTVISIT"].iloc[0] == 12 @@ -968,7 +1005,7 @@ def test_preprocess_with_merge_comparison( assert result["AE.VISIT"].iloc[0] == 24 -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): """ Test preprocessing when SUPP dataset has blank IDVAR and IDVARVAL. @@ -992,10 +1029,19 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): supp_dataset = PandasDataset(pd.DataFrame(supp_data)) mock_get_dataset.return_value = supp_dataset - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" + ) + ], + ) preprocessor = DatasetPreprocessor( main_dataset, - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), data_service, InMemoryCacheService(), ) @@ -1019,12 +1065,9 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" - ) - ] - result = preprocessor.preprocess(rule, datasets) + result = preprocessor.preprocess( + rule, + ) assert len(result.data) == 2 assert "AESPID" in result.data.columns assert "QNAM" not in result.data.columns @@ -1039,7 +1082,7 @@ def test_preprocess_supp_with_blank_idvar_idvarval(mock_get_dataset): ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_supp_wildcard_matches_all_supp_datasets( mock_get_dataset: MagicMock, ): @@ -1095,24 +1138,30 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", - first_record={"RDOMAIN": "AE"}, - filename="suppae.xpt", - ), - ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", + first_record={"RDOMAIN": "AE"}, + filename="suppae.xpt", + ), + ], + ) preprocessor = DatasetPreprocessor( ae_dataset, SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, full_path=os.path.join("path", "ae.xpt") + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=os.path.join("path", "ae.xpt"), ), data_service, InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_supp_wildcard, datasets) + result = preprocessor.preprocess(rule_with_supp_wildcard) assert len(result.data) == 2 assert "RDOMAIN" in result.data.columns assert "AESPID" in result.data.columns @@ -1121,7 +1170,7 @@ def test_preprocess_supp_wildcard_matches_all_supp_datasets( assert result.data.loc[1, "AESEV"] == "MILD" -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_preprocess_specific_suppae_dataset( mock_get_dataset: MagicMock, ): @@ -1172,23 +1221,27 @@ def test_preprocess_specific_suppae_dataset( } ), } - datasets = [ - SDTMDatasetMetadata( - name="SUPPAE", - first_record={"RDOMAIN": "AE"}, - filename="suppae.xpt", - ), - ] - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ + SDTMDatasetMetadata( + name="SUPPAE", + first_record={"RDOMAIN": "AE"}, + filename="suppae.xpt", + ), + ], + ) preprocessor = DatasetPreprocessor( ae_dataset, - SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}, full_path="path"), data_service, InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_specific_supp, datasets) + result = preprocessor.preprocess(rule_with_specific_supp) assert len(result.data) == 1 assert "AESPID" in result.data.columns @@ -1249,7 +1302,7 @@ def test_data_processor_groups_qnam_suppdm_qvals(suppdm_with_race): assert suppdm_df.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() -@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.DummyDataService.get_dataset") def test_dm_merged_with_suppdm_without_dupes( mock_get_dataset, suppdm_with_race, rule_with_specific_supp ): @@ -1277,14 +1330,16 @@ def test_dm_merged_with_suppdm_without_dupes( dm_ds = PandasDataset(pd.DataFrame(dm)) assert suppdm_with_race.data.shape[0] == 3 - data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + data_service = DummyDataService( + MagicMock(), MagicMock(), MagicMock(), data=[supp_dm_meta] + ) preprocessor = DatasetPreprocessor( dm_ds, dm_meta, data_service, InMemoryCacheService(), ) - result = preprocessor.preprocess(rule_with_specific_supp, [supp_dm_meta]) + result = preprocessor.preprocess(rule_with_specific_supp) assert result.data.shape[0] == 1 assert {"RACE1", "RACE2", "RACE3"}.issubset(set(result.columns)) assert result.data.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() @@ -1326,12 +1381,15 @@ def test_relrec_processed_correctly_with_others(rule_with_specific_supp): preprocessor = DatasetPreprocessor( relrec_ds, relrec_meta, - LocalDataService(MagicMock(), MagicMock(), MagicMock()), + DummyDataService( + MagicMock(), + MagicMock(), + MagicMock(), + data=[ec_meta, supp_ec_meta, relrec_meta], + ), InMemoryCacheService(), ) - result = preprocessor.preprocess( - rule_with_specific_supp, [ec_meta, supp_ec_meta, relrec_meta] - ) + result = preprocessor.preprocess(rule_with_specific_supp) # relrec preprocessing did not change data assert result.data.shape[0] == 2 diff --git a/tests/unit/test_dummy_dataset.py b/tests/unit/test_dummy_dataset.py index 39744a903..a30320eaf 100644 --- a/tests/unit/test_dummy_dataset.py +++ b/tests/unit/test_dummy_dataset.py @@ -11,19 +11,5 @@ def test_valid_dataset_data(): ] dataset = DummyDataset(dataset_data[0]) assert dataset.domain == "AE" - - -def test_get_dataset_metadata(): - dataset_data = [ - { - "domain": "AE", - "filename": "ae.xpt", - "label": "Adverse Events", - "records": {"AESEQ": [1, 2, 3, 4]}, - } - ] - dataset = DummyDataset(dataset_data[0]) - metadata = dataset.get_metadata() - assert "dataset_name" in metadata - assert metadata["dataset_name"] == ["AE"] - assert metadata["dataset_label"] == ["Adverse Events"] + assert dataset.name == "AE" + assert dataset.label == "Adverse Events" diff --git a/tests/unit/test_operations/test_dataset_names.py b/tests/unit/test_operations/test_dataset_names.py index 02ee8352a..a2e3edc9a 100644 --- a/tests/unit/test_operations/test_dataset_names.py +++ b/tests/unit/test_operations/test_dataset_names.py @@ -1,3 +1,5 @@ +from unittest.mock import MagicMock + from cdisc_rules_engine.config.config import ConfigService from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset @@ -5,9 +7,6 @@ from cdisc_rules_engine.models.operation_params import OperationParams from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.services.cache.cache_service_factory import CacheServiceFactory -from cdisc_rules_engine.services.data_services.data_service_factory import ( - DataServiceFactory, -) import pytest @@ -17,7 +16,6 @@ def test_get_study_domains_with_duplicates( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - data_service = DataServiceFactory(config, cache).get_data_service() datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ @@ -27,9 +25,13 @@ def test_get_study_domains_with_duplicates( {"name": "TV", "filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] ] - operation_params.datasets = datasets + mock_data_service = MagicMock() + mock_data_service.get_datasets.return_value = datasets result = DatasetNames( - operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service + operation_params, + dataset_type.from_dict({"A": [1, 2, 3]}), + cache, + mock_data_service, ).execute() assert operation_params.operation_id in result for val in result[operation_params.operation_id]: @@ -42,7 +44,6 @@ def test_get_study_domains_with_missing_domains( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - data_service = DataServiceFactory(config, cache).get_data_service() datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ @@ -52,9 +53,13 @@ def test_get_study_domains_with_missing_domains( {"name": "TV", "filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] ] - operation_params.datasets = datasets + mock_data_service = MagicMock() + mock_data_service.get_datasets.return_value = datasets result = DatasetNames( - operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service + operation_params, + dataset_type.from_dict({"A": [1, 2, 3]}), + cache, + mock_data_service, ).execute() assert operation_params.operation_id in result for val in result[operation_params.operation_id]: diff --git a/tests/unit/test_operations/test_day_data_validator.py b/tests/unit/test_operations/test_day_data_validator.py index e1bc801be..e0b88cd8e 100644 --- a/tests/unit/test_operations/test_day_data_validator.py +++ b/tests/unit/test_operations/test_day_data_validator.py @@ -55,7 +55,7 @@ def test_day_data_calculation( config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() datasets_map = { - "dm.xpt": dataset_type.from_dict( + "DM": dataset_type.from_dict( { "RFSTDTC": [ "1997-07-16T19:20:30", @@ -73,20 +73,17 @@ def test_day_data_calculation( datasets = [ SDTMDatasetMetadata( **{ + "name": "DM", "first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt", "full_path": "/path/to/dm.xpt", } ) ] - mock_data_service.get_dataset.side_effect = ( - lambda *args, **kwargs: datasets_map.get( - args.split("/")[-1] - if args - else kwargs.get("dataset_name", "").split("/")[-1] - ) + mock_data_service.get_dataset.side_effect = lambda **kwargs: datasets_map.get( + kwargs.get("dataset_name") ) - operation_params.datasets = datasets + mock_data_service.get_datasets.return_value = datasets operation_params.dataframe = PandasDataset.from_dict(data) operation_params.target = "values" result = DayDataValidator( diff --git a/tests/unit/test_operations/test_define_variable_metadata.py b/tests/unit/test_operations/test_define_variable_metadata.py index 4ffb11a83..dd3dab36a 100644 --- a/tests/unit/test_operations/test_define_variable_metadata.py +++ b/tests/unit/test_operations/test_define_variable_metadata.py @@ -1,6 +1,7 @@ from cdisc_rules_engine.config.config import ConfigService from pathlib import Path from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.define_variable_metadata import ( DefineVariableMetadata, ) @@ -23,7 +24,9 @@ def test_get_define_variable_metadata_variable_in_domain( cache = CacheServiceFactory(config).get_cache_service() data_service = DataServiceFactory(config, cache).get_data_service() resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") - operation_params.directory_path = str(resources_path) + operation_params.evaluation_dataset_metadata = SDTMDatasetMetadata( + full_path=Path(resources_path, "ae.xpt") + ) operation_params.domain = "AE" operation_params.target = "--SER" operation_params.attribute_name = "define_variable_ccode" @@ -44,7 +47,9 @@ def test_get_define_variable_metadata_variable_not_in_domain( cache = CacheServiceFactory(config).get_cache_service() data_service = DataServiceFactory(config, cache).get_data_service() resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") - operation_params.directory_path = str(resources_path) + operation_params.evaluation_dataset_metadata = SDTMDatasetMetadata( + full_path=Path(resources_path, "ae.xpt") + ) operation_params.domain = "AE" operation_params.target = "VERYFAKEVARIABLE" operation_params.attribute_name = "define_variable_ccode" diff --git a/tests/unit/test_operations/test_expected_variables.py b/tests/unit/test_operations/test_expected_variables.py index 7d547a980..a3bdd53b1 100644 --- a/tests/unit/test_operations/test_expected_variables.py +++ b/tests/unit/test_operations/test_expected_variables.py @@ -6,7 +6,7 @@ ) import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles @@ -175,6 +175,9 @@ def test_get_expected_variables(operation_params: OperationParams, dataset_type) operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -189,21 +192,14 @@ def test_get_expected_variables(operation_params: OperationParams, dataset_type) ) data_service.get_dataset_class = Mock(return_value=mock_dataset_class) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - operation = ExpectedVariables( - operation_params, - operation_params.dataframe, - cache, - data_service, - library_metadata, - ) - result = operation.execute() + operation = ExpectedVariables( + operation_params, + operation_params.dataframe, + cache, + data_service, + library_metadata, + ) + result = operation.execute() variables = ["STUDYID", "DOMAIN", "AENEW", "TIMING_VAR"] expected = pd.Series( diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index c8efd7d52..4bc5afe7c 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -32,6 +32,7 @@ def test_extract_metadata_get_dataset_name( ], } ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") operation_params.target = "dataset_name" cache = InMemoryCacheService.get_instance() # execute operation @@ -100,6 +101,9 @@ def test_extract_metadata_domain_suffix( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": [domain_value]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name=dataset_name, first_record=first_record + ) operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( @@ -122,6 +126,7 @@ def test_extract_metadata_domain_suffix_uses_domain( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APXX"]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="APFA") operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( @@ -140,6 +145,7 @@ def test_extract_metadata_domain_suffix_empty_metadata( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} ) + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="APFA") operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( diff --git a/tests/unit/test_operations/test_get_dataset_filtered_variables.py b/tests/unit/test_operations/test_get_dataset_filtered_variables.py index dff836b6c..de018a965 100644 --- a/tests/unit/test_operations/test_get_dataset_filtered_variables.py +++ b/tests/unit/test_operations/test_get_dataset_filtered_variables.py @@ -503,7 +503,7 @@ def test_get_dataset_filtered_variables( operation_params.standard_version = "3-4" operation_params.key_name = key_name operation_params.key_value = key_value - operation_params.datasets = [SDTMDatasetMetadata(**dataset_metadata)] + operation_params.dataframe_metadata = SDTMDatasetMetadata(**dataset_metadata) cache = InMemoryCacheService.get_instance() library_metadata = LibraryMetadataContainer( @@ -525,11 +525,6 @@ def test_get_dataset_filtered_variables( else FINDINGS_ABOUT ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(**dataset_metadata) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object( LocalDataService, "get_dataset_class", return_value=expected_class ): @@ -569,7 +564,7 @@ def test_get_dataset_filtered_variables_dask( operation_params.standard_version = "3-4" operation_params.key_name = "role" operation_params.key_value = "Timing" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -685,11 +680,6 @@ def test_get_dataset_filtered_variables_dask( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, @@ -716,7 +706,7 @@ def test_get_dataset_filtered_variables_empty_dataset( operation_params.domain = "AE" operation_params.key_name = "role" operation_params.key_value = "Timing" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -792,11 +782,6 @@ def test_get_dataset_filtered_variables_empty_dataset( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, @@ -822,7 +807,7 @@ def test_get_dataset_filtered_variables_invalid_key(operation_params: OperationP operation_params.domain = "AE" operation_params.key_name = "invalid_key" operation_params.key_value = "SomeValue" - operation_params.datasets = [SDTMDatasetMetadata(name="AE")] + operation_params.dataframe_metadata = SDTMDatasetMetadata(name="AE") model_metadata = { "datasets": [ @@ -904,11 +889,6 @@ def test_get_dataset_filtered_variables_invalid_key(operation_params: OperationP library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE") - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - with patch.object(LocalDataService, "get_dataset_class", return_value=EVENTS): operation = GetDatasetFilteredVariables( operation_params, diff --git a/tests/unit/test_operations/test_get_model_filtered_variables.py b/tests/unit/test_operations/test_get_model_filtered_variables.py index 526605a9a..8b9965795 100644 --- a/tests/unit/test_operations/test_get_model_filtered_variables.py +++ b/tests/unit/test_operations/test_get_model_filtered_variables.py @@ -294,7 +294,7 @@ def test_get_model_filtered_variables( operation_params.standard_version = "3-4" operation_params.key_name = "role" operation_params.key_value = key_val - operation_params.datasets = [SDTMDatasetMetadata(**dataset_metadata)] + operation_params.dataframe_metadata = SDTMDatasetMetadata(**dataset_metadata) # save model metadata to cache cache = InMemoryCacheService.get_instance() library_metadata = LibraryMetadataContainer( @@ -314,15 +314,7 @@ def test_get_model_filtered_variables( if model_metadata["datasets"][0]["_links"]["parentClass"]["title"] == "Events" else FINDINGS_ABOUT ) - """ - this fuction replaces get_raw_dataset_metadata in LocalDataService to - prevent filtering into the decorator that checks cache - """ - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(**dataset_metadata) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata with patch.object( LocalDataService, "get_dataset_class", return_value=expected_class ): diff --git a/tests/unit/test_operations/test_get_xhtml_errors.py b/tests/unit/test_operations/test_get_xhtml_errors.py index 04bf0392e..71e196132 100644 --- a/tests/unit/test_operations/test_get_xhtml_errors.py +++ b/tests/unit/test_operations/test_get_xhtml_errors.py @@ -89,7 +89,7 @@ def test_get_xhtml_errors( operation_params.namespace = namespace operation = GetXhtmlErrors( params=operation_params, - original_dataset=dataset, + evaluation_dataset=dataset, cache_service=MagicMock(), data_service=MagicMock(), ) diff --git a/tests/unit/test_operations/test_label_referenced_variable_metadata.py b/tests/unit/test_operations/test_label_referenced_variable_metadata.py index 58caa1c15..c96570043 100644 --- a/tests/unit/test_operations/test_label_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_label_referenced_variable_metadata.py @@ -14,7 +14,7 @@ from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock @pytest.mark.parametrize("dataset_type", [(PandasDataset)]) @@ -170,6 +170,10 @@ def test_get_label_referenced_variable_metadata( operation_params.standard_version = "3-4" operation_params.target = "AELABEL" operation_params.operation_id = "$label_referenced_variable" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) + # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -191,14 +195,7 @@ def test_get_label_referenced_variable_metadata( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() expected_columns = [ "STUDYID", "AETERM", diff --git a/tests/unit/test_operations/test_library_column_order.py b/tests/unit/test_operations/test_library_column_order.py index b13f9d8e8..906346b1a 100644 --- a/tests/unit/test_operations/test_library_column_order.py +++ b/tests/unit/test_operations/test_library_column_order.py @@ -8,7 +8,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams @@ -314,6 +314,9 @@ def test_get_column_order_from_library( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -335,14 +338,7 @@ def test_get_column_order_from_library( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result = operation.execute() + result = operation.execute() variables: List[str] = [ "STUDYID", "DOMAIN", diff --git a/tests/unit/test_operations/test_library_model_column_order.py b/tests/unit/test_operations/test_library_model_column_order.py index bc69b7e43..2d48bed13 100644 --- a/tests/unit/test_operations/test_library_model_column_order.py +++ b/tests/unit/test_operations/test_library_model_column_order.py @@ -118,9 +118,9 @@ def test_get_column_order_from_library(operation_params: OperationParams, datase operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - ] + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -138,11 +138,6 @@ def test_get_column_order_from_library(operation_params: OperationParams, datase library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata - operation = LibraryModelColumnOrder( operation_params, operation_params.dataframe, @@ -277,9 +272,9 @@ def test_get_findings_class_column_order_from_library( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - ] + operation_params.dataframe_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -296,10 +291,6 @@ def test_get_findings_class_column_order_from_library( library_metadata=library_metadata, ) - def mock_get_raw_metadata(*args, **kwargs): - return SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) - - data_service.get_raw_dataset_metadata = mock_get_raw_metadata operation = LibraryModelColumnOrder( operation_params, operation_params.dataframe, diff --git a/tests/unit/test_operations/test_name_referenced_variable_metadata.py b/tests/unit/test_operations/test_name_referenced_variable_metadata.py index 51ba41363..7c45ee0ee 100644 --- a/tests/unit/test_operations/test_name_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_name_referenced_variable_metadata.py @@ -13,7 +13,7 @@ from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock @pytest.mark.parametrize("dataset_type", [(PandasDataset)]) @@ -170,6 +170,9 @@ def test_get_name_referenced_variable_metadata( operation_params.standard_version = "3-4" operation_params.target = "AEREF" operation_params.operation_id = "$name_referenced_variable" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService() library_metadata = LibraryMetadataContainer( @@ -190,14 +193,7 @@ def test_get_name_referenced_variable_metadata( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result = operation.execute() + result = operation.execute() expected_columns = [ "STUDYID", "AETERM", diff --git a/tests/unit/test_operations/test_operations_factory.py b/tests/unit/test_operations/test_operations_factory.py index 0889aff9c..8a9352d69 100644 --- a/tests/unit/test_operations/test_operations_factory.py +++ b/tests/unit/test_operations/test_operations_factory.py @@ -29,7 +29,7 @@ def _execute_operation(self): operation_params=operation_params, cache=cache, data_service=data_service, - original_dataset=pd.DataFrame(), + evaluation_dataset=pd.DataFrame(), library_metadata=LibraryMetadataContainer(), ) assert isinstance(op, DummyOperation) diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index 0f40379ec..bdd777d64 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -117,6 +117,7 @@ def test_get_parent_column_order_from_library( ): datasets: List[SDTMDatasetMetadata] = [ SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="ae.xpt", full_path="ae.xpt", @@ -129,18 +130,17 @@ def test_get_parent_column_order_from_library( "AESEQ": [1, 2, 3], } ) - path_to_dataset_map: dict = {"ae.xpt": ae} + path_to_dataset_map: dict = {"AE": ae} with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], ): - # Set evaluation_dataset instead of dataframe - operation_params.evaluation_dataset = data operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = datasets - operation_params.dataset_path = "suppae.xpt" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"RDOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -156,6 +156,7 @@ def test_get_parent_column_order_from_library( standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: datasets def mock_get_raw_metadata(dataset_name, **kwargs): if "ae" in dataset_name.lower(): @@ -300,11 +301,13 @@ def test_get_parent_findings_class_column_order_from_library( ): datasets: List[dict] = [ { + "name": "AE", "first_record": {"DOMAIN": "AE"}, "filename": "ae.xpt", "full_path": "ae.xpt", }, { + "name": "EC", "first_record": {"DOMAIN": "EC"}, "filename": "ec.xpt", "full_path": "ec.xpt", @@ -336,22 +339,19 @@ def test_get_parent_findings_class_column_order_from_library( } ) path_to_dataset_map: dict = { - "ae.xpt": ae, - "ec.xpt": ec, + "AE": ae, + "EC": ec, } with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], ): - # Set evaluation_dataset instead of dataframe - operation_params.evaluation_dataset = data operation_params.domain = "SUPPAE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" - operation_params.datasets = [ - SDTMDatasetMetadata(**dataset) for dataset in datasets - ] - operation_params.dataset_path = "suppae.xpt" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"RDOMAIN": "AE"}, + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -368,6 +368,9 @@ def test_get_parent_findings_class_column_order_from_library( standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] def mock_get_raw_metadata(dataset_name, **kwargs): if "ae" in dataset_name.lower(): diff --git a/tests/unit/test_operations/test_permissible_variables.py b/tests/unit/test_operations/test_permissible_variables.py index be9838326..b5f7b8a38 100644 --- a/tests/unit/test_operations/test_permissible_variables.py +++ b/tests/unit/test_operations/test_permissible_variables.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams @@ -314,6 +314,9 @@ def test_get_permissible_variables( operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) cache = InMemoryCacheService library_metadata = LibraryMetadataContainer( standard_metadata=standard_metadata, model_metadata=model_metadata @@ -332,14 +335,7 @@ def test_get_permissible_variables( library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() variables: List[str] = [ "STUDYID", "DOMAIN", diff --git a/tests/unit/test_operations/test_related_domain_is_custom.py b/tests/unit/test_operations/test_related_domain_is_custom.py index c01ea774e..ffbe71871 100644 --- a/tests/unit/test_operations/test_related_domain_is_custom.py +++ b/tests/unit/test_operations/test_related_domain_is_custom.py @@ -1,3 +1,5 @@ +from unittest.mock import MagicMock + import pytest from cdisc_rules_engine.models.library_metadata_container import ( @@ -16,8 +18,7 @@ def __init__(self, name: str, is_supp: bool, rdomain: str): class DummyParams: - def __init__(self, datasets, domain: str): - self.datasets = datasets + def __init__(self, domain: str): self.domain = domain @@ -73,14 +74,16 @@ def test_related_domain_is_custom( library_metadata = LibraryMetadataContainer( standard_metadata={"dataset_names": standard_domains} ) - params = DummyParams(datasets=study_datasets, domain=domain) + params = DummyParams(domain=domain) + data_service = MagicMock() + data_service.get_datasets.return_value = study_datasets op = RelatedDomainIsCustom( params=params, library_metadata=library_metadata, - original_dataset=None, + evaluation_dataset=None, cache_service=None, - data_service=None, + data_service=data_service, ) assert op._execute_operation() is expected diff --git a/tests/unit/test_operations/test_required_variables.py b/tests/unit/test_operations/test_required_variables.py index 3a92a945b..d4c3d11e6 100644 --- a/tests/unit/test_operations/test_required_variables.py +++ b/tests/unit/test_operations/test_required_variables.py @@ -8,7 +8,7 @@ import pandas as pd import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles @@ -178,6 +178,9 @@ def test_get_required_variables(operation_params: OperationParams, dataset_type) operation_params.domain = "AE" operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" + operation_params.dataframe_metadata = SDTMDatasetMetadata( + first_record={"DOMAIN": "AE"} + ) # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -199,14 +202,7 @@ def test_get_required_variables(operation_params: OperationParams, dataset_type) library_metadata, ) - def mock_cached_method(*args, **kwargs): - return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) - - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", - side_effect=mock_cached_method, - ): - result: pd.DataFrame = operation.execute() + result: pd.DataFrame = operation.execute() variables: List[str] = sorted(["AETEST"]) for result_array in result[operation_params.operation_id]: assert sorted(result_array) == variables diff --git a/tests/unit/test_operations/test_study_domains.py b/tests/unit/test_operations/test_study_domains.py index 08c89c4f8..68eab7164 100644 --- a/tests/unit/test_operations/test_study_domains.py +++ b/tests/unit/test_operations/test_study_domains.py @@ -24,7 +24,9 @@ def test_get_study_domains_with_duplicates( {"filename": "ae.xpt", "first_record": {"DOMAIN": "AE"}}, {"filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] - operation_params.datasets = [SDTMDatasetMetadata(**dataset) for dataset in datasets] + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] result = StudyDomains( operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service ).execute() @@ -46,7 +48,9 @@ def test_get_study_domains_with_missing_domains( {"filename": "ae.xpt", "first_record": {"DOMAIN": "AE"}}, {"filename": "tv.xpt", "first_record": {"DOMAIN": "TV"}}, ] - operation_params.datasets = [SDTMDatasetMetadata(**dataset) for dataset in datasets] + data_service.get_datasets = lambda: [ + SDTMDatasetMetadata(**dataset) for dataset in datasets + ] result = StudyDomains( operation_params, dataset_type.from_dict({"A": [1, 2, 3]}), cache, data_service ).execute() diff --git a/tests/unit/test_operations/test_variable_count.py b/tests/unit/test_operations/test_variable_count.py index c59a9d73c..da62fe949 100644 --- a/tests/unit/test_operations/test_variable_count.py +++ b/tests/unit/test_operations/test_variable_count.py @@ -23,7 +23,6 @@ def test_variable_count( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - dataset_path = os.path.join("study", "bundle", "blah") datasets_map = { "AE": dataset_type.from_dict( {"STUDYID": [4, 7, 9], "AESEQ": [1, 2, 3], "DOMAIN": [12, 6, 1]} @@ -73,10 +72,9 @@ def test_variable_count( mock_data_service.concat_split_datasets.side_effect = lambda func, files: pd.concat( [func(f) for f in files] ) - operation_params.datasets = datasets + mock_data_service.get_datasets = lambda: datasets operation_params.target = target operation_params.original_target = target - operation_params.dataset_path = dataset_path result = VariableCount( operation_params, datasets_map["AE"], cache, mock_data_service ).execute() diff --git a/tests/unit/test_operations/test_variable_names.py b/tests/unit/test_operations/test_variable_names.py index 9daa296da..e8401f969 100644 --- a/tests/unit/test_operations/test_variable_names.py +++ b/tests/unit/test_operations/test_variable_names.py @@ -44,18 +44,11 @@ def test_get_variable_names_for_given_standard( } }, ) - dataset_path = "study/bundle/blah" datasets_map = { "AE": dataset_type.from_dict({"STUDYID": [4, 7, 9], "DOMAIN": [12, 6, 1]}), "EX": dataset_type.from_dict({"STUDYID": [4, 8, 12], "DOMAIN": [12, 6, 1]}), "AE2": dataset_type.from_dict({"STUDYID": [4, 7, 9], "DOMAIN": [12, 6, 1]}), } - - datasets = [ - {"domain": "AE", "filename": "AE"}, - {"domain": "EX", "filename": "EX"}, - {"domain": "AE", "filename": "AE2"}, - ] mock_data_service.get_dataset.side_effect = lambda name: datasets_map.get( name.split("/")[-1] ) @@ -63,8 +56,6 @@ def test_get_variable_names_for_given_standard( [func(f) for f in files] ) operation_params.target = target - operation_params.datasets = datasets - operation_params.dataset_path = dataset_path operation_params.standard = standard operation_params.standard_version = standard_version result = VariableNames( diff --git a/tests/unit/test_operations/test_variable_value_count.py b/tests/unit/test_operations/test_variable_value_count.py index 9e193be62..8f48f4ce1 100644 --- a/tests/unit/test_operations/test_variable_value_count.py +++ b/tests/unit/test_operations/test_variable_value_count.py @@ -28,7 +28,6 @@ def test_variable_value_count( ): config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - dataset_path = os.path.join("study", "bundle", "blah") datasets_map = { "AE": dataset_type.from_dict( {"STUDYID": [4, 7, 9], "AESEQ": [1, 2, 3], "DOMAIN": [12, 6, 1]} @@ -71,9 +70,8 @@ def test_variable_value_count( mock_data_service.concat_split_datasets.side_effect = ( lambda func, files: dataset_type().concat([func(f) for f in files]) ) - operation_params.datasets = datasets + mock_data_service.get_datasets = lambda: datasets operation_params.original_target = target - operation_params.dataset_path = dataset_path result = VariableValueCount( operation_params, datasets_map["AE"], diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 5de91eb06..e3b988ebd 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -91,7 +91,6 @@ def test_validate_rule_invalid_suffix( standard="sdtmig" ).validate_single_dataset( mock_ae_record_rule_equal_to_suffix, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -102,12 +101,12 @@ def test_validate_rule_invalid_suffix( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Suffix of AESTDY is equal to test.", "errors": [ - {"value": {"AESTDY": "valid-test"}, "dataset": "bundle", "row": 1} + {"value": {"AESTDY": "valid-test"}, "dataset": "AE", "row": 1} ], } ] @@ -139,7 +138,6 @@ def test_validate_rule_invalid_prefix( standard="sdtmig" ).validate_single_dataset( mock_record_rule_equal_to_string_prefix, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -150,12 +148,12 @@ def test_validate_rule_invalid_prefix( assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Prefix of AESTDY is equal to test.", "errors": [ - {"value": {"AESTDY": "test-valid"}, "dataset": "bundle", "row": 1} + {"value": {"AESTDY": "test-valid"}, "dataset": "AE", "row": 1} ], } ] @@ -235,47 +233,53 @@ def test_validate_rule_cross_dataset_check( mock_get_dataset_class.return_value = None # mock blob storage call path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ec.xpt"): ec_dataset, + "AE": ae_dataset, + "EC": ec_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + datasets = [ + SDTMDatasetMetadata( + name="EC", + first_record={"DOMAIN": "EC"}, + filename="ec.xpt", + full_path=os.path.join("path", "ec.xpt"), + ), + SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="ae.xpt", + full_path=os.path.join("path", "ae.xpt"), + ), + ] + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): - datasets = [ - SDTMDatasetMetadata( - name="EC", - first_record={"DOMAIN": "EC"}, - filename="ec.xpt", - full_path=os.path.join("path", "ec.xpt"), - ), - SDTMDatasetMetadata( - name="AE", - first_record={"DOMAIN": "AE"}, - filename="ae.xpt", - full_path=os.path.join("path", "ae.xpt"), - ), - ] validation_result: List[str] = RulesEngine( standard="sdtmig", standard_version="3-4" - ).validate_single_dataset(dataset_rule_equal_to, datasets, datasets[0]) + ).validate_single_dataset(dataset_rule_equal_to, datasets[0]) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ec.xpt", + "dataset": "EC", "domain": "EC", "variables": ["ECSTDY"], "message": "Value of ECSTDY is equal to AESTDY.", "errors": [ { - "dataset": "ec.xpt", + "dataset": "EC", "row": 1, "value": {"ECSTDY": 4.0}, "USUBJID": "CDISC001", "SEQ": 1, }, { - "dataset": "ec.xpt", + "dataset": "EC", "row": 2, "value": {"ECSTDY": 5.0}, "USUBJID": "CDISC001", @@ -350,31 +354,36 @@ def test_validate_one_to_one_rel_across_datasets(dataset_rule_one_to_one_related ) ) path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "ec.xpt"): ec_dataset, + "AE": ae_dataset, + "EC": ec_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_one_to_one_related, - datasets, datasets[0], ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ec.xpt", + "dataset": "EC", "domain": "EC", "variables": ["VISITNUM"], "message": "VISITNUM is not one-to-one related to VISIT", "errors": [ - {"value": {"VISITNUM": 1}, "dataset": "ec.xpt", "row": 1}, - {"value": {"VISITNUM": 1}, "dataset": "ec.xpt", "row": 3}, - {"value": {"VISITNUM": 3}, "dataset": "ec.xpt", "row": 4}, + {"value": {"VISITNUM": 1}, "dataset": "EC", "row": 1}, + {"value": {"VISITNUM": 1}, "dataset": "EC", "row": 3}, + {"value": {"VISITNUM": 3}, "dataset": "EC", "row": 4}, ], } ] @@ -402,7 +411,6 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): standard="sdtmig" ).validate_single_dataset( dataset_rule_greater_than, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -414,12 +422,12 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Value for ECCOOLVAR greater than 30.", "errors": [ - {"value": {"ECCOOLVAR": 100}, "dataset": "bundle", "row": 2}, - {"value": {"ECCOOLVAR": 34}, "dataset": "bundle", "row": 4}, + {"value": {"ECCOOLVAR": 100}, "dataset": "EC", "row": 2}, + {"value": {"ECCOOLVAR": 34}, "dataset": "EC", "row": 4}, ], } ] @@ -448,7 +456,6 @@ def test_validate_rule_equal_length(dataset_rule_has_equal_length: dict): standard="sdtmig" ).validate_single_dataset( dataset_rule_has_equal_length, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -460,11 +467,11 @@ def test_validate_rule_equal_length(dataset_rule_has_equal_length: dict): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Length of ECCOOLVAR is equal to 5.", "errors": [ - {"value": {"ECCOOLVAR": "equal"}, "dataset": "bundle", "row": 2} + {"value": {"ECCOOLVAR": "equal"}, "dataset": "EC", "row": 2} ], } ] @@ -492,28 +499,33 @@ def test_validate_is_contained_by_distinct(mock_rule_distinct_operation: dict): ) path_to_dataset_map: dict = { - os.path.join("path", "ae.xpt"): ae_dataset, - os.path.join("path", "dm.xpt"): dm_dataset, + "AE": ae_dataset, + "DM": dm_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( mock_rule_distinct_operation, - datasets, datasets[1], ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ae.xpt", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Value for AESTDY not in DM.USUBJID", - "errors": [{"value": {"AESTDY": 5000}, "dataset": "ae.xpt", "row": 4}], + "errors": [{"value": {"AESTDY": 5000}, "dataset": "AE", "row": 4}], } ] @@ -541,7 +553,6 @@ def test_validate_rule_not_equal_length(dataset_rule_has_not_equal_length: dict) standard="sdtmig" ).validate_single_dataset( dataset_rule_has_not_equal_length, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -553,13 +564,13 @@ def test_validate_rule_not_equal_length(dataset_rule_has_not_equal_length: dict) { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": "Length of ECCOOLVAR is not equal to 5.", "errors": [ { "value": {"ECCOOLVAR": "first_string"}, - "dataset": "bundle", + "dataset": "EC", "row": 1, } ], @@ -584,7 +595,6 @@ def test_validate_rule_multiple_conditions(dataset_rule_multiple_conditions: dic standard="sdtmig" ).validate_single_dataset( dataset_rule_multiple_conditions, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -596,14 +606,14 @@ def test_validate_rule_multiple_conditions(dataset_rule_multiple_conditions: dic { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["ECCOOLVAR"], "message": ( "Length of ECCOOLVAR is not equal to 5 or ECCOOLVAR == cool." ), "errors": [ - {"value": {"ECCOOLVAR": "valid"}, "dataset": "bundle", "row": 2}, - {"value": {"ECCOOLVAR": "cool"}, "dataset": "bundle", "row": 3}, + {"value": {"ECCOOLVAR": "valid"}, "dataset": "EC", "row": 2}, + {"value": {"ECCOOLVAR": "cool"}, "dataset": "EC", "row": 3}, ], } ] @@ -626,7 +636,6 @@ def test_validate_record_rule_numbers_separated_by_dash_pattern(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -637,13 +646,13 @@ def test_validate_record_rule_numbers_separated_by_dash_pattern(): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: ^\\d+\\-\\d+$", "errors": [ - {"value": {"AESTDY": "5-5"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "10-10"}, "dataset": "bundle", "row": 2}, + {"value": {"AESTDY": "5-5"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "10-10"}, "dataset": "AE", "row": 2}, ], } ] @@ -666,7 +675,6 @@ def test_validate_record_rule_semi_colon_delimited_pattern(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -678,12 +686,12 @@ def test_validate_record_rule_semi_colon_delimited_pattern(): { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: [^,]*;[^,]*", "errors": [ - {"value": {"AESTDY": "5;5"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "alex;alex"}, "dataset": "bundle", "row": 2}, + {"value": {"AESTDY": "5;5"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "alex;alex"}, "dataset": "AE", "row": 2}, ], } ] @@ -708,7 +716,6 @@ def test_validate_record_rule_no_letters_numbers_underscores(): standard="sdtmig" ).validate_single_dataset( rule, - [], SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -719,13 +726,13 @@ def test_validate_record_rule_no_letters_numbers_underscores(): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Records have the following pattern: ^((?![a-zA-Z0-9_]).)*$", "errors": [ - {"value": {"AESTDY": "[.*)]#@"}, "dataset": "bundle", "row": 1}, - {"value": {"AESTDY": "|>.§!"}, "dataset": "bundle", "row": 3}, + {"value": {"AESTDY": "[.*)]#@"}, "dataset": "AE", "row": 1}, + {"value": {"AESTDY": "|>.§!"}, "dataset": "AE", "row": 3}, ], } ] @@ -761,7 +768,6 @@ def test_validate_dataset_metadata( standard="sdtmig" ).validate_single_dataset( dataset_metadata_not_equal_to_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -772,7 +778,7 @@ def test_validate_dataset_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -813,7 +819,6 @@ def test_validate_dataset_metadata_wrong_metadata( standard="sdtmig" ).validate_single_dataset( dataset_metadata_not_equal_to_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -824,12 +829,12 @@ def test_validate_dataset_metadata_wrong_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["dataset_label", "dataset_name", "dataset_size"], "errors": [ { - "dataset": "bundle", + "dataset": "EC", "row": 1, "value": { "dataset_name": "AD", @@ -868,7 +873,6 @@ def test_validate_variable_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -879,7 +883,7 @@ def test_validate_variable_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -891,7 +895,6 @@ def test_validate_variable_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -902,7 +905,7 @@ def test_validate_variable_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -940,7 +943,6 @@ def test_validate_variable_metadata_wrong_metadata( standard="sdtmig" ).validate_single_dataset( variables_metadata_rule, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -951,12 +953,12 @@ def test_validate_variable_metadata_wrong_metadata( assert validation_result == [ { "domain": "EC", - "dataset": "bundle", + "dataset": "EC", "variables": ["variable_name", "variable_label", "variable_data_type"], "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "errors": [ { - "dataset": "bundle", + "dataset": "EC", "row": 1, "value": { "variable_name": "longer than eight", @@ -965,7 +967,7 @@ def test_validate_variable_metadata_wrong_metadata( }, }, { - "dataset": "bundle", + "dataset": "EC", "row": 2, "value": { "variable_name": "longer than eight as well", @@ -984,7 +986,10 @@ def test_validate_variable_metadata_wrong_metadata( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", ) -def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") +def test_rule_with_domain_prefix_replacement( + mock_get_datasets: MagicMock, mock_get_dataset: MagicMock +): rule = { "core_id": "TEST1", "standards": [], @@ -1016,24 +1021,28 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): df = PandasDataset(pd.DataFrame.from_dict({"AESTDY": [11, 12, 40, 59, 59]})) mock_get_dataset.return_value = df dataset_metadata = SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, filename="bundle", full_path="study/bundle" + name="AE", + first_record={"DOMAIN": "AE"}, + filename="bundle", + full_path="study/bundle", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" - ).validate_single_dataset(rule, [dataset_metadata], dataset_metadata) + ).validate_single_dataset(rule, dataset_metadata) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "AE", "domain": "AE", "variables": ["AESTDY"], "message": "Invalid AESTDY value", "errors": [ - {"dataset": "bundle", "row": 1, "value": {"AESTDY": 11}}, - {"dataset": "bundle", "row": 2, "value": {"AESTDY": 12}}, - {"dataset": "bundle", "row": 3, "value": {"AESTDY": 40}}, - {"dataset": "bundle", "row": 4, "value": {"AESTDY": 59}}, - {"dataset": "bundle", "row": 5, "value": {"AESTDY": 59}}, + {"dataset": "AE", "row": 1, "value": {"AESTDY": 11}}, + {"dataset": "AE", "row": 2, "value": {"AESTDY": 12}}, + {"dataset": "AE", "row": 3, "value": {"AESTDY": 40}}, + {"dataset": "AE", "row": 4, "value": {"AESTDY": 59}}, + {"dataset": "AE", "row": 5, "value": {"AESTDY": 59}}, ], } ] @@ -1065,7 +1074,7 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): [ { "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -1075,8 +1084,12 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): ), ], ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_domain_presence( - domain_presence_rule: dict, datasets: List[str], expected_validation_result: list + mock_get_datasets: MagicMock, + domain_presence_rule: dict, + datasets: List[str], + expected_validation_result: list, ): """ Unit test for RulesEngine.validate_domain_presence. @@ -1089,11 +1102,11 @@ def test_validate_domain_presence( ) for dataset in datasets ] + mock_get_datasets.return_value = dataset_metadata actual_validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( domain_presence_rule, - dataset_metadata, SDTMDatasetMetadata( name="AE", first_record={"DOMAIN": "AE"}, @@ -1129,33 +1142,39 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): } ) ) - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=df, + datasets = [ + SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="bundle", + full_path="study/bundle", + ) + ] + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): - datasets = [ - SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, - filename="bundle", - full_path="study/bundle", - ) - ] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_equal_to_error_objects, - datasets, datasets[0], ) assert validation_result == [ { "domain": "AE", - "dataset": "bundle", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "bundle", + "dataset": "AE", "row": 1, "value": { "AESTDY": "test", @@ -1164,7 +1183,7 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): "SEQ": 1, }, { - "dataset": "bundle", + "dataset": "AE", "row": 4, "value": { "AESTDY": "test", @@ -1173,7 +1192,7 @@ def test_validate_single_dataset(dataset_rule_equal_to_error_objects: dict): "SEQ": 4, }, { - "dataset": "bundle", + "dataset": "AE", "row": 5, "value": { "AESTDY": "test", @@ -1216,31 +1235,37 @@ def test_validate_single_dataset_not_equal_to( } ) ) - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=df, + dataset_metadata = SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="data_bundle", + full_path="study/data_bundle", + ) + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=[dataset_metadata], + ), ): - dataset_metadata = SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, - filename="data_bundle", - full_path="study/data_bundle", - ) validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_rule_not_equal_to_error_objects, - [dataset_metadata], dataset_metadata, ) assert validation_result == [ { "domain": "AE", - "dataset": "data_bundle", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "data_bundle", + "dataset": "AE", "row": 2, "value": { "AESTDY": "alex", @@ -1249,7 +1274,7 @@ def test_validate_single_dataset_not_equal_to( "SEQ": 2, }, { - "dataset": "data_bundle", + "dataset": "AE", "row": 3, "value": { "AESTDY": "alex", @@ -1288,7 +1313,7 @@ def test_validate_single_dataset_not_equal_to( [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ae.xpt", + "dataset": "AE", "domain": "AE", "variables": ["dataset_label", "dataset_name", "dataset_location"], "message": "Dataset metadata does not correspond to Define XML", @@ -1299,7 +1324,7 @@ def test_validate_single_dataset_not_equal_to( "dataset_location": "ae.xpt", "dataset_label": "Adverse", }, - "dataset": "ae.xpt", + "dataset": "AE", "row": 1, }, ], @@ -1326,7 +1351,7 @@ def test_validate_single_dataset_not_equal_to( [ { "domain": "AE", - "dataset": "ae.xpt", + "dataset": "AE", "errors": [], "executionStatus": ExecutionStatus.SUCCESS.value, "message": None, @@ -1346,7 +1371,11 @@ def test_validate_single_dataset_not_equal_to( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_dataset_metadata_against_define_xml( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_get_define_xml_metadata_for_domain: MagicMock, mock_get_define_xml_reader: MagicMock, @@ -1372,11 +1401,11 @@ def test_validate_dataset_metadata_against_define_xml( filename="ae.xpt", original_path="ae.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( define_xml_validation_rule, - [dataset_metadata], dataset_metadata, ) assert validation_result == expected_validation_result @@ -1415,11 +1444,11 @@ def test_validate_dataset_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "test", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_size"], "errors": [ - {"dataset": "test", "row": 1, "value": {"variable_size": 30}} + {"dataset": "AE", "row": 1, "value": {"variable_size": 30}} ], "message": ( "Variable metadata variable_size " @@ -1458,11 +1487,11 @@ def test_validate_dataset_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "test", + "dataset": "AE", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_size"], "errors": [ - {"dataset": "test", "row": 1, "value": {"variable_size": 30}} + {"dataset": "AE", "row": 1, "value": {"variable_size": 30}} ], "message": ( "Variable metadata variable_size " @@ -1480,7 +1509,9 @@ def test_validate_dataset_metadata_against_define_xml( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_variable_metadata_against_define_xml( + mock_get_datasets: MagicMock, mock_get_variables_metadata: MagicMock, mock_get_define_xml_variables_metadata: MagicMock, define_xml_variable_validation_rule: dict, @@ -1500,12 +1531,12 @@ def test_validate_variable_metadata_against_define_xml( filename="test", full_path="CDISC01/test", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=dataset_metadata, rule=define_xml_variable_validation_rule, - datasets=[dataset_metadata], ) assert validation_result == expected_validation_result @@ -1519,15 +1550,15 @@ def test_validate_variable_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "ae_2.xpt", + "dataset": "AE_2", "executionStatus": ExecutionStatus.SKIPPED.value, "variables": [], - "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=", + "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=AE_2", "errors": [ { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "error": "Outside scope", - "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=", + "message": "Rule skipped - doesn't apply to domain for rule id=MockRule, dataset=AE_2", } ], } @@ -1539,24 +1570,24 @@ def test_validate_variable_metadata_against_define_xml( [ { "domain": "AE", - "dataset": "ae_1.xpt, ae_2.xpt", + "dataset": "AE_1, AE_2", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["AESTDY"], "errors": [ { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "row": 1, "value": {"AESTDY": "test"}, "USUBJID": "1", }, { - "dataset": "ae_2.xpt", + "dataset": "AE_2", "row": 4, "value": {"AESTDY": "test"}, "USUBJID": "1", }, { - "dataset": "ae_1.xpt", + "dataset": "AE_1", "row": 4, "value": {"AESTDY": "test"}, "USUBJID": "2", @@ -1571,7 +1602,11 @@ def test_validate_variable_metadata_against_define_xml( @patch( "cdisc_rules_engine.services.data_services.LocalDataService._async_get_datasets", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_split_dataset_contents( + mock_get_datasets: MagicMock, mock_async_get_datasets: MagicMock, dataset_rule_equal_to_error_objects: dict, include_split_datasets: bool, @@ -1641,22 +1676,24 @@ def test_validate_split_dataset_contents( mock_async_get_datasets.return_value = [first_dataset_part, second_dataset_part] datasets = [ SDTMDatasetMetadata( + name="AE_2", first_record={"DOMAIN": "AE"}, filename="ae_2.xpt", full_path="CDISC01/test/ae_2.xpt", ), SDTMDatasetMetadata( + name="AE_1", first_record={"DOMAIN": "AE"}, filename="ae_1.xpt", full_path="CDISC01/test/ae_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=datasets[0], rule=dataset_rule_equal_to_error_objects, - datasets=datasets, ) # check validation result assert validation_result == result @@ -1668,7 +1705,11 @@ def test_validate_split_dataset_contents( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata", ) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", +) def test_validate_split_dataset_metadata( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_async_get_datasets: MagicMock, dataset_metadata_not_equal_to_rule: dict, @@ -1719,29 +1760,35 @@ def test_validate_split_dataset_metadata( mock_get_dataset_metadata.return_value = second_dataset_part datasets = [ SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, filename="ec_2.xpt", full_path="ec_2.xpt" + name="EC_2", + first_record={"DOMAIN": "EC"}, + filename="ec_2.xpt", + full_path="ec_2.xpt", ), SDTMDatasetMetadata( - first_record={"DOMAIN": "EC"}, filename="ec_1.xpt", full_path="ec_1.xpt" + name="EC_1", + first_record={"DOMAIN": "EC"}, + filename="ec_1.xpt", + full_path="ec_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( dataset_metadata=datasets[1], rule=dataset_metadata_not_equal_to_rule, - datasets=datasets, ) # check validation result. # error is contained only in the second part of the dataset. assert validation_result == [ { "domain": "EC", - "dataset": "ec_1.xpt", + "dataset": "EC_1", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "errors": [ { - "dataset": "ec_1.xpt", + "dataset": "EC_1", "row": 1, "value": { "dataset_label": "EC Label", @@ -1757,8 +1804,11 @@ def test_validate_split_dataset_metadata( @patch("cdisc_rules_engine.services.data_services.LocalDataService._async_get_datasets") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_split_dataset_variables_metadata( - mock_async_get_datasets: MagicMock, variables_metadata_rule: dict + mock_get_datasets: MagicMock, + mock_async_get_datasets: MagicMock, + variables_metadata_rule: dict, ): """ Unit test for validating variables metadata of a split dataset. @@ -1793,32 +1843,34 @@ def test_validate_split_dataset_variables_metadata( ] datasets = [ SDTMDatasetMetadata( + name="EC_2", first_record={"DOMAIN": "EC"}, filename="ec_2.xpt", full_path="CDISC/test/ec_2.xpt", ), SDTMDatasetMetadata( + name="EC_1", first_record={"DOMAIN": "EC"}, filename="ec_1.xpt", full_path="CDISC/test/ec_1.xpt", ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( rule=variables_metadata_rule, - datasets=datasets, dataset_metadata=datasets[0], ) assert validation_result == [ { "domain": "EC", - "dataset": "ec_2.xpt", + "dataset": "EC_2", "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "variables": ["variable_name", "variable_label", "variable_data_type"], "errors": [ { - "dataset": "ec_2.xpt", + "dataset": "EC_2", "row": 1, "value": { "variable_label": ( @@ -1835,7 +1887,9 @@ def test_validate_split_dataset_variables_metadata( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_record_in_parent_domain( + mock_get_datasets, mock_get_dataset_class, dataset_rule_record_in_parent_domain_equal_to: dict, ): @@ -1905,8 +1959,8 @@ def test_validate_record_in_parent_domain( ) ) path_to_dataset_map: dict = { - os.path.join("path", "ec.xpt"): ec_dataset, - os.path.join("path", "suppec.xpt"): suppec_dataset, + "EC": ec_dataset, + "SUPPEC": suppec_dataset, } mock_get_dataset_class.return_value = None with patch( @@ -1927,23 +1981,23 @@ def test_validate_record_in_parent_domain( full_path=os.path.join("path", "suppec.xpt"), ), ] + mock_get_datasets.return_value = datasets validation_result: List[str] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( dataset_rule_record_in_parent_domain_equal_to, - datasets, datasets[0], ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "EC", - "dataset": "ec.xpt", + "dataset": "EC", "variables": ["ECREASOC", "ECPRESP"], "message": "Dataset contents is wrong.", "errors": [ { - "dataset": "ec.xpt", + "dataset": "EC", "row": 4, "value": {"ECPRESP": "Y", "ECREASOC": "Some Value 1"}, "USUBJID": "CDISC005", @@ -1955,8 +2009,11 @@ def test_validate_record_in_parent_domain( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_additional_columns( - mock_get_dataset_class, dataset_rule_inconsistent_enumerated_columns: dict + mock_get_datasets, + mock_get_dataset_class, + dataset_rule_inconsistent_enumerated_columns: dict, ): """ Unit test for validating additional columns like TSVAL1, TSVAL2. @@ -1987,35 +2044,36 @@ def test_validate_additional_columns( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", return_value=dataset, ): - datset_metadata = SDTMDatasetMetadata( + dataset_metadata = SDTMDatasetMetadata( + name="TS", first_record={"DOMAIN": "TS"}, filename="ts.xpt", full_path="CDISC01/test/ts.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] validation_result: List[dict] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( rule=dataset_rule_inconsistent_enumerated_columns, - datasets=[datset_metadata], - dataset_metadata=datset_metadata, + dataset_metadata=dataset_metadata, ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "ts.xpt", + "dataset": "TS", "domain": "TS", "variables": ["TSVAL"], "message": "Inconsistencies found in enumerated TSVAL columns.", "errors": [ { "value": {"TSVAL": None}, - "dataset": "ts.xpt", + "dataset": "TS", "row": 2, "USUBJID": "1", }, { "value": {"TSVAL": None}, - "dataset": "ts.xpt", + "dataset": "TS", "row": 4, "USUBJID": "1", }, @@ -2026,7 +2084,9 @@ def test_validate_additional_columns( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( + mock_get_datasets: MagicMock, mock_get_dataset_class: MagicMock, mock_get_dataset: MagicMock, rule_distinct_operation_is_not_contained_by: dict, @@ -2077,8 +2137,8 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( ) path_to_dataset_map: dict = { - os.path.join("study_id", "data_bundle_id", "ie.xpt"): target_dataset, - os.path.join("study_id", "data_bundle_id", "ti.xpt"): operation_result_dataset, + "IE": target_dataset, + "TI": operation_result_dataset, } mock_get_dataset.side_effect = lambda dataset_name: path_to_dataset_map[ dataset_name @@ -2098,17 +2158,17 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( full_path=os.path.join("study_id", "data_bundle_id", "ti.xpt"), ), ] + mock_get_datasets.return_value = datasets validation_result: List[dict] = RulesEngine( standard="sdtmig", standard_version="3-4" ).validate_single_dataset( rule=rule_distinct_operation_is_not_contained_by, - datasets=datasets, dataset_metadata=datasets[0], ) assert validation_result == [ { "executionStatus": ExecutionStatus.SUCCESS.value, - "dataset": "ie.xpt", + "dataset": "IE", "domain": "IE", "variables": [], "message": None, @@ -2121,7 +2181,9 @@ def test_validate_single_dataset_operation_dataset_larger_than_target_dataset( @patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_metadata" ) +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_extract_metadata_operation( + mock_get_datasets: MagicMock, mock_get_dataset_metadata: MagicMock, mock_get_dataset: MagicMock, rule_equal_to_with_extract_metadata_operation: dict, @@ -2171,19 +2233,19 @@ def test_validate_extract_metadata_operation( filename="suppec.xpt", full_path="study_id/data_bundle_id/suppec.xpt", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation validation_result: List[dict] = RulesEngine( standard="sdtmig" ).validate_single_dataset( rule=rule_equal_to_with_extract_metadata_operation, - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "domain": "SUPPEC", "variables": [ "RDOMAIN", @@ -2193,21 +2255,21 @@ def test_validate_extract_metadata_operation( ), "errors": [ { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 1, "value": { "RDOMAIN": "EC", }, }, { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 2, "value": { "RDOMAIN": "EC", }, }, { - "dataset": "suppec.xpt", + "dataset": "SUPPEC", "row": 3, "value": { "RDOMAIN": "EC", @@ -2219,7 +2281,9 @@ def test_validate_extract_metadata_operation( @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_dataset_references_invalid_whodrug_terms( + mock_get_datasets: MagicMock, mock_get_dataset: MagicMock, rule_dataset_references_invalid_whodrug_terms: dict, installed_whodrug_dictionaries: dict, @@ -2250,8 +2314,12 @@ def test_dataset_references_invalid_whodrug_terms( {"classes": [{"name": "EVENTS", "datasets": [{"name": "AE"}]}]}, ) dataset_metadata = SDTMDatasetMetadata( - first_record={"DOMAIN": "AE"}, filename="dataset_path", full_path="dataset_path" + name="AE", + first_record={"DOMAIN": "AE"}, + filename="dataset_path", + full_path="dataset_path", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation engine = RulesEngine( @@ -2269,14 +2337,13 @@ def test_dataset_references_invalid_whodrug_terms( ) validation_result: List[dict] = engine.validate_single_dataset( rule=rule_dataset_references_invalid_whodrug_terms, - datasets=[dataset_metadata], dataset_metadata=dataset_metadata, ) assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, "domain": "AE", - "dataset": "dataset_path", + "dataset": "AE", "variables": [ "AEINA", ], @@ -2285,14 +2352,14 @@ def test_dataset_references_invalid_whodrug_terms( ), "errors": [ { - "dataset": "dataset_path", + "dataset": "AE", "row": 3, "value": { "AEINA": "A01AC", }, }, { - "dataset": "dataset_path", + "dataset": "AE", "row": 4, "value": { "AEINA": "A01AD", @@ -2308,7 +2375,9 @@ def test_dataset_references_invalid_whodrug_terms( "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" ) @patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset_class") +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_datasets") def test_validate_variables_order_against_library_metadata( + mock_get_datasets: MagicMock, mock_get_dataset_class: MagicMock, mock_get_variables_metadata: MagicMock, mock_get_dataset: MagicMock, @@ -2464,10 +2533,12 @@ def test_validate_variables_order_against_library_metadata( model_metadata=cache_data, standard_metadata=standard_data ) dataset_metadata = SDTMDatasetMetadata( + name="AE", first_record={"DOMAIN": "AE"}, filename="dataset_path", full_path="dataset_path", ) + mock_get_datasets.return_value = [dataset_metadata] # run validation engine = RulesEngine( cache=cache, @@ -2485,13 +2556,12 @@ def mock_cached_method(*args, **kwargs): ): result: List[dict] = engine.validate_single_dataset( rule_validate_columns_order_against_library_metadata, - [dataset_metadata], dataset_metadata, ) assert result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "dataset_path", + "dataset": "AE", "domain": "AE", "variables": [ "$column_order_from_dataset", @@ -2522,7 +2592,7 @@ def mock_cached_method(*args, **kwargs): ], "STUDYID": "TEST_STUDY", }, - "dataset": "dataset_path", + "dataset": "AE", } ], } diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index 37da3e277..f2cb210d3 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -27,7 +27,9 @@ ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.read_metadata") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService._LocalDataService__read_metadata" +) def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: dict): # mock file read mock_read_metadata.return_value = dataset_metadata @@ -36,9 +38,11 @@ def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: d cache_mock = MagicMock() cache_mock.get = lambda cache_key: None - data_service = LocalDataService(cache_mock, MagicMock(), MagicMock()) + data_service = LocalDataService( + cache_mock, MagicMock(), MagicMock(), dataset_paths=["dataset_path"] + ) actual_metadata: PandasDataset = data_service.get_dataset_metadata( - dataset_name="dataset_name" + dataset_name=dataset_metadata["contents_metadata"]["dataset_name"] ) assert actual_metadata.equals( PandasDataset.from_dict( @@ -57,7 +61,9 @@ def test_get_dataset_metadata(mock_read_metadata: MagicMock, dataset_metadata: d ) -@patch("cdisc_rules_engine.services.data_services.LocalDataService.read_metadata") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService._LocalDataService__read_metadata" +) def test_get_raw_dataset_metadata( mock_read_metadata: MagicMock, dataset_metadata: dict ): @@ -68,9 +74,11 @@ def test_get_raw_dataset_metadata( cache_mock = MagicMock() cache_mock.get_dataset = lambda cache_key: None - data_service = LocalDataService(cache_mock, MagicMock(), MagicMock()) + data_service = LocalDataService( + cache_mock, MagicMock(), MagicMock(), dataset_paths=["dataset_path"] + ) actual_metadata: SDTMDatasetMetadata = data_service.get_raw_dataset_metadata( - dataset_name="dataset_name" + dataset_name=dataset_metadata["contents_metadata"]["dataset_name"] ) expected_metadata = SDTMDatasetMetadata( name=dataset_metadata["contents_metadata"]["dataset_name"], @@ -219,10 +227,9 @@ def test_get_dataset_class(dataset_metadata, data, expected_class): standard_version="3-4", library_metadata=library_metadata, ) + data_service.get_datasets = lambda: [SDTMDatasetMetadata(**dataset_metadata)] class_name = data_service.get_dataset_class( df, - dataset_metadata.get("filename"), - [SDTMDatasetMetadata(**dataset_metadata)], SDTMDatasetMetadata(**dataset_metadata), ) assert class_name == expected_class @@ -239,9 +246,7 @@ def test_get_dataset_class_without_standard_and_version(): first_record={"DOMAIN": "DM"}, filename="dm.xpt" ) with pytest.raises(Exception): - data_service.get_dataset_class( - df, "dm.xpt", [dataset_metadata], dataset_metadata - ) + data_service.get_dataset_class(df, [dataset_metadata], dataset_metadata) def test_get_dataset_class_associated_domains(): @@ -249,23 +254,29 @@ def test_get_dataset_class_associated_domains(): SDTMDatasetMetadata(**dataset) for dataset in [ { + "name": "APDM", "first_record": {"DOMAIN": "APDM", "APID": "AP001"}, "filename": "apdm.xpt", }, - {"first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt"}, + {"name": "DM", "first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt"}, ] ] ap_dataset = PandasDataset.from_dict({"DOMAIN": ["APDM"], "APID": ["test"]}) ce_dataset = PandasDataset.from_dict({"DOMAIN": ["DM"]}) - data_bundle_path = "cdisc/databundle" path_to_dataset_map: dict = { - os.path.join(data_bundle_path, "apdm.xpt"): ap_dataset, - os.path.join(data_bundle_path, "dm.xpt"): ce_dataset, + "APDM": ap_dataset, + "DM": ce_dataset, } - with patch( - "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", - return_value=ap_dataset, - side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + with ( + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=ap_dataset, + side_effect=lambda dataset_name: path_to_dataset_map[dataset_name], + ), + patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_datasets", + return_value=datasets, + ), ): library_metadata: LibraryMetadataContainer = get_library_metadata_from_cache( Validation_args( @@ -306,11 +317,8 @@ def test_get_dataset_class_associated_domains(): standard_version="3-4", library_metadata=library_metadata, ) - filepath = f"{data_bundle_path}/apdm.xpt" class_name = data_service.get_dataset_class( ap_dataset, - filepath, - datasets, datasets[0], ) assert class_name == SPECIAL_PURPOSE diff --git a/tests/unit/test_services/test_data_service/test_dummy_data_service.py b/tests/unit/test_services/test_data_service/test_dummy_data_service.py index f625306e8..4c2e6723b 100644 --- a/tests/unit/test_services/test_data_service/test_dummy_data_service.py +++ b/tests/unit/test_services/test_data_service/test_dummy_data_service.py @@ -50,7 +50,7 @@ def test_get_dataset(): data_service = DummyDataService( MagicMock(), MagicMock(), MagicMock(), data=datasets ) - dataset = data_service.get_dataset("ae.xpt") + dataset = data_service.get_dataset(dataset_name="AE") assert dataset["AESEQ"].to_list() == [ 1, 2, @@ -102,7 +102,7 @@ def test_get_dataset_metadata(): cache_mock = MagicMock() cache_mock.get_dataset.return_value = None data_service = DummyDataService(cache_mock, MagicMock(), MagicMock(), data=datasets) - metadata = data_service.get_dataset_metadata(dataset_name="ae.xpt") + metadata = data_service.get_dataset_metadata(dataset_name="AE") assert metadata["dataset_label"][0] == "ADVERSE EVENTS" assert metadata["dataset_name"][0] == "AE" assert metadata["dataset_size"][0] == 2000 @@ -131,7 +131,7 @@ def test_get_variables_metadata(): data_service = DummyDataService( MagicMock(), MagicMock(), MagicMock(), data=datasets ) - metadata = data_service.get_variables_metadata("/ae.xpt") + metadata = data_service.get_variables_metadata("AE") assert metadata["variable_name"].iloc[0] == "AESEQ" assert metadata["variable_label"].iloc[0] == "AE Sequence" assert metadata["variable_data_type"].iloc[0] == "integer" diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index 8fbf30226..3070999fe 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -50,14 +50,13 @@ def test_whitespace_get_dataset_raises(dataset_name): ) mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService.get_instance( - config=ConfigService(), - cache_service=mock_cache, - dataset_implementation=PandasDataset, - dataset_path=dataset_path, - ) with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_dataset(dataset_name=dataset_name) + ExcelDataService.get_instance( + config=ConfigService(), + cache_service=mock_cache, + dataset_implementation=PandasDataset, + dataset_path=dataset_path, + ) assert "leading/trailing whitespace" in str(exc_info.value.message) assert any(col in exc_info.value.message for col in ["STUDYID", "DOMAIN", "EXSEQ"]) @@ -96,7 +95,7 @@ def test_get_dataset_metadata(expected_result): cache_mock, MagicMock(), MagicMock(), dataset_path=dataset_path ) metadata = data_service.get_dataset_metadata( - dataset_name=expected_result["dataset_location"] + dataset_name=expected_result["dataset_name"] ) assert metadata["dataset_label"][0] == expected_result["dataset_label"] assert metadata["dataset_name"][0] == expected_result["dataset_name"] @@ -107,7 +106,7 @@ def test_get_dataset_metadata(expected_result): @pytest.mark.parametrize( "dataset_name", - ("ecaa.xpt", "ecbb.xpt", "suppec.xpt"), + ("ECAA", "ECBB", "SUPPEC"), ) def test_get_variables_metadata(dataset_name): dataset_path = f"{os.path.dirname(__file__)}/../../../resources/test_datasets.xlsx" @@ -119,7 +118,7 @@ def test_get_variables_metadata(dataset_name): dataset_implementation=PandasDataset, dataset_path=dataset_path, ) - data = data_service.get_variables_metadata(dataset_name=dataset_name, datasets=[]) + data = data_service.get_variables_metadata(dataset_name=dataset_name) assert isinstance(data, PandasDataset) expected_keys = [ "variable_name", @@ -185,7 +184,7 @@ def test_na_value_preserved_not_converted_to_nan(): ) # Get the dataset - dataset = data_service.get_dataset(dataset_name="test.xpt") + dataset = data_service.get_dataset(dataset_name="TEST") # Assertions assert isinstance(dataset, PandasDataset) @@ -213,7 +212,7 @@ def test_na_value_preserved_not_converted_to_nan(): def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): """ When the workbook has no 'Datasets' sheet (e.g. tab named 'datasets' instead), - get_datasets() raises ExcelTestDataError with message that includes + initialization raises ExcelTestDataError with message that includes case-sensitive guidance. """ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: @@ -238,12 +237,10 @@ def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService( - mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path - ) - with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_datasets() + ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) msg = str(exc_info.value) assert ExcelDataSheets.DATASETS_SHEET_NAME.value in msg @@ -254,7 +251,7 @@ def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): def test_get_datasets_missing_label_column_raises_friendly_error(): """ When the 'Datasets' sheet exists but is missing the 'Label' column, - get_datasets() raises ExcelTestDataError with column names and + initialization raises ExcelTestDataError with column names and case-sensitive guidance. """ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: @@ -280,12 +277,10 @@ def test_get_datasets_missing_label_column_raises_friendly_error(): mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = ExcelDataService( - mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path - ) - with pytest.raises(ExcelTestDataError) as exc_info: - data_service.get_datasets() + ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) msg = str(exc_info.value) assert "Label" in msg diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 1337897b5..07104ae48 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -15,7 +15,9 @@ def test_read_metadata(): """ dataset_path = f"{os.path.dirname(__file__)}/../../../resources/test_dataset.xpt" data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) - metadata = data_service.read_metadata(dataset_path) + metadata = data_service._LocalDataService__read_metadata( # pyright: ignore[reportAttributeAccessIssue] + dataset_path + ) assert "file_metadata" in metadata assert metadata["file_metadata"].get("name") == "test_dataset.xpt" assert metadata["file_metadata"].get("file_size") == 823120 @@ -55,8 +57,11 @@ def test_get_dataset(dataset_implementation): config=ConfigService(), cache_service=mock_cache, dataset_implementation=dataset_implementation, + dataset_paths=[dataset_path], ) - data = data_service.get_dataset(dataset_name=dataset_path) + # Get the dataset name from metadata + dataset_name = list(data_service._datasets_metadata.keys())[0] + data = data_service.get_dataset(dataset_name=dataset_name) assert isinstance(data, dataset_implementation) @@ -74,8 +79,9 @@ def test_get_variables_metadata(dataset_implementation): config=ConfigService(), cache_service=mock_cache, dataset_implementation=dataset_implementation, + dataset_paths=[dataset_path], ) - data = data_service.get_variables_metadata(dataset_name=dataset_path, datasets=[]) + data = data_service.get_variables_metadata(dataset_name="TEST_ADAM_DATASET") assert isinstance(data, dataset_implementation) expected_keys = [ "variable_name", @@ -89,13 +95,12 @@ def test_get_variables_metadata(dataset_implementation): def test_get_datasets_raises_invalid_dataset_format_when_file_cannot_be_read(): - """get_datasets() raises InvalidDatasetFormat with user-friendly message when a file cannot be read.""" + """LocalDataService __init__ raises InvalidDatasetFormat with user-friendly message when a file cannot be read.""" mock_cache = MagicMock() mock_cache.get_dataset.return_value = None - data_service = LocalDataService( - mock_cache, MagicMock(), MagicMock(), dataset_paths=["/bad/path.xpt"] - ) with pytest.raises(InvalidDatasetFormat) as exc_info: - data_service.get_datasets() + LocalDataService( + mock_cache, MagicMock(), MagicMock(), dataset_paths=["/bad/path.xpt"] + ) assert "Your data file could not be read" in str(exc_info.value) assert "/bad/path.xpt" in str(exc_info.value) diff --git a/tests/unit/test_services/test_reporting/test_report_factory.py b/tests/unit/test_services/test_reporting/test_report_factory.py index b5eff0091..7c96680a0 100644 --- a/tests/unit/test_services/test_reporting/test_report_factory.py +++ b/tests/unit/test_services/test_reporting/test_report_factory.py @@ -11,7 +11,6 @@ def test_get_report_services(): Unit test for ReportFactory.get_report_services """ factory = ReportFactory( - datasets=[], results=[], elapsed_time=10.5, args=MagicMock( diff --git a/tests/unit/test_usdm_data.py b/tests/unit/test_usdm_data.py index c2461b0e9..0123fbee1 100644 --- a/tests/unit/test_usdm_data.py +++ b/tests/unit/test_usdm_data.py @@ -28,9 +28,8 @@ def test_list_dataset_metadata_with_valid_paths(self): os.path.join("tests", "resources", dataset_file), ], ) - expected_output = """[ - { - "domain": null, + expected_output = """{ + "domain": "null", "filename": "USDM_EliLilly_NCT03421379_Diabetes.json",""" self.assertEqual(result.exit_code, 0) self.assertIn(expected_output, result.output) @@ -58,8 +57,7 @@ def test_get_dataset(domain_name, record_count): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=mock_cache, dataset_path=dataset_path ) - dataset_name = os.path.join(dataset_path, "{}.json".format(domain_name)) - data = data_service.get_dataset(dataset_name=dataset_name) + data = data_service.get_dataset(dataset_name=domain_name) assert isinstance(data, PandasDataset) assert len(data) == record_count @@ -71,9 +69,7 @@ def test_get_raw_dataset_metadata(): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=cache, dataset_path=dataset_path ) - data = data_service.get_raw_dataset_metadata( - dataset_name=os.path.join(dataset_path, "Code.json") - ) + data = data_service.get_raw_dataset_metadata(dataset_name="Code") assert data.record_count == 117 @@ -97,7 +93,6 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): standard="usdm", dataset_paths=[dataset_path] ).validate_single_dataset( dataset_rule_greater_than, - [], SDTMDatasetMetadata( name="EC", first_record={"DOMAIN": "EC"}, @@ -109,19 +104,19 @@ def test_validate_rule_single_dataset_check(dataset_rule_greater_than: dict): assert validation_result == [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "domain": "EC", "variables": ["ECCOOLVAR"], "message": "Value for ECCOOLVAR greater than 30.", "errors": [ { "value": {"ECCOOLVAR": 100}, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "row": 2, }, { "value": {"ECCOOLVAR": 34}, - "dataset": "USDM_EliLilly_NCT03421379_Diabetes.json", + "dataset": "EC", "row": 4, }, ], @@ -136,9 +131,7 @@ def test_get_variables_metadata(): data_service = USDMDataService.get_instance( config=ConfigService(), cache_service=mock_cache, dataset_path=dataset_path ) - data = data_service.get_variables_metadata( - dataset_name=os.path.join(dataset_path, "StudyIdentifier.json") - ) + data = data_service.get_variables_metadata(dataset_name="StudyIdentifier") assert isinstance(data, PandasDataset) expected_keys = [ "variable_name", diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index a345e5785..e0f0ed23a 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -362,6 +362,7 @@ def test_rule_applies_to_class( processor = RuleProcessor(mock_data_service, InMemoryCacheService()) dataset_mock = PandasDataset.from_dict(data) mock_data_service.get_dataset_class.return_value = class_name + mock_data_service.get_datasets.return_value = datasets with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", return_value=dataset_mock, @@ -369,7 +370,6 @@ def test_rule_applies_to_class( assert ( processor.rule_applies_to_class( rule_metadata, - datasets, SDTMDatasetMetadata(*datasets[0]), ) == outcome @@ -377,62 +377,47 @@ def test_rule_applies_to_class( @pytest.mark.parametrize( - "dataset_name, domain, rdomain, rule_use_case, use_case, standard, standard_substandard, outcome", + "rule_use_case, use_case, standard, outcome", [ # Basic use case tests - user provides "INDH" or "PROD" - ("AE", "AE", None, "INDH, PROD", "INDH", "tig", "SDTM", True), - ("AE", "AE", None, "INDH, PROD", "PROD", "tig", "SDTM", True), - ("CM", "CM", None, "INDH", "INDH", "tig", "SDTM", True), - ("TS", "TS", None, "INDH", "INDH", "tig", "SDTM", True), - ("ES", "ES", None, "PROD", "PROD", "tig", "SDTM", True), - ("ES", "ES", None, "PROD", "INDH", "tig", "SDTM", False), - ("BW", "BW", None, "NONCLIN", "NONCLIN", "tig", "SEND", True), - ("BW", "BW", None, "NONCLIN", "INDH", "tig", "SEND", False), + ("INDH, PROD", "INDH", "tig", True), + ("INDH, PROD", "PROD", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("PROD", "PROD", "tig", True), + ("PROD", "INDH", "tig", False), + ("NONCLIN", "NONCLIN", "tig", True), + ("NONCLIN", "INDH", "tig", False), # Tests for ADaM datasets - ("ADSL", "ADSL", None, "ANALYSIS", "ANALYSIS", "tig", "ADAM", True), - ("ADAE", "ADAE", None, "ANALYSIS", "ANALYSIS", "tig", "ADAM", True), - ("ADAE", "ADAE", None, "ANALYSIS", "INDH", "tig", "ADAM", False), + ("ANALYSIS", "ANALYSIS", "tig", True), + ("ANALYSIS", "ANALYSIS", "tig", True), + ("ANALYSIS", "INDH", "tig", False), # Tests for supplementary datasets - ("SUPPAE", None, "AE", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPQS", None, "QS", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPEC", None, "EC", "INDH", "INDH", "tig", "SDTM", True), - ("SUPP--", None, "AE", "INDH", "INDH", "tig", "SDTM", True), - ("SUPPPT", None, "PT", "PROD", "PROD", "tig", "SDTM", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("INDH", "INDH", "tig", True), + ("PROD", "PROD", "tig", True), # Tests for empty/None use cases in rule (should always return True) - ("AE", "AE", None, "", "INDH", "tig", "SDTM", True), - ("AE", "AE", None, None, "INDH", "tig", "SDTM", True), + ("", "INDH", "tig", True), + (None, "INDH", "tig", True), # Tests for non-TIG standard (should always return True) - ("AE", "AE", None, "INDH", "INDH", "sdtmig", "SDTM", True), - ("BW", "BW", None, "NONCLIN", "NONCLIN", "sendct", "SEND", True), + ("INDH", "INDH", "sdtmig", True), + ("NONCLIN", "NONCLIN", "sendct", True), # Test case mismatch - ("AE", "AE", None, "INDH, PROD", "SAFETY", "tig", "SDTM", False), + ("INDH, PROD", "SAFETY", "tig", False), ], ) def test_rule_applies_to_use_case( mock_data_service, - dataset_name, - domain, - rdomain, rule_use_case, standard, - standard_substandard, use_case, outcome, ): processor = RuleProcessor(mock_data_service, InMemoryCacheService()) rule = {"use_case": rule_use_case} - dataset_metadata = SDTMDatasetMetadata( - name=dataset_name, - first_record=( - {"DOMAIN": domain, "RDOMAIN": rdomain} if domain or rdomain else {} - ), - ) - assert ( - processor.rule_applies_to_use_case( - dataset_metadata, rule, standard, standard_substandard, use_case - ) - == outcome - ) + assert processor.rule_applies_to_use_case(rule, standard, use_case) == outcome @pytest.mark.parametrize("dataset_implementation", [PandasDataset, DaskDataset]) @@ -496,12 +481,12 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation): ) ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) result = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -592,12 +577,12 @@ def test_perform_rule_operation_with_grouping( ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -709,12 +694,12 @@ def test_perform_rule_operation_with_multi_key_grouping( ] mock_data_service.get_dataset.return_value = df + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -766,12 +751,12 @@ def test_perform_rule_operation_with_null_operations( label="Adverse Events", ) ] + mock_data_service.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock_data_service, InMemoryCacheService()) new_data = processor.perform_rule_operations( rule, df, datasets_metadata[0], - datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -790,9 +775,6 @@ def test_preprocess_operation_params_wildcard_replacement(mock_data_service): target="--SEQ", original_target="--SEQ", domain="AE", - dataset_path="test/ae.xpt", - directory_path="test/", - datasets=[], standard="sdtmig", standard_version="3-4", grouping=["--SEQ", "--DTC", "USUBJID"], @@ -823,9 +805,6 @@ def test_preprocess_operation_params_supp_domain_uses_rdomain(mock_data_service) target="--SEQ", original_target="--SEQ", domain=None, - dataset_path="test/suppae.xpt", - directory_path="test/", - datasets=[], standard="sdtmig", standard_version="3-4", ) @@ -884,7 +863,14 @@ def test_perform_extract_metadata_operation( ], } ) - + datasets_metadata = [ + SDTMDatasetMetadata( + name="SUPPEC", + first_record={"RDOMAIN": "EC"}, + filename="suppec.xpt", + full_path="study/data_bundle/suppec.xpt", + ) + ] mock = MagicMock() mock.get_dataset.return_value = dataset mock.get_dataset_metadata.return_value = dataset_implementation.from_dict( @@ -894,20 +880,12 @@ def test_perform_extract_metadata_operation( ], } ) + mock.get_datasets.return_value = datasets_metadata processor = RuleProcessor(mock, InMemoryCacheService()) - datasets_metadata = [ - SDTMDatasetMetadata( - name="SUPPEC", - first_record={"RDOMAIN": "EC"}, - filename="suppec.xpt", - full_path="study/data_bundle/suppec.xpt", - ) - ] dataset_after_operation = processor.perform_rule_operations( rule=rule_equal_to_with_extract_metadata_operation, dataset=dataset, dataset_metadata=datasets_metadata[0], - datasets=datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py index 73a1da9f7..aee50aa8b 100644 --- a/tests/unit/test_utilities/test_sdtm_utils.py +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -29,7 +29,7 @@ def library_metadata(): def mock_data_service(): """Mock data service for tests that require it.""" mock_service = Mock() - mock_service._handle_custom_domains = Mock(return_value=None) + mock_service.handle_custom_domains = Mock(return_value=None) return mock_service @@ -39,42 +39,34 @@ def mock_datasets(): return [] -def test_standard_domain_ae(library_metadata, mock_data_service, mock_datasets): +def test_standard_domain_ae(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/ae.xpt", - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "AETERM" for var in variables) assert any(var["name"] == "AESTDTC" for var in variables) -def test_standard_domain_dm(library_metadata, mock_data_service, mock_datasets): +def test_standard_domain_dm(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="DM", first_record={"DOMAIN": "DM"}) variables = get_variables_metadata_from_standard( - library_metadata, - mock_data_service, - dataset_metadata, - "/path/to/dm.xpt", - mock_datasets, + library_metadata, mock_data_service, dataset_metadata ) assert any(var["name"] == "USUBJID" for var in variables) assert any(var["name"] == "AGE" for var in variables) assert any(var["name"] == "SEX" for var in variables) -def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): +def test_findings_domain_lb(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/lb.xpt", - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "USUBJID" for var in variables) @@ -82,42 +74,36 @@ def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): assert any(var["name"] == "LBORRES" for var in variables) -def test_supp_domain(library_metadata, mock_data_service, mock_datasets): +def test_supp_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/suppae.xpt", - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "QNAM" for var in variables) assert any(var["name"] == "QLABEL" for var in variables) -def test_sq_domain(library_metadata, mock_data_service, mock_datasets): +def test_sq_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="SQAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/sqae.xpt", - mock_datasets, ) assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "QNAM" for var in variables) assert any(var["name"] == "QLABEL" for var in variables) -def test_ap_domain(library_metadata, mock_data_service, mock_datasets): +def test_ap_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/apdm.xpt", - mock_datasets, ) assert any(var["name"] == "APID" for var in variables) assert not any(var["name"] == "USUBJID" for var in variables) @@ -126,7 +112,7 @@ def test_ap_domain(library_metadata, mock_data_service, mock_datasets): assert any(var["name"] == "DMDY" for var in variables) -def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): +def test_sqap_domain(library_metadata, mock_data_service): dataset_metadata = SDTMDatasetMetadata( name="SQAPMH", first_record={"QNAM": "TEST", "RDOMAIN": "APMH"} ) @@ -134,36 +120,30 @@ def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): library_metadata, mock_data_service, dataset_metadata, - "/path/to/sqapmh.xpt", - mock_datasets, ) assert any(var["name"] == "APID" for var in variables) assert not any(var["name"] == "USUBJID" for var in variables) assert any(var["name"] == "RDOMAIN" for var in variables) -def test_findings_about_domain_fa(library_metadata, mock_data_service, mock_datasets): +def test_findings_about_domain_fa(library_metadata, mock_data_service): """Test Findings About domain includes FINDINGS class variables.""" dataset_metadata = SDTMDatasetMetadata(name="FA", first_record={"DOMAIN": "FA"}) variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/fa.xpt", - mock_datasets, ) assert any(var["name"] == "FATEST" for var in variables) assert any(var["name"] == "FAOBJ" for var in variables) # Tests for get_variables_metadata_from_standard_model -def test_findings_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_findings_domain_from_model(library_metadata, mock_data_service): mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, - dataset_path="/path/to/lb.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -172,14 +152,12 @@ def test_findings_domain_from_model(library_metadata, mock_data_service, mock_da assert any(var["name"] == "LBTEST" for var in variables) -def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_supp_domain_from_model(library_metadata, mock_data_service): """Test retrieving variables for SUPP domain from model.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, - dataset_path="/path/to/suppae.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -188,14 +166,12 @@ def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datase assert any(var["name"] == "IDVAR" for var in variables) -def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_sqap_domain_from_model(library_metadata, mock_data_service): """Test retrieving variables for SUPP domain from model.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SQAP", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, - dataset_path="/path/to/suppae.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -204,14 +180,12 @@ def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datase assert any(var["name"] == "APID" for var in variables) -def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets): +def test_ap_domain_from_model(library_metadata, mock_data_service): """Test AP domain excludes USUBJID and includes APID.""" mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard_model( dataframe=mock_dataframe, - datasets=mock_datasets, - dataset_path="/path/to/apdm.xpt", data_service=mock_data_service, library_metadata=library_metadata, dataset_metadata=dataset_metadata, @@ -222,38 +196,32 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets assert any(var["name"] == "DMDY" for var in variables) -def test_custom_domain_events_class(library_metadata, mock_data_service, mock_datasets): +def test_custom_domain_events_class(library_metadata, mock_data_service): """Test custom domain detection and variable metadata retrieval for EVENTS class.""" dataset_metadata = SDTMDatasetMetadata(name="ZZ", first_record={"DOMAIN": "ZZ"}) - mock_data_service._handle_custom_domains = Mock(return_value="EVENTS") + mock_data_service.handle_custom_domains = Mock(return_value="EVENTS") variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/zz.xpt", - mock_datasets, ) - mock_data_service._handle_custom_domains.assert_called_once() + mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "DOMAIN" for var in variables) assert any(var["name"] == "ZZTERM" for var in variables) assert any(var["name"] == "ZZSEQ" for var in variables) -def test_custom_domain_findings_class( - library_metadata, mock_data_service, mock_datasets -): +def test_custom_domain_findings_class(library_metadata, mock_data_service): """Test custom domain detection and variable metadata retrieval for FINDINGS class.""" dataset_metadata = SDTMDatasetMetadata(name="XX", first_record={"DOMAIN": "XX"}) - mock_data_service._handle_custom_domains = Mock(return_value="FINDINGS") + mock_data_service.handle_custom_domains = Mock(return_value="FINDINGS") variables = get_variables_metadata_from_standard( library_metadata, mock_data_service, dataset_metadata, - "/path/to/xx.xpt", - mock_datasets, ) - mock_data_service._handle_custom_domains.assert_called_once() + mock_data_service.handle_custom_domains.assert_called_once() assert any(var["name"] == "STUDYID" for var in variables) assert any(var["name"] == "DOMAIN" for var in variables) assert any(var["name"] == "USUBJID" for var in variables)