From 5fcd3de28e7f318fffb31b9e8eabe91dde45a1ef Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 16 Mar 2026 19:31:30 -0400 Subject: [PATCH 1/6] error handling and removed if conditional --- .../utilities/data_processor.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 04d6dbe46..9e4f7c3ee 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -17,6 +17,7 @@ DataServiceFactory, DummyDataService, ) +from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.utilities.utils import ( search_in_list_of_dicts, ) @@ -211,6 +212,9 @@ def merge_pivot_supp_dataset( for key in static_keys if key in left_dataset.columns and key in right_dataset.columns ] + DataProcessor._validate_merge_key_overlap( + left_dataset, right_dataset, common_keys + ) if not is_blank: common_keys.append(dynamic_key) current_supp = right_dataset.rename(columns={"IDVARVAL": dynamic_key}) @@ -287,6 +291,9 @@ def _merge_supp_with_multiple_idvars( for key in static_keys if key in result_dataset.columns and key in group_data.columns ] + DataProcessor._validate_merge_key_overlap( + result_dataset, group_data, common_keys + ) common_keys.append(idvar_value) agg_dict = { @@ -480,3 +487,19 @@ def column_metadata_equal_to_define_and_library( @staticmethod def is_dummy_data(data_service: DataServiceInterface) -> bool: return isinstance(data_service, DummyDataService) + + @staticmethod + def _validate_merge_key_overlap( + left_dataset: DatasetInterface, + right_dataset: DatasetInterface, + common_keys: List[str], + ): + for key in common_keys: + left_values = set(left_dataset[key].dropna().unique()) + right_values = set(right_dataset[key].dropna().unique()) + if left_values and right_values and left_values.isdisjoint(right_values): + raise PreprocessingError( + f"SUPP merge key '{key}' has no overlapping values between " + f"parent dataset and SUPP dataset. " + f"Parent values: {left_values}, SUPP values: {right_values}." + ) From 7772a526427f80a746cfa6a49512512ba7fa763c Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 16 Mar 2026 19:45:29 -0400 Subject: [PATCH 2/6] added test --- tests/unit/test_dataset_preprocessor.py | 65 ++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_dataset_preprocessor.py b/tests/unit/test_dataset_preprocessor.py index 2b238ce9d..65fe8b519 100644 --- a/tests/unit/test_dataset_preprocessor.py +++ b/tests/unit/test_dataset_preprocessor.py @@ -17,7 +17,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) - +from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.models.dataset import PandasDataset @@ -1291,6 +1291,69 @@ def test_dm_merged_with_suppdm_without_dupes( assert result.data.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +def test_preprocess_suppae_mismatched_studyid_raises_key_error(mock_get_dataset): + ae_dataset = PandasDataset( + pd.DataFrame( + { + "STUDYID": ["CDISC-PILOT-01"], + "DOMAIN": ["AE"], + "USUBJID": ["S001"], + "AESEQ": [1], + "AETERM": ["Headache"], + } + ) + ) + suppae_dataset = PandasDataset( + pd.DataFrame( + { + "STUDYID": ["COMPLETELY-DIFFERENT-STUDY"], + "RDOMAIN": ["AE"], + "USUBJID": ["S001"], + "IDVAR": ["AESEQ"], + "IDVARVAL": ["1"], + "QNAM": ["TEST"], + "QLABEL": ["Test"], + "QVAL": ["A"], + } + ) + ) + + mock_get_dataset.return_value = suppae_dataset + rule = { + "core_id": "TestRule", + "datasets": [{"domain_name": "SUPPAE", "match_key": ["USUBJID"]}], + "conditions": ConditionCompositeFactory.get_condition_composite( + { + "all": [ + { + "name": "get_dataset", + "operator": "equal_to", + "value": {"target": "TEST", "comparator": "A"}, + } + ] + } + ), + } + datasets = [ + SDTMDatasetMetadata( + name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" + ) + ] + data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + preprocessor = DatasetPreprocessor( + ae_dataset, + SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + data_service, + InMemoryCacheService(), + ) + + with pytest.raises( + PreprocessingError, match="SUPP merge key 'STUDYID' has no overlapping values" + ): + preprocessor.preprocess(rule, datasets) + + def test_relrec_processed_correctly_with_others(rule_with_specific_supp): ec_meta = SDTMDatasetMetadata( name="EC", From 4a5fa2c6cc00979c2876f421a30a178cd4567a46 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 9 Apr 2026 13:38:10 -0400 Subject: [PATCH 3/6] tables to datasets.csv --- README.md | 2 +- cdisc_rules_engine/models/validation_args.py | 2 +- .../services/csv_metadata_reader.py | 20 ++--- .../data_services/data_service_factory.py | 6 +- .../data_services/local_data_service.py | 6 +- core.py | 42 ++++----- env.example | 8 +- scripts/run_validation.py | 2 +- .../datasets/{tables.csv => datasets.csv} | 0 tests/unit/test_csv_reader.py | 90 +++++++++---------- 10 files changed, 92 insertions(+), 86 deletions(-) rename tests/resources/CoreIssue1558/datasets/{tables.csv => datasets.csv} (100%) diff --git a/README.md b/README.md index e311d5466..6a648dded 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ This will show the list of validation options. -ft, --filetype TEXT File extension to filter datasets. Has higher priority than --dataset-path parameter. -vcp, --variables-csv-path Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders. Not required if variables.txt exists in all -dp directories. - -tcp, --tables-csv-path Path to tables.csv. Required when multiple dataset paths are provided and refer to different folders. + -dcp, --datasets-csv-path Path to datasets.csv. Required when multiple dataset paths are provided and refer to different folders. --help Show this message and exit. ``` diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py index 9bbb5b12b..dd05a3e3d 100644 --- a/cdisc_rules_engine/models/validation_args.py +++ b/cdisc_rules_engine/models/validation_args.py @@ -29,6 +29,6 @@ "max_errors_per_rule", "encoding", "variables_csv_path", - "tables_csv_path", + "datasets_csv_path", ], ) diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index beca70647..ccc26c76e 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -14,7 +14,7 @@ def __init__( file_name: str, encoding: str = DEFAULT_ENCODING, variables_csv_path: str = None, - tables_csv_path: str = None, + datasets_csv_path: str = None, **kwargs, ): self.file_path = file_path @@ -25,10 +25,10 @@ def __init__( if variables_csv_path else Path(self.file_path).parent / "variables.csv" ) - self.tables_csv_path = ( - Path(tables_csv_path) - if tables_csv_path - else Path(self.file_path).parent / "tables.csv" + self.datasets_csv_path = ( + Path(datasets_csv_path) + if datasets_csv_path + else Path(self.file_path).parent / "datasets.csv" ) def read(self) -> dict: @@ -111,11 +111,11 @@ def __get_variable_metadata( def __dataset_label(self) -> dict: logger = logging.getLogger("validator") - if not self.tables_csv_path.exists(): + if not self.datasets_csv_path.exists(): return {} try: - tables_df = pd.read_csv(self.tables_csv_path, encoding=self.encoding) + datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) except (UnicodeDecodeError, UnicodeError) as e: logger.error( f"\n Error reading CSV from: {self.file_path}" @@ -127,15 +127,15 @@ def __dataset_label(self) -> dict: logger.error("Error reading CSV file %s. %s", self.file_path, e) return {} - if "Filename" not in tables_df.columns or "Label" not in tables_df.columns: + if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: return {} - tables_df["dataset"] = tables_df["Filename"].apply( + datasets_df["dataset"] = datasets_df["Filename"].apply( lambda x: Path(str(x)).stem.lower() ) current_dataset = Path(self.file_name).stem.lower() - match = tables_df[tables_df["dataset"] == current_dataset] + match = datasets_df[datasets_df["dataset"] == current_dataset] if match.empty: return {} diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py index b7bdf4f6b..d0d5a08c7 100644 --- a/cdisc_rules_engine/services/data_services/data_service_factory.py +++ b/cdisc_rules_engine/services/data_services/data_service_factory.py @@ -39,7 +39,7 @@ def __init__( max_dataset_size: int = 0, encoding: str = None, variables_csv_path: str = None, - tables_csv_path=None, + datasets_csv_path=None, ): if config.getValue("DATA_SERVICE_TYPE"): self.data_service_name = config.getValue("DATA_SERVICE_TYPE") @@ -56,7 +56,7 @@ def __init__( self.max_dataset_size = max_dataset_size self.encoding = encoding self.variables_csv_path = variables_csv_path - self.tables_csv_path = tables_csv_path + self.datasets_csv_path = datasets_csv_path self.dataset_size_threshold = self.config.get_dataset_size_threshold() def get_data_service( @@ -103,7 +103,7 @@ def get_data_service( dataset_implementation=self.get_dataset_implementation(), encoding=self.encoding, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ) def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface: diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index b5dfa8819..3f0b2d8bc 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -49,7 +49,7 @@ def __init__( self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) self.encoding: str = kwargs.get("encoding") self.variables_csv_path: str = kwargs.get("variables_csv_path") - self.tables_csv_path: str = kwargs.get("tables_csv_path") + self.datasets_csv_path: str = kwargs.get("datasets_csv_path") @classmethod def get_instance( @@ -215,7 +215,7 @@ def read_metadata( file_name, encoding=self.encoding, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ).read() return { "file_metadata": file_metadata, @@ -252,7 +252,7 @@ def get_datasets(self) -> List[dict]: dataset_metadata = self.get_raw_dataset_metadata( dataset_name=dataset_path, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ) datasets.append(dataset_metadata) except InvalidDatasetFormat: diff --git a/core.py b/core.py index e6add635f..309b9b082 100644 --- a/core.py +++ b/core.py @@ -110,38 +110,40 @@ def _validate_csv_data_paths( dataset_paths: list[str], encoding: str = DEFAULT_ENCODING ) -> list[str]: """ - Filters dataset paths based on tables.csv content. + Filters dataset paths based on datasets.csv content. - Raises InvalidCSVFile error if there are no proper tables.csv files in provided path. + Raises InvalidCSVFile error if there are no proper datasets.csv files in provided path. - Keeps only datasets listed in tables.csv (Filename column). - Always excludes tables.csv and variables.csv from result. + Keeps only datasets listed in datasets.csv (Filename column). + Always excludes datasets.csv and variables.csv from result. """ import pandas as pd paths = [Path(p) for p in dataset_paths] - tables_path = list({p for p in paths if p.name.lower() == "tables.csv"}) - if len(tables_path) > 1: - raise InvalidCSVFile("There is more than one tables.csv file in provided path.") - elif len(tables_path) == 0: - raise InvalidCSVFile("There is no tables.csv file in provided path.") + datasets_path = list({p for p in paths if p.name.lower() == "datasets.csv"}) + if len(datasets_path) > 1: + raise InvalidCSVFile( + "There is more than one datasets.csv file in provided path." + ) + elif len(datasets_path) == 0: + raise InvalidCSVFile("There is no datasets.csv file in provided path.") else: - tables_path = tables_path[0] + datasets_path = datasets_path[0] dataset_files = [ - p for p in paths if p.name.lower() not in ("tables.csv", "variables.csv") + p for p in paths if p.name.lower() not in ("datasets.csv", "variables.csv") ] - tables_df = pd.read_csv(tables_path, encoding=encoding) + datasets_df = pd.read_csv(datasets_path, encoding=encoding) - if "Filename" not in tables_df.columns or "Label" not in tables_df.columns: + if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: raise InvalidCSVFile( "Metadata files is malformed. One of [Filename, Label] columns is missing." ) allowed_datasets = { - Path(str(name)).stem.lower() for name in tables_df["Filename"].dropna() + Path(str(name)).stem.lower() for name in datasets_df["Filename"].dropna() } filtered = { @@ -235,7 +237,7 @@ def _validate_dataset_paths( [ str(p) for p in dp_path.parent.glob("*") - if p.is_file() and p.name in {"tables.csv", "variables.csv"} + if p.is_file() and p.name in {"datasets.csv", "variables.csv"} ] ) try: @@ -539,10 +541,10 @@ def load_custom_dotenv_from_data_options(ctx, param, value): help="Path to variables.csv", ) @click.option( - "-tcp", - "--tables-csv-path", + "-dcp", + "--datasets-csv-path", required=False, - help="Path to tables.csv", + help="Path to datasets.csv", ) @click.pass_context def validate( # noqa @@ -583,7 +585,7 @@ def validate( # noqa max_errors_per_rule: tuple[int, bool], encoding: str, variables_csv_path: str, - tables_csv_path: str, + datasets_csv_path: str, ): """ Validate data using CDISC Rules Engine @@ -692,7 +694,7 @@ def validate( # noqa max_errors_per_rule, encoding, variables_csv_path, - tables_csv_path, + datasets_csv_path, ) ) diff --git a/env.example b/env.example index df2051558..3e87d175c 100644 --- a/env.example +++ b/env.example @@ -2,5 +2,9 @@ CDISC_LIBRARY_API_KEY=your_api_key_here DATASET_SIZE_THRESHOLD=10485760 # max dataset size in bytes to force dask implementation MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report. Defaults to 10000. MAX_ERRORS_PER_RULE = (10, True) # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value -DEFINE_XML -DATA_DIR \ No newline at end of file +DEFINE_XML = define.xml path +CT = controlled terminology package +PRODUCT= standard +VERSION= version, denoted with a dash i.e. 3-4 +SUBSTANDARD= TIG substandard +USE_CASE= TIG use case \ No newline at end of file diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 5f10dd3e6..750edd1ab 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -176,7 +176,7 @@ def run_validation(args: Validation_args): library_metadata=library_metadata, encoding=args.encoding, variables_csv_path=args.variables_csv_path, - tables_csv_path=args.tables_csv_path, + datasets_csv_path=args.datasets_csv_path, ).get_data_service(args.dataset_paths) # install dictionaries if needed dictionary_versions = fill_cache_with_dictionaries( diff --git a/tests/resources/CoreIssue1558/datasets/tables.csv b/tests/resources/CoreIssue1558/datasets/datasets.csv similarity index 100% rename from tests/resources/CoreIssue1558/datasets/tables.csv rename to tests/resources/CoreIssue1558/datasets/datasets.csv diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 2b6cb6f5b..13d31caaf 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -17,31 +17,31 @@ DEFAULT_ENCODING = "utf-8" -def test_no_tables_csv_raises_error(): +def test_no_datasets_csv_raises_error(): paths = ["/data/variables.csv", "/data/dm.csv"] with pytest.raises(InvalidCSVFile): _validate_csv_data_paths(paths) -class TestTablesCsvMissingFilenameColumn: +class TestDatasetsCsvMissingFilenameColumn: def test_non_csv_files_excluded_when_no_filename_col(self, tmp_path): - tables_csv = tmp_path / "tables.csv" - pd.DataFrame({"Name": ["dm"]}).to_csv(tables_csv, index=False) + datasets_csv = tmp_path / "datasets.csv" + pd.DataFrame({"Name": ["dm"]}).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" readme = tmp_path / "readme.txt" dm.touch() readme.touch() with pytest.raises(InvalidCSVFile): - _validate_csv_data_paths([str(tables_csv), str(dm), str(readme)]) + _validate_csv_data_paths([str(datasets_csv), str(dm), str(readme)]) -class TestTablesCsvWithFilenameColumn: +class TestDatasetsCsvWithFilenameColumn: def test_keeps_only_allowed_datasets(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame( {"Filename": ["dm.csv", "customers.csv"], "Label": ["test1", "test2"]} - ).to_csv(tables_csv, index=False) + ).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" customers = tmp_path / "customers.csv" @@ -50,128 +50,128 @@ def test_keeps_only_allowed_datasets(self, tmp_path): f.touch() result = _validate_csv_data_paths( - [str(tables_csv), str(dm), str(customers), str(orders)] + [str(datasets_csv), str(dm), str(customers), str(orders)] ) assert sorted(result) == sorted([str(dm), str(customers)]) assert str(orders) not in result def test_variables_csv_excluded_even_if_listed(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame( {"Filename": ["variables.csv", "dm.csv"], "Label": ["test1", "test2"]} - ).to_csv(tables_csv, index=False) + ).to_csv(datasets_csv, index=False) variables = tmp_path / "variables.csv" dm = tmp_path / "dm.csv" variables.touch() dm.touch() - result = _validate_csv_data_paths([str(tables_csv), str(variables), str(dm)]) + result = _validate_csv_data_paths([str(datasets_csv), str(variables), str(dm)]) assert str(variables) not in result assert str(dm) in result def test_filename_with_path_prefix_uses_stem_matching(self, tmp_path): """Filename 'subdir/dm.csv' -> stem 'dm' -> matches 'dm.csv' on disk.""" - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["subdir/dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - result = _validate_csv_data_paths([str(tables_csv), str(dm)]) + result = _validate_csv_data_paths([str(datasets_csv), str(dm)]) assert str(dm) in result def test_nan_filenames_are_ignored(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["dm.csv", None], "Label": ["test1", None]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" unknown = tmp_path / "unknown.csv" dm.touch() unknown.touch() - result = _validate_csv_data_paths([str(tables_csv), str(dm), str(unknown)]) + result = _validate_csv_data_paths([str(datasets_csv), str(dm), str(unknown)]) assert str(dm) in result assert str(unknown) not in result def test_no_matching_files_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["nonexistent.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_non_csv_files_excluded_even_if_stem_matches(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm_xlsx = tmp_path / "dm.xlsx" dm_xlsx.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm_xlsx)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm_xlsx)]) == [] def test_encoding_is_forwarded_to_read_csv(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) (tmp_path / "dm.csv").touch() with patch("pandas.read_csv", wraps=pd.read_csv) as mock_read: _validate_csv_data_paths( - [str(tables_csv), str(tmp_path / "dm.csv")], encoding="latin-1" + [str(datasets_csv), str(tmp_path / "dm.csv")], encoding="latin-1" ) - mock_read.assert_called_once_with(tables_csv, encoding="latin-1") + mock_read.assert_called_once_with(datasets_csv, encoding="latin-1") class TestEdgeCases: - def test_only_tables_csv_in_input(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + def test_only_datasets_csv_in_input(self, tmp_path): + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) - assert _validate_csv_data_paths([str(tables_csv)]) == [] + assert _validate_csv_data_paths([str(datasets_csv)]) == [] def test_only_variables_csv_in_input(self): with pytest.raises(InvalidCSVFile): _validate_csv_data_paths(["/data/variables.csv"]) def test_empty_filename_column_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame( {"Filename": pd.Series([], dtype=str), "Label": pd.Series([], dtype=str)} - ).to_csv(tables_csv, index=False) + ).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_all_filename_values_nan_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": [None, None], "Label": [None, None]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_duplicate_paths_removed(self, tmp_path): """The function does not deduplicate; duplicates in -> duplicates out.""" - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - paths = [str(tables_csv), str(dm), str(dm)] + paths = [str(datasets_csv), str(dm), str(dm)] result = _validate_csv_data_paths(paths) assert result.count(str(dm)) == 1 @@ -194,7 +194,7 @@ def test_duplicate_paths_removed(self, tmp_path): """ ) -TABLES_CSV = textwrap.dedent( +DATASETS_CSV = textwrap.dedent( """\ Filename,Label patients.csv,Patient Dataset @@ -336,14 +336,14 @@ def test_returns_partial_meta_when_no_variables_file(self, caplog): assert result["first_record"] == {"age": "30", "id": "1", "name": "Alice"} assert "No variables file found" in caplog.text - def test_dataset_label_added_when_tables_csv_present(self): + def test_dataset_label_added_when_datasets_csv_present(self): _write(self._variables_path(), VARIABLES_CSV) - _write(Path(self.tmpdir) / "tables.csv", TABLES_CSV) + _write(Path(self.tmpdir) / "datasets.csv", DATASETS_CSV) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") result = reader.read() assert result.get("dataset_label") == "Patient Dataset" - def test_no_dataset_label_when_tables_csv_absent(self): + def test_no_dataset_label_when_datasets_csv_absent(self): _write(self._variables_path(), VARIABLES_CSV) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") result = reader.read() From cca456075935b8290da93ad28809eb03daeed465 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Fri, 17 Apr 2026 09:51:41 -0400 Subject: [PATCH 4/6] data_processor main --- .../utilities/data_processor.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 885917ca5..5a928cdb0 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -223,9 +223,6 @@ def merge_pivot_supp_dataset( for key in static_keys if key in left_dataset.columns and key in right_dataset.columns ] - DataProcessor._validate_merge_key_overlap( - left_dataset, right_dataset, common_keys - ) if not is_blank: left_dataset[temp_key] = left_dataset[dynamic_key] @@ -314,9 +311,6 @@ def _merge_supp_with_multiple_idvars( for key in static_keys if key in result_dataset.columns and key in group_data.columns ] - DataProcessor._validate_merge_key_overlap( - result_dataset, group_data, common_keys - ) common_keys.append(idvar_value) agg_dict = { @@ -516,19 +510,3 @@ def column_metadata_equal_to_define_and_library( @staticmethod def is_dummy_data(data_service: DataServiceInterface) -> bool: return isinstance(data_service, DummyDataService) - - @staticmethod - def _validate_merge_key_overlap( - left_dataset: DatasetInterface, - right_dataset: DatasetInterface, - common_keys: List[str], - ): - for key in common_keys: - left_values = set(left_dataset[key].dropna().unique()) - right_values = set(right_dataset[key].dropna().unique()) - if left_values and right_values and left_values.isdisjoint(right_values): - raise PreprocessingError( - f"SUPP merge key '{key}' has no overlapping values between " - f"parent dataset and SUPP dataset. " - f"Parent values: {left_values}, SUPP values: {right_values}." - ) From d3197b053dece12c7f8aedc8f2147fbcf758f217 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 21 Apr 2026 10:42:10 -0400 Subject: [PATCH 5/6] metadata filenames --- README.md | 6 +-- .../services/csv_metadata_reader.py | 4 +- core.py | 20 +++++----- tests/unit/test_csv_reader.py | 38 +++++++++---------- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 6a648dded..4340249dc 100644 --- a/README.md +++ b/README.md @@ -245,9 +245,9 @@ This will show the list of validation options. -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times -e, --encoding TEXT File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc. -ft, --filetype TEXT File extension to filter datasets. Has higher priority than --dataset-path parameter. - -vcp, --variables-csv-path Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders. - Not required if variables.txt exists in all -dp directories. - -dcp, --datasets-csv-path Path to datasets.csv. Required when multiple dataset paths are provided and refer to different folders. + -vcp, --variables-csv-path Path to _variables.csv. Used when multiple dataset paths are provided and refer to different folders. + Not required if _variables.txt exists in all -dp directories. + -dcp, --datasets-csv-path Path to _datasets.csv. Required when multiple dataset paths are provided and refer to different folders. --help Show this message and exit. ``` diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index ccc26c76e..16427e76c 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -23,12 +23,12 @@ def __init__( self.variables_csv_path = ( Path(variables_csv_path) if variables_csv_path - else Path(self.file_path).parent / "variables.csv" + else Path(self.file_path).parent / "_variables.csv" ) self.datasets_csv_path = ( Path(datasets_csv_path) if datasets_csv_path - else Path(self.file_path).parent / "datasets.csv" + else Path(self.file_path).parent / "_datasets.csv" ) def read(self) -> dict: diff --git a/core.py b/core.py index 309b9b082..ea1ff2ce6 100644 --- a/core.py +++ b/core.py @@ -110,29 +110,29 @@ def _validate_csv_data_paths( dataset_paths: list[str], encoding: str = DEFAULT_ENCODING ) -> list[str]: """ - Filters dataset paths based on datasets.csv content. + Filters dataset paths based on _datasets.csv content. - Raises InvalidCSVFile error if there are no proper datasets.csv files in provided path. + Raises InvalidCSVFile error if there are no proper _datasets.csv files in provided path. - Keeps only datasets listed in datasets.csv (Filename column). - Always excludes datasets.csv and variables.csv from result. + Keeps only datasets listed in _datasets.csv (Filename column). + Always excludes _datasets.csv and _variables.csv from result. """ import pandas as pd paths = [Path(p) for p in dataset_paths] - datasets_path = list({p for p in paths if p.name.lower() == "datasets.csv"}) + datasets_path = list({p for p in paths if p.name.lower() == "_datasets.csv"}) if len(datasets_path) > 1: raise InvalidCSVFile( - "There is more than one datasets.csv file in provided path." + "There is more than one _datasets.csv file in provided path." ) elif len(datasets_path) == 0: - raise InvalidCSVFile("There is no datasets.csv file in provided path.") + raise InvalidCSVFile("There is no _datasets.csv file in provided path.") else: datasets_path = datasets_path[0] dataset_files = [ - p for p in paths if p.name.lower() not in ("datasets.csv", "variables.csv") + p for p in paths if p.name.lower() not in ("_datasets.csv", "_variables.csv") ] datasets_df = pd.read_csv(datasets_path, encoding=encoding) @@ -538,13 +538,13 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-vcp", "--variables-csv-path", required=False, - help="Path to variables.csv", + help="Path to _variables.csv", ) @click.option( "-dcp", "--datasets-csv-path", required=False, - help="Path to datasets.csv", + help="Path to _datasets.csv", ) @click.pass_context def validate( # noqa diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 13d31caaf..5cb7f6cdc 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -18,14 +18,14 @@ def test_no_datasets_csv_raises_error(): - paths = ["/data/variables.csv", "/data/dm.csv"] + paths = ["/data/_variables.csv", "/data/dm.csv"] with pytest.raises(InvalidCSVFile): _validate_csv_data_paths(paths) class TestDatasetsCsvMissingFilenameColumn: def test_non_csv_files_excluded_when_no_filename_col(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Name": ["dm"]}).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" @@ -38,7 +38,7 @@ def test_non_csv_files_excluded_when_no_filename_col(self, tmp_path): class TestDatasetsCsvWithFilenameColumn: def test_keeps_only_allowed_datasets(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( {"Filename": ["dm.csv", "customers.csv"], "Label": ["test1", "test2"]} ).to_csv(datasets_csv, index=False) @@ -56,11 +56,11 @@ def test_keeps_only_allowed_datasets(self, tmp_path): assert str(orders) not in result def test_variables_csv_excluded_even_if_listed(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( - {"Filename": ["variables.csv", "dm.csv"], "Label": ["test1", "test2"]} + {"Filename": ["_variables.csv", "dm.csv"], "Label": ["test1", "test2"]} ).to_csv(datasets_csv, index=False) - variables = tmp_path / "variables.csv" + variables = tmp_path / "_variables.csv" dm = tmp_path / "dm.csv" variables.touch() dm.touch() @@ -71,7 +71,7 @@ def test_variables_csv_excluded_even_if_listed(self, tmp_path): def test_filename_with_path_prefix_uses_stem_matching(self, tmp_path): """Filename 'subdir/dm.csv' -> stem 'dm' -> matches 'dm.csv' on disk.""" - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["subdir/dm.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -82,7 +82,7 @@ def test_filename_with_path_prefix_uses_stem_matching(self, tmp_path): assert str(dm) in result def test_nan_filenames_are_ignored(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv", None], "Label": ["test1", None]}).to_csv( datasets_csv, index=False ) @@ -96,7 +96,7 @@ def test_nan_filenames_are_ignored(self, tmp_path): assert str(unknown) not in result def test_no_matching_files_returns_empty(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["nonexistent.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -106,7 +106,7 @@ def test_no_matching_files_returns_empty(self, tmp_path): assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_non_csv_files_excluded_even_if_stem_matches(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -117,7 +117,7 @@ def test_non_csv_files_excluded_even_if_stem_matches(self, tmp_path): assert _validate_csv_data_paths([str(datasets_csv), str(dm_xlsx)]) == [] def test_encoding_is_forwarded_to_read_csv(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -132,7 +132,7 @@ def test_encoding_is_forwarded_to_read_csv(self, tmp_path): class TestEdgeCases: def test_only_datasets_csv_in_input(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -140,10 +140,10 @@ def test_only_datasets_csv_in_input(self, tmp_path): def test_only_variables_csv_in_input(self): with pytest.raises(InvalidCSVFile): - _validate_csv_data_paths(["/data/variables.csv"]) + _validate_csv_data_paths(["/data/_variables.csv"]) def test_empty_filename_column_returns_empty(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( {"Filename": pd.Series([], dtype=str), "Label": pd.Series([], dtype=str)} ).to_csv(datasets_csv, index=False) @@ -153,7 +153,7 @@ def test_empty_filename_column_returns_empty(self, tmp_path): assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_all_filename_values_nan_returns_empty(self, tmp_path): - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": [None, None], "Label": [None, None]}).to_csv( datasets_csv, index=False ) @@ -164,7 +164,7 @@ def test_all_filename_values_nan_returns_empty(self, tmp_path): def test_duplicate_paths_removed(self, tmp_path): """The function does not deduplicate; duplicates in -> duplicates out.""" - datasets_csv = tmp_path / "datasets.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( datasets_csv, index=False ) @@ -215,7 +215,7 @@ def setup_method(self): _write(self.data_path, DATA_CSV) def _variables_path(self): - return Path(self.tmpdir) / "variables.csv" + return Path(self.tmpdir) / "_variables.csv" def test_returns_dict_with_expected_keys(self): _write(self._variables_path(), VARIABLES_CSV) @@ -315,7 +315,7 @@ def test_variable_name_to_size_map_with_nan_length(self): assert sizes["id"] is None def test_dataset_name_lookup_is_case_insensitive(self): - """File name with mixed case should still match variables.csv entry.""" + """File name with mixed case should still match _variables.csv entry.""" variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV") _write(self._variables_path(), variables_upper) reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV") @@ -338,7 +338,7 @@ def test_returns_partial_meta_when_no_variables_file(self, caplog): def test_dataset_label_added_when_datasets_csv_present(self): _write(self._variables_path(), VARIABLES_CSV) - _write(Path(self.tmpdir) / "datasets.csv", DATASETS_CSV) + _write(Path(self.tmpdir) / "_datasets.csv", DATASETS_CSV) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") result = reader.read() assert result.get("dataset_label") == "Patient Dataset" From 0343ce96032ef58e7de49cd3b61d0d1e625a97cd Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 21 Apr 2026 14:33:13 -0400 Subject: [PATCH 6/6] regression test --- .../resources/CoreIssue1558/datasets/_datasets.csv | 4 ++++ .../resources/CoreIssue1558/datasets/_variables.csv | 13 +++++++++++++ tests/resources/CoreIssue1558/datasets/datasets.csv | 4 ---- .../resources/CoreIssue1558/datasets/variables.csv | 13 ------------- 4 files changed, 17 insertions(+), 17 deletions(-) create mode 100644 tests/resources/CoreIssue1558/datasets/_datasets.csv create mode 100644 tests/resources/CoreIssue1558/datasets/_variables.csv delete mode 100644 tests/resources/CoreIssue1558/datasets/datasets.csv delete mode 100644 tests/resources/CoreIssue1558/datasets/variables.csv diff --git a/tests/resources/CoreIssue1558/datasets/_datasets.csv b/tests/resources/CoreIssue1558/datasets/_datasets.csv new file mode 100644 index 000000000..b9c464bf2 --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/_datasets.csv @@ -0,0 +1,4 @@ +Filename,Label +pp,Pharmacokinetics Parameters +dm,Demographics +lb,Some Description \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/_variables.csv b/tests/resources/CoreIssue1558/datasets/_variables.csv new file mode 100644 index 000000000..4954c971a --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/_variables.csv @@ -0,0 +1,13 @@ +dataset,variable,label,type,length +dm,STUDYID,Study Identifier,Char,200 +dm,DOMAIN,Domain Abbreviation,Char,2 +dm,USUBJID,Unique Subject Identifier,Char,200 +dm,SUBJID,Subject Identifier for the Study,Char,40 +dm,RFSTDTC,Subject Reference Start Date/Time,Char,20 +pp,STUDYID,Study Identifier,Char,200 +pp,DOMAIN,Domain Abbreviation,Char,2 +pp,USUBJID,Unique Subject Identifier,Char,200 +pp,PPSEQ,Sequence Number,Num,8 +pp,PPGRPID,Group ID,Char,40 +pp,PPTESTCD,Parameter Short Name,Char,8 +pp,PPTEST,Parameter Name,Char,40 \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/datasets.csv b/tests/resources/CoreIssue1558/datasets/datasets.csv deleted file mode 100644 index 316b3b84c..000000000 --- a/tests/resources/CoreIssue1558/datasets/datasets.csv +++ /dev/null @@ -1,4 +0,0 @@ -Filename,Label -pp.xpt,Pharmacokinetics Parameters -dm.xpt,Demographics -lb.xpt,Some Description \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/variables.csv b/tests/resources/CoreIssue1558/datasets/variables.csv deleted file mode 100644 index 07b372de5..000000000 --- a/tests/resources/CoreIssue1558/datasets/variables.csv +++ /dev/null @@ -1,13 +0,0 @@ -dataset,variable,label,type,length -dm.xpt,STUDYID,Study Identifier,Char,200 -dm.xpt,DOMAIN,Domain Abbreviation,Char,2 -dm.xpt,USUBJID,Unique Subject Identifier,Char,200 -dm.xpt,SUBJID,Subject Identifier for the Study,Char,40 -dm.xpt,RFSTDTC,Subject Reference Start Date/Time,Char,20 -pp.xpt,STUDYID,Study Identifier,Char,200 -pp.xpt,DOMAIN,Domain Abbreviation,Char,2 -pp.xpt,USUBJID,Unique Subject Identifier,Char,200 -pp.xpt,PPSEQ,Sequence Number,Num,8 -pp.xpt,PPGRPID,Group ID,Char,40 -pp.xpt,PPTESTCD,Parameter Short Name,Char,8 -pp.xpt,PPTEST,Parameter Name,Char,40 \ No newline at end of file