diff --git a/README.md b/README.md index e311d5466..4340249dc 100644 --- a/README.md +++ b/README.md @@ -245,9 +245,9 @@ This will show the list of validation options. -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times -e, --encoding TEXT File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc. -ft, --filetype TEXT File extension to filter datasets. Has higher priority than --dataset-path parameter. - -vcp, --variables-csv-path Path to variables.csv. Used when multiple dataset paths are provided and refer to different folders. - Not required if variables.txt exists in all -dp directories. - -tcp, --tables-csv-path Path to tables.csv. Required when multiple dataset paths are provided and refer to different folders. + -vcp, --variables-csv-path Path to _variables.csv. Used when multiple dataset paths are provided and refer to different folders. + Not required if _variables.txt exists in all -dp directories. + -dcp, --datasets-csv-path Path to _datasets.csv. Required when multiple dataset paths are provided and refer to different folders. --help Show this message and exit. ``` diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py index 9bbb5b12b..dd05a3e3d 100644 --- a/cdisc_rules_engine/models/validation_args.py +++ b/cdisc_rules_engine/models/validation_args.py @@ -29,6 +29,6 @@ "max_errors_per_rule", "encoding", "variables_csv_path", - "tables_csv_path", + "datasets_csv_path", ], ) diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py index beca70647..16427e76c 100644 --- a/cdisc_rules_engine/services/csv_metadata_reader.py +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -14,7 +14,7 @@ def __init__( file_name: str, encoding: str = DEFAULT_ENCODING, variables_csv_path: str = None, - tables_csv_path: str = None, + datasets_csv_path: str = None, **kwargs, ): self.file_path = file_path @@ -23,12 +23,12 @@ def __init__( self.variables_csv_path = ( Path(variables_csv_path) if variables_csv_path - else Path(self.file_path).parent / "variables.csv" + else Path(self.file_path).parent / "_variables.csv" ) - self.tables_csv_path = ( - Path(tables_csv_path) - if tables_csv_path - else Path(self.file_path).parent / "tables.csv" + self.datasets_csv_path = ( + Path(datasets_csv_path) + if datasets_csv_path + else Path(self.file_path).parent / "_datasets.csv" ) def read(self) -> dict: @@ -111,11 +111,11 @@ def __get_variable_metadata( def __dataset_label(self) -> dict: logger = logging.getLogger("validator") - if not self.tables_csv_path.exists(): + if not self.datasets_csv_path.exists(): return {} try: - tables_df = pd.read_csv(self.tables_csv_path, encoding=self.encoding) + datasets_df = pd.read_csv(self.datasets_csv_path, encoding=self.encoding) except (UnicodeDecodeError, UnicodeError) as e: logger.error( f"\n Error reading CSV from: {self.file_path}" @@ -127,15 +127,15 @@ def __dataset_label(self) -> dict: logger.error("Error reading CSV file %s. %s", self.file_path, e) return {} - if "Filename" not in tables_df.columns or "Label" not in tables_df.columns: + if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: return {} - tables_df["dataset"] = tables_df["Filename"].apply( + datasets_df["dataset"] = datasets_df["Filename"].apply( lambda x: Path(str(x)).stem.lower() ) current_dataset = Path(self.file_name).stem.lower() - match = tables_df[tables_df["dataset"] == current_dataset] + match = datasets_df[datasets_df["dataset"] == current_dataset] if match.empty: return {} diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py index b7bdf4f6b..d0d5a08c7 100644 --- a/cdisc_rules_engine/services/data_services/data_service_factory.py +++ b/cdisc_rules_engine/services/data_services/data_service_factory.py @@ -39,7 +39,7 @@ def __init__( max_dataset_size: int = 0, encoding: str = None, variables_csv_path: str = None, - tables_csv_path=None, + datasets_csv_path=None, ): if config.getValue("DATA_SERVICE_TYPE"): self.data_service_name = config.getValue("DATA_SERVICE_TYPE") @@ -56,7 +56,7 @@ def __init__( self.max_dataset_size = max_dataset_size self.encoding = encoding self.variables_csv_path = variables_csv_path - self.tables_csv_path = tables_csv_path + self.datasets_csv_path = datasets_csv_path self.dataset_size_threshold = self.config.get_dataset_size_threshold() def get_data_service( @@ -103,7 +103,7 @@ def get_data_service( dataset_implementation=self.get_dataset_implementation(), encoding=self.encoding, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ) def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface: diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index b5dfa8819..3f0b2d8bc 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -49,7 +49,7 @@ def __init__( self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) self.encoding: str = kwargs.get("encoding") self.variables_csv_path: str = kwargs.get("variables_csv_path") - self.tables_csv_path: str = kwargs.get("tables_csv_path") + self.datasets_csv_path: str = kwargs.get("datasets_csv_path") @classmethod def get_instance( @@ -215,7 +215,7 @@ def read_metadata( file_name, encoding=self.encoding, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ).read() return { "file_metadata": file_metadata, @@ -252,7 +252,7 @@ def get_datasets(self) -> List[dict]: dataset_metadata = self.get_raw_dataset_metadata( dataset_name=dataset_path, variables_csv_path=self.variables_csv_path, - tables_csv_path=self.tables_csv_path, + datasets_csv_path=self.datasets_csv_path, ) datasets.append(dataset_metadata) except InvalidDatasetFormat: diff --git a/core.py b/core.py index e6add635f..ea1ff2ce6 100644 --- a/core.py +++ b/core.py @@ -110,38 +110,40 @@ def _validate_csv_data_paths( dataset_paths: list[str], encoding: str = DEFAULT_ENCODING ) -> list[str]: """ - Filters dataset paths based on tables.csv content. + Filters dataset paths based on _datasets.csv content. - Raises InvalidCSVFile error if there are no proper tables.csv files in provided path. + Raises InvalidCSVFile error if there are no proper _datasets.csv files in provided path. - Keeps only datasets listed in tables.csv (Filename column). - Always excludes tables.csv and variables.csv from result. + Keeps only datasets listed in _datasets.csv (Filename column). + Always excludes _datasets.csv and _variables.csv from result. """ import pandas as pd paths = [Path(p) for p in dataset_paths] - tables_path = list({p for p in paths if p.name.lower() == "tables.csv"}) - if len(tables_path) > 1: - raise InvalidCSVFile("There is more than one tables.csv file in provided path.") - elif len(tables_path) == 0: - raise InvalidCSVFile("There is no tables.csv file in provided path.") + datasets_path = list({p for p in paths if p.name.lower() == "_datasets.csv"}) + if len(datasets_path) > 1: + raise InvalidCSVFile( + "There is more than one _datasets.csv file in provided path." + ) + elif len(datasets_path) == 0: + raise InvalidCSVFile("There is no _datasets.csv file in provided path.") else: - tables_path = tables_path[0] + datasets_path = datasets_path[0] dataset_files = [ - p for p in paths if p.name.lower() not in ("tables.csv", "variables.csv") + p for p in paths if p.name.lower() not in ("_datasets.csv", "_variables.csv") ] - tables_df = pd.read_csv(tables_path, encoding=encoding) + datasets_df = pd.read_csv(datasets_path, encoding=encoding) - if "Filename" not in tables_df.columns or "Label" not in tables_df.columns: + if "Filename" not in datasets_df.columns or "Label" not in datasets_df.columns: raise InvalidCSVFile( "Metadata files is malformed. One of [Filename, Label] columns is missing." ) allowed_datasets = { - Path(str(name)).stem.lower() for name in tables_df["Filename"].dropna() + Path(str(name)).stem.lower() for name in datasets_df["Filename"].dropna() } filtered = { @@ -235,7 +237,7 @@ def _validate_dataset_paths( [ str(p) for p in dp_path.parent.glob("*") - if p.is_file() and p.name in {"tables.csv", "variables.csv"} + if p.is_file() and p.name in {"datasets.csv", "variables.csv"} ] ) try: @@ -536,13 +538,13 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-vcp", "--variables-csv-path", required=False, - help="Path to variables.csv", + help="Path to _variables.csv", ) @click.option( - "-tcp", - "--tables-csv-path", + "-dcp", + "--datasets-csv-path", required=False, - help="Path to tables.csv", + help="Path to _datasets.csv", ) @click.pass_context def validate( # noqa @@ -583,7 +585,7 @@ def validate( # noqa max_errors_per_rule: tuple[int, bool], encoding: str, variables_csv_path: str, - tables_csv_path: str, + datasets_csv_path: str, ): """ Validate data using CDISC Rules Engine @@ -692,7 +694,7 @@ def validate( # noqa max_errors_per_rule, encoding, variables_csv_path, - tables_csv_path, + datasets_csv_path, ) ) diff --git a/env.example b/env.example index df2051558..3e87d175c 100644 --- a/env.example +++ b/env.example @@ -2,5 +2,9 @@ CDISC_LIBRARY_API_KEY=your_api_key_here DATASET_SIZE_THRESHOLD=10485760 # max dataset size in bytes to force dask implementation MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report. Defaults to 10000. MAX_ERRORS_PER_RULE = (10, True) # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value -DEFINE_XML -DATA_DIR \ No newline at end of file +DEFINE_XML = define.xml path +CT = controlled terminology package +PRODUCT= standard +VERSION= version, denoted with a dash i.e. 3-4 +SUBSTANDARD= TIG substandard +USE_CASE= TIG use case \ No newline at end of file diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 5f10dd3e6..750edd1ab 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -176,7 +176,7 @@ def run_validation(args: Validation_args): library_metadata=library_metadata, encoding=args.encoding, variables_csv_path=args.variables_csv_path, - tables_csv_path=args.tables_csv_path, + datasets_csv_path=args.datasets_csv_path, ).get_data_service(args.dataset_paths) # install dictionaries if needed dictionary_versions = fill_cache_with_dictionaries( diff --git a/tests/resources/CoreIssue1558/datasets/_datasets.csv b/tests/resources/CoreIssue1558/datasets/_datasets.csv new file mode 100644 index 000000000..b9c464bf2 --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/_datasets.csv @@ -0,0 +1,4 @@ +Filename,Label +pp,Pharmacokinetics Parameters +dm,Demographics +lb,Some Description \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/_variables.csv b/tests/resources/CoreIssue1558/datasets/_variables.csv new file mode 100644 index 000000000..4954c971a --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/_variables.csv @@ -0,0 +1,13 @@ +dataset,variable,label,type,length +dm,STUDYID,Study Identifier,Char,200 +dm,DOMAIN,Domain Abbreviation,Char,2 +dm,USUBJID,Unique Subject Identifier,Char,200 +dm,SUBJID,Subject Identifier for the Study,Char,40 +dm,RFSTDTC,Subject Reference Start Date/Time,Char,20 +pp,STUDYID,Study Identifier,Char,200 +pp,DOMAIN,Domain Abbreviation,Char,2 +pp,USUBJID,Unique Subject Identifier,Char,200 +pp,PPSEQ,Sequence Number,Num,8 +pp,PPGRPID,Group ID,Char,40 +pp,PPTESTCD,Parameter Short Name,Char,8 +pp,PPTEST,Parameter Name,Char,40 \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/tables.csv b/tests/resources/CoreIssue1558/datasets/tables.csv deleted file mode 100644 index 316b3b84c..000000000 --- a/tests/resources/CoreIssue1558/datasets/tables.csv +++ /dev/null @@ -1,4 +0,0 @@ -Filename,Label -pp.xpt,Pharmacokinetics Parameters -dm.xpt,Demographics -lb.xpt,Some Description \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/variables.csv b/tests/resources/CoreIssue1558/datasets/variables.csv deleted file mode 100644 index 07b372de5..000000000 --- a/tests/resources/CoreIssue1558/datasets/variables.csv +++ /dev/null @@ -1,13 +0,0 @@ -dataset,variable,label,type,length -dm.xpt,STUDYID,Study Identifier,Char,200 -dm.xpt,DOMAIN,Domain Abbreviation,Char,2 -dm.xpt,USUBJID,Unique Subject Identifier,Char,200 -dm.xpt,SUBJID,Subject Identifier for the Study,Char,40 -dm.xpt,RFSTDTC,Subject Reference Start Date/Time,Char,20 -pp.xpt,STUDYID,Study Identifier,Char,200 -pp.xpt,DOMAIN,Domain Abbreviation,Char,2 -pp.xpt,USUBJID,Unique Subject Identifier,Char,200 -pp.xpt,PPSEQ,Sequence Number,Num,8 -pp.xpt,PPGRPID,Group ID,Char,40 -pp.xpt,PPTESTCD,Parameter Short Name,Char,8 -pp.xpt,PPTEST,Parameter Name,Char,40 \ No newline at end of file diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index 2b6cb6f5b..5cb7f6cdc 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -17,31 +17,31 @@ DEFAULT_ENCODING = "utf-8" -def test_no_tables_csv_raises_error(): - paths = ["/data/variables.csv", "/data/dm.csv"] +def test_no_datasets_csv_raises_error(): + paths = ["/data/_variables.csv", "/data/dm.csv"] with pytest.raises(InvalidCSVFile): _validate_csv_data_paths(paths) -class TestTablesCsvMissingFilenameColumn: +class TestDatasetsCsvMissingFilenameColumn: def test_non_csv_files_excluded_when_no_filename_col(self, tmp_path): - tables_csv = tmp_path / "tables.csv" - pd.DataFrame({"Name": ["dm"]}).to_csv(tables_csv, index=False) + datasets_csv = tmp_path / "_datasets.csv" + pd.DataFrame({"Name": ["dm"]}).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" readme = tmp_path / "readme.txt" dm.touch() readme.touch() with pytest.raises(InvalidCSVFile): - _validate_csv_data_paths([str(tables_csv), str(dm), str(readme)]) + _validate_csv_data_paths([str(datasets_csv), str(dm), str(readme)]) -class TestTablesCsvWithFilenameColumn: +class TestDatasetsCsvWithFilenameColumn: def test_keeps_only_allowed_datasets(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( {"Filename": ["dm.csv", "customers.csv"], "Label": ["test1", "test2"]} - ).to_csv(tables_csv, index=False) + ).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" customers = tmp_path / "customers.csv" @@ -50,128 +50,128 @@ def test_keeps_only_allowed_datasets(self, tmp_path): f.touch() result = _validate_csv_data_paths( - [str(tables_csv), str(dm), str(customers), str(orders)] + [str(datasets_csv), str(dm), str(customers), str(orders)] ) assert sorted(result) == sorted([str(dm), str(customers)]) assert str(orders) not in result def test_variables_csv_excluded_even_if_listed(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( - {"Filename": ["variables.csv", "dm.csv"], "Label": ["test1", "test2"]} - ).to_csv(tables_csv, index=False) - variables = tmp_path / "variables.csv" + {"Filename": ["_variables.csv", "dm.csv"], "Label": ["test1", "test2"]} + ).to_csv(datasets_csv, index=False) + variables = tmp_path / "_variables.csv" dm = tmp_path / "dm.csv" variables.touch() dm.touch() - result = _validate_csv_data_paths([str(tables_csv), str(variables), str(dm)]) + result = _validate_csv_data_paths([str(datasets_csv), str(variables), str(dm)]) assert str(variables) not in result assert str(dm) in result def test_filename_with_path_prefix_uses_stem_matching(self, tmp_path): """Filename 'subdir/dm.csv' -> stem 'dm' -> matches 'dm.csv' on disk.""" - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["subdir/dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - result = _validate_csv_data_paths([str(tables_csv), str(dm)]) + result = _validate_csv_data_paths([str(datasets_csv), str(dm)]) assert str(dm) in result def test_nan_filenames_are_ignored(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv", None], "Label": ["test1", None]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" unknown = tmp_path / "unknown.csv" dm.touch() unknown.touch() - result = _validate_csv_data_paths([str(tables_csv), str(dm), str(unknown)]) + result = _validate_csv_data_paths([str(datasets_csv), str(dm), str(unknown)]) assert str(dm) in result assert str(unknown) not in result def test_no_matching_files_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["nonexistent.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_non_csv_files_excluded_even_if_stem_matches(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm_xlsx = tmp_path / "dm.xlsx" dm_xlsx.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm_xlsx)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm_xlsx)]) == [] def test_encoding_is_forwarded_to_read_csv(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) (tmp_path / "dm.csv").touch() with patch("pandas.read_csv", wraps=pd.read_csv) as mock_read: _validate_csv_data_paths( - [str(tables_csv), str(tmp_path / "dm.csv")], encoding="latin-1" + [str(datasets_csv), str(tmp_path / "dm.csv")], encoding="latin-1" ) - mock_read.assert_called_once_with(tables_csv, encoding="latin-1") + mock_read.assert_called_once_with(datasets_csv, encoding="latin-1") class TestEdgeCases: - def test_only_tables_csv_in_input(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + def test_only_datasets_csv_in_input(self, tmp_path): + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) - assert _validate_csv_data_paths([str(tables_csv)]) == [] + assert _validate_csv_data_paths([str(datasets_csv)]) == [] def test_only_variables_csv_in_input(self): with pytest.raises(InvalidCSVFile): - _validate_csv_data_paths(["/data/variables.csv"]) + _validate_csv_data_paths(["/data/_variables.csv"]) def test_empty_filename_column_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame( {"Filename": pd.Series([], dtype=str), "Label": pd.Series([], dtype=str)} - ).to_csv(tables_csv, index=False) + ).to_csv(datasets_csv, index=False) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_all_filename_values_nan_returns_empty(self, tmp_path): - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": [None, None], "Label": [None, None]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - assert _validate_csv_data_paths([str(tables_csv), str(dm)]) == [] + assert _validate_csv_data_paths([str(datasets_csv), str(dm)]) == [] def test_duplicate_paths_removed(self, tmp_path): """The function does not deduplicate; duplicates in -> duplicates out.""" - tables_csv = tmp_path / "tables.csv" + datasets_csv = tmp_path / "_datasets.csv" pd.DataFrame({"Filename": ["dm.csv"], "Label": ["test1"]}).to_csv( - tables_csv, index=False + datasets_csv, index=False ) dm = tmp_path / "dm.csv" dm.touch() - paths = [str(tables_csv), str(dm), str(dm)] + paths = [str(datasets_csv), str(dm), str(dm)] result = _validate_csv_data_paths(paths) assert result.count(str(dm)) == 1 @@ -194,7 +194,7 @@ def test_duplicate_paths_removed(self, tmp_path): """ ) -TABLES_CSV = textwrap.dedent( +DATASETS_CSV = textwrap.dedent( """\ Filename,Label patients.csv,Patient Dataset @@ -215,7 +215,7 @@ def setup_method(self): _write(self.data_path, DATA_CSV) def _variables_path(self): - return Path(self.tmpdir) / "variables.csv" + return Path(self.tmpdir) / "_variables.csv" def test_returns_dict_with_expected_keys(self): _write(self._variables_path(), VARIABLES_CSV) @@ -315,7 +315,7 @@ def test_variable_name_to_size_map_with_nan_length(self): assert sizes["id"] is None def test_dataset_name_lookup_is_case_insensitive(self): - """File name with mixed case should still match variables.csv entry.""" + """File name with mixed case should still match _variables.csv entry.""" variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV") _write(self._variables_path(), variables_upper) reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV") @@ -336,14 +336,14 @@ def test_returns_partial_meta_when_no_variables_file(self, caplog): assert result["first_record"] == {"age": "30", "id": "1", "name": "Alice"} assert "No variables file found" in caplog.text - def test_dataset_label_added_when_tables_csv_present(self): + def test_dataset_label_added_when_datasets_csv_present(self): _write(self._variables_path(), VARIABLES_CSV) - _write(Path(self.tmpdir) / "tables.csv", TABLES_CSV) + _write(Path(self.tmpdir) / "_datasets.csv", DATASETS_CSV) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") result = reader.read() assert result.get("dataset_label") == "Patient Dataset" - def test_no_dataset_label_when_tables_csv_absent(self): + def test_no_dataset_label_when_datasets_csv_absent(self): _write(self._variables_path(), VARIABLES_CSV) reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") result = reader.read()