From 83e1531850e763feddb526f2749b5691d48bd015 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 20 Jan 2026 12:35:18 +0100 Subject: [PATCH 01/13] Use the correct path to the cache directory for the task --- openml/tasks/functions.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 3df2861c0..2bf1a40f4 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -415,9 +415,10 @@ def get_task( if not isinstance(task_id, int): raise TypeError(f"Task id should be integer, is {type(task_id)}") - cache_key_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) - tid_cache_dir = cache_key_dir / str(task_id) - tid_cache_dir_existed = tid_cache_dir.exists() + task_cache_directory = openml.utils._create_cache_directory_for_id( + TASKS_CACHE_DIR_NAME, task_id + ) + task_cache_directory_existed = task_cache_directory.exists() try: task = _get_task_description(task_id) dataset = get_dataset(task.dataset_id, **get_dataset_kwargs) @@ -431,8 +432,8 @@ def get_task( if download_splits and isinstance(task, OpenMLSupervisedTask): task.download_split() except Exception as e: - if not tid_cache_dir_existed: - openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) + if not task_cache_directory_existed: + openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory) raise e return task From f90036debbf81fc3fd6452263d9b80e786ac2806 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 21 Jan 2026 16:50:09 +0100 Subject: [PATCH 02/13] Push configuration of test server URL exclusively to config.py --- openml/cli.py | 2 +- openml/config.py | 4 +++- openml/testing.py | 2 +- tests/conftest.py | 2 +- tests/test_datasets/test_dataset_functions.py | 14 +++++--------- tests/test_flows/test_flow_functions.py | 15 +++++---------- tests/test_openml/test_config.py | 2 +- tests/test_runs/test_run_functions.py | 9 +++------ tests/test_tasks/test_task_functions.py | 12 ++++-------- 9 files changed, 24 insertions(+), 38 deletions(-) diff --git a/openml/cli.py b/openml/cli.py index 0afb089c2..18192a7db 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -109,7 +109,7 @@ def check_server(server: str) -> str: def replace_shorthand(server: str) -> str: if server == "test": - return "https://test.openml.org/api/v1/xml" + return f"{config.TEST_SERVER_URL}/api/v1/xml" if server == "production": return "https://www.openml.org/api/v1/xml" return server diff --git a/openml/config.py b/openml/config.py index e6104fd7f..5b2d69067 100644 --- a/openml/config.py +++ b/openml/config.py @@ -27,6 +27,8 @@ OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET" _TEST_SERVER_NORMAL_USER_KEY = "normaluser" +TEST_SERVER_URL = "https://test.openml.org" + class _Config(TypedDict): apikey: str @@ -213,7 +215,7 @@ class ConfigurationForExamples: _last_used_server = None _last_used_key = None _start_last_called = False - _test_server = "https://test.openml.org/api/v1/xml" + _test_server = f"{TEST_SERVER_URL}/api/v1/xml" _test_apikey = _TEST_SERVER_NORMAL_USER_KEY @classmethod diff --git a/openml/testing.py b/openml/testing.py index 8d3bbbd5b..9ee555a91 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -47,7 +47,7 @@ class TestBase(unittest.TestCase): "user": [], } flow_name_tracker: ClassVar[list[str]] = [] - test_server = "https://test.openml.org/api/v1/xml" + test_server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml" admin_key = "abc" user_key = openml.config._TEST_SERVER_NORMAL_USER_KEY diff --git a/tests/conftest.py b/tests/conftest.py index bd974f3f3..29366ce37 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -277,7 +277,7 @@ def with_server(request): openml.config.apikey = None yield return - openml.config.server = "https://test.openml.org/api/v1/xml" + openml.config.server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml" openml.config.apikey = TestBase.user_key yield diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c41664ba7..74faa73ea 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1730,7 +1730,6 @@ def test_delete_dataset(self): @mock.patch.object(requests.Session, "delete") def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" ) @@ -1745,14 +1744,13 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke ): openml.datasets.delete_dataset(40_000) - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" + dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000" assert dataset_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" ) @@ -1767,14 +1765,13 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key ): openml.datasets.delete_dataset(40_000) - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" + dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000" assert dataset_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" ) @@ -1786,14 +1783,13 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key) success = openml.datasets.delete_dataset(40000) assert success - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" + dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000" assert dataset_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" ) @@ -1808,7 +1804,7 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key) ): openml.datasets.delete_dataset(9_999_999) - dataset_url = "https://test.openml.org/api/v1/xml/data/9999999" + dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/9999999" assert dataset_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @@ -2010,7 +2006,7 @@ def test_get_dataset_parquet(requests_mock, test_files_directory): test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml" ) # While the mocked example is from production, unit tests by default connect to the test server. - requests_mock.get("https://test.openml.org/api/v1/xml/data/61", text=content_file.read_text()) + requests_mock.get(f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/61", text=content_file.read_text()) dataset = openml.datasets.get_dataset(61, download_data=True) assert dataset._parquet_url is not None assert dataset.parquet_file is not None diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 2339b27c8..790686d94 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -453,7 +453,6 @@ def test_delete_flow(self): @mock.patch.object(requests.Session, "delete") def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -466,14 +465,13 @@ def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key): ): openml.flows.delete_flow(40_000) - flow_url = "https://test.openml.org/api/v1/xml/flow/40000" + flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000" assert flow_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -486,14 +484,13 @@ def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key): ): openml.flows.delete_flow(40_000) - flow_url = "https://test.openml.org/api/v1/xml/flow/40000" + flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000" assert flow_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_subflow(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -506,14 +503,13 @@ def test_delete_subflow(mock_delete, test_files_directory, test_api_key): ): openml.flows.delete_flow(40_000) - flow_url = "https://test.openml.org/api/v1/xml/flow/40000" + flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000" assert flow_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_flow_success(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml" mock_delete.return_value = create_request_response( status_code=200, @@ -523,7 +519,7 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key): success = openml.flows.delete_flow(33364) assert success - flow_url = "https://test.openml.org/api/v1/xml/flow/33364" + flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/33364" assert flow_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @@ -531,7 +527,6 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key): @mock.patch.object(requests.Session, "delete") @pytest.mark.xfail(reason="failures_issue_1544", strict=False) def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -544,6 +539,6 @@ def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key): ): openml.flows.delete_flow(9_999_999) - flow_url = "https://test.openml.org/api/v1/xml/flow/9999999" + flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/9999999" assert flow_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 7ef223504..3ff7bd55e 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -78,7 +78,7 @@ def test_get_config_as_dict(self): config = openml.config.get_config_as_dict() _config = {} _config["apikey"] = TestBase.user_key - _config["server"] = "https://test.openml.org/api/v1/xml" + _config["server"] = f"{openml.config.TEST_SERVER_URL}/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False _config["connection_n_retries"] = 20 diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 8f2c505b7..b8bd6abd7 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1813,7 +1813,6 @@ def test_initialize_model_from_run_nonstrict(self): @mock.patch.object(requests.Session, "delete") def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -1826,14 +1825,13 @@ def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key): ): openml.runs.delete_run(40_000) - run_url = "https://test.openml.org/api/v1/xml/run/40000" + run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/40000" assert run_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_run_success(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml" mock_delete.return_value = create_request_response( status_code=200, @@ -1843,14 +1841,13 @@ def test_delete_run_success(mock_delete, test_files_directory, test_api_key): success = openml.runs.delete_run(10591880) assert success - run_url = "https://test.openml.org/api/v1/xml/run/10591880" + run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/10591880" assert run_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -1863,7 +1860,7 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key): ): openml.runs.delete_run(9_999_999) - run_url = "https://test.openml.org/api/v1/xml/run/9999999" + run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/9999999" assert run_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index d44717177..af143a26b 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -244,7 +244,6 @@ def test_deletion_of_cache_dir(self): @mock.patch.object(requests.Session, "delete") def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -257,14 +256,13 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key): ): openml.tasks.delete_task(1) - task_url = "https://test.openml.org/api/v1/xml/task/1" + task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/1" assert task_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -277,14 +275,13 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key): ): openml.tasks.delete_task(3496) - task_url = "https://test.openml.org/api/v1/xml/task/3496" + task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/3496" assert task_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_success(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml" mock_delete.return_value = create_request_response( status_code=200, @@ -294,14 +291,13 @@ def test_delete_success(mock_delete, test_files_directory, test_api_key): success = openml.tasks.delete_task(361323) assert success - task_url = "https://test.openml.org/api/v1/xml/task/361323" + task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/361323" assert task_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") @mock.patch.object(requests.Session, "delete") def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key): - openml.config.start_using_configuration_for_example() content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml" mock_delete.return_value = create_request_response( status_code=412, @@ -314,6 +310,6 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key): ): openml.tasks.delete_task(9_999_999) - task_url = "https://test.openml.org/api/v1/xml/task/9999999" + task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/9999999" assert task_url == mock_delete.call_args.args[0] assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") From 3a257abea627f9a37d00feb7766cf1a49b82dbd5 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 28 Jan 2026 12:12:02 +0100 Subject: [PATCH 03/13] Update the test to use a dataset which does not have a parquet file Locally, MinIO already has more parquet files than on the test server. --- tests/test_datasets/test_dataset_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 74faa73ea..fe5939d7a 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -886,7 +886,7 @@ def test_create_invalid_dataset(self): @pytest.mark.uses_test_server() def test_get_online_dataset_arff(self): - dataset_id = 100 # Australian + dataset_id = 128 # iris -- one of the few datasets with parquet file # lazy loading not used as arff file is checked. dataset = openml.datasets.get_dataset(dataset_id, download_data=True) decoder = arff.ArffDecoder() From 3b79017a48da3ba9f002de813160ff60cb2159db Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 28 Jan 2026 12:32:30 +0100 Subject: [PATCH 04/13] Replace hard-coded cache directory by configured one --- tests/test_datasets/test_dataset_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index fe5939d7a..9df7e3879 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -527,7 +527,7 @@ def test_deletion_of_cache_dir(self): def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) - datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets") + datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets") assert len(os.listdir(datasets_cache_dir)) == 0 @pytest.mark.uses_test_server() From f524d756964ecb03e77f9e932022a446bd1c5a35 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 28 Jan 2026 16:04:26 +0100 Subject: [PATCH 05/13] Update test to use dataset file that is already in cache Note that the previously strategy didn't work anymore if the server returned a parquet file, which is the case for the new local setup. --- tests/test_datasets/test_dataset_functions.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 9df7e3879..27d3075fd 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -532,14 +532,7 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): @pytest.mark.uses_test_server() def test_publish_dataset(self): - # lazy loading not possible as we need the arff-file. - openml.datasets.get_dataset(3, download_data=True) - file_path = os.path.join( - openml.config.get_cache_directory(), - "datasets", - "3", - "dataset.arff", - ) + arff_file_path = Path(__file__).parent.parent / "files" / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff" dataset = OpenMLDataset( "anneal", "test", @@ -547,7 +540,7 @@ def test_publish_dataset(self): version=1, licence="public", default_target_attribute="class", - data_file=file_path, + data_file=arff_file_path, ) dataset.publish() TestBase._mark_entity_for_removal("data", dataset.dataset_id) From a5601e3dc849ac4c8759c14292960d624d774ff0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 29 Jan 2026 10:05:28 +0100 Subject: [PATCH 06/13] relax assumptions on local file structure --- tests/test_datasets/test_dataset_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 27d3075fd..49b13e4b8 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1457,8 +1457,9 @@ def test_data_edit_critical_field(self): raise e time.sleep(10) # Delete the cache dir to get the newer version of the dataset + shutil.rmtree( - os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)), + os.path.join(openml.config.get_cache_directory(), "datasets", str(did)), ) @pytest.mark.uses_test_server() @@ -1892,9 +1893,8 @@ def _dataset_features_is_downloaded(did: int): def _dataset_data_file_is_downloaded(did: int): - parquet_present = _dataset_file_is_downloaded(did, "dataset.pq") - arff_present = _dataset_file_is_downloaded(did, "dataset.arff") - return parquet_present or arff_present + cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did) + return any(f.suffix in (".pq", ".arff") for f in cache_directory.iterdir()) def _assert_datasets_retrieved_successfully( From d862be2de5ddc4d551efad22dff1fdefb7db3854 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 29 Jan 2026 10:47:51 +0100 Subject: [PATCH 07/13] Do not use static cache directory --- tests/test_tasks/test_task_functions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index af143a26b..08811add5 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -156,13 +156,13 @@ def test_get_task(self): task = openml.tasks.get_task(1, download_data=True) # anneal; crossvalidation assert isinstance(task, OpenMLTask) assert os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml") + os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml") ) assert not os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") + os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff") ) assert os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff") + os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq") ) @pytest.mark.uses_test_server() @@ -170,21 +170,21 @@ def test_get_task_lazy(self): task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation assert isinstance(task, OpenMLTask) assert os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml") + os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml") ) assert task.class_labels == ["1", "2", "3", "4", "5", "U"] assert not os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") + os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff") ) # Since the download_data=False is propagated to get_dataset assert not os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff") + os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff") ) task.download_split() assert os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff") + os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff") ) @mock.patch("openml.tasks.functions.get_dataset") @@ -228,7 +228,7 @@ def test_download_split(self): split = task.download_split() assert type(split) == OpenMLSplit assert os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff") + os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff") ) def test_deletion_of_cache_dir(self): From 16699e6871f6b242fbd4fae1e2893dc78930bf1e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 29 Jan 2026 11:18:40 +0100 Subject: [PATCH 08/13] Update expected number to match initial server state This means it is not reliant on the evaluation engine processing the dataset. Interestingly, the database state purposely seems to keep the last task's dataset in preparation explicitly (by having processing marked as done but having to dataset_status entry). --- tests/test_tasks/test_task_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 08811add5..6951bf36f 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -96,7 +96,9 @@ def test_list_tasks_empty(self): @pytest.mark.uses_test_server() def test_list_tasks_by_tag(self): - num_basic_tasks = 100 # number is flexible, check server if fails + # Server starts with 99 active tasks with the tag, and one 'in_preparation', + # so depending on the processing of the last dataset, there may be 99 or 100 matches. + num_basic_tasks = 99 tasks = openml.tasks.list_tasks(tag="OpenML100") assert len(tasks) >= num_basic_tasks for task in tasks.to_dict(orient="index").values(): From 775dcf722f95aa0f78b4dbef16fe8177cec2a6f0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 30 Jan 2026 10:30:18 +0100 Subject: [PATCH 09/13] Add symlink to regular test cache directory --- tests/files/localhost:8080 | 1 + 1 file changed, 1 insertion(+) create mode 120000 tests/files/localhost:8080 diff --git a/tests/files/localhost:8080 b/tests/files/localhost:8080 new file mode 120000 index 000000000..5a469fa32 --- /dev/null +++ b/tests/files/localhost:8080 @@ -0,0 +1 @@ +/Users/pietergijsbers/repositories/openml-python/tests/files/org/openml/test \ No newline at end of file From 319cb355c7b4488f83e223e3a9b0d9d20e080771 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 30 Jan 2026 10:47:29 +0100 Subject: [PATCH 10/13] Skip test for 1.8 since expected results differ too much --- tests/test_runs/test_run_functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index b8bd6abd7..dda940e4d 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1870,6 +1870,10 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key): Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) +@unittest.skipIf( + Version(sklearn.__version__) >= Version("1.8"), + reason="predictions differ significantly", + ) @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") @pytest.mark.uses_test_server() def test__run_task_get_arffcontent_2(parallel_mock): From a680ebe1648ec2bd549259eab164c62e66bb7151 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 30 Jan 2026 11:08:50 +0100 Subject: [PATCH 11/13] Simplify path to static cache directory --- tests/test_datasets/test_dataset_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 49b13e4b8..2654721bd 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -532,7 +532,7 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): @pytest.mark.uses_test_server() def test_publish_dataset(self): - arff_file_path = Path(__file__).parent.parent / "files" / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff" + arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff" dataset = OpenMLDataset( "anneal", "test", From b161b3b8ce5d92d31f4564ae60cb836ae5793d57 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 30 Jan 2026 11:26:24 +0100 Subject: [PATCH 12/13] Update symbolic link to be relative --- tests/files/localhost:8080 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/files/localhost:8080 b/tests/files/localhost:8080 index 5a469fa32..334c709ef 120000 --- a/tests/files/localhost:8080 +++ b/tests/files/localhost:8080 @@ -1 +1 @@ -/Users/pietergijsbers/repositories/openml-python/tests/files/org/openml/test \ No newline at end of file +org/openml/test \ No newline at end of file From 0b989d151e45899c0cba0f7981938b293668ad82 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 30 Jan 2026 11:27:52 +0100 Subject: [PATCH 13/13] Fix typo --- tests/test_datasets/test_dataset_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 2654721bd..d8a9d80b9 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -879,7 +879,7 @@ def test_create_invalid_dataset(self): @pytest.mark.uses_test_server() def test_get_online_dataset_arff(self): - dataset_id = 128 # iris -- one of the few datasets with parquet file + dataset_id = 128 # iris -- one of the few datasets without parquet file # lazy loading not used as arff file is checked. dataset = openml.datasets.get_dataset(dataset_id, download_data=True) decoder = arff.ArffDecoder()