From 24a80fed8803de640088a2b38912d804851d1555 Mon Sep 17 00:00:00 2001
From: Jos van der Velde <j.d.v.d.velde@tue.nl>
Date: Sat, 20 Sep 2025 20:17:54 +0200
Subject: [PATCH 1/2] Do not use test api_key for production calls inside the
 unittests

---
 openml/testing.py                             |  5 +++++
 tests/test_datasets/test_dataset.py           |  4 ++--
 tests/test_datasets/test_dataset_functions.py | 16 +++++++-------
 .../test_evaluation_functions.py              | 20 +++++++++---------
 tests/test_flows/test_flow.py                 |  6 +++---
 tests/test_flows/test_flow_functions.py       | 19 +++++++++--------
 tests/test_runs/test_run_functions.py         | 21 ++++++++++---------
 tests/test_setups/test_setup_functions.py     |  4 ++--
 tests/test_study/test_study_functions.py      | 12 +++++------
 tests/test_tasks/test_clustering_task.py      |  4 ++--
 10 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/openml/testing.py b/openml/testing.py
index 547405df0..d6d2866c5 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -51,6 +51,7 @@ class TestBase(unittest.TestCase):
     # amueller's read/write key that he will throw away later
     apikey = "610344db6388d9ba34f6db45a3cf71de"
 
+
     # creating logger for tracking files uploaded to test server
     logger = logging.getLogger("unit_tests_published_entities")
     logger.setLevel(logging.DEBUG)
@@ -108,6 +109,10 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         self.connection_n_retries = openml.config.connection_n_retries
         openml.config.set_retry_policy("robot", n_retries=20)
 
+    def use_production_server(self):
+        openml.config.server = self.production_server
+        openml.config.apikey = None
+
     def tearDown(self) -> None:
         """Tear down the test"""
         os.chdir(self.cwd)
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index c48086a72..86a4d3f57 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -24,7 +24,7 @@ class OpenMLDatasetTest(TestBase):
 
     def setUp(self):
         super().setUp()
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         # Load dataset id 2 - dataset 2 is interesting because it contains
         # missing values, categorical features etc.
@@ -344,7 +344,7 @@ class OpenMLDatasetTestSparse(TestBase):
 
     def setUp(self):
         super().setUp()
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 1c06cc4b5..4145b86ad 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -139,7 +139,7 @@ def test_list_datasets_empty(self):
     @pytest.mark.production()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
-        openml.config.server = self.production_server
+        self.use_production_server()
         active = openml.datasets.check_datasets_active(
             [2, 17, 79],
             raise_error_if_not_exist=False,
@@ -176,7 +176,7 @@ def test_illegal_length_tag(self):
     @pytest.mark.production()
     def test__name_to_id_with_deactivated(self):
         """Check that an activated dataset is returned if an earlier deactivated one exists."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         # /d/1 was deactivated
         assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
@@ -184,19 +184,19 @@ def test__name_to_id_with_deactivated(self):
     @pytest.mark.production()
     def test__name_to_id_with_multiple_active(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris") == 61
 
     @pytest.mark.production()
     def test__name_to_id_with_version(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
     @pytest.mark.production()
     def test__name_to_id_with_multiple_active_error(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaisesRegex(
             ValueError,
             "Multiple active datasets exist with name 'iris'.",
@@ -272,12 +272,12 @@ def test_get_dataset_uint8_dtype(self):
     @pytest.mark.production()
     def test_get_dataset_cannot_access_private_data(self):
         # Issue324 Properly handle private datasets when trying to access them
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
     @pytest.mark.skip("Need to find dataset name of private dataset")
     def test_dataset_by_name_cannot_access_private_data(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
     def test_get_dataset_lazy_all_functions(self):
@@ -1501,7 +1501,7 @@ def test_data_fork(self):
     @pytest.mark.production()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         datasets_a = openml.datasets.list_datasets()
         datasets_b = openml.datasets.list_datasets(size=np.inf)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 37b0ce7c8..ffd3d9f78 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -52,7 +52,7 @@ def _check_list_evaluation_setups(self, **kwargs):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_task(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         task_id = 7312
 
@@ -72,7 +72,7 @@ def test_evaluation_list_filter_task(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_16(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         uploader_id = 16
         evaluations = openml.evaluations.list_evaluations(
@@ -87,7 +87,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_10(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         setup_id = 10
         evaluations = openml.evaluations.list_evaluations(
@@ -106,7 +106,7 @@ def test_evaluation_list_filter_uploader_ID_10(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow_id = 100
 
@@ -126,7 +126,7 @@ def test_evaluation_list_filter_flow(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_run(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         run_id = 12
 
@@ -146,7 +146,7 @@ def test_evaluation_list_filter_run(self):
 
     @pytest.mark.production()
     def test_evaluation_list_limit(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         evaluations = openml.evaluations.list_evaluations(
             "predictive_accuracy",
@@ -164,7 +164,7 @@ def test_list_evaluations_empty(self):
 
     @pytest.mark.production()
     def test_evaluation_list_per_fold(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 1000
         task_ids = [6]
         uploader_ids = [1]
@@ -202,7 +202,7 @@ def test_evaluation_list_per_fold(self):
 
     @pytest.mark.production()
     def test_evaluation_list_sort(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 10
         task_id = 6
         # Get all evaluations of the task
@@ -239,7 +239,7 @@ def test_list_evaluation_measures(self):
 
     @pytest.mark.production()
     def test_list_evaluations_setups_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_id = [405]
         size = 100
         evals = self._check_list_evaluation_setups(flows=flow_id, size=size)
@@ -257,7 +257,7 @@ def test_list_evaluations_setups_filter_flow(self):
 
     @pytest.mark.production()
     def test_list_evaluations_setups_filter_task(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         task_id = [6]
         size = 121
         self._check_list_evaluation_setups(tasks=task_id, size=size)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index e6407a51c..0b034c3b4 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -48,7 +48,7 @@ def tearDown(self):
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
         assert isinstance(flow, openml.OpenMLFlow)
@@ -82,7 +82,7 @@ def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
         flow_structure_name = flow.get_structure("name")
@@ -558,7 +558,7 @@ def test_extract_tags(self):
 
     @pytest.mark.production()
     def test_download_non_scikit_learn_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(6742)
         assert isinstance(flow, openml.OpenMLFlow)
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 4a9b03fd7..ef4759e54 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -48,7 +48,7 @@ def _check_flow(self, flow):
 
     @pytest.mark.production()
     def test_list_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows()
@@ -59,7 +59,7 @@ def test_list_flows(self):
 
     @pytest.mark.production()
     def test_list_flows_output_format(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows()
@@ -68,13 +68,14 @@ def test_list_flows_output_format(self):
 
     @pytest.mark.production()
     def test_list_flows_empty(self):
+        self.use_production_server()
         openml.config.server = self.production_server
         flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
         assert flows.empty
 
     @pytest.mark.production()
     def test_list_flows_by_tag(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flows = openml.flows.list_flows(tag="weka")
         assert len(flows) >= 5
         for flow in flows.to_dict(orient="index").values():
@@ -82,7 +83,7 @@ def test_list_flows_by_tag(self):
 
     @pytest.mark.production()
     def test_list_flows_paginate(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
@@ -302,7 +303,7 @@ def test_sklearn_to_flow_list_of_lists(self):
     def test_get_flow1(self):
         # Regression test for issue #305
         # Basically, this checks that a flow without an external version can be loaded
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(1)
         assert flow.external_version is None
 
@@ -335,7 +336,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
     )
     @pytest.mark.production()
     def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = 8175
         expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied."
         self.assertRaisesRegex(
@@ -356,7 +357,7 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==1.0.0" not in flow.dependencies
@@ -370,7 +371,7 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.23.1" not in flow.dependencies
@@ -382,7 +383,7 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.19.1" not in flow.dependencies
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 7dff05cfc..b02acdf51 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1083,7 +1083,7 @@ def test_local_run_metric_score(self):
 
     @pytest.mark.production()
     def test_online_run_metric_score(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         # important to use binary classification task,
         # due to assertions
@@ -1388,7 +1388,7 @@ def test__create_trace_from_arff(self):
     @pytest.mark.production()
     def test_get_run(self):
         # this run is not available on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         run = openml.runs.get_run(473351)
         assert run.dataset_id == 357
         assert run.evaluations["f_measure"] == 0.841225
@@ -1424,7 +1424,7 @@ def _check_run(self, run):
     @pytest.mark.production()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         runs = openml.runs.list_runs(id=[2], display_errors=True)
         assert len(runs) == 1
         for run in runs.to_dict(orient="index").values():
@@ -1437,7 +1437,7 @@ def test_list_runs_empty(self):
     @pytest.mark.production()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         task_ids = [20]
         runs = openml.runs.list_runs(task=task_ids)
         assert len(runs) >= 590
@@ -1456,7 +1456,7 @@ def test_get_runs_list_by_task(self):
     @pytest.mark.production()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         # 29 is Dominik Kirchhoff
         uploader_ids = [29]
 
@@ -1478,7 +1478,7 @@ def test_get_runs_list_by_uploader(self):
     @pytest.mark.production()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_ids = [1154]
         runs = openml.runs.list_runs(flow=flow_ids)
         assert len(runs) >= 1
@@ -1497,7 +1497,7 @@ def test_get_runs_list_by_flow(self):
     @pytest.mark.production()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         uploader_ids = [1]
         size = 10
         max = 100
@@ -1510,7 +1510,7 @@ def test_get_runs_pagination(self):
     @pytest.mark.production()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         ids = [505212, 6100]
         tasks = [2974, 339]
         uploaders_1 = [1, 2]
@@ -1548,7 +1548,8 @@ def test_get_runs_list_by_filters(self):
     def test_get_runs_list_by_tag(self):
         # TODO: comes from live, no such lists on test
         # Unit test works on production server only
-        openml.config.server = self.production_server
+
+        self.use_production_server()
         runs = openml.runs.list_runs(tag="curves")
         assert len(runs) >= 1
 
@@ -1663,7 +1664,7 @@ def test_run_flow_on_task_downloaded_flow(self):
     @pytest.mark.production()
     def test_format_prediction_non_supervised(self):
         # non-supervised tasks don't exist on the test server
-        openml.config.server = self.production_server
+        self.use_production_server()
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
         with pytest.raises(
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index b805ca9d3..6fd11638f 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -134,7 +134,7 @@ def test_get_setup(self):
 
     @pytest.mark.production()
     def test_setup_list_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow_id = 5873
 
@@ -153,7 +153,7 @@ def test_list_setups_empty(self):
 
     @pytest.mark.production()
     def test_list_setups_output_format(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_id = 6794
         setups = openml.setups.list_setups(flow=flow_id, size=10)
         assert isinstance(setups, dict)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 22f5b0d03..40026592f 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -14,7 +14,7 @@ class TestStudyFunctions(TestBase):
 
     @pytest.mark.production()
     def test_get_study_old(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_study(34)
         assert len(study.data) == 105
@@ -25,7 +25,7 @@ def test_get_study_old(self):
 
     @pytest.mark.production()
     def test_get_study_new(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_study(123)
         assert len(study.data) == 299
@@ -36,7 +36,7 @@ def test_get_study_new(self):
 
     @pytest.mark.production()
     def test_get_openml100(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_study("OpenML100", "tasks")
         assert isinstance(study, openml.study.OpenMLBenchmarkSuite)
@@ -46,7 +46,7 @@ def test_get_openml100(self):
 
     @pytest.mark.production()
     def test_get_study_error(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         with pytest.raises(
             ValueError, match="Unexpected entity type 'task' reported by the server, expected 'run'"
@@ -55,7 +55,7 @@ def test_get_study_error(self):
 
     @pytest.mark.production()
     def test_get_suite(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_suite(99)
         assert len(study.data) == 72
@@ -66,7 +66,7 @@ def test_get_suite(self):
 
     @pytest.mark.production()
     def test_get_suite_error(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         with pytest.raises(
             ValueError, match="Unexpected entity type 'run' reported by the server, expected 'task'"
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index bc0876228..dcc024388 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -23,14 +23,14 @@ def setUp(self, n_levels: int = 1):
     @pytest.mark.production()
     def test_get_dataset(self):
         # no clustering tasks on test server
-        openml.config.server = self.production_server
+        self.use_production_server()
         task = openml.tasks.get_task(self.task_id)
         task.get_dataset()
 
     @pytest.mark.production()
     def test_download_task(self):
         # no clustering tasks on test server
-        openml.config.server = self.production_server
+        self.use_production_server()
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.CLUSTERING

From 69875d70d351081e11b17460df88272f2c3f544b Mon Sep 17 00:00:00 2001
From: Jos van der Velde <j.d.v.d.velde@tue.nl>
Date: Sat, 20 Sep 2025 20:21:20 +0200
Subject: [PATCH 2/2] Precommit checks

---
 openml/testing.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/openml/testing.py b/openml/testing.py
index d6d2866c5..2003bb1b9 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -51,7 +51,6 @@ class TestBase(unittest.TestCase):
     # amueller's read/write key that he will throw away later
     apikey = "610344db6388d9ba34f6db45a3cf71de"
 
-
     # creating logger for tracking files uploaded to test server
     logger = logging.getLogger("unit_tests_published_entities")
     logger.setLevel(logging.DEBUG)
@@ -109,9 +108,14 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         self.connection_n_retries = openml.config.connection_n_retries
         openml.config.set_retry_policy("robot", n_retries=20)
 
-    def use_production_server(self):
+    def use_production_server(self) -> None:
+        """
+        Use the production server for the OpenML API calls.
+
+        Please use this sparingly - it is better to use the test server.
+        """
         openml.config.server = self.production_server
-        openml.config.apikey = None
+        openml.config.apikey = ""
 
     def tearDown(self) -> None:
         """Tear down the test"""