openml · PGijsbers · Sep 22, 2025 · Sep 20, 2025 · Sep 20, 2025
diff --git a/openml/testing.py b/openml/testing.py
@@ -108,6 +108,15 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         self.connection_n_retries = openml.config.connection_n_retries
         openml.config.set_retry_policy("robot", n_retries=20)
 
+    def use_production_server(self) -> None:
+        """
+        Use the production server for the OpenML API calls.
+
+        Please use this sparingly - it is better to use the test server.
+        """
+        openml.config.server = self.production_server
+        openml.config.apikey = ""
+
     def tearDown(self) -> None:
         """Tear down the test"""
         os.chdir(self.cwd)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -24,7 +24,7 @@ class OpenMLDatasetTest(TestBase):
 
     def setUp(self):
         super().setUp()
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         # Load dataset id 2 - dataset 2 is interesting because it contains
         # missing values, categorical features etc.
@@ -344,7 +344,7 @@ class OpenMLDatasetTestSparse(TestBase):
 
     def setUp(self):
         super().setUp()
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -139,7 +139,7 @@ def test_list_datasets_empty(self):
     @pytest.mark.production()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
-        openml.config.server = self.production_server
+        self.use_production_server()
         active = openml.datasets.check_datasets_active(
             [2, 17, 79],
             raise_error_if_not_exist=False,
@@ -176,27 +176,27 @@ def test_illegal_length_tag(self):
     @pytest.mark.production()
     def test__name_to_id_with_deactivated(self):
         """Check that an activated dataset is returned if an earlier deactivated one exists."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         # /d/1 was deactivated
         assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
 
     @pytest.mark.production()
     def test__name_to_id_with_multiple_active(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris") == 61
 
     @pytest.mark.production()
     def test__name_to_id_with_version(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
     @pytest.mark.production()
     def test__name_to_id_with_multiple_active_error(self):
         """With multiple active datasets, retrieve the least recent active."""
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaisesRegex(
             ValueError,
             "Multiple active datasets exist with name 'iris'.",
@@ -272,12 +272,12 @@ def test_get_dataset_uint8_dtype(self):
     @pytest.mark.production()
     def test_get_dataset_cannot_access_private_data(self):
         # Issue324 Properly handle private datasets when trying to access them
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
     @pytest.mark.skip("Need to find dataset name of private dataset")
     def test_dataset_by_name_cannot_access_private_data(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
     def test_get_dataset_lazy_all_functions(self):
@@ -1501,7 +1501,7 @@ def test_data_fork(self):
     @pytest.mark.production()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         datasets_a = openml.datasets.list_datasets()
         datasets_b = openml.datasets.list_datasets(size=np.inf)

diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
@@ -52,7 +52,7 @@ def _check_list_evaluation_setups(self, **kwargs):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_task(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         task_id = 7312
 
@@ -72,7 +72,7 @@ def test_evaluation_list_filter_task(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_16(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         uploader_id = 16
         evaluations = openml.evaluations.list_evaluations(
@@ -87,7 +87,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_uploader_ID_10(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         setup_id = 10
         evaluations = openml.evaluations.list_evaluations(
@@ -106,7 +106,7 @@ def test_evaluation_list_filter_uploader_ID_10(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow_id = 100
 
@@ -126,7 +126,7 @@ def test_evaluation_list_filter_flow(self):
 
     @pytest.mark.production()
     def test_evaluation_list_filter_run(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         run_id = 12
 
@@ -146,7 +146,7 @@ def test_evaluation_list_filter_run(self):
 
     @pytest.mark.production()
     def test_evaluation_list_limit(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         evaluations = openml.evaluations.list_evaluations(
             "predictive_accuracy",
@@ -164,7 +164,7 @@ def test_list_evaluations_empty(self):
 
     @pytest.mark.production()
     def test_evaluation_list_per_fold(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 1000
         task_ids = [6]
         uploader_ids = [1]
@@ -202,7 +202,7 @@ def test_evaluation_list_per_fold(self):
 
     @pytest.mark.production()
     def test_evaluation_list_sort(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 10
         task_id = 6
         # Get all evaluations of the task
@@ -239,7 +239,7 @@ def test_list_evaluation_measures(self):
 
     @pytest.mark.production()
     def test_list_evaluations_setups_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_id = [405]
         size = 100
         evals = self._check_list_evaluation_setups(flows=flow_id, size=size)
@@ -257,7 +257,7 @@ def test_list_evaluations_setups_filter_flow(self):
 
     @pytest.mark.production()
     def test_list_evaluations_setups_filter_task(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         task_id = [6]
         size = 121
         self._check_list_evaluation_setups(tasks=task_id, size=size)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -48,7 +48,7 @@ def tearDown(self):
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
         assert isinstance(flow, openml.OpenMLFlow)
@@ -82,7 +82,7 @@ def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
         flow_structure_name = flow.get_structure("name")
@@ -558,7 +558,7 @@ def test_extract_tags(self):
 
     @pytest.mark.production()
     def test_download_non_scikit_learn_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(6742)
         assert isinstance(flow, openml.OpenMLFlow)

diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -48,7 +48,7 @@ def _check_flow(self, flow):
 
     @pytest.mark.production()
     def test_list_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows()
@@ -59,7 +59,7 @@ def test_list_flows(self):
 
     @pytest.mark.production()
     def test_list_flows_output_format(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows()
@@ -68,21 +68,22 @@ def test_list_flows_output_format(self):
 
     @pytest.mark.production()
     def test_list_flows_empty(self):
+        self.use_production_server()
         openml.config.server = self.production_server
         flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
         assert flows.empty
 
     @pytest.mark.production()
     def test_list_flows_by_tag(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flows = openml.flows.list_flows(tag="weka")
         assert len(flows) >= 5
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
     @pytest.mark.production()
     def test_list_flows_paginate(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
@@ -302,7 +303,7 @@ def test_sklearn_to_flow_list_of_lists(self):
     def test_get_flow1(self):
         # Regression test for issue #305
         # Basically, this checks that a flow without an external version can be loaded
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(1)
         assert flow.external_version is None
 
@@ -335,7 +336,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
     )
     @pytest.mark.production()
     def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = 8175
         expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied."
         self.assertRaisesRegex(
@@ -356,7 +357,7 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==1.0.0" not in flow.dependencies
@@ -370,7 +371,7 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.23.1" not in flow.dependencies
@@ -382,7 +383,7 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
     )
     @pytest.mark.production()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
         assert flow.flow_id is None
         assert "sklearn==0.19.1" not in flow.dependencies

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -1083,7 +1083,7 @@ def test_local_run_metric_score(self):
 
     @pytest.mark.production()
     def test_online_run_metric_score(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         # important to use binary classification task,
         # due to assertions
@@ -1388,7 +1388,7 @@ def test__create_trace_from_arff(self):
     @pytest.mark.production()
     def test_get_run(self):
         # this run is not available on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         run = openml.runs.get_run(473351)
         assert run.dataset_id == 357
         assert run.evaluations["f_measure"] == 0.841225
@@ -1424,7 +1424,7 @@ def _check_run(self, run):
     @pytest.mark.production()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         runs = openml.runs.list_runs(id=[2], display_errors=True)
         assert len(runs) == 1
         for run in runs.to_dict(orient="index").values():
@@ -1437,7 +1437,7 @@ def test_list_runs_empty(self):
     @pytest.mark.production()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         task_ids = [20]
         runs = openml.runs.list_runs(task=task_ids)
         assert len(runs) >= 590
@@ -1456,7 +1456,7 @@ def test_get_runs_list_by_task(self):
     @pytest.mark.production()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         # 29 is Dominik Kirchhoff
         uploader_ids = [29]
 
@@ -1478,7 +1478,7 @@ def test_get_runs_list_by_uploader(self):
     @pytest.mark.production()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_ids = [1154]
         runs = openml.runs.list_runs(flow=flow_ids)
         assert len(runs) >= 1
@@ -1497,7 +1497,7 @@ def test_get_runs_list_by_flow(self):
     @pytest.mark.production()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         uploader_ids = [1]
         size = 10
         max = 100
@@ -1510,7 +1510,7 @@ def test_get_runs_pagination(self):
     @pytest.mark.production()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         ids = [505212, 6100]
         tasks = [2974, 339]
         uploaders_1 = [1, 2]
@@ -1548,7 +1548,8 @@ def test_get_runs_list_by_filters(self):
     def test_get_runs_list_by_tag(self):
         # TODO: comes from live, no such lists on test
         # Unit test works on production server only
-        openml.config.server = self.production_server
+
+        self.use_production_server()
         runs = openml.runs.list_runs(tag="curves")
         assert len(runs) >= 1
 
@@ -1663,7 +1664,7 @@ def test_run_flow_on_task_downloaded_flow(self):
     @pytest.mark.production()
     def test_format_prediction_non_supervised(self):
         # non-supervised tasks don't exist on the test server
-        openml.config.server = self.production_server
+        self.use_production_server()
         clustering = openml.tasks.get_task(126033, download_data=False)
         ignored_input = [0] * 5
         with pytest.raises(