From ec57d8e5ab9f6a09856e0863fed7386971649b85 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 6 Jan 2026 12:49:07 +0500
Subject: [PATCH 1/3] add marker

---
 tests/test_datasets/test_dataset.py           |  9 ++-
 tests/test_datasets/test_dataset_functions.py | 67 ++++++++++++++++---
 .../test_evaluation_functions.py              |  2 +
 tests/test_flows/test_flow.py                 | 13 ++--
 tests/test_flows/test_flow_functions.py       |  7 +-
 tests/test_openml/test_api_calls.py           |  3 +
 tests/test_runs/test_run.py                   | 11 +--
 tests/test_runs/test_run_functions.py         | 50 ++++++++------
 tests/test_setups/test_setup_functions.py     | 11 +--
 tests/test_study/test_study_functions.py      |  5 ++
 tests/test_tasks/test_classification_task.py  |  7 +-
 tests/test_tasks/test_clustering_task.py      |  2 +
 tests/test_tasks/test_learning_curve_task.py  |  6 +-
 tests/test_tasks/test_regression_task.py      |  4 +-
 tests/test_tasks/test_supervised_task.py      |  1 +
 tests/test_tasks/test_task.py                 |  3 +-
 tests/test_tasks/test_task_functions.py       | 16 ++++-
 tests/test_tasks/test_task_methods.py         |  2 +
 tests/test_utils/test_utils.py                | 20 +++---
 19 files changed, 170 insertions(+), 69 deletions(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 66e9b8554..6dc4c7d5d 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -278,6 +278,7 @@ def test_equality_comparison(self):
         self.assertNotEqual(self.titanic, "Wrong_object")
 
 
+@pytest.mark.uses_test_server()
 def test_tagging():
     dataset = openml.datasets.get_dataset(125, download_data=False)
 
@@ -294,7 +295,7 @@ def test_tagging():
     datasets = openml.datasets.list_datasets(tag=tag)
     assert datasets.empty
 
-@pytest.mark.xfail(reason="failures_issue_1544")
+@pytest.mark.uses_test_server()
 def test_get_feature_with_ontology_data_id_11():
     # test on car dataset, which has built-in ontology references
     dataset = openml.datasets.get_dataset(11)
@@ -303,6 +304,7 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
+@pytest.mark.uses_test_server()
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -310,6 +312,7 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
+@pytest.mark.uses_test_server()
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
@@ -318,6 +321,7 @@ def test_add_same_ontology_multiple_features():
         openml.datasets.functions.data_feature_add_ontology(did, i, ontology)    
 
 
+@pytest.mark.uses_test_server()
 def test_add_illegal_long_ontology():
     did = 1
     ontology = "http://www.google.com/" + ("a" * 257)
@@ -329,6 +333,7 @@ def test_add_illegal_long_ontology():
     
 
 
+@pytest.mark.uses_test_server()
 def test_add_illegal_url_ontology():
     did = 1
     ontology = "not_a_url" + str(time())
@@ -400,6 +405,7 @@ def test_get_sparse_categorical_data_id_395(self):
         assert len(feature.nominal_values) == 25
 
 
+@pytest.mark.uses_test_server()
 def test__read_features(mocker, workdir, static_cache_dir):
     """Test we read the features from the xml if no cache pickle is available.
     This test also does some simple checks to verify that the features are read correctly
@@ -431,6 +437,7 @@ def test__read_features(mocker, workdir, static_cache_dir):
     assert pickle_mock.dump.call_count == 1
 
 
+@pytest.mark.uses_test_server()
 def test__read_qualities(static_cache_dir, workdir, mocker):
     """Test we read the qualities from the xml if no cache pickle is available.
     This test also does some minor checks to ensure that the qualities are read correctly.
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index f8cb1943c..c41664ba7 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -107,6 +107,7 @@ def _check_datasets(self, datasets):
         for did in datasets:
             self._check_dataset(datasets[did])
 
+    @pytest.mark.uses_test_server()
     def test_tag_untag_dataset(self):
         tag = "test_tag_%d" % random.randint(1, 1000000)
         all_tags = _tag_entity("data", 1, tag)
@@ -114,10 +115,12 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         assert tag not in all_tags
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_length(self):
         datasets = openml.datasets.list_datasets()
         assert len(datasets) >= 100
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
@@ -132,6 +135,7 @@ def test_list_datasets_paginate(self):
                 categories=["in_preparation", "active", "deactivated"],
             )
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_empty(self):
         datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
         assert datasets.empty
@@ -155,6 +159,7 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
+    @pytest.mark.uses_test_server()
     def test_illegal_character_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "illegal_tag&"
@@ -164,6 +169,7 @@ def test_illegal_character_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
+    @pytest.mark.uses_test_server()
     def test_illegal_length_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "a" * 65
@@ -205,6 +211,7 @@ def test__name_to_id_with_multiple_active_error(self):
             error_if_multiple=True,
         )
 
+    @pytest.mark.uses_test_server()
     def test__name_to_id_name_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -214,6 +221,7 @@ def test__name_to_id_name_does_not_exist(self):
             dataset_name="does_not_exist",
         )
 
+    @pytest.mark.uses_test_server()
     def test__name_to_id_version_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -224,6 +232,7 @@ def test__name_to_id_version_does_not_exist(self):
             version=100000,
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", "kr-vs-kp"]
@@ -231,6 +240,7 @@ def test_get_datasets_by_name(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", 2]
@@ -238,12 +248,14 @@ def test_get_datasets_by_mixed(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -262,6 +274,7 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -280,7 +293,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -310,24 +323,28 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
         assert isinstance(X, pd.DataFrame)
         assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
+    @pytest.mark.uses_test_server()
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
         dataset = openml.datasets.get_dataset(did)
         assert dataset.row_id_attribute == "Counter"
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
         assert isinstance(description, dict)
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
+    @pytest.mark.uses_test_server()
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -391,6 +408,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
 
 
     @mock.patch("openml._api_calls._download_minio_file")
+    @pytest.mark.uses_test_server()
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
@@ -431,18 +449,21 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
         qualities_xml_path = self.workdir / "qualities.xml"
         assert qualities_xml_path.exists()
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_force_refresh_cache(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -465,6 +486,7 @@ def test_get_dataset_force_refresh_cache(self):
             did_cache_dir,
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_force_refresh_cache_clean_start(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -501,12 +523,14 @@ def test_deletion_of_cache_dir(self):
 
     # get_dataset_description is the only data guaranteed to be downloaded
     @mock.patch("openml.datasets.functions._get_dataset_description")
+    @pytest.mark.uses_test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
         self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
         datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.uses_test_server()
     def test_publish_dataset(self):
         # lazy loading not possible as we need the arff-file.
         openml.datasets.get_dataset(3, download_data=True)
@@ -532,6 +556,7 @@ def test_publish_dataset(self):
         )
         assert isinstance(dataset.dataset_id, int)
 
+    @pytest.mark.uses_test_server()
     def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2).retrieve_class_labels()
@@ -548,6 +573,7 @@ def test__retrieve_class_labels(self):
         labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
         assert labels == ["COIL", "SHEET"]
 
+    @pytest.mark.uses_test_server()
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -574,6 +600,7 @@ def _assert_status_of_dataset(self, *, did: int, status: str):
         assert result[did]["status"] == status
 
     @pytest.mark.flaky()
+    @pytest.mark.uses_test_server()
     def test_data_status(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -665,7 +692,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -699,6 +726,7 @@ def test_create_dataset_numpy(self):
         ), "Uploaded arff does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.uses_test_server()
     def test_create_dataset_list(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -753,7 +781,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -856,6 +884,7 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
+    @pytest.mark.uses_test_server()
     def test_get_online_dataset_arff(self):
         dataset_id = 100  # Australian
         # lazy loading not used as arff file is checked.
@@ -871,7 +900,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -890,6 +919,7 @@ def test_topic_api_error(self):
             topic="business",
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_online_dataset_format(self):
         # Phoneme dataset
         dataset_id = 77
@@ -899,7 +929,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1124,7 +1154,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1243,7 +1273,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1334,11 +1364,13 @@ def test_create_dataset_attributes_auto_without_df(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.uses_test_server()
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1354,6 +1386,7 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1386,6 +1419,7 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1407,7 +1441,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1434,6 +1468,7 @@ def test_data_edit_critical_field(self):
                     os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
                 )
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_requires_field(self):
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
@@ -1446,6 +1481,7 @@ def test_data_edit_requires_field(self):
             data_id=64,  # blood-transfusion-service-center
         )
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_requires_valid_dataset(self):
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1456,7 +1492,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1483,6 +1519,7 @@ def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.uses_test_server()
     def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
@@ -1494,6 +1531,7 @@ def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.uses_test_server()
     def test_data_fork(self):
         did = 1
         result = fork_dataset(did)
@@ -1785,6 +1823,7 @@ def all_datasets():
     return openml.datasets.list_datasets()
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets(all_datasets: pd.DataFrame):
     # We can only perform a smoke test here because we test on dynamic
     # data from the internet...
@@ -1793,42 +1832,49 @@ def test_list_datasets(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(all_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
     tag_datasets = openml.datasets.list_datasets(tag="study_14")
     assert 0 < len(tag_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(tag_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_size():
     datasets = openml.datasets.list_datasets(size=5)
     assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     small_datasets = openml.datasets.list_datasets(number_instances="5..100")
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     five_class_datasets = openml.datasets.list_datasets(number_classes="5")
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1901,6 +1947,7 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
+@pytest.mark.uses_test_server()
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
@@ -1927,6 +1974,7 @@ def test_get_dataset_lazy_behavior(
     )
 
 
+@pytest.mark.uses_test_server()
 def test_get_dataset_with_invalid_id() -> None:
     INVALID_ID = 123819023109238  # Well, at some point this will probably be valid...
     with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
@@ -1954,6 +2002,7 @@ def test_read_features_from_xml_with_whitespace() -> None:
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
 
 
+@pytest.mark.uses_test_server()
 def test_get_dataset_parquet(requests_mock, test_files_directory):
     # Parquet functionality is disabled on the test server
     # There is no parquet-copy of the test server yet.
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index ffd3d9f78..7009217d6 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -155,6 +155,7 @@ def test_evaluation_list_limit(self):
         )
         assert len(evaluations) == 100
 
+    @pytest.mark.uses_test_server()
     def test_list_evaluations_empty(self):
         evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
@@ -232,6 +233,7 @@ def test_evaluation_list_sort(self):
         test_output = sorted(unsorted_output, reverse=True)
         assert test_output[:size] == sorted_output
 
+    @pytest.mark.uses_test_server()
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
         assert isinstance(measures, list) is True
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index da719d058..99cee6f87 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -102,6 +102,7 @@ def test_get_structure(self):
                 subflow = flow.get_subflow(structure)
                 assert subflow.flow_id == sub_flow_id
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         flows = openml.flows.list_flows(size=1)
         flow_id = flows["id"].iloc[0]
@@ -119,6 +120,7 @@ def test_tagging(self):
         flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
 
+    @pytest.mark.uses_test_server()
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
         # TODO maybe get this via get_flow(), which would have to be refactored
@@ -178,7 +180,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -220,7 +222,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -271,7 +273,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -363,6 +365,7 @@ def test_illegal_flow(self):
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
+    @pytest.mark.uses_test_server()
     def test_nonexisting_flow_exists(self):
         def get_sentinel():
             # Create a unique prefix for the flow. Necessary because the flow
@@ -380,7 +383,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -421,7 +424,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 0be65ceac..46bc36a94 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -274,12 +274,12 @@ def test_are_flows_equal_ignore_if_older(self):
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OrdinalEncoder introduced in 0.20. "
         "No known models with list of lists parameters in older versions.",
     )
+    @pytest.mark.uses_test_server()
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
 
@@ -308,6 +308,7 @@ def test_get_flow1(self):
         assert flow.external_version is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
@@ -319,6 +320,7 @@ def test_get_flow_reinstantiate_model(self):
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
+    @pytest.mark.uses_test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
         self.assertRaisesRegex(
@@ -389,7 +391,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
@@ -424,6 +426,7 @@ def test_get_flow_id(self):
             pytest.skip(reason="Not sure why there should only be one version of this flow.")
             assert flow_ids_exact_version_True == flow_ids_exact_version_False
 
+    @pytest.mark.uses_test_server()
     def test_delete_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index da6857b6e..a295259ef 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -15,12 +15,14 @@
 
 
 class TestConfig(openml.testing.TestBase):
+    @pytest.mark.uses_test_server()
     def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
+    @pytest.mark.uses_test_server()
     def test_retry_on_database_error(self, Session_class_mock, _):
         response_mock = unittest.mock.Mock()
         response_mock.text = (
@@ -115,6 +117,7 @@ def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
         ("task/42", "delete"),  # 460
     ],
 )
+@pytest.mark.uses_test_server()
 def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
     endpoint: str,
     method: str,
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 71651d431..1a66b76c0 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,6 +25,7 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
@@ -118,7 +119,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -154,7 +155,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -189,7 +190,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -295,7 +296,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -339,7 +340,7 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 305d859d9..98b97045a 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -398,6 +398,7 @@ def _check_sample_evaluations(
                             assert evaluation < max_time_allowed
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_regression_on_classif_task(self):
         task_id = 259  # collins; crossvalidation; has numeric targets
 
@@ -413,8 +414,8 @@ def test_run_regression_on_classif_task(self):
                 task=task,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -882,8 +883,8 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -907,8 +908,8 @@ def test_learning_curve_task_1(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -944,12 +945,12 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
         reason="Pipelines don't support indexing (used for the assert check)",
     )
+    @pytest.mark.uses_test_server()
     def test_initialize_cv_from_run(self):
         randomsearch = Pipeline(
             [
@@ -1023,8 +1024,8 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -1039,12 +1040,12 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_local_run_swapped_parameter_order_flow(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1068,12 +1069,12 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_local_run_metric_score(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1106,12 +1107,12 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(
             steps=[
@@ -1168,12 +1169,12 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1228,8 +1229,8 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1248,8 +1249,8 @@ def test_run_with_illegal_flow_id(self):
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1281,6 +1282,7 @@ def test_run_with_illegal_flow_id_after_load(self):
             TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1305,8 +1307,8 @@ def test_run_with_illegal_flow_id_1(self):
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1344,12 +1346,12 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish,
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
         num_instances = 3196
@@ -1450,6 +1452,7 @@ def test_get_runs_list(self):
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
+    @pytest.mark.uses_test_server()
     def test_list_runs_empty(self):
         runs = openml.runs.list_runs(task=[0])
         assert runs.empty
@@ -1572,12 +1575,12 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1609,12 +1612,12 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_on_dataset_with_missing_labels_array(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1653,6 +1656,7 @@ def test_run_on_dataset_with_missing_labels_array(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.uses_test_server()
     def test_get_cached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.runs.functions._get_cached_run(1)
@@ -1662,8 +1666,8 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
@@ -1694,6 +1698,7 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
+    @pytest.mark.uses_test_server()
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1703,7 +1708,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1714,6 +1719,7 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1724,7 +1730,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
@@ -1732,7 +1738,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self):
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1762,12 +1768,12 @@ def test_format_prediction_task_regression(self):
 
 
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_delete_run(self):
         rs = np.random.randint(1, 2**31 - 1)
         clf = sklearn.pipeline.Pipeline(
@@ -1863,12 +1869,12 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
 
 
 @pytest.mark.sklearn()
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
     )
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.uses_test_server()
 def test__run_task_get_arffcontent_2(parallel_mock):
     """Tests if a run executed in parallel is collated correctly."""
     task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
@@ -1940,7 +1946,6 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
@@ -1960,6 +1965,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
     ]
 )
+@pytest.mark.uses_test_server()
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index a3b698a37..a0469f9a5 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -34,8 +34,8 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
@@ -82,8 +82,8 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -98,14 +98,14 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -147,6 +147,7 @@ def test_setup_list_filter_flow(self):
         for setup_id in setups:
             assert setups[setup_id].flow_id == flow_id
 
+    @pytest.mark.uses_test_server()
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
@@ -167,6 +168,7 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
+    @pytest.mark.uses_test_server()
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
@@ -178,6 +180,7 @@ def test_setuplist_offset(self):
 
         assert len(all) == size * 2
 
+    @pytest.mark.uses_test_server()
     def test_get_cached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.setups.functions._get_cached_setup(1)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 40026592f..839e74cf3 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -73,6 +73,7 @@ def test_get_suite_error(self):
         ):
             openml.study.get_suite(123)
 
+    @pytest.mark.uses_test_server()
     def test_publish_benchmark_suite(self):
         fixture_alias = None
         fixture_name = "unit tested benchmark suite"
@@ -141,13 +142,16 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool):
         assert study_downloaded.main_entity_type == "run"
         assert study_downloaded.runs is None
 
+    @pytest.mark.uses_test_server()
     def test_publish_empty_study_explicit(self):
         self._test_publish_empty_study_is_allowed(explicit=True)
 
+    @pytest.mark.uses_test_server()
     def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
+    @pytest.mark.uses_test_server()
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -217,6 +221,7 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
+    @pytest.mark.uses_test_server()
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 5528cabf2..fed0c0a00 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -26,14 +26,13 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index dcc024388..2bbb015c6 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -28,6 +28,7 @@ def test_get_dataset(self):
         task.get_dataset()
 
     @pytest.mark.production()
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         # no clustering tasks on test server
         self.use_production_server()
@@ -36,6 +37,7 @@ def test_download_task(self):
         assert task.task_type_id == TaskType.CLUSTERING
         assert task.dataset_id == 36
 
+    @pytest.mark.uses_test_server()
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 5f4b3e0ab..fbcbfe9bf 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -27,14 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 0cd2d96e2..a834cdf0f 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -49,7 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
@@ -58,7 +58,7 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_numeric_dtype(Y)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index e5a17a72b..3f7b06ee4 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -28,6 +28,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index 67f715d2b..b77782847 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -32,10 +32,11 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         return get_task(self.task_id)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_upload_task(self):
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 110459711..a46e597bc 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -26,6 +26,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.uses_test_server()
     def test__get_cached_tasks(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
@@ -33,6 +34,7 @@ def test__get_cached_tasks(self):
         assert len(tasks) == 3
         assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
+    @pytest.mark.uses_test_server()
     def test__get_cached_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
@@ -47,6 +49,7 @@ def test__get_cached_task_not_cached(self):
             2,
         )
 
+    @pytest.mark.uses_test_server()
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
         assert isinstance(estimation_procedures, list)
@@ -69,6 +72,7 @@ def _check_task(self, task):
         assert isinstance(task["status"], str)
         assert task["status"] in ["in_preparation", "active", "deactivated"]
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
@@ -78,15 +82,18 @@ def test_list_tasks_by_type(self):
             assert ttid == task["ttid"]
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_length(self):
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) > 100
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_empty(self):
         tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
         assert tasks.empty
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
         tasks = openml.tasks.list_tasks(tag="OpenML100")
@@ -94,12 +101,14 @@ def test_list_tasks_by_tag(self):
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks()
         assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -109,6 +118,7 @@ def test_list_tasks_paginate(self):
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_per_type_paginate(self):
         size = 40
         max = 100
@@ -125,6 +135,7 @@ def test_list_tasks_per_type_paginate(self):
                     assert j == task["ttid"]
                     self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.tasks.get_task(1882)
@@ -139,6 +150,7 @@ def test__get_task_live(self):
         # https://github.com/openml/openml-python/issues/378
         openml.tasks.get_task(34536)
 
+    @pytest.mark.uses_test_server()
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -152,7 +164,7 @@ def test_get_task(self):
             os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -193,6 +205,7 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
+    @pytest.mark.uses_test_server()
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
@@ -208,6 +221,7 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.uses_test_server()
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 540c43de0..6b8804b9f 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -16,6 +16,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars
@@ -31,6 +32,7 @@ def test_tagging(self):
         tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
 
+    @pytest.mark.uses_test_server()
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 35be84903..a1cdb55ea 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -48,18 +48,18 @@ def _mocked_perform_api_call(call, request_method):
     return openml._api_calls._download_text_file(url)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all():
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_tasks(min_number_tasks_on_test_server):
     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
@@ -72,7 +72,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
         size=min_number_datasets_on_test_server,
@@ -83,29 +83,29 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
         _check_dataset(dataset)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_flows(min_number_flows_on_test_server):
     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_setups(min_number_setups_on_test_server):
     # TODO apparently list_setups function does not support kwargs
     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
     assert min_number_setups_on_test_server == len(setups)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_runs(min_number_runs_on_test_server):
     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     # TODO apparently list_evaluations function does not support kwargs
     evaluations = openml.evaluations.list_evaluations(
@@ -115,8 +115,8 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     assert min_number_evaluations_on_test_server == len(evaluations)
 
 
-@pytest.mark.server()
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
+@pytest.mark.uses_test_server()
 def test_list_all_few_results_available(_perform_api_call):
     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
@@ -141,7 +141,7 @@ def test__create_cache_directory(config_mock, tmp_path):
         openml.utils._create_cache_directory("ghi")
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_correct_test_server_download_state():
     """This test verifies that the test server downloads the data from the correct source.
 

From f5d268073c639019840e71d9cb7f1404bd0ae5a4 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 6 Jan 2026 22:32:11 +0500
Subject: [PATCH 2/3] deselect tests requiring the test server

---
 .github/workflows/test.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 850abdfe7..d65cc3796 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -98,9 +98,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and not production"
+          marks="sklearn and not production and not uses_test_server"
         else
-          marks="not production"
+          marks="not production and not uses_test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -113,9 +113,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and production"
+          marks="sklearn and production and not uses_test_server"
         else
-          marks="production"
+          marks="production and not uses_test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -123,7 +123,7 @@ jobs:
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not uses_test_server"
 
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()

From 6c331a9577f8fd9493231c81a08c589b39f25e75 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 6 Jan 2026 22:46:07 +0500
Subject: [PATCH 3/3] add remaining missing markers

---
 tests/test_runs/test_run_functions.py   | 15 +++++++--------
 tests/test_tasks/test_task_functions.py |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 98b97045a..db54151d1 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -627,8 +627,8 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -636,8 +636,8 @@ def test_run_and_upload_logistic_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -667,8 +667,8 @@ def test_run_and_upload_linear_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
@@ -681,12 +681,12 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_column_transformer_pipeline(self):
         import sklearn.compose
         import sklearn.impute
@@ -746,7 +746,6 @@ def get_ct_cf(nominal_indices, numeric_indices):
             sentinel=sentinel,
         )
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skip("https://github.com/openml/OpenML/issues/1180")
     @unittest.skipIf(
@@ -799,8 +798,8 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                 call_count += 1
         assert call_count == 3
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
@@ -823,7 +822,7 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
-    @pytest.mark.skip(reason="failures_issue_1544")
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -855,8 +854,8 @@ def test_run_and_upload_randomsearch(self):
         trace = openml.runs.get_run_trace(run.run_id)
         assert len(trace.trace_iterations) == 5
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index a46e597bc..3a2b9ea0a 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -187,7 +187,7 @@ def test_get_task_lazy(self):
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
-    @pytest.mark.xfail(reason="failures_issue_1544")
+    @pytest.mark.uses_test_server()
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass