From dd130fddbdfa956428db5c794f77816b47323af2 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 31 Dec 2025 10:20:44 +0200
Subject: [PATCH 1/2] Remove xfail since they pass with updated test server
 state

---
 tests/test_runs/test_run.py               |  5 ----
 tests/test_runs/test_run_functions.py     | 28 -----------------------
 tests/test_setups/test_setup_functions.py |  3 ---
 3 files changed, 36 deletions(-)

diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 088856450..034b731aa 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -118,7 +118,6 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -154,7 +153,6 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -189,7 +187,6 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -295,7 +292,6 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -339,7 +335,6 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3bb4b0a0c..94ffa5001 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -398,7 +398,6 @@ def _check_sample_evaluations(
                             assert evaluation < max_time_allowed
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_regression_on_classif_task(self):
         task_id = 259  # collins; crossvalidation; has numeric targets
 
@@ -415,7 +414,6 @@ def test_run_regression_on_classif_task(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -628,7 +626,6 @@ def _run_and_upload_regression(
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -637,7 +634,6 @@ def test_run_and_upload_logistic_regression(self):
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -668,7 +664,6 @@ def test_run_and_upload_linear_regression(self):
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
@@ -682,7 +677,6 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
@@ -799,7 +793,6 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
         assert call_count == 3
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
@@ -822,7 +815,6 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -855,7 +847,6 @@ def test_run_and_upload_randomsearch(self):
         assert len(trace.trace_iterations) == 5
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -883,7 +874,6 @@ def test_run_and_upload_maskedarrays(self):
     ##########################################################################
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -908,7 +898,6 @@ def test_learning_curve_task_1(self):
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -945,7 +934,6 @@ def test_learning_curve_task_2(self):
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
         reason="Pipelines don't support indexing (used for the assert check)",
@@ -1024,7 +1012,6 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] <= 1
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -1040,7 +1027,6 @@ def test_local_run_swapped_parameter_order_model(self):
         self._test_local_evaluations(run)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1069,7 +1055,6 @@ def test_local_run_swapped_parameter_order_flow(self):
         self._test_local_evaluations(run)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1107,7 +1092,6 @@ def test_online_run_metric_score(self):
         self._test_local_evaluations(run)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1173,7 +1157,6 @@ def test_initialize_model_from_run(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1229,7 +1212,6 @@ def test__run_exists(self):
             assert run_ids, (run_ids, clf)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1249,7 +1231,6 @@ def test_run_with_illegal_flow_id(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1281,7 +1262,6 @@ def test_run_with_illegal_flow_id_after_load(self):
             TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1307,7 +1287,6 @@ def test_run_with_illegal_flow_id_1(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1346,7 +1325,6 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
@@ -1574,7 +1552,6 @@ def test_get_runs_list_by_tag(self):
         assert len(runs) >= 1
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
@@ -1611,7 +1588,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             assert len(row) == 12
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
@@ -1664,7 +1640,6 @@ def test_get_uncached_run(self):
             openml.runs.functions._get_cached_run(10)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
@@ -1765,7 +1740,6 @@ def test_format_prediction_task_regression(self):
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_delete_run(self):
         rs = np.random.randint(1, 2**31 - 1)
         clf = sklearn.pipeline.Pipeline(
@@ -1861,7 +1835,6 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
 
 
 @pytest.mark.sklearn()
-@pytest.mark.xfail(reason="failures_issue_1544")
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
@@ -1957,7 +1930,6 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
     ]
 )
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 18d7f5cc6..42af5362b 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -82,7 +82,6 @@ def _existing_setup_exists(self, classif):
         assert setup_id == run.setup_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -98,13 +97,11 @@ def side_effect(self):
             self._existing_setup_exists(nb)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(

From bb5a2c68d6043bc7a99ed035510432e621cc9689 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 31 Dec 2025 10:21:03 +0200
Subject: [PATCH 2/2] Test cannot pass due to test server state

---
 tests/test_datasets/test_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 86a4d3f57..66e9b8554 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -294,6 +294,7 @@ def test_tagging():
     datasets = openml.datasets.list_datasets(tag=tag)
     assert datasets.empty
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_get_feature_with_ontology_data_id_11():
     # test on car dataset, which has built-in ontology references
     dataset = openml.datasets.get_dataset(11)
@@ -470,4 +471,4 @@ def test__check_qualities():
 
     qualities = [{"oml:name": "a", "oml:value": None}]
     qualities = openml.datasets.dataset._check_qualities(qualities)
-    assert qualities["a"] != qualities["a"]
\ No newline at end of file
+    assert qualities["a"] != qualities["a"]