From 8db2d821eba3c77ffa4e3a77f115da58a81d50f4 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Thu, 19 Jun 2025 19:50:31 +0200
Subject: [PATCH 1/7] Don't actually execute the test body

---
 tests/test_runs/test_run_functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 725421d4f..04be16e1a 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1753,6 +1753,7 @@ def test_format_prediction_task_regression(self):
     )
     @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
     def test__run_task_get_arffcontent_2(self, parallel_mock):
+        return
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
         x, y = task.get_X_and_y()

From 0a506adf26802c0a38f8b7a7ce1d61ddfe6b8cbe Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Thu, 19 Jun 2025 23:18:12 +0200
Subject: [PATCH 2/7] only do setup

---
 tests/test_runs/test_run_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 04be16e1a..db484ba88 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1753,7 +1753,6 @@ def test_format_prediction_task_regression(self):
     )
     @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
     def test__run_task_get_arffcontent_2(self, parallel_mock):
-        return
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
         x, y = task.get_X_and_y()
@@ -1780,6 +1779,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         )
         n_jobs = 2
         backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+        return
         with parallel_backend(backend, n_jobs=n_jobs):
             res = openml.runs.functions._run_task_get_arffcontent(
                 extension=self.extension,

From 5699299253b18d728ab198c520e5b05b4bad06af Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 20 Jun 2025 09:44:19 +0200
Subject: [PATCH 3/7] get task but not data

---
 tests/test_runs/test_run_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index db484ba88..2c9548add 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1755,6 +1755,7 @@ def test_format_prediction_task_regression(self):
     def test__run_task_get_arffcontent_2(self, parallel_mock):
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
+        return
         x, y = task.get_X_and_y()
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
@@ -1779,7 +1780,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         )
         n_jobs = 2
         backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
-        return
         with parallel_backend(backend, n_jobs=n_jobs):
             res = openml.runs.functions._run_task_get_arffcontent(
                 extension=self.extension,

From d52d65968e481184e60e79563e3d598200c23723 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 20 Jun 2025 10:02:24 +0200
Subject: [PATCH 4/7] Also get data

---
 tests/test_runs/test_run_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 2c9548add..816c433c3 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1755,8 +1755,8 @@ def test_format_prediction_task_regression(self):
     def test__run_task_get_arffcontent_2(self, parallel_mock):
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
-        return
         x, y = task.get_X_and_y()
+        return
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
         loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"

From 78111f577ffc80c6839a1a1086493e5872c25234 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 20 Jun 2025 10:17:44 +0200
Subject: [PATCH 5/7] Execute full test

---
 tests/test_runs/test_run_functions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 816c433c3..725421d4f 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1756,7 +1756,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
         x, y = task.get_X_and_y()
-        return
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
         loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"

From ae6e55db20165a3e8f2698f18ea809871ede46fc Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 20 Jun 2025 17:31:00 +0200
Subject: [PATCH 6/7] Convert from unittest to pytest

---
 tests/test_runs/test_run_functions.py | 150 +++++++++++++-------------
 1 file changed, 77 insertions(+), 73 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 725421d4f..687be7bb6 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1746,79 +1746,6 @@ def test_format_prediction_task_regression(self):
         res = format_prediction(regression, *ignored_input)
         self.assertListEqual(res, [0] * 5)
 
-    @pytest.mark.sklearn()
-    @unittest.skipIf(
-        Version(sklearn.__version__) < Version("0.21"),
-        reason="couldn't perform local tests successfully w/o bloating RAM",
-    )
-    @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
-    def test__run_task_get_arffcontent_2(self, parallel_mock):
-        """Tests if a run executed in parallel is collated correctly."""
-        task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
-        x, y = task.get_X_and_y()
-        num_instances = x.shape[0]
-        line_length = 6 + len(task.class_labels)
-        loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
-        clf = sklearn.pipeline.Pipeline(
-            [
-                (
-                    "cat_handling",
-                    ColumnTransformer(
-                        transformers=[
-                            (
-                                "cat",
-                                OneHotEncoder(handle_unknown="ignore"),
-                                x.select_dtypes(include=["object", "category"]).columns,
-                            )
-                        ],
-                        remainder="passthrough",
-                    ),
-                ),
-                ("clf", SGDClassifier(loss=loss, random_state=1)),
-            ]
-        )
-        n_jobs = 2
-        backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
-        with parallel_backend(backend, n_jobs=n_jobs):
-            res = openml.runs.functions._run_task_get_arffcontent(
-                extension=self.extension,
-                model=clf,
-                task=task,
-                add_local_measures=True,
-                n_jobs=n_jobs,
-            )
-        # This unit test will fail if joblib is unable to distribute successfully since the
-        # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it
-        # is not and the mock call_count should remain 0 while the subsequent check of actual
-        # results should also hold, only on successful distribution of tasks to workers.
-        # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
-        # block and mocking this function doesn't affect rest of the pipeline, but is adequately
-        # indicative if _run_model_on_fold() is being called or not.
-        assert parallel_mock.call_count == 0
-        assert isinstance(res[0], list)
-        assert len(res[0]) == num_instances
-        assert len(res[0][0]) == line_length
-        assert len(res[2]) == 7
-        assert len(res[3]) == 7
-        expected_scores = [
-            0.9625,
-            0.953125,
-            0.965625,
-            0.9125,
-            0.98125,
-            0.975,
-            0.9247648902821317,
-            0.9404388714733543,
-            0.9780564263322884,
-            0.9623824451410659,
-        ]
-        scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
-        np.testing.assert_array_almost_equal(
-            scores,
-            expected_scores,
-            decimal=2,
-            err_msg="Observed performance scores deviate from expected ones.",
-        )
 
     @pytest.mark.sklearn()
     @unittest.skipIf(
@@ -1993,3 +1920,80 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     run_url = "https://test.openml.org/api/v1/xml/run/9999999"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@pytest.mark.sklearn()
+@unittest.skipIf(
+    Version(sklearn.__version__) < Version("0.21"),
+    reason="couldn't perform local tests successfully w/o bloating RAM",
+    )
+@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+def test__run_task_get_arffcontent_2(parallel_mock):
+    """Tests if a run executed in parallel is collated correctly."""
+    task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
+    x, y = task.get_X_and_y()
+    num_instances = x.shape[0]
+    line_length = 6 + len(task.class_labels)
+    loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
+    clf = sklearn.pipeline.Pipeline(
+        [
+            (
+                "cat_handling",
+                ColumnTransformer(
+                    transformers=[
+                        (
+                            "cat",
+                            OneHotEncoder(handle_unknown="ignore"),
+                            x.select_dtypes(include=["object", "category"]).columns,
+                        )
+                    ],
+                    remainder="passthrough",
+                ),
+            ),
+            ("clf", SGDClassifier(loss=loss, random_state=1)),
+        ]
+    )
+    n_jobs = 2
+    backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+    from openml_sklearn import SklearnExtension
+    extension = SklearnExtension()
+    with parallel_backend(backend, n_jobs=n_jobs):
+        res = openml.runs.functions._run_task_get_arffcontent(
+            extension=extension,
+            model=clf,
+            task=task,
+            add_local_measures=True,
+            n_jobs=n_jobs,
+        )
+    # This unit test will fail if joblib is unable to distribute successfully since the
+    # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it
+    # is not and the mock call_count should remain 0 while the subsequent check of actual
+    # results should also hold, only on successful distribution of tasks to workers.
+    # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
+    # block and mocking this function doesn't affect rest of the pipeline, but is adequately
+    # indicative if _run_model_on_fold() is being called or not.
+    assert parallel_mock.call_count == 0
+    assert isinstance(res[0], list)
+    assert len(res[0]) == num_instances
+    assert len(res[0][0]) == line_length
+    assert len(res[2]) == 7
+    assert len(res[3]) == 7
+    expected_scores = [
+        0.9625,
+        0.953125,
+        0.965625,
+        0.9125,
+        0.98125,
+        0.975,
+        0.9247648902821317,
+        0.9404388714733543,
+        0.9780564263322884,
+        0.9623824451410659,
+    ]
+    scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
+    np.testing.assert_array_almost_equal(
+        scores,
+        expected_scores,
+        decimal=2,
+        err_msg="Observed performance scores deviate from expected ones.",
+    )

From 0223c360659c4f2408be7b833f10036ffb59f7d7 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 20 Jun 2025 17:49:32 +0200
Subject: [PATCH 7/7] Convert from unittest to pytest, parametrize outside of
 test

---
 tests/test_runs/test_run_functions.py | 163 ++++++++++++++------------
 1 file changed, 87 insertions(+), 76 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 687be7bb6..0517c8c9c 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1747,82 +1747,6 @@ def test_format_prediction_task_regression(self):
         self.assertListEqual(res, [0] * 5)
 
 
-    @pytest.mark.sklearn()
-    @unittest.skipIf(
-        Version(sklearn.__version__) < Version("0.21"),
-        reason="couldn't perform local tests successfully w/o bloating RAM",
-    )
-    @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
-    def test_joblib_backends(self, parallel_mock):
-        """Tests evaluation of a run using various joblib backends and n_jobs."""
-        task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
-        x, y = task.get_X_and_y()
-        num_instances = x.shape[0]
-        line_length = 6 + len(task.class_labels)
-
-        backend_choice = (
-            "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
-        )
-        for n_jobs, backend, call_count in [
-            (1, backend_choice, 10),
-            (2, backend_choice, 10),
-            (-1, backend_choice, 10),
-            (1, "threading", 20),
-            (-1, "threading", 30),
-            (1, "sequential", 40),
-        ]:
-            clf = sklearn.model_selection.RandomizedSearchCV(
-                estimator=sklearn.pipeline.Pipeline(
-                    [
-                        (
-                            "cat_handling",
-                            ColumnTransformer(
-                                transformers=[
-                                    (
-                                        "cat",
-                                        OneHotEncoder(handle_unknown="ignore"),
-                                        x.select_dtypes(include=["object", "category"]).columns,
-                                    )
-                                ],
-                                remainder="passthrough",
-                            ),
-                        ),
-                        ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
-                    ]
-                ),
-                param_distributions={
-                    "clf__max_depth": [3, None],
-                    "clf__max_features": [1, 2, 3, 4],
-                    "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-                    "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                    "clf__bootstrap": [True, False],
-                    "clf__criterion": ["gini", "entropy"],
-                },
-                random_state=1,
-                cv=sklearn.model_selection.StratifiedKFold(
-                    n_splits=2,
-                    shuffle=True,
-                    random_state=1,
-                ),
-                n_iter=5,
-                n_jobs=n_jobs,
-            )
-            with parallel_backend(backend, n_jobs=n_jobs):
-                res = openml.runs.functions._run_task_get_arffcontent(
-                    extension=self.extension,
-                    model=clf,
-                    task=task,
-                    add_local_measures=True,
-                    n_jobs=n_jobs,
-                )
-            assert type(res[0]) == list
-            assert len(res[0]) == num_instances
-            assert len(res[0][0]) == line_length
-            # usercpu_time_millis_* not recorded when n_jobs > 1
-            # *_time_millis_* not recorded when n_jobs = -1
-            assert len(res[2]["predictive_accuracy"][0]) == 10
-            assert len(res[3]["predictive_accuracy"][0]) == 10
-            assert parallel_mock.call_count == call_count
 
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1997,3 +1921,90 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         decimal=2,
         err_msg="Observed performance scores deviate from expected ones.",
     )
+
+
+@pytest.mark.sklearn()
+@unittest.skipIf(
+    Version(sklearn.__version__) < Version("0.21"),
+    reason="couldn't perform local tests successfully w/o bloating RAM",
+    )
+@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.parametrize(
+    ("n_jobs", "backend", "call_count"),
+    [
+        # `None` picks the backend based on joblib version (loky or multiprocessing) and
+        # spawns multiple processes if n_jobs != 1, which means the mock is not applied.
+        (2, None, 0),
+        (-1, None, 0),
+        (1, None, 10),  # with n_jobs=1 the mock *is* applied, since there is no new subprocess
+        (1, "sequential", 10),
+        (1, "threading", 10),
+        (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
+    ]
+)
+def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
+    """Tests evaluation of a run using various joblib backends and n_jobs."""
+    if backend is None:
+        backend = (
+            "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+        )
+
+    task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
+    x, y = task.get_X_and_y()
+    num_instances = x.shape[0]
+    line_length = 6 + len(task.class_labels)
+
+    clf = sklearn.model_selection.RandomizedSearchCV(
+        estimator=sklearn.pipeline.Pipeline(
+            [
+                (
+                    "cat_handling",
+                    ColumnTransformer(
+                        transformers=[
+                            (
+                                "cat",
+                                OneHotEncoder(handle_unknown="ignore"),
+                                x.select_dtypes(include=["object", "category"]).columns,
+                            )
+                        ],
+                        remainder="passthrough",
+                    ),
+                ),
+                ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
+            ]
+        ),
+        param_distributions={
+            "clf__max_depth": [3, None],
+            "clf__max_features": [1, 2, 3, 4],
+            "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "clf__bootstrap": [True, False],
+            "clf__criterion": ["gini", "entropy"],
+        },
+        random_state=1,
+        cv=sklearn.model_selection.StratifiedKFold(
+            n_splits=2,
+            shuffle=True,
+            random_state=1,
+        ),
+        n_iter=5,
+        n_jobs=n_jobs,
+    )
+    from openml_sklearn import SklearnExtension
+    extension = SklearnExtension()
+    with parallel_backend(backend, n_jobs=n_jobs):
+        res = openml.runs.functions._run_task_get_arffcontent(
+            extension=extension,
+            model=clf,
+            task=task,
+            add_local_measures=True,
+            n_jobs=n_jobs,
+        )
+    assert type(res[0]) == list
+    assert len(res[0]) == num_instances
+    assert len(res[0][0]) == line_length
+    # usercpu_time_millis_* not recorded when n_jobs > 1
+    # *_time_millis_* not recorded when n_jobs = -1
+    assert len(res[2]["predictive_accuracy"][0]) == 10
+    assert len(res[3]["predictive_accuracy"][0]) == 10
+    assert parallel_mock.call_count == call_count