From 7a6f845a1bc35cec0408a508ff768e0be4b7c6bc Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 16 Oct 2024 14:14:07 +0200
Subject: [PATCH 01/23] refactor: Remove "array" format

---
 examples/20_basic/simple_datasets_tutorial.py |   6 +-
 .../task_manual_iteration_tutorial.py         |   8 +-
 openml/datasets/functions.py                  | 152 ++-----------
 openml/evaluations/evaluation.py              |  20 ++
 openml/evaluations/functions.py               | 148 +++++--------
 openml/flows/functions.py                     | 122 +----------
 openml/runs/functions.py                      | 104 ++-------
 openml/setups/functions.py                    | 173 +++++----------
 openml/setups/setup.py                        |  23 ++
 openml/study/functions.py                     | 204 +++---------------
 openml/tasks/functions.py                     |  64 +-----
 openml/tasks/task.py                          |  73 +------
 openml/utils.py                               |  71 ++----
 tests/test_datasets/test_dataset.py           | 129 +----------
 tests/test_datasets/test_dataset_functions.py |   3 +-
 .../test_sklearn_extension.py                 | 114 ++++++----
 tests/test_runs/test_run_functions.py         |  12 +-
 tests/test_setups/test_setup_functions.py     |   7 -
 tests/test_utils/test_utils.py                |  14 +-
 19 files changed, 368 insertions(+), 1079 deletions(-)

diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
index b90d53660..9b18aab14 100644
--- a/examples/20_basic/simple_datasets_tutorial.py
+++ b/examples/20_basic/simple_datasets_tutorial.py
@@ -19,7 +19,7 @@
 # List datasets
 # =============
 
-datasets_df = openml.datasets.list_datasets(output_format="dataframe")
+datasets_df = openml.datasets.list_datasets()
 print(datasets_df.head(n=10))
 
 ############################################################################
@@ -48,7 +48,7 @@
 # attribute_names - the names of the features for the examples (X) and
 # target feature (y)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="dataframe", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 
 ############################################################################
@@ -63,9 +63,9 @@
 # Visualize the dataset
 # =====================
 
+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-import matplotlib.pyplot as plt
 
 sns.set_style("darkgrid")
 
diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
index 676a742a1..dda40de50 100644
--- a/examples/30_extended/task_manual_iteration_tutorial.py
+++ b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -68,7 +68,7 @@
 ####################################################################################################
 # And then split the data based on this:
 
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 X_train = X.iloc[train_indices]
 y_train = y.iloc[train_indices]
 X_test = X.iloc[test_indices]
@@ -88,7 +88,7 @@
 
 task_id = 3
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -132,7 +132,7 @@
 
 task_id = 1767
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -176,7 +176,7 @@
 
 task_id = 1702
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 0901171d6..5ce095814 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -7,7 +7,7 @@
 from collections import OrderedDict
 from pathlib import Path
 from pyexpat import ExpatError
-from typing import TYPE_CHECKING, Any, overload
+from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
 import arff
@@ -75,54 +75,16 @@ def list_qualities() -> list[str]:
     return qualities["oml:data_qualities_list"]["oml:quality"]
 
 
-@overload
-def list_datasets(
-    data_id: list[int] | None = ...,
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    tag: str | None = ...,
-    *,
-    output_format: Literal["dataframe"],
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_datasets(
-    data_id: list[int] | None,
-    offset: int | None,
-    size: int | None,
-    status: str | None,
-    tag: str | None,
-    output_format: Literal["dataframe"],
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_datasets(
-    data_id: list[int] | None = ...,
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    tag: str | None = ...,
-    output_format: Literal["dict"] = "dict",
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
 def list_datasets(
     data_id: list[int] | None = None,
     offset: int | None = None,
     size: int | None = None,
     status: str | None = None,
     tag: str | None = None,
-    output_format: Literal["dataframe", "dict"] = "dict",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
-    """
-    Return a list of all dataset which are on OpenML.
+) -> pd.DataFrame:
+    """Return a dataframe of all dataset which are on OpenML.
+
     Supports large amount of results.
 
     Parameters
@@ -139,10 +101,6 @@ def list_datasets(
         default active datasets are returned, but also datasets
         from another status can be requested.
     tag : str, optional
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         data_name, data_version, number_instances,
@@ -150,76 +108,29 @@ def list_datasets(
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            A mapping from dataset ID to dict.
-
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also returned.
-
-        - If output_format='dataframe'
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
+    datasets: dataframe
+        Each row maps to a dataset
+        Each column contains the following information:
+        - dataset id
+        - name
+        - format
+        - status
+        If qualities are calculated for the dataset, some of
+        these are also included as columns.
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
-        )
-
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
-    return openml.utils._list_all(  # type: ignore
-        data_id=data_id,
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_datasets,
+        data_id=data_id,
         offset=offset,
         size=size,
         status=status,
         tag=tag,
         **kwargs,
     )
+    return pd.concat(batches, ignore_index=True)
 
 
-@overload
-def _list_datasets(
-    data_id: list | None = ...,
-    output_format: Literal["dict"] = "dict",
-    **kwargs: Any,
-) -> dict: ...
-
-
-@overload
-def _list_datasets(
-    data_id: list | None = ...,
-    output_format: Literal["dataframe"] = "dataframe",
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-def _list_datasets(
-    data_id: list | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
-    **kwargs: Any,
-) -> dict | pd.DataFrame:
+def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame:
     """
     Perform api call to return a list of all datasets.
 
@@ -232,10 +143,6 @@ def _list_datasets(
 
     data_id : list, optional
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         tag, status, limit, offset, data_name, data_version, number_instances,
@@ -243,7 +150,7 @@ def _list_datasets(
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
+    datasets : dataframe
     """
     api_call = "data/list"
 
@@ -252,21 +159,10 @@ def _list_datasets(
             api_call += f"/{operator}/{value}"
     if data_id is not None:
         api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id]))
-    return __list_datasets(api_call=api_call, output_format=output_format)
-
-
-@overload
-def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
+    return __list_datasets(api_call=api_call)
 
-@overload
-def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
 
-
-def __list_datasets(
-    api_call: str,
-    output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+def __list_datasets(api_call: str) -> pd.DataFrame:
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
 
@@ -295,10 +191,7 @@ def __list_datasets(
                 dataset[quality["@name"]] = float(quality["#text"])
         datasets[dataset["did"]] = dataset
 
-    if output_format == "dataframe":
-        datasets = pd.DataFrame.from_dict(datasets, orient="index")
-
-    return datasets
+    return pd.DataFrame.from_dict(datasets, orient="index")
 
 
 def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
@@ -1493,8 +1386,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
 
 
 def _get_online_dataset_format(dataset_id: int) -> str:
-    """Get the dataset format for a given dataset id
-    from the OpenML website.
+    """Get the dataset format for a given dataset id from the OpenML website.
 
     Parameters
     ----------
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 3cf732f25..70fab9f28 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -8,6 +8,8 @@
 import openml.tasks
 
 
+# TODO(eddiebergman): A lot of this class is automatically
+# handled by a dataclass
 class OpenMLEvaluation:
     """
     Contains all meta-information about a run / evaluation combination,
@@ -78,6 +80,24 @@ def __init__(  # noqa: PLR0913
         self.values = values
         self.array_data = array_data
 
+    def _to_dict(self) -> dict:
+        return {
+            "run_id": self.run_id,
+            "task_id": self.task_id,
+            "setup_id": self.setup_id,
+            "flow_id": self.flow_id,
+            "flow_name": self.flow_name,
+            "data_id": self.data_id,
+            "data_name": self.data_name,
+            "function": self.function,
+            "upload_time": self.upload_time,
+            "uploader": self.uploader,
+            "uploader_name": self.uploader_name,
+            "value": self.value,
+            "values": self.values,
+            "array_data": self.array_data,
+        }
+
     def __repr__(self) -> str:
         header = "OpenML Evaluation"
         header = "{}\n{}\n".format(header, "=" * len(header))
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index a39096a58..3b4735250 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import json
-import warnings
+from itertools import chain
 from typing import Any
 from typing_extensions import Literal, overload
 
@@ -31,8 +31,8 @@ def list_evaluations(
     study: int | None = ...,
     per_fold: bool | None = ...,
     sort_order: str | None = ...,
-    output_format: Literal["dict", "object"] = "dict",
-) -> dict: ...
+    output_format: Literal["object"] = "object",
+) -> dict[int, OpenMLEvaluation]: ...
 
 
 @overload
@@ -66,10 +66,10 @@ def list_evaluations(
     study: int | None = None,
     per_fold: bool | None = None,
     sort_order: str | None = None,
-    output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
-    """
-    List all run-evaluation pairs matching all of the given filters.
+    output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLEvaluation] | pd.DataFrame:
+    """List all run-evaluation pairs matching all of the given filters.
+
     (Supports large amount of results)
 
     Parameters
@@ -105,33 +105,20 @@ def list_evaluations(
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
         - If 'object' the output is a dict of OpenMLEvaluation objects
-        - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
     dict or dataframe
     """
-    if output_format not in ["dataframe", "dict", "object"]:
-        raise ValueError(
-            "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.",
-        )
-
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15. "
-            "To ensure your code will continue to work, "
-            "use `output_format`='dataframe' or `output_format`='object'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+    if output_format not in ("dataframe", "object"):
+        raise ValueError("Invalid output format. Only 'object', 'dataframe'.")
 
     per_fold_str = None
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()
 
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    eval_collection: list[list[OpenMLEvaluation]] = openml.utils._list_all(
         listing_call=_list_evaluations,
         function=function,
         offset=offset,
@@ -146,6 +133,13 @@ def list_evaluations(
         sort_order=sort_order,
         per_fold=per_fold_str,
     )
+    flattened = list(chain.from_iterable(eval_collection))
+
+    if output_format == "dataframe":
+        records = [item._to_dict() for item in flattened]
+        return pd.DataFrame.from_records(records, index="run_id")
+
+    return {e.run_id: e for e in flattened}
 
 
 def _list_evaluations(
@@ -157,9 +151,8 @@ def _list_evaluations(
     uploaders: list | None = None,
     study: int | None = None,
     sort_order: str | None = None,
-    output_format: Literal["object", "dict", "dataframe"] = "object",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> list[OpenMLEvaluation]:
     """
     Perform API call ``/evaluation/function{function}/{filters}``
 
@@ -190,17 +183,9 @@ def _list_evaluations(
     sort_order : str, optional
         order of sorting evaluations, ascending ("asc") or descending ("desc")
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-        - If 'dataframe' the output is a pandas DataFrame
-
     Returns
     -------
-    dict of objects, or dataframe
+    list of OpenMLEvaluation objects
     """
     api_call = f"evaluation/list/function/{function}"
     if kwargs is not None:
@@ -221,13 +206,10 @@ def _list_evaluations(
     if sort_order is not None:
         api_call += f"/sort_order/{sort_order}"
 
-    return __list_evaluations(api_call, output_format=output_format)
+    return __list_evaluations(api_call)
 
 
-def __list_evaluations(
-    api_call: str,
-    output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
+def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
@@ -241,29 +223,24 @@ def __list_evaluations(
         evals_dict["oml:evaluations"],
     )
 
-    evals: dict[int, dict | OpenMLEvaluation] = {}
     uploader_ids = list(
         {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
     )
     api_users = "user/list/user_id/" + ",".join(uploader_ids)
     xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
+
     users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
     user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
+
+    evals = []
     for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
         run_id = int(eval_["oml:run_id"])
-
-        value = None
-        if "oml:value" in eval_:
-            value = float(eval_["oml:value"])
-
-        values = None
-        if "oml:values" in eval_:
-            values = json.loads(eval_["oml:values"])
-
+        value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
+        values = json.loads(eval_["oml:values"]) if eval_["oml:values"] is not None else None
         array_data = eval_.get("oml:array_data")
 
-        if output_format == "object":
-            evals[run_id] = OpenMLEvaluation(
+        evals.append(
+            OpenMLEvaluation(
                 run_id=run_id,
                 task_id=int(eval_["oml:task_id"]),
                 setup_id=int(eval_["oml:setup_id"]),
@@ -279,28 +256,7 @@ def __list_evaluations(
                 values=values,
                 array_data=array_data,
             )
-        else:
-            # for output_format in ['dict', 'dataframe']
-            evals[run_id] = {
-                "run_id": int(eval_["oml:run_id"]),
-                "task_id": int(eval_["oml:task_id"]),
-                "setup_id": int(eval_["oml:setup_id"]),
-                "flow_id": int(eval_["oml:flow_id"]),
-                "flow_name": eval_["oml:flow_name"],
-                "data_id": int(eval_["oml:data_id"]),
-                "data_name": eval_["oml:data_name"],
-                "function": eval_["oml:function"],
-                "upload_time": eval_["oml:upload_time"],
-                "uploader": int(eval_["oml:uploader"]),
-                "uploader_name": user_dict[eval_["oml:uploader"]],
-                "value": value,
-                "values": values,
-                "array_data": array_data,
-            }
-
-    if output_format == "dataframe":
-        rows = list(evals.values())
-        return pd.DataFrame.from_records(rows, columns=rows[0].keys())  # type: ignore
+        )
 
     return evals
 
@@ -321,9 +277,11 @@ def list_evaluation_measures() -> list[str]:
     qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
     # Minimalistic check if the XML is useful
     if "oml:evaluation_measures" not in qualities:
-        raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
+        raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"')
+
     if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
-        raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list')
+        raise TypeError('Error in return XML, does not contain "oml:measure" as a list')
+
     return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]
 
 
@@ -343,14 +301,13 @@ def list_estimation_procedures() -> list[str]:
 
     # Minimalistic check if the XML is useful
     if "oml:estimationprocedures" not in api_results:
-        raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedures"')
+        raise ValueError('Error in return XML, does not contain "oml:estimationprocedures"')
+
     if "oml:estimationprocedure" not in api_results["oml:estimationprocedures"]:
-        raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedure"')
+        raise ValueError('Error in return XML, does not contain "oml:estimationprocedure"')
 
     if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list):
-        raise TypeError(
-            "Error in return XML, does not contain " '"oml:estimationprocedure" as a list',
-        )
+        raise TypeError('Error in return XML, does not contain "oml:estimationprocedure" as a list')
 
     return [
         prod["oml:name"]
@@ -370,11 +327,9 @@ def list_evaluations_setups(
     tag: str | None = None,
     per_fold: bool | None = None,
     sort_order: str | None = None,
-    output_format: str = "dataframe",
     parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
-) -> dict | pd.DataFrame:
-    """
-    List all run-evaluation pairs matching all of the given filters
+) -> pd.DataFrame:
+    """List all run-evaluation pairs matching all of the given filters
     and their hyperparameter settings.
 
     Parameters
@@ -400,23 +355,16 @@ def list_evaluations_setups(
     per_fold : bool, optional
     sort_order : str, optional
        order of sorting evaluations, ascending ("asc") or descending ("desc")
-    output_format: str, optional (default='dataframe')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     parameters_in_separate_columns: bool, optional (default= False)
         Returns hyperparameters in separate columns if set to True.
         Valid only for a single flow
 
-
     Returns
     -------
-    dict or dataframe with hyperparameter settings as a list of tuples.
+    dataframe with hyperparameter settings as a list of tuples.
     """
     if parameters_in_separate_columns and (flows is None or len(flows) != 1):
-        raise ValueError(
-            "Can set parameters_in_separate_columns to true " "only for single flow_id",
-        )
+        raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id")
 
     # List evaluations
     evals = list_evaluations(
@@ -439,11 +387,14 @@ def list_evaluations_setups(
     _df = pd.DataFrame()
     if len(evals) != 0:
         N = 100  # size of section
-        length = len(evals["setup_id"].unique())  # length of the array we want to split
+        uniq = np.asarray(evals["setup_id"].unique())
+        length = len(uniq)
+
         # array_split - allows indices_or_sections to not equally divide the array
         # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
-        uniq = np.asarray(evals["setup_id"].unique())
-        setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1)
+        split_size = ((length - 1) // N) + 1
+        setup_chunks = np.array_split(uniq, split_size)
+
         setup_data = pd.DataFrame()
         for _setups in setup_chunks:
             result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
@@ -471,7 +422,4 @@ def list_evaluations_setups(
             axis=1,
         )
 
-    if output_format == "dataframe":
-        return _df
-
-    return _df.to_dict(orient="index")
+    return _df
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 3d056ac60..e43e18c17 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -3,10 +3,8 @@
 
 import os
 import re
-import warnings
 from collections import OrderedDict
-from typing import Any, Dict, overload
-from typing_extensions import Literal
+from typing import Any, Dict
 
 import dateutil.parser
 import pandas as pd
@@ -133,44 +131,12 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
         return _create_flow_from_xml(flow_xml)
 
 
-@overload
-def list_flows(
-    offset: int | None = ...,
-    size: int | None = ...,
-    tag: str | None = ...,
-    output_format: Literal["dict"] = "dict",
-    **kwargs: Any,
-) -> dict: ...
-
-
-@overload
-def list_flows(
-    offset: int | None = ...,
-    size: int | None = ...,
-    tag: str | None = ...,
-    *,
-    output_format: Literal["dataframe"],
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_flows(
-    offset: int | None,
-    size: int | None,
-    tag: str | None,
-    output_format: Literal["dataframe"],
-    **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
 def list_flows(
     offset: int | None = None,
     size: int | None = None,
     tag: str | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
     """
     Return a list of all flows which are on OpenML.
     (Supports large amount of results)
@@ -183,29 +149,12 @@ def list_flows(
         the maximum number of flows to return
     tag : str, optional
         the tag to include
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs: dict, optional
         Legal filter operators: uploader.
 
     Returns
     -------
-    flows : dict of dicts, or dataframe
-        - If output_format='dict'
-            A mapping from flow_id to a dict giving a brief overview of the
-            respective flow.
-            Every flow is represented by a dictionary containing
-            the following information:
-            - flow id
-            - full name
-            - name
-            - version
-            - external version
-            - uploader
-
-        - If output_format='dataframe'
+    flows : dataframe
             Each row maps to a dataset
             Each column contains the following information:
             - flow id
@@ -215,61 +164,28 @@ def list_flows(
             - external version
             - uploader
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
-        )
-
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
-    return openml.utils._list_all(
-        list_output_format=output_format,
+    batches = openml.utils._list_all(
         listing_call=_list_flows,
         offset=offset,
         size=size,
         tag=tag,
         **kwargs,
     )
+    return pd.concat(batches, ignore_index=True)
 
 
-@overload
-def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: ...
-
-
-@overload
-def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-
-@overload
-def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-
-def _list_flows(
-    output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
-) -> dict | pd.DataFrame:
+def _list_flows(**kwargs: Any) -> pd.DataFrame:
     """
     Perform the api call that return a list of all flows.
 
     Parameters
     ----------
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
     kwargs: dict, optional
         Legal filter operators: uploader, tag, limit, offset.
 
     Returns
     -------
-    flows : dict, or dataframe
+    flows : dataframe
     """
     api_call = "flow/list"
 
@@ -277,7 +193,7 @@ def _list_flows(
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
 
-    return __list_flows(api_call=api_call, output_format=output_format)
+    return __list_flows(api_call=api_call)
 
 
 def flow_exists(name: str, external_version: str) -> int | bool:
@@ -378,23 +294,12 @@ def get_flow_id(
             raise ValueError("exact_version should be False if model is None!")
         return flow_exists(name=flow_name, external_version=external_version)
 
-    flows = list_flows(output_format="dataframe")
-    assert isinstance(flows, pd.DataFrame)  # Make mypy happy
+    flows = list_flows()
     flows = flows.query(f'name == "{flow_name}"')
     return flows["id"].to_list()  # type: ignore[no-any-return]
 
 
-@overload
-def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
-
-@overload
-def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
-
-
-def __list_flows(
-    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_flows(api_call: str) -> pd.DataFrame:
     """Retrieve information about flows from OpenML API
     and parse it to a dictionary or a Pandas DataFrame.
 
@@ -402,8 +307,6 @@ def __list_flows(
     ----------
     api_call: str
         Retrieves the information about flows.
-    output_format: str in {"dict", "dataframe"}
-        The output format.
 
     Returns
     -------
@@ -431,10 +334,7 @@ def __list_flows(
         }
         flows[fid] = flow
 
-    if output_format == "dataframe":
-        flows = pd.DataFrame.from_dict(flows, orient="index")
-
-    return flows
+    return pd.DataFrame.from_dict(flows, orient="index")
 
 
 def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index c6af4a481..cff649c1d 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -7,7 +7,6 @@
 from collections import OrderedDict
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-from typing_extensions import Literal
 
 import numpy as np
 import pandas as pd
@@ -65,7 +64,6 @@ def run_model_on_task(  # noqa: PLR0913
     add_local_measures: bool = True,  # noqa: FBT001, FBT002
     upload_flow: bool = False,  # noqa: FBT001, FBT002
     return_flow: bool = False,  # noqa: FBT001, FBT002
-    dataset_format: Literal["array", "dataframe"] = "dataframe",
     n_jobs: int | None = None,
 ) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
     """Run the model on the dataset defined by the task.
@@ -94,9 +92,6 @@ def run_model_on_task(  # noqa: PLR0913
         If False, do not upload the flow to OpenML.
     return_flow : bool (default=False)
         If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.
-    dataset_format : str (default='dataframe')
-        If 'array', the dataset is passed to the model as a numpy array.
-        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
     n_jobs : int (default=None)
         The number of processes/threads to distribute the evaluation asynchronously.
         If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
@@ -170,7 +165,6 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
         seed=seed,
         add_local_measures=add_local_measures,
         upload_flow=upload_flow,
-        dataset_format=dataset_format,
         n_jobs=n_jobs,
     )
     if return_flow:
@@ -186,7 +180,6 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     seed: int | None = None,
     add_local_measures: bool = True,  # noqa: FBT001, FBT002
     upload_flow: bool = False,  # noqa: FBT001, FBT002
-    dataset_format: Literal["array", "dataframe"] = "dataframe",
     n_jobs: int | None = None,
 ) -> OpenMLRun:
     """Run the model provided by the flow on the dataset defined by task.
@@ -216,9 +209,6 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     upload_flow : bool (default=False)
         If True, upload the flow to OpenML if it does not exist yet.
         If False, do not upload the flow to OpenML.
-    dataset_format : str (default='dataframe')
-        If 'array', the dataset is passed to the model as a numpy array.
-        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
     n_jobs : int (default=None)
         The number of processes/threads to distribute the evaluation asynchronously.
         If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
@@ -306,7 +296,6 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
         task=task,
         extension=flow.extension,
         add_local_measures=add_local_measures,
-        dataset_format=dataset_format,
         n_jobs=n_jobs,
     )
 
@@ -468,13 +457,12 @@ def run_exists(task_id: int, setup_id: int) -> set[int]:
         return set()
 
 
-def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, PLR0913, C901
+def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, C901
     *,
     model: Any,
     task: OpenMLTask,
     extension: Extension,
     add_local_measures: bool,
-    dataset_format: Literal["array", "dataframe"],
     n_jobs: int | None = None,
 ) -> tuple[
     list[list],
@@ -495,8 +483,6 @@ def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, PLR0913, C901
         The OpenML extension object.
     add_local_measures : bool
         Whether to compute additional local evaluation measures.
-    dataset_format : str
-        The format in which to download the dataset.
     n_jobs : int
         Number of jobs to run in parallel.
         If None, use 1 core by default. If -1, use all available cores.
@@ -560,7 +546,6 @@ def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, PLR0913, C901
             rep_no=rep_no,
             sample_no=sample_no,
             task=task,
-            dataset_format=dataset_format,
             configuration=_config,
         )
         for _n_fit, rep_no, fold_no, sample_no in jobs
@@ -704,7 +689,6 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
     rep_no: int,
     sample_no: int,
     task: OpenMLTask,
-    dataset_format: Literal["array", "dataframe"],
     configuration: _Config | None = None,
 ) -> tuple[
     np.ndarray,
@@ -730,8 +714,6 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
         Sample number to be run.
     task : OpenMLTask
         The task object from OpenML.
-    dataset_format : str
-        The dataset format to be used.
     configuration : _Config
         Hyperparameters to configure the model.
 
@@ -755,24 +737,15 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
     )
 
     if isinstance(task, OpenMLSupervisedTask):
-        x, y = task.get_X_and_y(dataset_format=dataset_format)
-        if isinstance(x, pd.DataFrame):
-            assert isinstance(y, (pd.Series, pd.DataFrame))
-            train_x = x.iloc[train_indices]
-            train_y = y.iloc[train_indices]
-            test_x = x.iloc[test_indices]
-            test_y = y.iloc[test_indices]
-        else:
-            # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
-            assert y is not None
-            train_x = x[train_indices]  # type: ignore
-            train_y = y[train_indices]
-            test_x = x[test_indices]  # type: ignore
-            test_y = y[test_indices]
+        x, y = task.get_X_and_y()
+        assert isinstance(y, (pd.Series, pd.DataFrame))
+        train_x = x.iloc[train_indices]
+        train_y = y.iloc[train_indices]
+        test_x = x.iloc[test_indices]
+        test_y = y.iloc[test_indices]
     elif isinstance(task, OpenMLClusteringTask):
-        x = task.get_X(dataset_format=dataset_format)
-        # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
-        train_x = x.iloc[train_indices] if isinstance(x, pd.DataFrame) else x[train_indices]  # type: ignore
+        x = task.get_X()
+        train_x = x.iloc[train_indices]
         train_y = None
         test_x = None
         test_y = None
@@ -793,8 +766,7 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
         model=model,
         task=task,
         X_train=train_x,
-        # TODO(eddiebergman): Likely should not be ignored
-        y_train=train_y,  # type: ignore
+        y_train=train_y,
         rep_no=rep_no,
         fold_no=fold_no,
         X_test=test_x,
@@ -1050,8 +1022,6 @@ def _get_cached_run(run_id: int) -> OpenMLRun:
         raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e
 
 
-# TODO(eddiebergman): Could overload, likely too large an annoying to do
-# nvm, will be deprecated in 0.15
 def list_runs(  # noqa: PLR0913
     offset: int | None = None,
     size: int | None = None,
@@ -1063,11 +1033,10 @@ def list_runs(  # noqa: PLR0913
     tag: str | None = None,
     study: int | None = None,
     display_errors: bool = False,  # noqa: FBT001, FBT002
-    output_format: Literal["dict", "dataframe"] = "dict",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
-    """
-    List all runs matching all of the given filters.
+) -> pd.DataFrame:
+    """List all runs matching all of the given filters.
+
     (Supports large amount of results)
 
     Parameters
@@ -1095,31 +1064,13 @@ def list_runs(  # noqa: PLR0913
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
     kwargs : dict, optional
         Legal filter operators: task_type.
 
     Returns
     -------
-    dict of dicts, or dataframe
+    dataframe
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.")
-
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
-    # TODO(eddiebergman): Do we really need this runtime type validation?
     if id is not None and (not isinstance(id, list)):
         raise TypeError("id must be of type list.")
     if task is not None and (not isinstance(task, list)):
@@ -1131,8 +1082,7 @@ def list_runs(  # noqa: PLR0913
     if uploader is not None and (not isinstance(uploader, list)):
         raise TypeError("uploader must be of type list.")
 
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_runs,
         offset=offset,
         size=size,
@@ -1146,6 +1096,7 @@ def list_runs(  # noqa: PLR0913
         display_errors=display_errors,
         **kwargs,
     )
+    return pd.concat(batches, ignore_index=True)
 
 
 def _list_runs(  # noqa: PLR0913
@@ -1156,9 +1107,8 @@ def _list_runs(  # noqa: PLR0913
     uploader: list | None = None,
     study: int | None = None,
     display_errors: bool = False,  # noqa: FBT002, FBT001
-    output_format: Literal["dict", "dataframe"] = "dict",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
     """
     Perform API call `/run/list/{filters}'
     <https://www.openml.org/api_docs/#!/run/get_run_list_filters>`
@@ -1186,18 +1136,12 @@ def _list_runs(  # noqa: PLR0913
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
     kwargs : dict, optional
         Legal filter operators: task_type.
 
     Returns
     -------
-    dict, or dataframe
-        List of found runs.
+    dataframe of found runs.
     """
     api_call = "run/list"
     if kwargs is not None:
@@ -1217,12 +1161,10 @@ def _list_runs(  # noqa: PLR0913
         api_call += "/study/%d" % study
     if display_errors:
         api_call += "/show_errors/true"
-    return __list_runs(api_call=api_call, output_format=output_format)
+    return __list_runs(api_call=api_call)
 
 
-def __list_runs(
-    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_runs(api_call: str) -> pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
@@ -1257,11 +1199,7 @@ def __list_runs(
         }
         for r in runs_dict["oml:runs"]["oml:run"]
     }
-
-    if output_format == "dataframe":
-        runs = pd.DataFrame.from_dict(runs, orient="index")
-
-    return runs
+    return pd.DataFrame.from_dict(runs, orient="index")
 
 
 def format_prediction(  # noqa: PLR0913
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 0bcd2b4e2..383e2e16e 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -1,8 +1,8 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import warnings
 from collections import OrderedDict
+from itertools import chain
 from pathlib import Path
 from typing import Any, Iterable
 from typing_extensions import Literal
@@ -125,7 +125,7 @@ def get_setup(setup_id: int) -> OpenMLSetup:
             fh.write(setup_xml)
 
     result_dict = xmltodict.parse(setup_xml)
-    return _create_setup_from_xml(result_dict, output_format="object")  # type: ignore
+    return _create_setup_from_xml(result_dict)
 
 
 def list_setups(  # noqa: PLR0913
@@ -134,8 +134,8 @@ def list_setups(  # noqa: PLR0913
     flow: int | None = None,
     tag: str | None = None,
     setup: Iterable[int] | None = None,
-    output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
+    output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLSetup] | pd.DataFrame:
     """
     List all setups matching all of the given filters.
 
@@ -148,30 +148,19 @@ def list_setups(  # noqa: PLR0913
     setup : Iterable[int], optional
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
     dict or dataframe
     """
-    if output_format not in ["dataframe", "dict", "object"]:
+    if output_format not in ["dataframe", "object"]:
         raise ValueError(
-            "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable.",
+            "Invalid output format selected. Only 'object', or 'dataframe' applicable.",
         )
 
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15. "
-            "To ensure your code will continue to work, "
-            "use `output_format`='dataframe' or `output_format`='object'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
     batch_size = 1000  # batch size for setups is lower
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_setups,
         offset=offset,
         size=size,
@@ -180,15 +169,15 @@ def list_setups(  # noqa: PLR0913
         setup=setup,
         batch_size=batch_size,
     )
+    flattened = list(chain.from_iterable(batches))
+    if output_format == "object":
+        return {setup.setup_id: setup for setup in flattened}
 
+    return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id")
 
-def _list_setups(
-    setup: Iterable[int] | None = None,
-    output_format: Literal["dict", "dataframe", "object"] = "object",
-    **kwargs: Any,
-) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
-    """
-    Perform API call `/setup/list/{filters}`
+
+def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[OpenMLSetup]:
+    """Perform API call `/setup/list/{filters}`
 
     Parameters
     ----------
@@ -197,18 +186,12 @@ def _list_setups(
 
     setup : list(int), optional
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-        - If 'object' the output is a dict of OpenMLSetup objects
-
     kwargs: dict, optional
         Legal filter operators: flow, setup, limit, offset, tag.
 
     Returns
     -------
-    dict or dataframe or list[OpenMLSetup]
+    The setups that match the filters, going from id to the OpenMLSetup object.
     """
     api_call = "setup/list"
     if setup is not None:
@@ -217,12 +200,10 @@ def _list_setups(
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
 
-    return __list_setups(api_call=api_call, output_format=output_format)
+    return __list_setups(api_call=api_call)
 
 
-def __list_setups(
-    api_call: str, output_format: Literal["dict", "dataframe", "object"] = "object"
-) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
+def __list_setups(api_call: str) -> list[OpenMLSetup]:
     """Helper function to parse API calls which are lists of setups"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
@@ -247,22 +228,10 @@ def __list_setups(
 
     assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"])
 
-    setups = {}
-    for setup_ in setups_dict["oml:setups"]["oml:setup"]:
-        # making it a dict to give it the right format
-        current = _create_setup_from_xml(
-            {"oml:setup_parameters": setup_},
-            output_format=output_format,
-        )
-        if output_format == "object":
-            setups[current.setup_id] = current  # type: ignore
-        else:
-            setups[current["setup_id"]] = current  # type: ignore
-
-    if output_format == "dataframe":
-        setups = pd.DataFrame.from_dict(setups, orient="index")
-
-    return setups
+    return [
+        _create_setup_from_xml({"oml:setup_parameters": setup_})
+        for setup_ in setups_dict["oml:setups"]["oml:setup"]
+    ]
 
 
 def initialize_model(setup_id: int) -> Any:
@@ -297,9 +266,7 @@ def initialize_model(setup_id: int) -> Any:
     return flow.extension.flow_to_model(flow)
 
 
-def _to_dict(
-    flow_id: int, openml_parameter_settings: list[OpenMLParameter] | list[dict[str, Any]]
-) -> OrderedDict:
+def _to_dict(flow_id: int, openml_parameter_settings: list[dict[str, Any]]) -> OrderedDict:
     """Convert a flow ID and a list of OpenML parameter settings to
     a dictionary representation that can be serialized to XML.
 
@@ -307,7 +274,7 @@ def _to_dict(
     ----------
     flow_id : int
         ID of the flow.
-    openml_parameter_settings : List[OpenMLParameter]
+    openml_parameter_settings : list[dict[str, Any]]
         A list of OpenML parameter settings.
 
     Returns
@@ -325,77 +292,41 @@ def _to_dict(
     return xml
 
 
-def _create_setup_from_xml(
-    result_dict: dict, output_format: Literal["dict", "dataframe", "object"] = "object"
-) -> OpenMLSetup | dict[str, int | dict[int, Any] | None]:
+def _create_setup_from_xml(result_dict: dict) -> OpenMLSetup:
     """Turns an API xml result into a OpenMLSetup object (or dict)"""
-    if output_format in ["dataframe", "dict"]:
-        _output_format: Literal["dict", "object"] = "dict"
-    elif output_format == "object":
-        _output_format = "object"
-    else:
-        raise ValueError(
-            f"Invalid output format selected: {output_format}"
-            "Only 'dict', 'object', or 'dataframe' applicable.",
-        )
-
     setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"])
     flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"])
+
     if "oml:parameter" not in result_dict["oml:setup_parameters"]:
-        parameters = None
+        return OpenMLSetup(setup_id, flow_id, parameters=None)
+
+    xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
+    if isinstance(xml_parameters, dict):
+        parameters = {
+            int(xml_parameters["oml:id"]): _create_setup_parameter_from_xml(xml_parameters),
+        }
+    elif isinstance(xml_parameters, list):
+        parameters = {
+            int(xml_parameter["oml:id"]): _create_setup_parameter_from_xml(xml_parameter)
+            for xml_parameter in xml_parameters
+        }
     else:
-        parameters = {}
-        # basically all others
-        xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
-        if isinstance(xml_parameters, dict):
-            oml_id = int(xml_parameters["oml:id"])
-            parameters[oml_id] = _create_setup_parameter_from_xml(
-                result_dict=xml_parameters,
-                output_format=_output_format,
-            )
-        elif isinstance(xml_parameters, list):
-            for xml_parameter in xml_parameters:
-                oml_id = int(xml_parameter["oml:id"])
-                parameters[oml_id] = _create_setup_parameter_from_xml(
-                    result_dict=xml_parameter,
-                    output_format=_output_format,
-                )
-        else:
-            raise ValueError(
-                "Expected None, list or dict, received "
-                f"something else: {type(xml_parameters)!s}",
-            )
-
-    if _output_format in ["dataframe", "dict"]:
-        return {"setup_id": setup_id, "flow_id": flow_id, "parameters": parameters}
+        raise ValueError(
+            f"Expected None, list or dict, received something else: {type(xml_parameters)!s}",
+        )
+
     return OpenMLSetup(setup_id, flow_id, parameters)
 
 
-def _create_setup_parameter_from_xml(
-    result_dict: dict[str, str], output_format: Literal["object", "dict"] = "object"
-) -> dict[str, int | str] | OpenMLParameter:
+def _create_setup_parameter_from_xml(result_dict: dict[str, str]) -> OpenMLParameter:
     """Create an OpenMLParameter object or a dictionary from an API xml result."""
-    if output_format == "object":
-        return OpenMLParameter(
-            input_id=int(result_dict["oml:id"]),
-            flow_id=int(result_dict["oml:flow_id"]),
-            flow_name=result_dict["oml:flow_name"],
-            full_name=result_dict["oml:full_name"],
-            parameter_name=result_dict["oml:parameter_name"],
-            data_type=result_dict["oml:data_type"],
-            default_value=result_dict["oml:default_value"],
-            value=result_dict["oml:value"],
-        )
-
-    # FIXME: likely we want to crash here if unknown output_format but not backwards compatible
-    # output_format == "dict" case,
-    return {
-        "input_id": int(result_dict["oml:id"]),
-        "flow_id": int(result_dict["oml:flow_id"]),
-        "flow_name": result_dict["oml:flow_name"],
-        "full_name": result_dict["oml:full_name"],
-        "parameter_name": result_dict["oml:parameter_name"],
-        "data_type": result_dict["oml:data_type"],
-        "default_value": result_dict["oml:default_value"],
-        "value": result_dict["oml:value"],
-    }
+    return OpenMLParameter(
+        input_id=int(result_dict["oml:id"]),
+        flow_id=int(result_dict["oml:flow_id"]),
+        flow_name=result_dict["oml:flow_name"],
+        full_name=result_dict["oml:full_name"],
+        parameter_name=result_dict["oml:parameter_name"],
+        data_type=result_dict["oml:data_type"],
+        default_value=result_dict["oml:default_value"],
+        value=result_dict["oml:value"],
+    )
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index e8dc059e7..83b258dd7 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -7,6 +7,7 @@
 import openml.flows
 
 
+# TODO(eddiebergman): All functionality handled by dataclass
 class OpenMLSetup:
     """Setup object (a.k.a. Configuration).
 
@@ -34,6 +35,15 @@ def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | Non
         self.flow_id = flow_id
         self.parameters = parameters
 
+    def _to_dict(self) -> dict[str, Any]:
+        return {
+            "setup_id": self.setup_id,
+            "flow_id": self.flow_id,
+            "parameters": {p.input_id: p._to_dict() for p in self.parameters.values()}
+            if self.parameters is not None
+            else None,
+        }
+
     def __repr__(self) -> str:
         header = "OpenML Setup"
         header = "{}\n{}\n".format(header, "=" * len(header))
@@ -57,6 +67,7 @@ def __repr__(self) -> str:
         return header + body
 
 
+# TODO(eddiebergman): All functionality handled by dataclass
 class OpenMLParameter:
     """Parameter object (used in setup).
 
@@ -102,6 +113,18 @@ def __init__(  # noqa: PLR0913
         self.default_value = default_value
         self.value = value
 
+    def _to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "flow_id": self.flow_id,
+            "flow_name": self.flow_name,
+            "full_name": self.full_name,
+            "parameter_name": self.parameter_name,
+            "data_type": self.data_type,
+            "default_value": self.default_value,
+            "value": self.value,
+        }
+
     def __repr__(self) -> str:
         header = "OpenML Parameter"
         header = "{}\n{}\n".format(header, "=" * len(header))
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 7fdc6f636..f7db1e0aa 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -3,8 +3,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING, Any, overload
-from typing_extensions import Literal
+from typing import TYPE_CHECKING, Any
 
 import pandas as pd
 import xmltodict
@@ -433,33 +432,12 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int:
     return int(result["oml:linked_entities"])
 
 
-@overload
-def list_suites(
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    uploader: list[int] | None = ...,
-    output_format: Literal["dict"] = "dict",
-) -> dict: ...
-
-
-@overload
-def list_suites(
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    uploader: list[int] | None = ...,
-    output_format: Literal["dataframe"] = "dataframe",
-) -> pd.DataFrame: ...
-
-
 def list_suites(
     offset: int | None = None,
     size: int | None = None,
     status: str | None = None,
     uploader: list[int] | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
     """
     Return a list of all suites which are on OpenML.
 
@@ -474,49 +452,20 @@ def list_suites(
         suites are returned.
     uploader : list (int), optional
         Result filter. Will only return suites created by these users.
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            Every suite is represented by a dictionary containing the following information:
-            - id
-            - alias (optional)
-            - name
-            - main_entity_type
-            - status
-            - creator
-            - creation_date
-
-        - If output_format='dataframe'
-            Every row is represented by a dictionary containing the following information:
-            - id
-            - alias (optional)
-            - name
-            - main_entity_type
-            - status
-            - creator
-            - creation_date
+    datasets : dataframe
+        Every row is represented by a dictionary containing the following information:
+        - id
+        - alias (optional)
+        - name
+        - main_entity_type
+        - status
+        - creator
+        - creation_date
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
-        )
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_studies,
         offset=offset,
         size=size,
@@ -524,28 +473,7 @@ def list_suites(
         status=status,
         uploader=uploader,
     )
-
-
-@overload
-def list_studies(
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    uploader: list[str] | None = ...,
-    benchmark_suite: int | None = ...,
-    output_format: Literal["dict"] = "dict",
-) -> dict: ...
-
-
-@overload
-def list_studies(
-    offset: int | None = ...,
-    size: int | None = ...,
-    status: str | None = ...,
-    uploader: list[str] | None = ...,
-    benchmark_suite: int | None = ...,
-    output_format: Literal["dataframe"] = "dataframe",
-) -> pd.DataFrame: ...
+    return pd.concat(batches, ignore_index=True)
 
 
 def list_studies(
@@ -554,8 +482,7 @@ def list_studies(
     status: str | None = None,
     uploader: list[str] | None = None,
     benchmark_suite: int | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
     """
     Return a list of all studies which are on OpenML.
 
@@ -571,55 +498,23 @@ def list_studies(
     uploader : list (int), optional
         Result filter. Will only return studies created by these users.
     benchmark_suite : int, optional
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - id
-            - alias (optional)
-            - name
-            - benchmark_suite (optional)
-            - status
-            - creator
-            - creation_date
-            If qualities are calculated for the dataset, some of
-            these are also returned.
-
-        - If output_format='dataframe'
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - id
-            - alias (optional)
-            - name
-            - benchmark_suite (optional)
-            - status
-            - creator
-            - creation_date
-            If qualities are calculated for the dataset, some of
-            these are also returned.
+    datasets : dataframe
+        Every dataset is represented by a dictionary containing
+        the following information:
+        - id
+        - alias (optional)
+        - name
+        - benchmark_suite (optional)
+        - status
+        - creator
+        - creation_date
+        If qualities are calculated for the dataset, some of
+        these are also returned.
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
-        )
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_studies,
         offset=offset,
         size=size,
@@ -628,54 +523,30 @@ def list_studies(
         uploader=uploader,
         benchmark_suite=benchmark_suite,
     )
+    return pd.concat(batches, ignore_index=True)
 
 
-@overload
-def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: ...
-
-
-@overload
-def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-
-def _list_studies(
-    output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
-) -> dict | pd.DataFrame:
-    """
-    Perform api call to return a list of studies.
+def _list_studies(**kwargs: Any) -> pd.DataFrame:
+    """Perform api call to return a list of studies.
 
     Parameters
     ----------
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         status, limit, offset, main_entity_type, uploader
 
     Returns
     -------
-    studies : dict of dicts
+    studies : dataframe
     """
     api_call = "study/list"
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
-    return __list_studies(api_call=api_call, output_format=output_format)
-
-
-@overload
-def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
-
-@overload
-def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
+    return __list_studies(api_call=api_call)
 
 
-def __list_studies(
-    api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_studies(api_call: str) -> pd.DataFrame:
     """Retrieves the list of OpenML studies and
     returns it in a dictionary or a Pandas DataFrame.
 
@@ -683,13 +554,10 @@ def __list_studies(
     ----------
     api_call : str
         The API call for retrieving the list of OpenML studies.
-    output_format : str in {"dict", "dataframe"}
-        Format of the output, either 'object' for a dictionary
-        or 'dataframe' for a Pandas DataFrame.
 
     Returns
     -------
-    Union[Dict, pd.DataFrame]
+    pd.DataFrame
         A dictionary or Pandas DataFrame of OpenML studies,
         depending on the value of 'output_format'.
     """
@@ -725,6 +593,4 @@ def __list_studies(
         current_study["id"] = int(current_study["id"])
         studies[study_id] = current_study
 
-    if output_format == "dataframe":
-        studies = pd.DataFrame.from_dict(studies, orient="index")
-    return studies
+    return pd.DataFrame.from_dict(studies, orient="index")
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 54030422d..6877658f7 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -5,7 +5,6 @@
 import re
 import warnings
 from typing import Any
-from typing_extensions import Literal
 
 import pandas as pd
 import xmltodict
@@ -131,9 +130,8 @@ def list_tasks(
     offset: int | None = None,
     size: int | None = None,
     tag: str | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
     **kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
     """
     Return a number of tasks having the given tag and task_type
 
@@ -150,10 +148,6 @@ def list_tasks(
         the maximum number of tasks to show
     tag : str, optional
         the tag to include
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs: dict, optional
         Legal filter operators: data_tag, status, data_id, data_name,
         number_instances, number_features,
@@ -161,31 +155,13 @@ def list_tasks(
 
     Returns
     -------
-    dict
-        All tasks having the given task_type and the give tag. Every task is
-        represented by a dictionary containing the following information:
-        task id, dataset id, task_type and status. If qualities are calculated
-        for the associated dataset, some of these are also returned.
     dataframe
         All tasks having the given task_type and the give tag. Every task is
         represented by a row in the data frame containing the following information
         as columns: task id, dataset id, task_type and status. If qualities are
         calculated for the associated dataset, some of these are also returned.
     """
-    if output_format not in ["dataframe", "dict"]:
-        raise ValueError(
-            "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
-        )
-    # TODO: [0.15]
-    if output_format == "dict":
-        msg = (
-            "Support for `output_format` of 'dict' will be removed in 0.15 "
-            "and pandas dataframes will be returned instead. To ensure your code "
-            "will continue to work, use `output_format`='dataframe'."
-        )
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-    return openml.utils._list_all(  # type: ignore
-        list_output_format=output_format,  # type: ignore
+    batches = openml.utils._list_all(
         listing_call=_list_tasks,
         task_type=task_type,
         offset=offset,
@@ -193,13 +169,10 @@ def list_tasks(
         tag=tag,
         **kwargs,
     )
+    return pd.concat(batches, ignore_index=True)
 
 
-def _list_tasks(
-    task_type: TaskType | None = None,
-    output_format: Literal["dict", "dataframe"] = "dict",
-    **kwargs: Any,
-) -> dict | pd.DataFrame:
+def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFrame:
     """
     Perform the api call to return a number of tasks having the given filters.
 
@@ -210,10 +183,6 @@ def _list_tasks(
     type when used as a filter in list tasks call.
     task_type : TaskType, optional
         Refers to the type of task.
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
     kwargs: dict, optional
         Legal filter operators: tag, task_id (list), data_tag, status, limit,
         offset, data_id, data_name, number_instances, number_features,
@@ -221,7 +190,7 @@ def _list_tasks(
 
     Returns
     -------
-    dict or dataframe
+    dataframe
     """
     api_call = "task/list"
     if task_type is not None:
@@ -232,27 +201,20 @@ def _list_tasks(
                 value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
             api_call += f"/{operator}/{value}"
 
-    return __list_tasks(api_call=api_call, output_format=output_format)
+    return __list_tasks(api_call=api_call)
 
 
-# TODO(eddiebergman): overload todefine type returned
-def __list_tasks(  # noqa: PLR0912, C901
-    api_call: str,
-    output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
-    """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.
+def __list_tasks(api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
+    """Returns a Pandas DataFrame with information about OpenML tasks.
 
     Parameters
     ----------
     api_call : str
         The API call specifying which tasks to return.
-    output_format : str in {"dict", "dataframe"}
-        Output format for the returned object.
 
     Returns
     -------
-    Union[Dict, pd.DataFrame]
-        A dictionary or a Pandas DataFrame with information about OpenML tasks.
+        A Pandas DataFrame with information about OpenML tasks.
 
     Raises
     ------
@@ -339,13 +301,9 @@ def __list_tasks(  # noqa: PLR0912, C901
             else:
                 warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
 
-    if output_format == "dataframe":
-        tasks = pd.DataFrame.from_dict(tasks, orient="index")
-
-    return tasks
+    return pd.DataFrame.from_dict(tasks, orient="index")
 
 
-# TODO(eddiebergman): Maybe since this isn't public api, we can make it keyword only?
 def get_tasks(
     task_ids: list[int],
     download_data: bool | None = None,
@@ -590,7 +548,7 @@ def create_task(
         task_type_id=task_type,
         task_type="None",  # TODO: refactor to get task type string from ID.
         data_set_id=dataset_id,
-        target_name=target_name,
+        target_name=target_name,  # type: ignore
         estimation_procedure_id=estimation_procedure_id,
         evaluation_measure=evaluation_measure,
         **kwargs,
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index e7d19bdce..395b52482 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -8,7 +8,7 @@
 from enum import Enum
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Sequence
-from typing_extensions import Literal, TypedDict, overload
+from typing_extensions import TypedDict
 
 import openml._api_calls
 import openml.config
@@ -21,7 +21,6 @@
 if TYPE_CHECKING:
     import numpy as np
     import pandas as pd
-    import scipy.sparse
 
 
 # TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
@@ -277,52 +276,14 @@ def __init__(  # noqa: PLR0913
 
         self.target_name = target_name
 
-    @overload
-    def get_X_and_y(
-        self, dataset_format: Literal["array"] = "array"
-    ) -> tuple[
-        np.ndarray | scipy.sparse.spmatrix,
-        np.ndarray | None,
-    ]: ...
-
-    @overload
-    def get_X_and_y(
-        self, dataset_format: Literal["dataframe"]
-    ) -> tuple[
-        pd.DataFrame,
-        pd.Series | pd.DataFrame | None,
-    ]: ...
-
-    # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`?
-    def get_X_and_y(
-        self, dataset_format: Literal["dataframe", "array"] = "array"
-    ) -> tuple[
-        np.ndarray | pd.DataFrame | scipy.sparse.spmatrix,
-        np.ndarray | pd.Series | pd.DataFrame | None,
-    ]:
+    def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         """Get data associated with the current task.
 
-        Parameters
-        ----------
-        dataset_format : str
-            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
-            for possible options.
-
         Returns
         -------
         tuple - X and y
 
         """
-        # TODO: [0.15]
-        if dataset_format == "array":
-            warnings.warn(
-                "Support for `dataset_format='array'` will be removed in 0.15,"
-                "start using `dataset_format='dataframe' to ensure your code "
-                "will continue to work. You can use the dataframe's `to_numpy` "
-                "function to continue using numpy arrays.",
-                category=FutureWarning,
-                stacklevel=2,
-            )
         dataset = self.get_dataset()
         if self.task_type_id not in (
             TaskType.SUPERVISED_CLASSIFICATION,
@@ -331,10 +292,7 @@ def get_X_and_y(
         ):
             raise NotImplementedError(self.task_type)
 
-        X, y, _, _ = dataset.get_data(
-            dataset_format=dataset_format,
-            target=self.target_name,
-        )
+        X, y, _, _ = dataset.get_data(target=self.target_name)
         return X, y
 
     def _to_dict(self) -> dict[str, dict]:
@@ -536,34 +494,15 @@ def __init__(  # noqa: PLR0913
 
         self.target_name = target_name
 
-    @overload
-    def get_X(
-        self,
-        dataset_format: Literal["array"] = "array",
-    ) -> np.ndarray | scipy.sparse.spmatrix: ...
-
-    @overload
-    def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: ...
-
-    def get_X(
-        self,
-        dataset_format: Literal["array", "dataframe"] = "array",
-    ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix:
+    def get_X(self) -> pd.DataFrame:
         """Get data associated with the current task.
 
-        Parameters
-        ----------
-        dataset_format : str
-            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
-            for possible options.
-
         Returns
         -------
-        tuple - X and y
-
+        The X data as a dataframe
         """
         dataset = self.get_dataset()
-        data, *_ = dataset.get_data(dataset_format=dataset_format, target=None)
+        data, *_ = dataset.get_data(target=None)
         return data
 
     def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
diff --git a/openml/utils.py b/openml/utils.py
index 66c4df800..aecf860d8 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -6,11 +6,10 @@
 import warnings
 from functools import wraps
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Mapping, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
 from typing_extensions import Literal, ParamSpec
 
 import numpy as np
-import pandas as pd
 import xmltodict
 from minio.helpers import ProgressType
 from tqdm import tqdm
@@ -27,6 +26,7 @@
 
     P = ParamSpec("P")
     R = TypeVar("R")
+    _SizedT = TypeVar("_SizedT", bound=Sized)
 
 
 @overload
@@ -237,39 +237,11 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
         raise
 
 
-@overload
-def _list_all(
-    listing_call: Callable[P, Any],
-    list_output_format: Literal["dict"] = ...,
-    *args: P.args,
-    **filters: P.kwargs,
-) -> dict: ...
-
-
-@overload
-def _list_all(
-    listing_call: Callable[P, Any],
-    list_output_format: Literal["object"],
-    *args: P.args,
-    **filters: P.kwargs,
-) -> dict: ...
-
-
-@overload
-def _list_all(
-    listing_call: Callable[P, Any],
-    list_output_format: Literal["dataframe"],
-    *args: P.args,
-    **filters: P.kwargs,
-) -> pd.DataFrame: ...
-
-
-def _list_all(  # noqa: C901, PLR0912
-    listing_call: Callable[P, Any],
-    list_output_format: Literal["dict", "dataframe", "object"] = "dict",
+def _list_all(  # noqa: C901
+    listing_call: Callable[P, _SizedT],
     *args: P.args,
     **filters: P.kwargs,
-) -> dict | pd.DataFrame:
+) -> list[_SizedT]:
     """Helper to handle paged listing requests.
 
     Example usage:
@@ -280,11 +252,6 @@ def _list_all(  # noqa: C901, PLR0912
     ----------
     listing_call : callable
         Call listing, e.g. list_evaluations.
-    list_output_format : str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-        - If 'object' the output is a dict of objects (only for some `listing_call`)
     *args : Variable length argument list
         Any required arguments for the listing call.
     **filters : Arbitrary keyword arguments
@@ -294,12 +261,12 @@ def _list_all(  # noqa: C901, PLR0912
 
     Returns
     -------
-    dict or dataframe
+    List of types returned from the listing call
     """
     # eliminate filters that have a None value
     active_filters = {key: value for key, value in filters.items() if value is not None}
     page = 0
-    result = pd.DataFrame() if list_output_format == "dataframe" else {}
+    results: list[_SizedT] = []
 
     # Default batch size per paging.
     # This one can be set in filters (batch_size), but should not be
@@ -313,6 +280,8 @@ def _list_all(  # noqa: C901, PLR0912
     if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
         raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
 
+    # If our batch size is larger than the limit, we should only
+    # request one batch of size of LIMIT
     if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
         BATCH_SIZE_ORIG = LIMIT
 
@@ -326,24 +295,17 @@ def _list_all(  # noqa: C901, PLR0912
             current_offset = offset + BATCH_SIZE_ORIG * page
             new_batch = listing_call(
                 *args,
-                output_format=list_output_format,  # type: ignore
                 **{**active_filters, "limit": batch_size, "offset": current_offset},  # type: ignore
             )
         except openml.exceptions.OpenMLServerNoResult:
-            # we want to return an empty dict in this case
             # NOTE: This above statement may not actually happen, but we could just return here
             # to enforce it...
             break
 
-        if list_output_format == "dataframe":
-            if len(result) == 0:
-                result = new_batch
-            else:
-                result = pd.concat([result, new_batch], ignore_index=True)
-        else:
-            # For output_format = 'dict' (or catch all)
-            result.update(new_batch)
+        results.append(new_batch)
 
+        # If the batch is less than our requested batch_size, that's the last batch
+        # and we can bail out.
         if len(new_batch) < batch_size:
             break
 
@@ -352,14 +314,15 @@ def _list_all(  # noqa: C901, PLR0912
             # check if the number of required results has been achieved
             # always do a 'bigger than' check,
             # in case of bugs to prevent infinite loops
-            if len(result) >= LIMIT:
+            n_recieved = sum(len(result) for result in results)
+            if n_recieved >= LIMIT:
                 break
 
             # check if there are enough results to fulfill a batch
-            if LIMIT - len(result) < BATCH_SIZE_ORIG:
-                batch_size = LIMIT - len(result)
+            if LIMIT - n_recieved < BATCH_SIZE_ORIG:
+                batch_size = LIMIT - n_recieved
 
-    return result
+    return results
 
 
 def _get_cache_dir_for_key(key: str) -> Path:
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 4598b8985..102429b41 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -95,25 +95,8 @@ def test__unpack_categories_with_nan_likes(self):
         self.assertListEqual(list(clean_series.values), expected_values)
         self.assertListEqual(list(clean_series.cat.categories.values), list("ab"))
 
-    def test_get_data_array(self):
-        # Basic usage
-        rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array")
-        assert isinstance(rval, np.ndarray)
-        assert rval.dtype == np.float32
-        assert rval.shape == (898, 39)
-        assert len(categorical) == 39
-        assert all(isinstance(cat, bool) for cat in categorical)
-        assert len(attribute_names) == 39
-        assert all(isinstance(att, str) for att in attribute_names)
-        assert _ is None
-
-        # check that an error is raised when the dataset contains string
-        err_msg = "PyOpenML cannot handle string when returning numpy arrays"
-        with pytest.raises(PyOpenMLError, match=err_msg):
-            self.titanic.get_data(dataset_format="array")
-
     def test_get_data_pandas(self):
-        data, _, _, _ = self.titanic.get_data(dataset_format="dataframe")
+        data, _, _, _ = self.titanic.get_data()
         assert isinstance(data, pd.DataFrame)
         assert data.shape[1] == len(self.titanic.features)
         assert data.shape[0] == 1309
@@ -137,7 +120,6 @@ def test_get_data_pandas(self):
             assert data[col_name].dtype.name == col_dtype[col_name]
 
         X, y, _, _ = self.titanic.get_data(
-            dataset_format="dataframe",
             target=self.titanic.default_target_attribute,
         )
         assert isinstance(X, pd.DataFrame)
@@ -160,12 +142,6 @@ def test_get_data_boolean_pandas(self):
         assert data["c"].dtype.name == "category"
         assert set(data["c"].cat.categories) == {True, False}
 
-    def test_get_data_no_str_data_for_nparrays(self):
-        # check that an error is raised when the dataset contains string
-        err_msg = "PyOpenML cannot handle string when returning numpy arrays"
-        with pytest.raises(PyOpenMLError, match=err_msg):
-            self.titanic.get_data(dataset_format="array")
-
     def _check_expected_type(self, dtype, is_cat, col):
         if is_cat:
             expected_type = "category"
@@ -193,16 +169,6 @@ def test_get_data_with_rowid(self):
         assert rval.shape == (898, 38)
         assert len(categorical) == 38
 
-    def test_get_data_with_target_array(self):
-        X, y, _, attribute_names = self.dataset.get_data(dataset_format="array", target="class")
-        assert isinstance(X, np.ndarray)
-        assert X.dtype == np.float32
-        assert X.shape == (898, 38)
-        assert y.dtype in [np.int32, np.int64]
-        assert y.shape == (898,)
-        assert len(attribute_names) == 38
-        assert "class" not in attribute_names
-
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_target_pandas(self):
         X, y, categorical, attribute_names = self.dataset.get_data(target="class")
@@ -247,13 +213,8 @@ def test_get_data_with_nonexisting_class(self):
         # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
         # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
         # indices 4 and 5, and that nothing is mapped to index 3.
-        _, y, _, _ = self.dataset.get_data("class", dataset_format="dataframe")
+        _, y, _, _ = self.dataset.get_data("class")
         assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
-        _, y, _, _ = self.dataset.get_data("class", dataset_format="array")
-        assert np.min(y) == 0
-        assert np.max(y) == 5
-        # Check that no label is mapped to 3, since it is reserved for label '4'.
-        assert np.sum(y == 3) == 0
 
     def test_get_data_corrupt_pickle(self):
         # Lazy loaded dataset, populate cache.
@@ -345,21 +306,20 @@ def test_get_feature_with_ontology_data_id_11(self):
     def test_add_remove_ontology_to_dataset(self):
         did = 1
         feature_index = 1
-        ontology = 'https://www.openml.org/unittest/' + str(time())
+        ontology = "https://www.openml.org/unittest/" + str(time())
         openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
         openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)
 
     def test_add_same_ontology_multiple_features(self):
         did = 1
-        ontology = 'https://www.openml.org/unittest/' + str(time())
+        ontology = "https://www.openml.org/unittest/" + str(time())
 
         for i in range(3):
             openml.datasets.functions.data_feature_add_ontology(did, i, ontology)
 
-
     def test_add_illegal_long_ontology(self):
         did = 1
-        ontology = 'http://www.google.com/' + ('a' * 257)
+        ontology = "http://www.google.com/" + ("a" * 257)
         try:
             openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
             assert False
@@ -368,13 +328,14 @@ def test_add_illegal_long_ontology(self):
 
     def test_add_illegal_url_ontology(self):
         did = 1
-        ontology = 'not_a_url' + str(time())
+        ontology = "not_a_url" + str(time())
         try:
             openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
             assert False
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 1106
 
+
 @pytest.mark.production()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True
@@ -385,28 +346,8 @@ def setUp(self):
 
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
-    def test_get_sparse_dataset_array_with_target(self):
-        X, y, _, attribute_names = self.sparse_dataset.get_data(
-            dataset_format="array",
-            target="class",
-        )
-
-        assert sparse.issparse(X)
-        assert X.dtype == np.float32
-        assert X.shape == (600, 20000)
-
-        assert isinstance(y, np.ndarray)
-        assert y.dtype in [np.int32, np.int64]
-        assert y.shape == (600,)
-
-        assert len(attribute_names) == 20000
-        assert "class" not in attribute_names
-
     def test_get_sparse_dataset_dataframe_with_target(self):
-        X, y, _, attribute_names = self.sparse_dataset.get_data(
-            dataset_format="dataframe",
-            target="class",
-        )
+        X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
         assert isinstance(X.dtypes[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
@@ -418,18 +359,6 @@ def test_get_sparse_dataset_dataframe_with_target(self):
         assert len(attribute_names) == 20000
         assert "class" not in attribute_names
 
-    def test_get_sparse_dataset_array(self):
-        rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array")
-        assert sparse.issparse(rval)
-        assert rval.dtype == np.float32
-        assert rval.shape == (600, 20001)
-
-        assert len(categorical) == 20001
-        assert all(isinstance(cat, bool) for cat in categorical)
-
-        assert len(attribute_names) == 20001
-        assert all(isinstance(att, str) for att in attribute_names)
-
     def test_get_sparse_dataset_dataframe(self):
         rval, *_ = self.sparse_dataset.get_data()
         assert isinstance(rval, pd.DataFrame)
@@ -439,52 +368,12 @@ def test_get_sparse_dataset_dataframe(self):
         )
         assert rval.shape == (600, 20001)
 
-    def test_get_sparse_dataset_with_rowid(self):
-        self.sparse_dataset.row_id_attribute = ["V256"]
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array",
-            include_row_id=True,
-        )
-        assert sparse.issparse(rval)
-        assert rval.dtype == np.float32
-        assert rval.shape == (600, 20001)
-        assert len(categorical) == 20001
-
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array",
-            include_row_id=False,
-        )
-        assert sparse.issparse(rval)
-        assert rval.dtype == np.float32
-        assert rval.shape == (600, 20000)
-        assert len(categorical) == 20000
-
-    def test_get_sparse_dataset_with_ignore_attributes(self):
-        self.sparse_dataset.ignore_attribute = ["V256"]
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array",
-            include_ignore_attribute=True,
-        )
-        assert sparse.issparse(rval)
-        assert rval.dtype == np.float32
-        assert rval.shape == (600, 20001)
-
-        assert len(categorical) == 20001
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array",
-            include_ignore_attribute=False,
-        )
-        assert sparse.issparse(rval)
-        assert rval.dtype == np.float32
-        assert rval.shape == (600, 20000)
-        assert len(categorical) == 20000
-
     def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
         self.sparse_dataset.ignore_attribute = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
+        # TODO(eddiebergman): Will break from dataset_format removal
         X, y, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format="array",
             target="class",
             include_row_id=False,
             include_ignore_attribute=False,
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index a15100070..3995f1b98 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -308,7 +308,8 @@ def ensure_absence_of_real_data():
 
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
-        X, *_ = dataset.get_data(dataset_format="array")
+        # TODO(eddiebergman): Will break from dataset_format removal
+        X, *_ = dataset.get_data()
         assert isinstance(X, scipy.sparse.csr_matrix)
 
     def test_download_rowid(self):
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index e181aaa15..242f6793c 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -261,7 +261,7 @@ def test_serialize_model(self):
                     ("min_samples_split", "2"),
                     ("min_weight_fraction_leaf", "0.0"),
                     ("presort", presort_val),
-                    ('monotonic_cst', 'null'),
+                    ("monotonic_cst", "null"),
                     ("random_state", "null"),
                     ("splitter", '"best"'),
                 ),
@@ -331,21 +331,23 @@ def test_serialize_model_clustering(self):
             n_init = '"auto"'
 
         algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"'
-        fixture_parameters = OrderedDict([
-            ("algorithm", algorithm),
-            ("copy_x", "true"),
-            ("init", '"k-means++"'),
-            ("max_iter", "300"),
-            ("n_clusters", "8"),
-            ("n_init", n_init),
-            ("n_jobs", n_jobs_val),
-            ("precompute_distances", precomp_val),
-            ("random_state", "null"),
-            ("tol", "0.0001"),
-            ("verbose", "0"),
-        ])
-
-        if sklearn_version >= Version("1.0" ):
+        fixture_parameters = OrderedDict(
+            [
+                ("algorithm", algorithm),
+                ("copy_x", "true"),
+                ("init", '"k-means++"'),
+                ("max_iter", "300"),
+                ("n_clusters", "8"),
+                ("n_init", n_init),
+                ("n_jobs", n_jobs_val),
+                ("precompute_distances", precomp_val),
+                ("random_state", "null"),
+                ("tol", "0.0001"),
+                ("verbose", "0"),
+            ]
+        )
+
+        if sklearn_version >= Version("1.0"):
             fixture_parameters.pop("n_jobs")
             fixture_parameters.pop("precompute_distances")
 
@@ -369,7 +371,9 @@ def test_serialize_model_clustering(self):
 
     @pytest.mark.sklearn()
     def test_serialize_model_with_subcomponent(self):
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()}
         model = sklearn.ensemble.AdaBoostClassifier(
             n_estimators=100,
@@ -428,8 +432,7 @@ def test_serialize_model_with_subcomponent(self):
             serialization.components[estimator_name].class_name == fixture_subcomponent_class_name
         )
         assert (
-            serialization.components[estimator_name].description
-            == fixture_subcomponent_description
+            serialization.components[estimator_name].description == fixture_subcomponent_description
         )
         self.assertDictEqual(structure, fixture_structure)
 
@@ -702,7 +705,9 @@ def test_serialize_column_transformer_pipeline(self):
         reason="Pipeline processing behaviour updated",
     )
     def test_serialize_feature_union(self):
-        sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        sparse_parameter = (
+            "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        )
         ohe_params = {sparse_parameter: False}
         if Version(sklearn.__version__) >= Version("0.20"):
             ohe_params["categories"] = "auto"
@@ -719,7 +724,9 @@ def test_serialize_feature_union(self):
         )
         structure = serialization.get_structure("name")
         # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        module_name_encoder = (
+            "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        )
         scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
         fixture_name = (
             "sklearn.pipeline.FeatureUnion("
@@ -765,7 +772,9 @@ def test_serialize_feature_union(self):
 
     @pytest.mark.sklearn()
     def test_serialize_feature_union_switched_names(self):
-        ohe_params = {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {}
+        ohe_params = (
+            {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {}
+        )
         ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
         scaler = sklearn.preprocessing.StandardScaler()
         fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)])
@@ -787,7 +796,9 @@ def test_serialize_feature_union_switched_names(self):
         )
 
         # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        module_name_encoder = (
+            "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        )
         scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
         assert (
             fu1_serialization.name == "sklearn.pipeline.FeatureUnion("
@@ -836,7 +847,9 @@ def test_serialize_complex_flow(self):
         )
         structure = serialized.get_structure("name")
         # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        module_name_encoder = (
+            "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        )
         ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder
         scaler_name = "sklearn.preprocessing.{}.StandardScaler".format(
             "data" if Version(sklearn.__version__) < Version("0.22") else "_data",
@@ -1294,7 +1307,9 @@ def test_paralizable_check(self):
         # using this param distribution should not raise an exception
         legal_param_dist = {"n_estimators": [2, 3, 4]}
 
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         legal_models = [
             sklearn.ensemble.RandomForestClassifier(),
             sklearn.ensemble.RandomForestClassifier(n_jobs=5),
@@ -1506,7 +1521,9 @@ def test_deserialize_complex_with_defaults(self):
         pipe_adjusted = sklearn.clone(pipe_orig)
         impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean"
         sparse = Version(sklearn.__version__) >= Version("0.23")
-        sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        sparse_parameter = (
+            "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        )
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator"
         )
@@ -1532,7 +1549,9 @@ def test_deserialize_complex_with_defaults(self):
     @pytest.mark.sklearn()
     def test_openml_param_name_to_sklearn(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         boosting = sklearn.ensemble.AdaBoostClassifier(
             **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
         )
@@ -1569,17 +1588,21 @@ def test_openml_param_name_to_sklearn(self):
     def test_obtain_parameter_values_flow_not_from_server(self):
         model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
         flow = self.extension.model_to_flow(model)
-        logistic_name = "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic"
+        logistic_name = (
+            "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic"
+        )
         msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!"
 
         with pytest.raises(ValueError, match=msg):
             self.extension.obtain_parameter_values(flow)
 
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         model = sklearn.ensemble.AdaBoostClassifier(
             **{
                 estimator_name: sklearn.linear_model.LogisticRegression(
-                solver="lbfgs",
+                    solver="lbfgs",
                 ),
             }
         )
@@ -1650,7 +1673,7 @@ def test_run_model_on_task(self):
                 ("dummy", sklearn.dummy.DummyClassifier()),
             ],
         )
-        openml.runs.run_model_on_task(pipe, task, dataset_format="array")
+        openml.runs.run_model_on_task(pipe, task)
 
     @pytest.mark.sklearn()
     def test_seed_model(self):
@@ -1751,7 +1774,9 @@ def test_run_model_on_fold_classification_1_array(self):
             assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+            lambda: collections.defaultdict(dict)
+        )
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
@@ -1778,7 +1803,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
 
         # diff test_run_model_on_fold_classification_1_array()
-        X, y = task.get_X_and_y(dataset_format="dataframe")
+        X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
         X_train = X.iloc[train_indices]
         y_train = y.iloc[train_indices]
@@ -1786,7 +1811,9 @@ def test_run_model_on_fold_classification_1_dataframe(self):
         y_test = y.iloc[test_indices]
 
         # Helper functions to return required columns for ColumnTransformer
-        sparse = {"sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False}
+        sparse = {
+            "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False
+        }
         cat_imp = make_pipeline(
             SimpleImputer(strategy="most_frequent"),
             OneHotEncoder(handle_unknown="ignore", **sparse),
@@ -1825,7 +1852,9 @@ def test_run_model_on_fold_classification_1_dataframe(self):
             assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+            lambda: collections.defaultdict(dict)
+        )
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
@@ -1878,7 +1907,9 @@ def test_run_model_on_fold_classification_2(self):
             assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))
 
         # check user defined measures
-        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+            lambda: collections.defaultdict(dict)
+        )
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
@@ -2001,7 +2032,9 @@ def test_run_model_on_fold_regression(self):
         assert y_hat_proba is None
 
         # check user defined measures
-        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+            lambda: collections.defaultdict(dict)
+        )
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
@@ -2023,7 +2056,7 @@ def test_run_model_on_fold_clustering(self):
         openml.config.server = self.production_server
         task = openml.tasks.get_task(126033)
 
-        X = task.get_X(dataset_format="array")
+        X = task.get_X()
 
         pipeline = sklearn.pipeline.Pipeline(
             steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())],
@@ -2045,7 +2078,9 @@ def test_run_model_on_fold_clustering(self):
         assert y_hat_proba is None
 
         # check user defined measures
-        fold_evaluations: dict[str, dict[int, dict[int, float]]]  = collections.defaultdict(lambda: collections.defaultdict(dict))
+        fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+            lambda: collections.defaultdict(dict)
+        )
         for measure in user_defined_measures:
             fold_evaluations[measure][0][0] = user_defined_measures[measure]
 
@@ -2186,7 +2221,6 @@ def test_run_on_model_with_empty_steps(self):
 
         X, y, categorical_ind, feature_names = dataset.get_data(
             target=dataset.default_target_attribute,
-            dataset_format="array",
         )
         categorical_ind = np.array(categorical_ind)
         (cat_idx,) = np.where(categorical_ind)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index d43a8bab5..960a72d6d 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -413,7 +413,6 @@ def test_run_regression_on_classif_task(self):
                 model=clf,
                 task=task,
                 avoid_duplicate_runs=False,
-                dataset_format="array",
             )
 
     @pytest.mark.sklearn()
@@ -1355,7 +1354,6 @@ def test__run_task_get_arffcontent(self):
             model=clf,
             task=task,
             add_local_measures=True,
-            dataset_format="dataframe",
         )
         arff_datacontent, trace, fold_evaluations, _ = res
         # predictions
@@ -1601,7 +1599,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             task=task,
             extension=self.extension,
             add_local_measures=True,
-            dataset_format="dataframe",
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
@@ -1645,7 +1642,6 @@ def test_run_on_dataset_with_missing_labels_array(self):
             task=task,
             extension=self.extension,
             add_local_measures=True,
-            dataset_format="array",  # diff test_run_on_dataset_with_missing_labels_dataframe()
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
@@ -1767,7 +1763,7 @@ def test_format_prediction_task_regression(self):
     def test__run_task_get_arffcontent_2(self, parallel_mock):
         """Tests if a run executed in parallel is collated correctly."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
-        x, y = task.get_X_and_y(dataset_format="dataframe")
+        x, y = task.get_X_and_y()
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
         loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
@@ -1780,7 +1776,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
                 model=clf,
                 task=task,
                 add_local_measures=True,
-                dataset_format="array",  # "dataframe" would require handling of categoricals
+                # dataset_format="array",  # "dataframe" would require handling of categoricals
                 n_jobs=n_jobs,
             )
         # This unit test will fail if joblib is unable to distribute successfully since the
@@ -1825,7 +1821,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
     def test_joblib_backends(self, parallel_mock):
         """Tests evaluation of a run using various joblib backends and n_jobs."""
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
-        x, y = task.get_X_and_y(dataset_format="dataframe")
+        x, y = task.get_X_and_y()
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
 
@@ -1865,7 +1861,7 @@ def test_joblib_backends(self, parallel_mock):
                     model=clf,
                     task=task,
                     add_local_measures=True,
-                    dataset_format="array",  # "dataframe" would require handling of categoricals
+                    # dataset_format="array",  # "dataframe" would require handling of categoricals
                     n_jobs=n_jobs,
                 )
             assert type(res[0]) == list
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 9e357f6aa..06cf821c2 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -164,13 +164,6 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
-        # TODO: [0.15] Remove section as `dict` is no longer supported.
-        with pytest.warns(FutureWarning):
-            setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10)
-        assert isinstance(setups, Dict)
-        assert isinstance(setups[next(iter(setups.keys()))], Dict)
-        assert len(setups) == 10
-
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index cae947917..4f71f63c3 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -5,6 +5,7 @@
 import pytest
 import shutil
 import openml
+from itertools import chain
 from openml.testing import _check_dataset
 
 
@@ -83,10 +84,6 @@ def _mocked_perform_api_call(call, request_method):
 @pytest.mark.server()
 def test_list_all():
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
-    openml.utils._list_all(
-        listing_call=openml.tasks.functions._list_tasks,
-        list_output_format="dataframe",
-    )
 
 
 @pytest.mark.server()
@@ -104,12 +101,13 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
     batch_size = min_number_tasks_on_test_server - 1
-    res = openml.utils._list_all(
+    batches = openml.utils._list_all(
         listing_call=openml.tasks.functions._list_tasks,
-        list_output_format="dataframe",
         batch_size=batch_size,
     )
-    assert min_number_tasks_on_test_server <= len(res)
+    assert len(batches) >= 2
+    flattened = list(chain(*batches))
+    assert min_number_tasks_on_test_server <= len(flattened)
 
 
 @pytest.mark.server()
@@ -202,4 +200,4 @@ def test_correct_test_server_download_state():
     """
     task = openml.tasks.get_task(119)
     dataset = task.get_dataset()
-    assert len(dataset.features) == dataset.get_data(dataset_format="dataframe")[0].shape[1]
\ No newline at end of file
+    assert len(dataset.features) == dataset.get_data()[0].shape[1]
\ No newline at end of file

From 6296f3778a2e6bf2d74967b35255a9d0b71198d8 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 16 Oct 2024 18:08:47 +0200
Subject: [PATCH 02/23] refactor: Explicitly name parameter to listing
 functions

---
 examples/40_paper/2018_ida_strang_example.py | 14 ++---
 openml/datasets/functions.py                 | 56 +++++++++++++------
 openml/evaluations/functions.py              | 25 +++++----
 openml/flows/functions.py                    | 22 ++++----
 openml/runs/functions.py                     | 50 +++++++++--------
 openml/setups/functions.py                   | 38 +++++++------
 openml/study/functions.py                    | 25 +++++----
 openml/tasks/functions.py                    | 57 ++++++++++++++------
 openml/testing.py                            |  4 +-
 openml/utils.py                              | 43 +++++++--------
 tests/test_tasks/test_task.py                |  2 +-
 tests/test_utils/test_utils.py               |  1 -
 12 files changed, 204 insertions(+), 133 deletions(-)

diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index 8b225125b..8b116507c 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -17,8 +17,8 @@
 # License: BSD 3-Clause
 
 import matplotlib.pyplot as plt
+
 import openml
-import pandas as pd
 
 ##############################################################################
 # A basic step for each data-mining or machine learning task is to determine
@@ -47,13 +47,16 @@
 
 # Downloads all evaluation records related to this study
 evaluations = openml.evaluations.list_evaluations(
-    measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
+    measure,
+    size=None,
+    flows=flow_ids,
+    study=study_id,
 )
 # gives us a table with columns data_id, flow1_value, flow2_value
 evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
 # downloads all data qualities (for scatter plot)
 data_qualities = openml.datasets.list_datasets(
-    data_id=list(evaluations.index.values), output_format="dataframe"
+    data_id=list(evaluations.index.values),
 )
 # removes irrelevant data qualities
 data_qualities = data_qualities[meta_features]
@@ -86,10 +89,9 @@
 def determine_class(val_lin, val_nonlin):
     if val_lin < val_nonlin:
         return class_values[0]
-    elif val_nonlin < val_lin:
+    if val_nonlin < val_lin:
         return class_values[1]
-    else:
-        return class_values[2]
+    return class_values[2]
 
 
 evaluations["class"] = evaluations.apply(
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 5ce095814..a328f9762 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -5,6 +5,7 @@
 import logging
 import warnings
 from collections import OrderedDict
+from functools import partial
 from pathlib import Path
 from pyexpat import ExpatError
 from typing import TYPE_CHECKING, Any
@@ -81,7 +82,12 @@ def list_datasets(
     size: int | None = None,
     status: str | None = None,
     tag: str | None = None,
-    **kwargs: Any,
+    data_name: str | None = None,
+    data_version: int | None = None,
+    number_instances: int | None = None,
+    number_features: int | None = None,
+    number_classes: int | None = None,
+    number_missing_values: int | None = None,
 ) -> pd.DataFrame:
     """Return a dataframe of all dataset which are on OpenML.
 
@@ -101,10 +107,12 @@ def list_datasets(
         default active datasets are returned, but also datasets
         from another status can be requested.
     tag : str, optional
-    kwargs : dict, optional
-        Legal filter operators (keys in the dict):
-        data_name, data_version, number_instances,
-        number_features, number_classes, number_missing_values.
+    data_name : str, optional
+    data_version : int, optional
+    number_instances : int, optional
+    number_features : int, optional
+    number_classes : int, optional
+    number_missing_values : int, optional
 
     Returns
     -------
@@ -118,19 +126,29 @@ def list_datasets(
         If qualities are calculated for the dataset, some of
         these are also included as columns.
     """
-    batches = openml.utils._list_all(
-        listing_call=_list_datasets,
+    listing_call = partial(
+        _list_datasets,
         data_id=data_id,
-        offset=offset,
-        size=size,
         status=status,
         tag=tag,
-        **kwargs,
+        data_name=data_name,
+        data_version=data_version,
+        number_instances=number_instances,
+        number_features=number_features,
+        number_classes=number_classes,
+        number_missing_values=number_missing_values,
     )
-    return pd.concat(batches, ignore_index=True)
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    return pd.concat(batches)
 
 
-def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame:
+def _list_datasets(
+    limit: int,
+    offset: int,
+    *,
+    data_id: list[int] | None = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
     """
     Perform api call to return a list of all datasets.
 
@@ -141,6 +159,10 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr
     display_errors is also separated from the kwargs since it has a
     default value.
 
+    limit : int
+        The maximum number of datasets to show.
+    offset : int
+        The number of datasets to skip, starting from the first.
     data_id : list, optional
 
     kwargs : dict, optional
@@ -152,7 +174,7 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr
     -------
     datasets : dataframe
     """
-    api_call = "data/list"
+    api_call = f"data/list/list/{limit}/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
@@ -242,12 +264,13 @@ def check_datasets_active(
     dict
         A dictionary with items {did: bool}
     """
-    datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
-    missing = set(dataset_ids) - set(datasets.get("did", []))
+    datasets = list_datasets(status="all", data_id=dataset_ids)
+    missing = set(dataset_ids) - set(datasets.index)
     if raise_error_if_not_exist and missing:
         missing_str = ", ".join(str(did) for did in missing)
         raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
-    return dict(datasets["status"] == "active")
+    mask = datasets["status"] == "active"
+    return dict(mask)
 
 
 def _name_to_id(
@@ -285,7 +308,6 @@ def _name_to_id(
         data_name=dataset_name,
         status=status,
         data_version=version,
-        output_format="dataframe",
     )
     if error_if_multiple and len(candidates) > 1:
         msg = f"Multiple active datasets exist with name '{dataset_name}'."
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 3b4735250..0adab61ac 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+from functools import partial
 from itertools import chain
 from typing import Any
 from typing_extensions import Literal, overload
@@ -56,7 +57,7 @@ def list_evaluations(
 def list_evaluations(
     function: str,
     offset: int | None = None,
-    size: int | None = 10000,
+    size: int | None = None,
     tasks: list[str | int] | None = None,
     setups: list[str | int] | None = None,
     flows: list[str | int] | None = None,
@@ -118,11 +119,9 @@ def list_evaluations(
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()
 
-    eval_collection: list[list[OpenMLEvaluation]] = openml.utils._list_all(
-        listing_call=_list_evaluations,
+    listing_call = partial(
+        _list_evaluations,
         function=function,
-        offset=offset,
-        size=size,
         tasks=tasks,
         setups=setups,
         flows=flows,
@@ -133,8 +132,9 @@ def list_evaluations(
         sort_order=sort_order,
         per_fold=per_fold_str,
     )
-    flattened = list(chain.from_iterable(eval_collection))
+    eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)
 
+    flattened = list(chain.from_iterable(eval_collection))
     if output_format == "dataframe":
         records = [item._to_dict() for item in flattened]
         return pd.DataFrame.from_records(records, index="run_id")
@@ -143,6 +143,9 @@ def list_evaluations(
 
 
 def _list_evaluations(
+    limit: int,
+    offset: int,
+    *,
     function: str,
     tasks: list | None = None,
     setups: list | None = None,
@@ -161,6 +164,10 @@ def _list_evaluations(
     The arguments that are lists are separated from the single value
     ones which are put into the kwargs.
 
+    limit : int
+        the number of evaluations to return
+    offset : int
+        the number of evaluations to skip, starting from the first
     function : str
         the evaluation function. e.g., predictive_accuracy
 
@@ -178,7 +185,7 @@ def _list_evaluations(
     study : int, optional
 
     kwargs: dict, optional
-        Legal filter operators: tag, limit, offset.
+        Legal filter operators: tag, per_fold
 
     sort_order : str, optional
         order of sorting evaluations, ascending ("asc") or descending ("desc")
@@ -187,7 +194,7 @@ def _list_evaluations(
     -------
     list of OpenMLEvaluation objects
     """
-    api_call = f"evaluation/list/function/{function}"
+    api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
@@ -202,7 +209,7 @@ def _list_evaluations(
     if uploaders is not None:
         api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders]))
     if study is not None:
-        api_call += "/study/%d" % study
+        api_call += f"/study/{study}"
     if sort_order is not None:
         api_call += f"/sort_order/{sort_order}"
 
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index e43e18c17..f1f1da6fb 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -4,6 +4,7 @@
 import os
 import re
 from collections import OrderedDict
+from functools import partial
 from typing import Any, Dict
 
 import dateutil.parser
@@ -135,7 +136,7 @@ def list_flows(
     offset: int | None = None,
     size: int | None = None,
     tag: str | None = None,
-    **kwargs: Any,
+    uploader: str | None = None,
 ) -> pd.DataFrame:
     """
     Return a list of all flows which are on OpenML.
@@ -164,30 +165,29 @@ def list_flows(
             - external version
             - uploader
     """
-    batches = openml.utils._list_all(
-        listing_call=_list_flows,
-        offset=offset,
-        size=size,
-        tag=tag,
-        **kwargs,
-    )
+    listing_call = partial(_list_flows, tag=tag, uploader=uploader)
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
     return pd.concat(batches, ignore_index=True)
 
 
-def _list_flows(**kwargs: Any) -> pd.DataFrame:
+def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
     """
     Perform the api call that return a list of all flows.
 
     Parameters
     ----------
+    limit : int
+        the maximum number of flows to return
+    offset : int
+        the number of flows to skip, starting from the first
     kwargs: dict, optional
-        Legal filter operators: uploader, tag, limit, offset.
+        Legal filter operators: uploader, tag
 
     Returns
     -------
     flows : dataframe
     """
-    api_call = "flow/list"
+    api_call = f"flow/list/limit/{limit}/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index cff649c1d..67a035825 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -5,6 +5,7 @@
 import time
 import warnings
 from collections import OrderedDict
+from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
@@ -447,8 +448,7 @@ def run_exists(task_id: int, setup_id: int) -> set[int]:
         return set()
 
     try:
-        result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
-        assert isinstance(result, pd.DataFrame)  # TODO(eddiebergman): Remove once #1299
+        result = list_runs(task=[task_id], setup=[setup_id])
         return set() if result.empty else set(result["run_id"])
     except OpenMLServerException as exception:
         # error code implies no results. The run does not exist yet
@@ -1033,10 +1033,10 @@ def list_runs(  # noqa: PLR0913
     tag: str | None = None,
     study: int | None = None,
     display_errors: bool = False,  # noqa: FBT001, FBT002
-    **kwargs: Any,
+    task_type: TaskType | int | None = None,
 ) -> pd.DataFrame:
-    """List all runs matching all of the given filters.
-
+    """
+    List all runs matching all of the given filters.
     (Supports large amount of results)
 
     Parameters
@@ -1064,8 +1064,7 @@ def list_runs(  # noqa: PLR0913
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    kwargs : dict, optional
-        Legal filter operators: task_type.
+    task_type : str, optional
 
     Returns
     -------
@@ -1082,10 +1081,8 @@ def list_runs(  # noqa: PLR0913
     if uploader is not None and (not isinstance(uploader, list)):
         raise TypeError("uploader must be of type list.")
 
-    batches = openml.utils._list_all(
-        listing_call=_list_runs,
-        offset=offset,
-        size=size,
+    listing_call = partial(
+        _list_runs,
         id=id,
         task=task,
         setup=setup,
@@ -1094,20 +1091,25 @@ def list_runs(  # noqa: PLR0913
         tag=tag,
         study=study,
         display_errors=display_errors,
-        **kwargs,
+        task_type=task_type,
     )
-    return pd.concat(batches, ignore_index=True)
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    return pd.concat(batches)
 
 
 def _list_runs(  # noqa: PLR0913
+    limit: int,
+    offset: int,
+    *,
     id: list | None = None,  # noqa: A002
     task: list | None = None,
     setup: list | None = None,
     flow: list | None = None,
     uploader: list | None = None,
     study: int | None = None,
-    display_errors: bool = False,  # noqa: FBT002, FBT001
-    **kwargs: Any,
+    tag: str | None = None,
+    display_errors: bool = False,
+    task_type: TaskType | int | None = None,
 ) -> pd.DataFrame:
     """
     Perform API call `/run/list/{filters}'
@@ -1128,6 +1130,8 @@ def _list_runs(  # noqa: PLR0913
 
     flow : list, optional
 
+    tag: str, optional
+
     uploader : list, optional
 
     study : int, optional
@@ -1136,17 +1140,14 @@ def _list_runs(  # noqa: PLR0913
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    kwargs : dict, optional
-        Legal filter operators: task_type.
+    task_type : str, optional
 
     Returns
     -------
-    dataframe of found runs.
+    dict, or dataframe
+        List of found runs.
     """
-    api_call = "run/list"
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+    api_call = f"run/list/limit/{limit}/offset/{offset}"
     if id is not None:
         api_call += "/run/{}".format(",".join([str(int(i)) for i in id]))
     if task is not None:
@@ -1161,6 +1162,11 @@ def _list_runs(  # noqa: PLR0913
         api_call += "/study/%d" % study
     if display_errors:
         api_call += "/show_errors/true"
+    if tag is not None:
+        api_call += f"/tag/{tag}"
+    if task_type is not None:
+        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+        api_call += f"/task_type/{tvalue}"
     return __list_runs(api_call=api_call)
 
 
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 383e2e16e..0c3095501 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
+from functools import partial
 from itertools import chain
 from pathlib import Path
 from typing import Any, Iterable
@@ -149,6 +150,7 @@ def list_setups(  # noqa: PLR0913
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
         - If 'dataframe' the output is a pandas DataFrame
+        - If 'object' the output is a dictionary of OpenMLSetup objects
 
     Returns
     -------
@@ -159,15 +161,12 @@ def list_setups(  # noqa: PLR0913
             "Invalid output format selected. Only 'object', or 'dataframe' applicable.",
         )
 
-    batch_size = 1000  # batch size for setups is lower
+    listing_call = partial(_list_setups, flow=flow, tag=tag, setup=setup)
     batches = openml.utils._list_all(
-        listing_call=_list_setups,
+        listing_call,
+        batch_size=1_000,  # batch size for setups is lower
         offset=offset,
-        size=size,
-        flow=flow,
-        tag=tag,
-        setup=setup,
-        batch_size=batch_size,
+        limit=size,
     )
     flattened = list(chain.from_iterable(batches))
     if output_format == "object":
@@ -176,7 +175,14 @@ def list_setups(  # noqa: PLR0913
     return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id")
 
 
-def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[OpenMLSetup]:
+def _list_setups(
+    limit: int,
+    offset: int,
+    *,
+    setup: Iterable[int] | None = None,
+    flow: int | None = None,
+    tag: str | None = None,
+) -> list[OpenMLSetup]:
     """Perform API call `/setup/list/{filters}`
 
     Parameters
@@ -184,21 +190,23 @@ def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[Open
     The setup argument that is a list is separated from the single value
     filters which are put into the kwargs.
 
+    limit : int
+    listing_offset : int
     setup : list(int), optional
-
-    kwargs: dict, optional
-        Legal filter operators: flow, setup, limit, offset, tag.
+    flow : int, optional
+    tag : str, optional
 
     Returns
     -------
     The setups that match the filters, going from id to the OpenMLSetup object.
     """
-    api_call = "setup/list"
+    api_call = f"setup/list/offset/{offset}/limit/{limit}"
     if setup is not None:
         api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup]))
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+    if flow is not None:
+        api_call += f"/flow/{flow}"
+    if tag is not None:
+        api_call += f"/tag/{tag}"
 
     return __list_setups(api_call=api_call)
 
diff --git a/openml/study/functions.py b/openml/study/functions.py
index f7db1e0aa..59f255b47 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import warnings
+from functools import partial
 from typing import TYPE_CHECKING, Any
 
 import pandas as pd
@@ -465,14 +466,13 @@ def list_suites(
         - creator
         - creation_date
     """
-    batches = openml.utils._list_all(
-        listing_call=_list_studies,
-        offset=offset,
-        size=size,
+    listing_call = partial(
+        _list_studies,
         main_entity_type="task",
         status=status,
         uploader=uploader,
     )
+    batches = openml.utils._list_all(listing_call, limit=size, offset=offset)
     return pd.concat(batches, ignore_index=True)
 
 
@@ -514,32 +514,35 @@ def list_studies(
         If qualities are calculated for the dataset, some of
         these are also returned.
     """
-    batches = openml.utils._list_all(
-        listing_call=_list_studies,
-        offset=offset,
-        size=size,
+    listing_call = partial(
+        _list_studies,
         main_entity_type="run",
         status=status,
         uploader=uploader,
         benchmark_suite=benchmark_suite,
     )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
     return pd.concat(batches, ignore_index=True)
 
 
-def _list_studies(**kwargs: Any) -> pd.DataFrame:
+def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
     """Perform api call to return a list of studies.
 
     Parameters
     ----------
+    limit: int
+        The maximum number of studies to return.
+    offset: int
+        The number of studies to skip, starting from the first.
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
-        status, limit, offset, main_entity_type, uploader
+        status, main_entity_type, uploader, benchmark_suite
 
     Returns
     -------
     studies : dataframe
     """
-    api_call = "study/list"
+    api_call = f"study/list/limit/{limit}/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 6877658f7..1f4ec71f8 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -4,6 +4,7 @@
 import os
 import re
 import warnings
+from functools import partial
 from typing import Any
 
 import pandas as pd
@@ -125,12 +126,18 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]:
     return procs
 
 
-def list_tasks(
+def list_tasks(  # noqa: PLR0913
     task_type: TaskType | None = None,
     offset: int | None = None,
     size: int | None = None,
     tag: str | None = None,
-    **kwargs: Any,
+    data_tag: str | None = None,
+    status: str | None = None,
+    data_name: str | None = None,
+    number_instances: int | None = None,
+    number_features: int | None = None,
+    number_classes: int | None = None,
+    number_missing_values: int | None = None,
 ) -> pd.DataFrame:
     """
     Return a number of tasks having the given tag and task_type
@@ -140,18 +147,22 @@ def list_tasks(
     Filter task_type is separated from the other filters because
     it is used as task_type in the task description, but it is named
     type when used as a filter in list tasks call.
-    task_type : TaskType, optional
-        Refers to the type of task.
     offset : int, optional
         the number of tasks to skip, starting from the first
+    task_type : TaskType, optional
+        Refers to the type of task.
     size : int, optional
         the maximum number of tasks to show
     tag : str, optional
         the tag to include
-    kwargs: dict, optional
-        Legal filter operators: data_tag, status, data_id, data_name,
-        number_instances, number_features,
-        number_classes, number_missing_values.
+    data_tag : str, optional
+        the tag of the dataset
+    status : str, optional
+    data_name : str, optional
+    number_instances : int, optional
+    number_features : int, optional
+    number_classes : int, optional
+    number_missing_values : int, optional
 
     Returns
     -------
@@ -161,18 +172,29 @@ def list_tasks(
         as columns: task id, dataset id, task_type and status. If qualities are
         calculated for the associated dataset, some of these are also returned.
     """
-    batches = openml.utils._list_all(
-        listing_call=_list_tasks,
-        task_type=task_type,
-        offset=offset,
+    listing_call = partial(
+        _list_tasks,
         size=size,
+        task_type=task_type,
         tag=tag,
-        **kwargs,
+        data_tag=data_tag,
+        status=status,
+        data_name=data_name,
+        number_instances=number_instances,
+        number_features=number_features,
+        number_classes=number_classes,
+        number_missing_values=number_missing_values,
     )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
     return pd.concat(batches, ignore_index=True)
 
 
-def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFrame:
+def _list_tasks(
+    limit: int,
+    offset: int,
+    task_type: TaskType | int | None = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
     """
     Perform the api call to return a number of tasks having the given filters.
 
@@ -181,6 +203,8 @@ def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFram
     Filter task_type is separated from the other filters because
     it is used as task_type in the task description, but it is named
     type when used as a filter in list tasks call.
+    limit: int
+    offset: int
     task_type : TaskType, optional
         Refers to the type of task.
     kwargs: dict, optional
@@ -192,9 +216,10 @@ def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFram
     -------
     dataframe
     """
-    api_call = "task/list"
+    api_call = f"task/list/limit/{limit}/offset/{offset}"
     if task_type is not None:
-        api_call += "/type/%d" % task_type.value
+        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+        api_call += f"/type/{tvalue}"
     if kwargs is not None:
         for operator, value in kwargs.items():
             if operator == "task_id":
diff --git a/openml/testing.py b/openml/testing.py
index 9016ff6a9..3a3c19c7e 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -12,7 +12,6 @@
 from pathlib import Path
 from typing import ClassVar
 
-import pandas as pd
 import requests
 
 import openml
@@ -286,8 +285,7 @@ def check_task_existence(
     int, None
     """
     return_val = None
-    tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
-    assert isinstance(tasks, pd.DataFrame)
+    tasks = openml.tasks.list_tasks(task_type=task_type)
     if len(tasks) == 0:
         return None
     tasks = tasks.loc[tasks["did"] == dataset_id]
diff --git a/openml/utils.py b/openml/utils.py
index aecf860d8..52e3a3675 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -238,9 +238,11 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
 
 
 def _list_all(  # noqa: C901
-    listing_call: Callable[P, _SizedT],
-    *args: P.args,
-    **filters: P.kwargs,
+    listing_call: Callable[[int, int], _SizedT],
+    *,
+    limit: int | None = None,
+    offset: int | None = None,
+    batch_size: int | None = 10_000,
 ) -> list[_SizedT]:
     """Helper to handle paged listing requests.
 
@@ -251,32 +253,35 @@ def _list_all(  # noqa: C901
     Parameters
     ----------
     listing_call : callable
-        Call listing, e.g. list_evaluations.
-    *args : Variable length argument list
-        Any required arguments for the listing call.
-    **filters : Arbitrary keyword arguments
-        Any filters that can be applied to the listing function.
-        additionally, the batch_size can be specified. This is
-        useful for testing purposes.
+        Call listing, e.g. list_evaluations. Takes two positional
+        arguments: batch_size and offset.
+    batch_size : int, optional
+        The batch size to use for the listing call.
+    offset : int, optional
+        The initial offset to use for the listing call.
+    limit : int, optional
+        The total size of the listing. If not provided, the function will
+        request the first batch and then continue until no more results are
+        returned
 
     Returns
     -------
-    List of types returned from the listing call
+    List of types returned from type of the listing call
     """
-    # eliminate filters that have a None value
-    active_filters = {key: value for key, value in filters.items() if value is not None}
     page = 0
     results: list[_SizedT] = []
 
+    LIMIT = limit
+    BATCH_SIZE_ORIG = batch_size
+    offset = offset if offset is not None else 0
+    batch_size = batch_size if batch_size is not None else 10_000
+
     # Default batch size per paging.
     # This one can be set in filters (batch_size), but should not be
     # changed afterwards. The derived batch_size can be changed.
-    BATCH_SIZE_ORIG = active_filters.pop("batch_size", 10000)
     if not isinstance(BATCH_SIZE_ORIG, int):
         raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
 
-    # max number of results to be shown
-    LIMIT: int | float | None = active_filters.pop("size", None)  # type: ignore
     if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
         raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
 
@@ -285,7 +290,6 @@ def _list_all(  # noqa: C901
     if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
         BATCH_SIZE_ORIG = LIMIT
 
-    offset = active_filters.pop("offset", 0)
     if not isinstance(offset, int):
         raise ValueError(f"'offset' should be an integer but got {offset}")
 
@@ -293,10 +297,7 @@ def _list_all(  # noqa: C901
     while True:
         try:
             current_offset = offset + BATCH_SIZE_ORIG * page
-            new_batch = listing_call(
-                *args,
-                **{**active_filters, "limit": batch_size, "offset": current_offset},  # type: ignore
-            )
+            new_batch = listing_call(batch_size, current_offset)
         except openml.exceptions.OpenMLServerNoResult:
             # NOTE: This above statement may not actually happen, but we could just return here
             # to enforce it...
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index ec5a8caf5..311ffd365 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -71,7 +71,7 @@ def test_upload_task(self):
             )
 
     def _get_compatible_rand_dataset(self) -> list:
-        active_datasets = list_datasets(status="active", output_format="dataframe")
+        active_datasets = list_datasets(status="active")
 
         # depending on the task type, find either datasets
         # with only symbolic features or datasets with only
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 4f71f63c3..38d497e2d 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -115,7 +115,6 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
         batch_size=100,
         size=min_number_datasets_on_test_server,
-        output_format="dataframe",
     )
 
     assert min_number_datasets_on_test_server == len(datasets)

From a31dafc3ccb5832172944f5c5c6e8c10a0d4042e Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 17 Oct 2024 17:20:37 +0200
Subject: [PATCH 03/23] fix: Don't double call list

---
 openml/datasets/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index a328f9762..522e9d84d 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -174,7 +174,7 @@ def _list_datasets(
     -------
     datasets : dataframe
     """
-    api_call = f"data/list/list/{limit}/offset/{offset}"
+    api_call = f"data/list/{limit}/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():

From 3bd924ca40783631ec9bdab819823ebf6ca7c79a Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 4 Nov 2024 17:14:54 +0100
Subject: [PATCH 04/23] update...

---
 README.md                                     |   2 +-
 examples/20_basic/simple_datasets_tutorial.py |   2 +-
 .../simple_flows_and_runs_tutorial.py         |   4 +-
 examples/30_extended/datasets_tutorial.py     |  17 +-
 .../30_extended/fetch_evaluations_tutorial.py |  10 +-
 .../30_extended/flows_and_runs_tutorial.py    |   8 +-
 .../plot_svm_hyperparameters_tutorial.py      |   4 +-
 examples/30_extended/study_tutorial.py        |  10 +-
 examples/30_extended/suites_tutorial.py       |  11 +-
 examples/30_extended/tasks_tutorial.py        |  20 +-
 .../40_paper/2015_neurips_feurer_example.py   |   7 +-
 examples/40_paper/2018_kdd_rijn_example.py    |  13 +-
 .../40_paper/2018_neurips_perrone_example.py  |  12 +-
 openml/datasets/dataset.py                    | 225 ++++++------------
 openml/datasets/functions.py                  |  32 ++-
 openml/evaluations/functions.py               |  65 ++---
 openml/flows/functions.py                     |   5 +-
 openml/runs/functions.py                      |   3 +
 openml/setups/functions.py                    |   7 +-
 openml/setups/setup.py                        |   2 +-
 openml/study/functions.py                     |  13 +-
 openml/tasks/functions.py                     |  15 +-
 openml/tasks/task.py                          |   4 +-
 tests/test_datasets/test_dataset.py           |  22 +-
 tests/test_datasets/test_dataset_functions.py |  68 +++---
 .../test_evaluation_functions.py              |   2 -
 .../test_evaluations_example.py               |   3 +-
 .../test_sklearn_extension.py                 |   2 +-
 tests/test_flows/test_flow.py                 |  26 +-
 tests/test_flows/test_flow_functions.py       |  18 +-
 tests/test_openml/test_api_calls.py           |   2 +-
 tests/test_runs/test_run.py                   |   8 +-
 tests/test_runs/test_run_functions.py         |  38 ++-
 tests/test_setups/test_setup_functions.py     |   7 +-
 tests/test_study/test_study_functions.py      |  14 +-
 tests/test_tasks/test_task_functions.py       |  27 +--
 tests/test_tasks/test_task_methods.py         |   6 +-
 tests/test_utils/test_utils.py                |  19 +-
 38 files changed, 319 insertions(+), 434 deletions(-)

diff --git a/README.md b/README.md
index 0bad7ac66..081bf7923 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 <div id="user-content-toc">
   <ul align="center" style="list-style: none;">
     <summary>
-      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/> 
+      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
       <h1>OpenML-Python</h1>
       <img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
     </summary>
diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
index 9b18aab14..fed997d58 100644
--- a/examples/20_basic/simple_datasets_tutorial.py
+++ b/examples/20_basic/simple_datasets_tutorial.py
@@ -48,7 +48,7 @@
 # attribute_names - the names of the features for the examples (X) and
 # target feature (y)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 
 ############################################################################
diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
index eec6d7e8b..63a3a6d85 100644
--- a/examples/20_basic/simple_flows_and_runs_tutorial.py
+++ b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
 from sklearn import ensemble, neighbors
 
+import openml
 
 ############################################################################
 # .. warning::
@@ -23,7 +23,7 @@
 # NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20
 dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 clf.fit(X, y)
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 606455dd8..2841c68f7 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -8,29 +8,24 @@
 
 # License: BSD 3-Clauses
 
-import openml
 import pandas as pd
+
+import openml
 from openml.datasets import edit_dataset, fork_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
 # **********
 #
-# * List datasets
-#
-#   * Use the output_format parameter to select output type
-#   * Default gives 'dict' (other option: 'dataframe', see below)
-#
-# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
-# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
-datalist = openml.datasets.list_datasets(output_format="dataframe")
+# * List datasets and return a dataframe
+datalist = openml.datasets.list_datasets()
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")
 datalist.head(n=10)
 
 # The same can be done with lesser lines of code
-openml_df = openml.datasets.list_datasets(output_format="dataframe")
+openml_df = openml.datasets.list_datasets()
 openml_df.head(n=10)
 
 ############################################################################
@@ -73,7 +68,7 @@
 # dataset. In particular, many datasets for supervised problems have a set
 # `default_target_attribute` which may help identify the target variable.
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 print(X.head())
 print(X.info())
diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
index 86302e2d1..4ef2d3e5f 100644
--- a/examples/30_extended/fetch_evaluations_tutorial.py
+++ b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -32,9 +32,7 @@
 # Required filters can be applied to retrieve results from runs as required.
 
 # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(
-    function="predictive_accuracy", size=10, output_format="dataframe"
-)
+openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
 
 # Using other evaluation metrics, 'precision' in this case
 evals = openml.evaluations.list_evaluations(
@@ -62,9 +60,7 @@
 # Note that we now filter the evaluations based on another parameter 'task'.
 
 metric = "predictive_accuracy"
-evals = openml.evaluations.list_evaluations(
-    function=metric, tasks=[task_id], output_format="dataframe"
-)
+evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id])
 # Displaying the first 10 rows
 print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
@@ -94,7 +90,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
     plt.minorticks_on()
     plt.grid(visible=True, which="minor", linestyle="--")
     plt.axvline(max_val, linestyle="--", color="gray")
-    plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
+    plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
     plt.show()
 
 
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
index b7c000101..bbe94c1dc 100644
--- a/examples/30_extended/flows_and_runs_tutorial.py
+++ b/examples/30_extended/flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
 
+import openml
 
 ############################################################################
 # We'll use the test server for the rest of this tutorial.
@@ -27,7 +27,7 @@
 # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
 clf.fit(X, y)
@@ -38,7 +38,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(
diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
index e366c56df..491507d16 100644
--- a/examples/30_extended/plot_svm_hyperparameters_tutorial.py
+++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -6,9 +6,10 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 
+import openml
+
 ####################################################################################################
 # First step - obtaining the data
 # ===============================
@@ -22,7 +23,6 @@
     function="predictive_accuracy",
     flows=[8353],
     tasks=[6],
-    output_format="dataframe",
     # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
     # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
     parameters_in_separate_columns=True,
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index 8715dfb4a..d1f6ccbd0 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -17,16 +17,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing studies
 # ***************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-studies = openml.study.list_studies(output_format="dataframe", status="all")
+studies = openml.study.list_studies(status="all")
 print(studies.head(n=10))
 
 
@@ -52,7 +47,6 @@
 # the evaluations available for the conducted runs:
 evaluations = openml.evaluations.list_evaluations(
     function="predictive_accuracy",
-    output_format="dataframe",
     study=study.study_id,
 )
 print(evaluations.head())
@@ -81,7 +75,7 @@
 # To verify
 # https://test.openml.org/api/v1/study/1
 suite = openml.study.get_suite("OpenML100")
-print(all([t_id in suite.tasks for t_id in tasks]))
+print(all(t_id in suite.tasks for t_id in tasks))
 
 run_ids = []
 for task_id in tasks:
diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
index 935d4c529..19f5cdc1a 100644
--- a/examples/30_extended/suites_tutorial.py
+++ b/examples/30_extended/suites_tutorial.py
@@ -19,16 +19,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing suites
 # **************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-suites = openml.study.list_suites(output_format="dataframe", status="all")
+suites = openml.study.list_suites(status="all")
 print(suites.head(n=10))
 
 ############################################################################
@@ -51,7 +46,7 @@
 
 ############################################################################
 # And we can use the task listing functionality to learn more about them:
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 
 # Using ``@`` in `pd.DataFrame.query <
 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
@@ -76,7 +71,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
+all_tasks = list(openml.tasks.list_tasks()["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,
diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
index 19a7e542c..63821c7a2 100644
--- a/examples/30_extended/tasks_tutorial.py
+++ b/examples/30_extended/tasks_tutorial.py
@@ -9,7 +9,6 @@
 
 import openml
 from openml.tasks import TaskType
-import pandas as pd
 
 ############################################################################
 #
@@ -30,14 +29,11 @@
 # ^^^^^^^^^^^^^
 #
 # We will start by simply listing only *supervised classification* tasks.
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
-# request a
+# **openml.tasks.list_tasks()** getting a
 # `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# instead to have better visualization capabilities and easier access:
+# to have good visualization capabilities and easier access:
 
-tasks = openml.tasks.list_tasks(
-    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
+tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
 print(tasks.columns)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
@@ -71,21 +67,21 @@
 #
 # Similar to listing tasks by task type, we can list tasks by tags:
 
-tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+tasks = openml.tasks.list_tasks(tag="OpenML100")
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # Furthermore, we can list tasks based on the dataset id:
 
-tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
+tasks = openml.tasks.list_tasks(data_id=1471)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # In addition, a size limit and an offset can be applied both separately and simultaneously:
 
-tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
+tasks = openml.tasks.list_tasks(size=10, offset=50)
 print(tasks)
 
 ############################################################################
@@ -101,7 +97,7 @@
 # Finally, it is also possible to list all tasks on OpenML with:
 
 ############################################################################
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 print(len(tasks))
 
 ############################################################################
@@ -195,7 +191,7 @@
     # Error code for 'task already exists'
     if e.code == 614:
         # Lookup task
-        tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(data_id=128)
         tasks = tasks.query(
             'task_type == "Supervised Classification" '
             'and estimation_procedure == "10-fold Crossvalidation" '
diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py
index 3960c3852..f0413d27a 100644
--- a/examples/40_paper/2015_neurips_feurer_example.py
+++ b/examples/40_paper/2015_neurips_feurer_example.py
@@ -13,12 +13,10 @@
 | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
 | In *Advances in Neural Information Processing Systems 28*, 2015
 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""  # noqa F401
+"""
 
 # License: BSD 3-Clause
 
-import pandas as pd
-
 import openml
 
 ####################################################################################################
@@ -60,7 +58,6 @@
 tasks = openml.tasks.list_tasks(
     task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
     status="all",
-    output_format="dataframe",
 )
 
 # Query only those with holdout as the resampling startegy.
@@ -68,7 +65,7 @@
 
 task_ids = []
 for did in dataset_ids:
-    tasks_ = list(tasks.query("did == {}".format(did)).tid)
+    tasks_ = list(tasks.query(f"did == {did}").tid)
     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
         task_id = min(tasks_)
     else:
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index 7ec60fe53..2230ade2d 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -32,17 +32,18 @@
 
 import sys
 
-if sys.platform == "win32":  # noqa
+if sys.platform == "win32":
     print(
         "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
     )
-    exit()
+    sys.exit()
 
 # DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
 print("This example is deprecated, remove this code to use it manually.")
-exit()
+sys.exit()
 
 import json
+
 import fanova
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -50,7 +51,6 @@
 
 import openml
 
-
 ##############################################################################
 # With the advent of automated machine learning, automated hyperparameter
 # optimization methods are by now routinely used in data mining. However, this
@@ -81,7 +81,7 @@
 # important when it is put on a log-scale. All these simplifications can be
 # addressed by defining a ConfigSpace. For a more elaborated example that uses
 # this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
+# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
 
 suite = openml.study.get_suite("OpenML100")
 flow_id = 7707
@@ -107,7 +107,6 @@
         flows=[flow_id],
         tasks=[task_id],
         size=limit_per_task,
-        output_format="dataframe",
     )
 
     performance_column = "value"
@@ -122,7 +121,7 @@
             [
                 dict(
                     **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]}
+                    **{performance_column: setup[performance_column]},
                 )
                 for _, setup in evals.iterrows()
             ]
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 0d72846ac..7c29ede8d 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -27,16 +27,17 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
 from sklearn.metrics import mean_squared_error
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.ensemble import RandomForestRegressor
+
+import openml
 
 flow_type = "svm"  # this example will use the smaller svm flow evaluations
 ############################################################################
@@ -94,7 +95,6 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
         tasks=task_ids,
         flows=[flow_id],
         uploaders=[2702],
-        output_format="dataframe",
         parameters_in_separate_columns=True,
     )
     return eval_df, task_ids, flow_id
@@ -206,7 +206,7 @@ def list_categorical_attributes(flow_type="svm"):
 model.fit(X, y)
 y_pred = model.predict(X)
 
-print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
+print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
 
 
 #############################################################################
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index b00c458e3..12ce262e7 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -17,13 +17,28 @@
 import xmltodict
 
 from openml.base import OpenMLBase
-from openml.exceptions import PyOpenMLError
 
 from .data_feature import OpenMLDataFeature
 
 logger = logging.getLogger(__name__)
 
 
+def _ensure_dataframe(
+    data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
+    attribute_names: list | None = None,
+) -> pd.DataFrame:
+    if isinstance(data, pd.DataFrame):
+        return data
+    if scipy.sparse.issparse(data):
+        return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
+    if isinstance(data, np.ndarray):
+        return pd.DataFrame(data, columns=attribute_names)  # type: ignore
+    if isinstance(data, pd.Series):
+        return data.to_frame()
+
+    raise TypeError(f"Data type {type(data)} not supported.")
+
+
 class OpenMLDataset(OpenMLBase):
     """Dataset object.
 
@@ -575,13 +590,17 @@ def _cache_compressed_file_from_file(
 
         return data, categorical, attribute_names
 
-    def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
+    def _parse_data_from_file(
+        self,
+        data_file: Path,
+    ) -> tuple[list[str], list[bool], pd.DataFrame | scipy.sparse.csr_matrix]:
         if data_file.suffix == ".arff":
             data, categorical, attribute_names = self._parse_data_from_arff(data_file)
         elif data_file.suffix == ".pq":
             attribute_names, categorical, data = self._parse_data_from_pq(data_file)
         else:
             raise ValueError(f"Unknown file type for file '{data_file}'.")
+
         return attribute_names, categorical, data
 
     def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
@@ -593,7 +612,7 @@ def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], p
         attribute_names = list(data.columns)
         return attribute_names, categorical, data
 
-    def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:  # noqa: PLR0912, C901
+    def _load_data(self) -> tuple[pd.DataFrame, list[bool], list[str]]:  # noqa: PLR0912, C901, PLR0915
         """Load data from compressed format or arff. Download data if not present on disk."""
         need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
         need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
@@ -604,7 +623,8 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
 
             file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
             assert file_to_load is not None
-            return self._cache_compressed_file_from_file(Path(file_to_load))
+            data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
+            return _ensure_dataframe(data, attrs), cats, attrs
 
         # helper variable to help identify where errors occur
         fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
@@ -616,12 +636,13 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
 
                 data = pd.read_feather(self.data_feather_file)
                 fpath = self.feather_attribute_file
-                with open(self.feather_attribute_file, "rb") as fh:  # noqa: PTH123
+                with self.feather_attribute_file.open("rb") as fh:
                     categorical, attribute_names = pickle.load(fh)  # noqa: S301
             else:
                 assert self.data_pickle_file is not None
-                with open(self.data_pickle_file, "rb") as fh:  # noqa: PTH123
+                with self.data_pickle_file.open("rb") as fh:
                     data, categorical, attribute_names = pickle.load(fh)  # noqa: S301
+
         except FileNotFoundError as e:
             raise ValueError(
                 f"Cannot find file for dataset {self.name} at location '{fpath}'."
@@ -660,7 +681,7 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
             file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
             assert file_to_load is not None
             attr, cat, df = self._parse_data_from_file(Path(file_to_load))
-            return df, cat, attr
+            return _ensure_dataframe(df), cat, attr
 
         data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
         if self.cache_format == "pickle" and not data_up_to_date:
@@ -668,79 +689,9 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
             file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
             assert file_to_load is not None
 
-            return self._cache_compressed_file_from_file(Path(file_to_load))
-        return data, categorical, attribute_names
+            data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
 
-    # TODO(eddiebergman): Can type this better with overload
-    # TODO(eddiebergman): Could also techinically use scipy.sparse.sparray
-    @staticmethod
-    def _convert_array_format(
-        data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
-        array_format: Literal["array", "dataframe"],
-        attribute_names: list | None = None,
-    ) -> pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix:
-        """Convert a dataset to a given array format.
-
-        Converts to numpy array if data is non-sparse.
-        Converts to a sparse dataframe if data is sparse.
-
-        Parameters
-        ----------
-        array_format : str {'array', 'dataframe'}
-            Desired data type of the output
-            - If array_format='array'
-                If data is non-sparse
-                    Converts to numpy-array
-                    Enforces numeric encoding of categorical columns
-                    Missing values are represented as NaN in the numpy-array
-                else returns data as is
-            - If array_format='dataframe'
-                If data is sparse
-                    Works only on sparse data
-                    Converts sparse data to sparse dataframe
-                else returns data as is
-
-        """
-        if array_format == "array" and not isinstance(data, scipy.sparse.spmatrix):
-            # We encode the categories such that they are integer to be able
-            # to make a conversion to numeric for backward compatibility
-            def _encode_if_category(column: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
-                if column.dtype.name == "category":
-                    column = column.cat.codes.astype(np.float32)
-                    mask_nan = column == -1
-                    column[mask_nan] = np.nan
-                return column
-
-            if isinstance(data, pd.DataFrame):
-                columns = {
-                    column_name: _encode_if_category(data.loc[:, column_name])
-                    for column_name in data.columns
-                }
-                data = pd.DataFrame(columns)
-            else:
-                data = _encode_if_category(data)
-
-            try:
-                # TODO(eddiebergman): float32?
-                return_array = np.asarray(data, dtype=np.float32)
-            except ValueError as e:
-                raise PyOpenMLError(
-                    "PyOpenML cannot handle string when returning numpy"
-                    ' arrays. Use dataset_format="dataframe".',
-                ) from e
-
-            return return_array
-
-        if array_format == "dataframe":
-            if scipy.sparse.issparse(data):
-                data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
-        else:
-            data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
-            logger.warning(
-                f"Cannot convert {data_type} ({type(data)}) to '{array_format}'."
-                " Returning input data.",
-            )
-        return data
+        return _ensure_dataframe(data, attribute_names), categorical, attribute_names
 
     @staticmethod
     def _unpack_categories(series: pd.Series, categories: list) -> pd.Series:
@@ -761,19 +712,13 @@ def valid_category(cat: Any) -> bool:
         raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
         return pd.Series(raw_cat, index=series.index, name=series.name)
 
-    def get_data(  # noqa: C901, PLR0912, PLR0915
+    def get_data(  # noqa: C901, PLR0912
         self,
         target: list[str] | str | None = None,
         include_row_id: bool = False,  # noqa: FBT001, FBT002
         include_ignore_attribute: bool = False,  # noqa: FBT001, FBT002
-        dataset_format: Literal["array", "dataframe"] = "dataframe",
-    ) -> tuple[
-        np.ndarray | pd.DataFrame | scipy.sparse.csr_matrix,
-        np.ndarray | pd.DataFrame | None,
-        list[bool],
-        list[str],
-    ]:
-        """Returns dataset content as dataframes or sparse matrices.
+    ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]:
+        """Returns dataset content as dataframes.
 
         Parameters
         ----------
@@ -785,35 +730,20 @@ def get_data(  # noqa: C901, PLR0912, PLR0915
         include_ignore_attribute : boolean (default=False)
             Whether to include columns that are marked as "ignore"
             on the server in the dataset.
-        dataset_format : string (default='dataframe')
-            The format of returned dataset.
-            If ``array``, the returned dataset will be a NumPy array or a SciPy sparse
-            matrix. Support for ``array`` will be removed in 0.15.
-            If ``dataframe``, the returned dataset will be a Pandas DataFrame.
 
 
         Returns
         -------
-        X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
-            Dataset
-        y : ndarray or pd.Series, shape (n_samples, ) or None
+        X : dataframe, shape (n_samples, n_columns)
+            Dataset, may have sparse dtypes in the columns if required.
+        y : pd.Series, shape (n_samples, ) or None
             Target column
-        categorical_indicator : boolean ndarray
+        categorical_indicator : list[bool]
             Mask that indicate categorical features.
-        attribute_names : List[str]
+        attribute_names : list[str]
             List of attribute names.
         """
-        # TODO: [0.15]
-        if dataset_format == "array":
-            warnings.warn(
-                "Support for `dataset_format='array'` will be removed in 0.15,"
-                "start using `dataset_format='dataframe' to ensure your code "
-                "will continue to work. You can use the dataframe's `to_numpy` "
-                "function to continue using numpy arrays.",
-                category=FutureWarning,
-                stacklevel=2,
-            )
-        data, categorical, attribute_names = self._load_data()
+        data, categorical_mask, attribute_names = self._load_data()
 
         to_exclude = []
         if not include_row_id and self.row_id_attribute is not None:
@@ -831,54 +761,43 @@ def get_data(  # noqa: C901, PLR0912, PLR0915
         if len(to_exclude) > 0:
             logger.info(f"Going to remove the following attributes: {to_exclude}")
             keep = np.array([column not in to_exclude for column in attribute_names])
-            data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep]
-
-            categorical = [cat for cat, k in zip(categorical, keep) if k]
+            data = data.drop(columns=to_exclude)
+            categorical_mask = [cat for cat, k in zip(categorical_mask, keep) if k]
             attribute_names = [att for att, k in zip(attribute_names, keep) if k]
 
         if target is None:
-            data = self._convert_array_format(data, dataset_format, attribute_names)  # type: ignore
-            targets = None
+            return data, None, categorical_mask, attribute_names
+
+        if isinstance(target, str):
+            target_names = target.split(",") if "," in target else [target]
         else:
-            if isinstance(target, str):
-                target = target.split(",") if "," in target else [target]
-            targets = np.array([column in target for column in attribute_names])
-            target_names = [column for column in attribute_names if column in target]
-            if np.sum(targets) > 1:
-                raise NotImplementedError(
-                    "Number of requested targets %d is not implemented." % np.sum(targets),
-                )
-            target_categorical = [
-                cat for cat, column in zip(categorical, attribute_names) if column in target
-            ]
-            target_dtype = int if target_categorical[0] else float
-
-            if isinstance(data, pd.DataFrame):
-                x = data.iloc[:, ~targets]
-                y = data.iloc[:, targets]
-            else:
-                x = data[:, ~targets]
-                y = data[:, targets].astype(target_dtype)  # type: ignore
-
-            categorical = [cat for cat, t in zip(categorical, targets) if not t]
-            attribute_names = [att for att, k in zip(attribute_names, targets) if not k]
-
-            x = self._convert_array_format(x, dataset_format, attribute_names)  # type: ignore
-            if dataset_format == "array" and scipy.sparse.issparse(y):
-                # scikit-learn requires dense representation of targets
-                y = np.asarray(y.todense()).astype(target_dtype)
-                # dense representation of single column sparse arrays become a 2-d array
-                # need to flatten it to a 1-d array for _convert_array_format()
-                y = y.squeeze()
-            y = self._convert_array_format(y, dataset_format, target_names)
-            y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y
-            if len(y.shape) > 1 and y.shape[1] == 1:
-                # single column targets should be 1-d for both `array` and `dataframe` formats
-                assert isinstance(y, (np.ndarray, pd.DataFrame, pd.Series))
-                y = y.squeeze()
-            data, targets = x, y
-
-        return data, targets, categorical, attribute_names  # type: ignore
+            target_names = target
+
+        # All the assumptions below for the target are dependant on the number of targets being 1
+        n_targets = len(target_names)
+        if n_targets > 1:
+            raise NotImplementedError(f"Number of targets {n_targets} not implemented.")
+
+        target_name = target_names[0]
+        x = data.drop(columns=[target_name])
+        y = data[target_name].squeeze()
+
+        if isinstance(y.dtype, pd.SparseDtype):
+            y = y.sparse.to_dense()
+
+            # Since it was sparsified, the y column may not be of the correct dtype, hence we check
+            # if it was categorical and convert it to and integer if needs be.
+            category_names = data.columns[categorical_mask]
+            if target_name in category_names:
+                y = y.astype(int)
+
+        # Finally, remove the target from the list of attributes and categorical mask
+        target_index = attribute_names.index(target_name)
+        categorical_mask.pop(target_index)
+        attribute_names.remove(target_name)
+
+        assert isinstance(y, pd.Series)
+        return x, y, categorical_mask, attribute_names
 
     def _load_features(self) -> None:
         """Load the features metadata from the server and store it in the dataset object."""
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 522e9d84d..d9f8d36a0 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -84,10 +84,10 @@ def list_datasets(
     tag: str | None = None,
     data_name: str | None = None,
     data_version: int | None = None,
-    number_instances: int | None = None,
-    number_features: int | None = None,
-    number_classes: int | None = None,
-    number_missing_values: int | None = None,
+    number_instances: int | str | None = None,
+    number_features: int | str | None = None,
+    number_classes: int | str | None = None,
+    number_missing_values: int | str | None = None,
 ) -> pd.DataFrame:
     """Return a dataframe of all dataset which are on OpenML.
 
@@ -109,10 +109,10 @@ def list_datasets(
     tag : str, optional
     data_name : str, optional
     data_version : int, optional
-    number_instances : int, optional
-    number_features : int, optional
-    number_classes : int, optional
-    number_missing_values : int, optional
+    number_instances : int | str, optional
+    number_features : int | str, optional
+    number_classes : int | str, optional
+    number_missing_values : int | str, optional
 
     Returns
     -------
@@ -139,6 +139,9 @@ def list_datasets(
         number_missing_values=number_missing_values,
     )
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
     return pd.concat(batches)
 
 
@@ -174,11 +177,12 @@ def _list_datasets(
     -------
     datasets : dataframe
     """
-    api_call = f"data/list/{limit}/offset/{offset}"
+    api_call = f"data/list/limit/{limit}/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+            if value is not None:
+                api_call += f"/{operator}/{value}"
     if data_id is not None:
         api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id]))
     return __list_datasets(api_call=api_call)
@@ -213,7 +217,13 @@ def __list_datasets(api_call: str) -> pd.DataFrame:
                 dataset[quality["@name"]] = float(quality["#text"])
         datasets[dataset["did"]] = dataset
 
-    return pd.DataFrame.from_dict(datasets, orient="index")
+    return pd.DataFrame.from_dict(datasets, orient="index").astype(
+        {
+            "did": int,
+            "version": int,
+            "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+        }
+    )
 
 
 def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 0adab61ac..bee326438 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -21,37 +21,37 @@
 @overload
 def list_evaluations(
     function: str,
-    offset: int | None = ...,
-    size: int | None = ...,
-    tasks: list[str | int] | None = ...,
-    setups: list[str | int] | None = ...,
-    flows: list[str | int] | None = ...,
-    runs: list[str | int] | None = ...,
-    uploaders: list[str | int] | None = ...,
-    tag: str | None = ...,
-    study: int | None = ...,
-    per_fold: bool | None = ...,
-    sort_order: str | None = ...,
-    output_format: Literal["object"] = "object",
-) -> dict[int, OpenMLEvaluation]: ...
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["dataframe"] = ...,
+) -> pd.DataFrame: ...
 
 
 @overload
 def list_evaluations(
     function: str,
-    offset: int | None = ...,
-    size: int | None = ...,
-    tasks: list[str | int] | None = ...,
-    setups: list[str | int] | None = ...,
-    flows: list[str | int] | None = ...,
-    runs: list[str | int] | None = ...,
-    uploaders: list[str | int] | None = ...,
-    tag: str | None = ...,
-    study: int | None = ...,
-    per_fold: bool | None = ...,
-    sort_order: str | None = ...,
-    output_format: Literal["dataframe"] = ...,
-) -> pd.DataFrame: ...
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["object"] = "object",
+) -> dict[int, OpenMLEvaluation]: ...
 
 
 def list_evaluations(
@@ -137,12 +137,12 @@ def list_evaluations(
     flattened = list(chain.from_iterable(eval_collection))
     if output_format == "dataframe":
         records = [item._to_dict() for item in flattened]
-        return pd.DataFrame.from_records(records, index="run_id")
+        return pd.DataFrame.from_records(records)  # No index...
 
     return {e.run_id: e for e in flattened}
 
 
-def _list_evaluations(
+def _list_evaluations(  # noqa: C901
     limit: int,
     offset: int,
     *,
@@ -197,7 +197,8 @@ def _list_evaluations(
     api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+            if value is not None:
+                api_call += f"/{operator}/{value}"
     if tasks is not None:
         api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks]))
     if setups is not None:
@@ -243,7 +244,7 @@ def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
     for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
         run_id = int(eval_["oml:run_id"])
         value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
-        values = json.loads(eval_["oml:values"]) if eval_["oml:values"] is not None else None
+        values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None
         array_data = eval_.get("oml:array_data")
 
         evals.append(
@@ -408,10 +409,10 @@ def list_evaluations_setups(
             assert isinstance(result, pd.DataFrame)
             result = result.drop("flow_id", axis=1)
             # concat resulting setup chunks into single datframe
-            setup_data = pd.concat([setup_data, result], ignore_index=True)
+            setup_data = pd.concat([setup_data, result])
 
         parameters = []
-        # Convert parameters of setup into list of tuples of (hyperparameter, value)
+        # Convert parameters of setup into dict of (hyperparameter, value)
         for parameter_dict in setup_data["parameters"]:
             if parameter_dict is not None:
                 parameters.append(
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index f1f1da6fb..7cb93386b 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -167,7 +167,10 @@ def list_flows(
     """
     listing_call = partial(_list_flows, tag=tag, uploader=uploader)
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-    return pd.concat(batches, ignore_index=True)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
 
 
 def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 67a035825..ab852a029 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1094,6 +1094,9 @@ def list_runs(  # noqa: PLR0913
         task_type=task_type,
     )
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
     return pd.concat(batches)
 
 
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 0c3095501..1b7ffb329 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -90,7 +90,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
         setup_file = setup_cache_dir / "description.xml"
         with setup_file.open(encoding="utf8") as fh:
             setup_xml = xmltodict.parse(fh.read())
-            return _create_setup_from_xml(setup_xml, output_format="object")  # type: ignore
+            return _create_setup_from_xml(setup_xml)
 
     except OSError as e:
         raise openml.exceptions.OpenMLCacheException(
@@ -120,7 +120,7 @@ def get_setup(setup_id: int) -> OpenMLSetup:
     try:
         return _get_cached_setup(setup_id)
     except openml.exceptions.OpenMLCacheException:
-        url_suffix = "/setup/%d" % setup_id
+        url_suffix = f"/setup/{setup_id}"
         setup_xml = openml._api_calls._perform_api_call(url_suffix, "get")
         with setup_file.open("w", encoding="utf8") as fh:
             fh.write(setup_xml)
@@ -172,7 +172,8 @@ def list_setups(  # noqa: PLR0913
     if output_format == "object":
         return {setup.setup_id: setup for setup in flattened}
 
-    return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id")
+    records = [setup._to_dict() for setup in flattened]
+    return pd.DataFrame.from_records(records, index="setup_id")
 
 
 def _list_setups(
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index 83b258dd7..a98e832c0 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -39,7 +39,7 @@ def _to_dict(self) -> dict[str, Any]:
         return {
             "setup_id": self.setup_id,
             "flow_id": self.flow_id,
-            "parameters": {p.input_id: p._to_dict() for p in self.parameters.values()}
+            "parameters": {p.id: p._to_dict() for p in self.parameters.values()}
             if self.parameters is not None
             else None,
         }
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 59f255b47..fb71272a1 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -473,7 +473,10 @@ def list_suites(
         uploader=uploader,
     )
     batches = openml.utils._list_all(listing_call, limit=size, offset=offset)
-    return pd.concat(batches, ignore_index=True)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
 
 
 def list_studies(
@@ -522,7 +525,10 @@ def list_studies(
         benchmark_suite=benchmark_suite,
     )
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-    return pd.concat(batches, ignore_index=True)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
 
 
 def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
@@ -561,8 +567,7 @@ def __list_studies(api_call: str) -> pd.DataFrame:
     Returns
     -------
     pd.DataFrame
-        A dictionary or Pandas DataFrame of OpenML studies,
-        depending on the value of 'output_format'.
+        A Pandas DataFrame of OpenML studies
     """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 1f4ec71f8..497e35c69 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -134,6 +134,7 @@ def list_tasks(  # noqa: PLR0913
     data_tag: str | None = None,
     status: str | None = None,
     data_name: str | None = None,
+    data_id: int | None = None,
     number_instances: int | None = None,
     number_features: int | None = None,
     number_classes: int | None = None,
@@ -157,6 +158,7 @@ def list_tasks(  # noqa: PLR0913
         the tag to include
     data_tag : str, optional
         the tag of the dataset
+    data_id : int, optional
     status : str, optional
     data_name : str, optional
     number_instances : int, optional
@@ -179,6 +181,7 @@ def list_tasks(  # noqa: PLR0913
         tag=tag,
         data_tag=data_tag,
         status=status,
+        data_id=data_id,
         data_name=data_name,
         number_instances=number_instances,
         number_features=number_features,
@@ -186,7 +189,10 @@ def list_tasks(  # noqa: PLR0913
         number_missing_values=number_missing_values,
     )
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-    return pd.concat(batches, ignore_index=True)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
 
 
 def _list_tasks(
@@ -222,9 +228,10 @@ def _list_tasks(
         api_call += f"/type/{tvalue}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            if operator == "task_id":
-                value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
-            api_call += f"/{operator}/{value}"
+            if value is not None:
+                if operator == "task_id":
+                    value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
+                api_call += f"/{operator}/{value}"
 
     return __list_tasks(api_call=api_call)
 
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index 395b52482..c3079db91 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -292,7 +292,7 @@ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         ):
             raise NotImplementedError(self.task_type)
 
-        X, y, _, _ = dataset.get_data(target=self.target_name)
+        X, y, _, _ = dataset.get_data(target_names=self.target_name)
         return X, y
 
     def _to_dict(self) -> dict[str, dict]:
@@ -502,7 +502,7 @@ def get_X(self) -> pd.DataFrame:
         The X data as a dataframe
         """
         dataset = self.get_dataset()
-        data, *_ = dataset.get_data(target=None)
+        data, *_ = dataset.get_data(target_names=None)
         return data
 
     def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 102429b41..0675c21d2 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -120,7 +120,7 @@ def test_get_data_pandas(self):
             assert data[col_name].dtype.name == col_dtype[col_name]
 
         X, y, _, _ = self.titanic.get_data(
-            target=self.titanic.default_target_attribute,
+            target_names=self.titanic.default_target_attribute,
         )
         assert isinstance(X, pd.DataFrame)
         assert isinstance(y, pd.Series)
@@ -171,7 +171,7 @@ def test_get_data_with_rowid(self):
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_target_pandas(self):
-        X, y, categorical, attribute_names = self.dataset.get_data(target="class")
+        X, y, categorical, attribute_names = self.dataset.get_data(target_names="class")
         assert isinstance(X, pd.DataFrame)
         for dtype, is_cat, col in zip(X.dtypes, categorical, X):
             self._check_expected_type(dtype, is_cat, X[col])
@@ -187,7 +187,7 @@ def test_get_data_with_target_pandas(self):
     def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
-        X, y, categorical, names = self.dataset.get_data(target="class")
+        X, y, categorical, names = self.dataset.get_data(target_names="class")
         assert X.shape == (898, 36)
         assert len(categorical) == 36
         cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
@@ -273,7 +273,8 @@ def test_lazy_loading_metadata(self):
     def test_equality_comparison(self):
         self.assertEqual(self.iris, self.iris)
         self.assertNotEqual(self.iris, self.titanic)
-        self.assertNotEqual(self.titanic, 'Wrong_object')
+        self.assertNotEqual(self.titanic, "Wrong_object")
+
 
 class OpenMLDatasetTestOnTestServer(TestBase):
     def setUp(self):
@@ -285,14 +286,14 @@ def test_tagging(self):
         # tags can be at most 64 alphanumeric (+ underscore) chars
         unique_indicator = str(time()).replace(".", "")
         tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}"
-        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        datasets = openml.datasets.list_datasets(tag=tag)
         assert datasets.empty
         self.dataset.push_tag(tag)
-        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        datasets = openml.datasets.list_datasets(tag=tag)
         assert len(datasets) == 1
         assert 125 in datasets["did"]
         self.dataset.remove_tag(tag)
-        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        datasets = openml.datasets.list_datasets(tag=tag)
         assert datasets.empty
 
     def test_get_feature_with_ontology_data_id_11(self):
@@ -347,7 +348,7 @@ def setUp(self):
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
     def test_get_sparse_dataset_dataframe_with_target(self):
-        X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
+        X, y, _, attribute_names = self.sparse_dataset.get_data(target_names="class")
         assert isinstance(X, pd.DataFrame)
         assert isinstance(X.dtypes[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
@@ -374,12 +375,11 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         self.sparse_dataset.row_id_attribute = ["V512"]
         # TODO(eddiebergman): Will break from dataset_format removal
         X, y, categorical, _ = self.sparse_dataset.get_data(
-            target="class",
+            target_names="class",
             include_row_id=False,
             include_ignore_attribute=False,
         )
-        assert sparse.issparse(X)
-        assert X.dtype == np.float32
+        assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
         assert y.dtype in [np.int32, np.int64]
         assert X.shape == (600, 19998)
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 3995f1b98..d6b85689a 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -113,9 +113,8 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         assert tag not in all_tags
 
-    def test_list_datasets_output_format(self):
-        datasets = openml.datasets.list_datasets(output_format="dataframe")
-        assert isinstance(datasets, pd.DataFrame)
+    def test_list_datasets_length(self):
+        datasets = openml.datasets.list_datasets()
         assert len(datasets) >= 100
 
     def test_list_datasets_paginate(self):
@@ -123,14 +122,18 @@ def test_list_datasets_paginate(self):
         max = 100
         for i in range(0, max, size):
             datasets = openml.datasets.list_datasets(offset=i, size=size)
-            assert size == len(datasets)
-            self._check_datasets(datasets)
+            assert len(datasets) == size
+            assert len(datasets.columns) >= 2
+            # Maybe index?
+            assert "did" in datasets.columns
+            assert datasets["did"].dtype == int
+            assert "status" in datasets.columns
+            assert datasets["status"].dtype == pd.CategoricalDtype(
+                categories=["in_preparation", "active", "deactivated"],
+            )
 
     def test_list_datasets_empty(self):
-        datasets = openml.datasets.list_datasets(
-            tag="NoOneWouldUseThisTagAnyway",
-            output_format="dataframe",
-        )
+        datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
         assert datasets.empty
 
     @pytest.mark.production()
@@ -308,9 +311,9 @@ def ensure_absence_of_real_data():
 
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
-        # TODO(eddiebergman): Will break from dataset_format removal
         X, *_ = dataset.get_data()
-        assert isinstance(X, scipy.sparse.csr_matrix)
+        assert isinstance(X, pd.DataFrame)
+        assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
@@ -570,11 +573,7 @@ def test_upload_dataset_with_url(self):
     def _assert_status_of_dataset(self, *, did: int, status: str):
         """Asserts there is exactly one dataset with id `did` and its current status is `status`"""
         # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(
-            data_id=[did],
-            status="all",
-            output_format="dataframe",
-        )
+        result = openml.datasets.list_datasets(data_id=[did], status="all")
         result = result.to_dict(orient="index")
         # I think we should drop the test that one result is returned,
         # the server should never return multiple results?
@@ -1522,8 +1521,8 @@ def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
         openml.config.server = self.production_server
 
-        datasets_a = openml.datasets.list_datasets(output_format="dataframe")
-        datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)
+        datasets_a = openml.datasets.list_datasets()
+        datasets_b = openml.datasets.list_datasets(size=np.inf)
 
         # Reverting to test server
         openml.config.server = self.test_server
@@ -1792,7 +1791,7 @@ def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame):
 
 @pytest.fixture(scope="module")
 def all_datasets():
-    return openml.datasets.list_datasets(output_format="dataframe")
+    return openml.datasets.list_datasets()
 
 
 def test_list_datasets(all_datasets: pd.DataFrame):
@@ -1804,49 +1803,37 @@ def test_list_datasets(all_datasets: pd.DataFrame):
 
 
 def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
-    tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
+    tag_datasets = openml.datasets.list_datasets(tag="study_14")
     assert 0 < len(tag_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(tag_datasets)
 
 
 def test_list_datasets_by_size():
-    datasets = openml.datasets.list_datasets(size=5, output_format="dataframe")
+    datasets = openml.datasets.list_datasets(size=5)
     assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
-    small_datasets = openml.datasets.list_datasets(
-        number_instances="5..100",
-        output_format="dataframe",
-    )
+    small_datasets = openml.datasets.list_datasets(number_instances="5..100")
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
-    wide_datasets = openml.datasets.list_datasets(
-        number_features="50..100",
-        output_format="dataframe",
-    )
+    wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
 
 
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
-    five_class_datasets = openml.datasets.list_datasets(
-        number_classes="5",
-        output_format="dataframe",
-    )
+    five_class_datasets = openml.datasets.list_datasets(number_classes="5")
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
-    na_datasets = openml.datasets.list_datasets(
-        number_missing_values="5..100",
-        output_format="dataframe",
-    )
+    na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
@@ -1856,7 +1843,6 @@ def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
         tag="study_14",
         number_instances="100..1000",
         number_missing_values="800..1000",
-        output_format="dataframe",
     )
     assert 1 <= len(combined_filter_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(combined_filter_datasets)
@@ -1956,8 +1942,12 @@ def test_get_dataset_with_invalid_id() -> None:
         openml.datasets.get_dataset(INVALID_ID)
         assert e.value.code == 111
 
+
 def test_read_features_from_xml_with_whitespace() -> None:
     from openml.datasets.dataset import _read_features
-    features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+
+    features_file = (
+        Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+    )
     dict = _read_features(features_file)
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 7af01384f..37b0ce7c8 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -17,7 +17,6 @@ def _check_list_evaluation_setups(self, **kwargs):
             "predictive_accuracy",
             **kwargs,
             sort_order="desc",
-            output_format="dataframe",
         )
         evals = openml.evaluations.list_evaluations(
             "predictive_accuracy",
@@ -250,7 +249,6 @@ def test_list_evaluations_setups_filter_flow(self):
             flows=flow_id,
             size=size,
             sort_order="desc",
-            output_format="dataframe",
             parameters_in_separate_columns=True,
         )
         columns = list(evals_cols.columns)
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index bf5b03f3f..aabf40bda 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -8,6 +8,7 @@ class TestEvaluationsExample(unittest.TestCase):
     def test_example_python_paper(self):
         # Example script which will appear in the upcoming OpenML-Python paper
         # This test ensures that the example will keep running!
+        #
 
         import matplotlib.pyplot as plt
         import numpy as np
@@ -18,7 +19,7 @@ def test_example_python_paper(self):
             "predictive_accuracy",
             flows=[8353],
             tasks=[6],
-            output_format="dataframe",
+            # output_format="dataframe", ... This was removed
             parameters_in_separate_columns=True,
         )  # Choose an SVM flow, for example 8353, and a task.
 
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 242f6793c..aeae743c2 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -2220,7 +2220,7 @@ def test_run_on_model_with_empty_steps(self):
         task = openml.tasks.get_task(59)  # mfeat-pixel; crossvalidation
 
         X, y, categorical_ind, feature_names = dataset.get_data(
-            target=dataset.default_target_attribute,
+            target_names=dataset.default_target_attribute,
         )
         categorical_ind = np.array(categorical_ind)
         (cat_idx,) = np.where(categorical_ind)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index dafbeaf3c..dcf074c8f 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -101,20 +101,20 @@ def test_get_structure(self):
                 assert subflow.flow_id == sub_flow_id
 
     def test_tagging(self):
-        flows = openml.flows.list_flows(size=1, output_format="dataframe")
+        flows = openml.flows.list_flows(size=1)
         flow_id = flows["id"].iloc[0]
         flow = openml.flows.get_flow(flow_id)
         # tags can be at most 64 alphanumeric (+ underscore) chars
         unique_indicator = str(time.time()).replace(".", "")
         tag = f"test_tag_TestFlow_{unique_indicator}"
-        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
         flow.push_tag(tag)
-        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 1
         assert flow_id in flows["id"]
         flow.remove_tag(tag)
-        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
 
     def test_from_xml_to_xml(self):
@@ -156,7 +156,9 @@ def test_from_xml_to_xml(self):
     @pytest.mark.sklearn()
     def test_to_xml_from_xml(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         boosting = sklearn.ensemble.AdaBoostClassifier(
             **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
         )
@@ -269,12 +271,14 @@ def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
         # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         semi_legal = sklearn.ensemble.BaggingClassifier(
             **{
                 estimator_name: sklearn.ensemble.BaggingClassifier(
                     **{
-                        estimator_name:sklearn.tree.DecisionTreeClassifier(),
+                        estimator_name: sklearn.tree.DecisionTreeClassifier(),
                     }
                 )
             }
@@ -428,7 +432,9 @@ def test_sklearn_to_upload_to_flow(self):
             percentile=30,
         )
         fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
-        estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         boosting = sklearn.ensemble.AdaBoostClassifier(
             **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
         )
@@ -499,7 +505,9 @@ def test_sklearn_to_upload_to_flow(self):
         assert new_flow is not flow
 
         # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        module_name_encoder = (
+            "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+        )
         if Version(sklearn.__version__) < Version("0.22"):
             fixture_name = (
                 f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index b3d5be1a6..a25c2d740 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -51,7 +51,7 @@ def test_list_flows(self):
         openml.config.server = self.production_server
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
-        flows = openml.flows.list_flows(output_format="dataframe")
+        flows = openml.flows.list_flows()
         # 3000 as the number of flows on openml.org
         assert len(flows) >= 1500
         for flow in flows.to_dict(orient="index").values():
@@ -62,20 +62,20 @@ def test_list_flows_output_format(self):
         openml.config.server = self.production_server
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
-        flows = openml.flows.list_flows(output_format="dataframe")
+        flows = openml.flows.list_flows()
         assert isinstance(flows, pd.DataFrame)
         assert len(flows) >= 1500
 
     @pytest.mark.production()
     def test_list_flows_empty(self):
         openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe")
+        flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
         assert flows.empty
 
     @pytest.mark.production()
     def test_list_flows_by_tag(self):
         openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag="weka", output_format="dataframe")
+        flows = openml.flows.list_flows(tag="weka")
         assert len(flows) >= 5
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
@@ -86,7 +86,7 @@ def test_list_flows_paginate(self):
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
-            flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe")
+            flows = openml.flows.list_flows(offset=i, size=size)
             assert size >= len(flows)
             for flow in flows.to_dict(orient="index").values():
                 self._check_flow(flow)
@@ -199,14 +199,18 @@ def test_are_flows_equal_ignore_parameter_values(self):
         new_flow.parameters["a"] = 7
         with pytest.raises(ValueError) as excinfo:
             openml.flows.functions.assert_flows_equal(flow, new_flow)
-        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value)
+        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+            excinfo.value
+        )
 
         openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True)
 
         del new_flow.parameters["a"]
         with pytest.raises(ValueError) as excinfo:
             openml.flows.functions.assert_flows_equal(flow, new_flow)
-        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value)
+        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+            excinfo.value
+        )
 
         self.assertRaisesRegex(
             ValueError,
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index 37cf6591d..4968fb363 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -16,7 +16,7 @@
 class TestConfig(openml.testing.TestBase):
     def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
-            openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe")
+            openml.datasets.list_datasets(data_id=list(range(10000)))
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index ce46b6548..6769b2140 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -26,21 +26,21 @@ class TestRun(TestBase):
     # less than 1 seconds
 
     def test_tagging(self):
-        runs = openml.runs.list_runs(size=1, output_format="dataframe")
+        runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
         run_id = runs["run_id"].iloc[0]
         run = openml.runs.get_run(run_id)
         # tags can be at most 64 alphanumeric (+ underscore) chars
         unique_indicator = str(time()).replace(".", "")
         tag = f"test_tag_TestRun_{unique_indicator}"
-        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        runs = openml.runs.list_runs(tag=tag)
         assert len(runs) == 0
         run.push_tag(tag)
-        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        runs = openml.runs.list_runs(tag=tag)
         assert len(runs) == 1
         assert run_id in runs["run_id"]
         run.remove_tag(tag)
-        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        runs = openml.runs.list_runs(tag=tag)
         assert len(runs) == 0
 
     @staticmethod
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 960a72d6d..7ce5bf71e 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1434,25 +1434,21 @@ def _check_run(self, run):
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe")
+        runs = openml.runs.list_runs(id=[2], display_errors=True)
         assert len(runs) == 1
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
     def test_list_runs_empty(self):
-        runs = openml.runs.list_runs(task=[0], output_format="dataframe")
+        runs = openml.runs.list_runs(task=[0])
         assert runs.empty
 
-    def test_list_runs_output_format(self):
-        runs = openml.runs.list_runs(size=1000, output_format="dataframe")
-        assert isinstance(runs, pd.DataFrame)
-
     @pytest.mark.production()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         task_ids = [20]
-        runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(task=task_ids)
         assert len(runs) >= 590
         for run in runs.to_dict(orient="index").values():
             assert run["task_id"] in task_ids
@@ -1460,7 +1456,7 @@ def test_get_runs_list_by_task(self):
         num_runs = len(runs)
 
         task_ids.append(21)
-        runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(task=task_ids)
         assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
             assert run["task_id"] in task_ids
@@ -1473,7 +1469,7 @@ def test_get_runs_list_by_uploader(self):
         # 29 is Dominik Kirchhoff
         uploader_ids = [29]
 
-        runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(uploader=uploader_ids)
         assert len(runs) >= 2
         for run in runs.to_dict(orient="index").values():
             assert run["uploader"] in uploader_ids
@@ -1482,7 +1478,7 @@ def test_get_runs_list_by_uploader(self):
 
         uploader_ids.append(274)
 
-        runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(uploader=uploader_ids)
         assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
             assert run["uploader"] in uploader_ids
@@ -1493,7 +1489,7 @@ def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         flow_ids = [1154]
-        runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(flow=flow_ids)
         assert len(runs) >= 1
         for run in runs.to_dict(orient="index").values():
             assert run["flow_id"] in flow_ids
@@ -1501,7 +1497,7 @@ def test_get_runs_list_by_flow(self):
         num_runs = len(runs)
 
         flow_ids.append(1069)
-        runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
+        runs = openml.runs.list_runs(flow=flow_ids)
         assert len(runs) >= num_runs + 1
         for run in runs.to_dict(orient="index").values():
             assert run["flow_id"] in flow_ids
@@ -1515,12 +1511,7 @@ def test_get_runs_pagination(self):
         size = 10
         max = 100
         for i in range(0, max, size):
-            runs = openml.runs.list_runs(
-                offset=i,
-                size=size,
-                uploader=uploader_ids,
-                output_format="dataframe",
-            )
+            runs = openml.runs.list_runs(offset=i, size=size, uploader=uploader_ids)
             assert size >= len(runs)
             for run in runs.to_dict(orient="index").values():
                 assert run["uploader"] in uploader_ids
@@ -1543,23 +1534,22 @@ def test_get_runs_list_by_filters(self):
         # self.assertRaises(openml.exceptions.OpenMLServerError,
         # openml.runs.list_runs)
 
-        runs = openml.runs.list_runs(id=ids, output_format="dataframe")
+        runs = openml.runs.list_runs(id=ids)
         assert len(runs) == 2
 
-        runs = openml.runs.list_runs(task=tasks, output_format="dataframe")
+        runs = openml.runs.list_runs(task=tasks)
         assert len(runs) >= 2
 
-        runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe")
+        runs = openml.runs.list_runs(uploader=uploaders_2)
         assert len(runs) >= 10
 
-        runs = openml.runs.list_runs(flow=flows, output_format="dataframe")
+        runs = openml.runs.list_runs(flow=flows)
         assert len(runs) >= 100
 
         runs = openml.runs.list_runs(
             id=ids,
             task=tasks,
             uploader=uploaders_1,
-            output_format="dataframe",
         )
         assert len(runs) == 2
 
@@ -1568,7 +1558,7 @@ def test_get_runs_list_by_tag(self):
         # TODO: comes from live, no such lists on test
         # Unit test works on production server only
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(tag="curves", output_format="dataframe")
+        runs = openml.runs.list_runs(tag="curves")
         assert len(runs) >= 1
 
     @pytest.mark.sklearn()
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 06cf821c2..88338663a 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -4,7 +4,6 @@
 import hashlib
 import time
 import unittest.mock
-from typing import Dict
 
 import pandas as pd
 import pytest
@@ -155,12 +154,12 @@ def test_list_setups_empty(self):
     def test_list_setups_output_format(self):
         openml.config.server = self.production_server
         flow_id = 6794
-        setups = openml.setups.list_setups(flow=flow_id, output_format="object", size=10)
-        assert isinstance(setups, Dict)
+        setups = openml.setups.list_setups(flow=flow_id, size=10)
+        assert isinstance(setups, dict)
         assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
         assert len(setups) == 10
 
-        setups = openml.setups.list_setups(flow=flow_id, output_format="dataframe", size=10)
+        setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe")
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index d01a1dcf4..d13f770c8 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -1,7 +1,6 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import pandas as pd
 import pytest
 import unittest
 
@@ -236,13 +235,15 @@ def test_study_attach_illegal(self):
         study_original = openml.study.get_study(study.id)
 
         with pytest.raises(
-            openml.exceptions.OpenMLServerException, match="Problem attaching entities."
+            openml.exceptions.OpenMLServerException,
+            match="Problem attaching entities.",
         ):
             # run id does not exists
             openml.study.attach_to_study(study.id, [0])
 
         with pytest.raises(
-            openml.exceptions.OpenMLServerException, match="Problem attaching entities."
+            openml.exceptions.OpenMLServerException,
+            match="Problem attaching entities.",
         ):
             # some runs already attached
             openml.study.attach_to_study(study.id, list(run_list_more.keys()))
@@ -251,11 +252,6 @@ def test_study_attach_illegal(self):
 
     @unittest.skip("It is unclear when we can expect the test to pass or fail.")
     def test_study_list(self):
-        study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
+        study_list = openml.study.list_studies(status="in_preparation")
         # might fail if server is recently reset
         assert len(study_list) >= 2
-
-    @unittest.skip("It is unclear when we can expect the test to pass or fail.")
-    def test_study_list_output_format(self):
-        study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
-        assert isinstance(study_list, pd.DataFrame)
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 046184791..856352ac2 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -57,7 +57,7 @@ def test__get_estimation_procedure_list(self):
     def test_list_clustering_task(self):
         # as shown by #383, clustering tasks can give list/dict casting problems
         openml.config.server = self.production_server
-        openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10, output_format="dataframe")
+        openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
         # the expected outcome is that it doesn't crash. No assertions.
 
     def _check_task(self, task):
@@ -72,34 +72,30 @@ def _check_task(self, task):
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
-        tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) >= num_curves_tasks
         for task in tasks.to_dict(orient="index").values():
             assert ttid == task["ttid"]
             self._check_task(task)
 
-    def test_list_tasks_output_format(self):
+    def test_list_tasks_length(self):
         ttid = TaskType.LEARNING_CURVE
-        tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
-        assert isinstance(tasks, pd.DataFrame)
+        tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) > 100
 
     def test_list_tasks_empty(self):
-        tasks = cast(
-            pd.DataFrame,
-            openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag", output_format="dataframe"),
-        )
+        tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
         assert tasks.empty
 
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
-        tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+        tasks = openml.tasks.list_tasks(tag="OpenML100")
         assert len(tasks) >= num_basic_tasks
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
     def test_list_tasks(self):
-        tasks = openml.tasks.list_tasks(output_format="dataframe")
+        tasks = openml.tasks.list_tasks()
         assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
@@ -108,7 +104,7 @@ def test_list_tasks_paginate(self):
         size = 10
         max = 100
         for i in range(0, max, size):
-            tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe")
+            tasks = openml.tasks.list_tasks(offset=i, size=size)
             assert size >= len(tasks)
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
@@ -123,12 +119,7 @@ def test_list_tasks_per_type_paginate(self):
         ]
         for j in task_types:
             for i in range(0, max, size):
-                tasks = openml.tasks.list_tasks(
-                    task_type=j,
-                    offset=i,
-                    size=size,
-                    output_format="dataframe",
-                )
+                tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size)
                 assert size >= len(tasks)
                 for task in tasks.to_dict(orient="index").values():
                     assert j == task["ttid"]
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 552fbe949..4480c2cbc 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -20,14 +20,14 @@ def test_tagging(self):
         # tags can be at most 64 alphanumeric (+ underscore) chars
         unique_indicator = str(time()).replace(".", "")
         tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"
-        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
         task.push_tag(tag)
-        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 1
         assert 1 in tasks["tid"]
         task.remove_tag(tag)
-        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
 
     def test_get_train_and_test_split_indices(self):
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 38d497e2d..51504dedc 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -88,11 +88,7 @@ def test_list_all():
 
 @pytest.mark.server()
 def test_list_all_for_tasks(min_number_tasks_on_test_server):
-    tasks = openml.tasks.list_tasks(
-        batch_size=1000,
-        size=min_number_tasks_on_test_server,
-        output_format="dataframe",
-    )
+    tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
@@ -124,11 +120,7 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
 
 @pytest.mark.server()
 def test_list_all_for_flows(min_number_flows_on_test_server):
-    flows = openml.flows.list_flows(
-        batch_size=25,
-        size=min_number_flows_on_test_server,
-        output_format="dataframe",
-    )
+    flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
@@ -161,12 +153,7 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
 @pytest.mark.server()
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
 def test_list_all_few_results_available(_perform_api_call):
-    datasets = openml.datasets.list_datasets(
-        size=1000,
-        data_name="iris",
-        data_version=1,
-        output_format="dataframe",
-    )
+    datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
     assert _perform_api_call.call_count == 1, "expect just one call to get one dataset"
 

From 2c8212287a9a76704f475473f6ee0b3cb03b885e Mon Sep 17 00:00:00 2001
From: SubhadityaMukherjee <msubhaditya@gmail.com>
Date: Tue, 1 Apr 2025 15:05:22 +0200
Subject: [PATCH 05/23] rename target_names back to target

---
 examples/20_basic/simple_datasets_tutorial.py          |  2 +-
 examples/20_basic/simple_flows_and_runs_tutorial.py    |  2 +-
 examples/30_extended/datasets_tutorial.py              |  2 +-
 examples/30_extended/flows_and_runs_tutorial.py        |  4 ++--
 openml/tasks/task.py                                   |  4 ++--
 tests/test_datasets/test_dataset.py                    | 10 +++++-----
 .../test_sklearn_extension/test_sklearn_extension.py   |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
index fed997d58..9b18aab14 100644
--- a/examples/20_basic/simple_datasets_tutorial.py
+++ b/examples/20_basic/simple_datasets_tutorial.py
@@ -48,7 +48,7 @@
 # attribute_names - the names of the features for the examples (X) and
 # target feature (y)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target_names=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 
 ############################################################################
diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
index 63a3a6d85..f7d7a49d1 100644
--- a/examples/20_basic/simple_flows_and_runs_tutorial.py
+++ b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -23,7 +23,7 @@
 # NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20
 dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target_names=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 clf.fit(X, y)
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 2841c68f7..77a46d8b0 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -68,7 +68,7 @@
 # dataset. In particular, many datasets for supervised problems have a set
 # `default_target_attribute` which may help identify the target variable.
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target_names=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 print(X.head())
 print(X.info())
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
index bbe94c1dc..afd398feb 100644
--- a/examples/30_extended/flows_and_runs_tutorial.py
+++ b/examples/30_extended/flows_and_runs_tutorial.py
@@ -27,7 +27,7 @@
 # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target_names=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
 clf.fit(X, y)
@@ -38,7 +38,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target_names=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index c3079db91..395b52482 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -292,7 +292,7 @@ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         ):
             raise NotImplementedError(self.task_type)
 
-        X, y, _, _ = dataset.get_data(target_names=self.target_name)
+        X, y, _, _ = dataset.get_data(target=self.target_name)
         return X, y
 
     def _to_dict(self) -> dict[str, dict]:
@@ -502,7 +502,7 @@ def get_X(self) -> pd.DataFrame:
         The X data as a dataframe
         """
         dataset = self.get_dataset()
-        data, *_ = dataset.get_data(target_names=None)
+        data, *_ = dataset.get_data(target=None)
         return data
 
     def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 0675c21d2..8afc1fb91 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -120,7 +120,7 @@ def test_get_data_pandas(self):
             assert data[col_name].dtype.name == col_dtype[col_name]
 
         X, y, _, _ = self.titanic.get_data(
-            target_names=self.titanic.default_target_attribute,
+            target=self.titanic.default_target_attribute,
         )
         assert isinstance(X, pd.DataFrame)
         assert isinstance(y, pd.Series)
@@ -171,7 +171,7 @@ def test_get_data_with_rowid(self):
 
     @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_target_pandas(self):
-        X, y, categorical, attribute_names = self.dataset.get_data(target_names="class")
+        X, y, categorical, attribute_names = self.dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
         for dtype, is_cat, col in zip(X.dtypes, categorical, X):
             self._check_expected_type(dtype, is_cat, X[col])
@@ -187,7 +187,7 @@ def test_get_data_with_target_pandas(self):
     def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
-        X, y, categorical, names = self.dataset.get_data(target_names="class")
+        X, y, categorical, names = self.dataset.get_data(target="class")
         assert X.shape == (898, 36)
         assert len(categorical) == 36
         cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
@@ -348,7 +348,7 @@ def setUp(self):
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
     def test_get_sparse_dataset_dataframe_with_target(self):
-        X, y, _, attribute_names = self.sparse_dataset.get_data(target_names="class")
+        X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
         assert isinstance(X.dtypes[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
@@ -375,7 +375,7 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         self.sparse_dataset.row_id_attribute = ["V512"]
         # TODO(eddiebergman): Will break from dataset_format removal
         X, y, categorical, _ = self.sparse_dataset.get_data(
-            target_names="class",
+            target="class",
             include_row_id=False,
             include_ignore_attribute=False,
         )
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index aeae743c2..242f6793c 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -2220,7 +2220,7 @@ def test_run_on_model_with_empty_steps(self):
         task = openml.tasks.get_task(59)  # mfeat-pixel; crossvalidation
 
         X, y, categorical_ind, feature_names = dataset.get_data(
-            target_names=dataset.default_target_attribute,
+            target=dataset.default_target_attribute,
         )
         categorical_ind = np.array(categorical_ind)
         (cat_idx,) = np.where(categorical_ind)

From ce5788108dbb6047ccbf73a261ea8ed04d4c910f Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 10:33:31 +0200
Subject: [PATCH 06/23] fix: resolve kdd_rijn example to be without dataframe
 parameter and remove wrong merge files

---
 examples/40_paper/2018_kdd_rijn_example.py    |  14 +-
 .../2018_kdd_rijn_example_BACKUP_34122.py     | 271 ------------------
 .../2018_kdd_rijn_example_BACKUP_39115.py     | 271 ------------------
 .../2018_kdd_rijn_example_BASE_34122.py       | 189 ------------
 .../2018_kdd_rijn_example_BASE_39115.py       | 189 ------------
 .../2018_kdd_rijn_example_LOCAL_34122.py      | 188 ------------
 .../2018_kdd_rijn_example_LOCAL_39115.py      | 188 ------------
 .../2018_kdd_rijn_example_REMOTE_34122.py     | 188 ------------
 .../2018_kdd_rijn_example_REMOTE_39115.py     | 188 ------------
 9 files changed, 11 insertions(+), 1675 deletions(-)
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BASE_34122.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BASE_39115.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py
 delete mode 100644 examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py

diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index f38466502..f98f5de55 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -106,13 +106,19 @@
         size=limit_per_task,
     )
 
-
+    performance_column = "value"
+    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
+    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
+    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
+    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
+    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
+    # setups that belong to the flows embedded in this example though.
     try:
         setups_evals = pd.DataFrame(
             [
                 dict(
                     **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]}
+                    **{performance_column: setup[performance_column]},
                 )
                 for _, setup in evals.iterrows()
             ]
@@ -152,7 +158,9 @@
             fanova_results.append(
                 {
                     "hyperparameter": pname.split(".")[-1],
-                    "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
+                    "fanova": evaluator.quantify_importance([idx])[(idx,)][
+                        "individual importance"
+                    ],
                 }
             )
         except RuntimeError as e:
diff --git a/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py b/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py
deleted file mode 100644
index bf3b7ac83..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    sys.exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-<<<<<<< HEAD
-print("This example is deprecated, remove this code to use it manually.")
-sys.exit()
-
-import json
-
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
-=======
-print("This example is deprecated, remove the `if False` in this code to use it manually.")
-if False:
-    import json
-    import fanova
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    import openml
-
-
-    ##############################################################################
-    # With the advent of automated machine learning, automated hyperparameter
-    # optimization methods are by now routinely used in data mining. However, this
-    # progress is not yet matched by equal progress on automatic analyses that
-    # yield information beyond performance-optimizing hyperparameter settings.
-    # In this example, we aim to answer the following two questions: Given an
-    # algorithm, what are generally its most important hyperparameters?
-    #
-    # This work is carried out on the OpenML-100 benchmark suite, which can be
-    # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-    # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-    # with specific kernel (we will perform a post-process filter operation for
-    # this). We should set some other experimental parameters (number of results
-    # per task, evaluation measure and the number of trees of the internal
-    # functional Anova) before the fun can begin.
-    #
-    # Note that we simplify the example in several ways:
-    #
-    # 1) We only consider numerical hyperparameters
-    # 2) We consider all hyperparameters that are numerical (in reality, some
-    #    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-    #    (e.g., ``random_state``)
-    # 3) We assume all hyperparameters to be on uniform scale
-    #
-    # Any difference in conclusion between the actual paper and the presented
-    # results is most likely due to one of these simplifications. For example,
-    # the hyperparameter C looks rather insignificant, whereas it is quite
-    # important when it is put on a log-scale. All these simplifications can be
-    # addressed by defining a ConfigSpace. For a more elaborated example that uses
-    # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
->>>>>>> develop
-
-    suite = openml.study.get_suite("OpenML100")
-    flow_id = 7707
-    parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-    evaluation_measure = "predictive_accuracy"
-    limit_per_task = 500
-    limit_nr_tasks = 15
-    n_trees = 16
-
-<<<<<<< HEAD
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]},
-                )
-                for _, setup in evals.iterrows()
-            ]
-=======
-    fanova_results = []
-    # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-    # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-    for idx, task_id in enumerate(suite.tasks):
-        if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-            continue
-        print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-        )
-        # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-        evals = openml.evaluations.list_evaluations_setups(
-            evaluation_measure,
-            flows=[flow_id],
-            tasks=[task_id],
-            size=limit_per_task,
-            output_format="dataframe",
->>>>>>> develop
-        )
-
-        performance_column = "value"
-        # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-        # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-        # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-        # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-        # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-        # setups that belong to the flows embedded in this example though.
-        try:
-            setups_evals = pd.DataFrame(
-                [
-                    dict(
-                        **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
-                    )
-                    for _, setup in evals.iterrows()
-                ]
-            )
-        except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
-            continue
-        # apply our filters, to have only the setups that comply to the hyperparameters we want
-        for filter_key, filter_value in parameter_filters.items():
-            setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-        # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-        # the fanova library needs to be informed by using a configspace object.
-        setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-        # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-        # ``verbose``.
-        setups_evals = setups_evals[
-            [
-                c
-                for c in list(setups_evals)
-                if len(setups_evals[c].unique()) > 1 or c == performance_column
-            ]
-        ]
-        # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-        # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-        # determine x values to pass to fanova library
-        parameter_names = [
-            pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-        ]
-        evaluator = fanova.fanova.fANOVA(
-            X=setups_evals[parameter_names].to_numpy(),
-            Y=setups_evals[performance_column].to_numpy(),
-            n_trees=n_trees,
-        )
-        for idx, pname in enumerate(parameter_names):
-            try:
-                fanova_results.append(
-                    {
-                        "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                    }
-                )
-            except RuntimeError as e:
-                # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-                # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-                # paper).
-                print("Task %d error: %s" % (task_id, e))
-                continue
-
-    # transform ``fanova_results`` from a list of dicts into a DataFrame
-    fanova_results = pd.DataFrame(fanova_results)
-
-    ##############################################################################
-    # make the boxplot of the variance contribution. Obviously, we can also use
-    # this data to make the Nemenyi plot, but this relies on the rather complex
-    # ``Orange`` dependency (``pip install Orange3``). For the complete example,
-    # the reader is referred to the more elaborate script (referred to earlier)
-    fig, ax = plt.subplots()
-    sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-    ax.set_ylabel("Variance Contribution")
-    ax.set_xlabel(None)
-    plt.tight_layout()
-    plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py b/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py
deleted file mode 100644
index bf3b7ac83..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    sys.exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-<<<<<<< HEAD
-print("This example is deprecated, remove this code to use it manually.")
-sys.exit()
-
-import json
-
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
-=======
-print("This example is deprecated, remove the `if False` in this code to use it manually.")
-if False:
-    import json
-    import fanova
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    import openml
-
-
-    ##############################################################################
-    # With the advent of automated machine learning, automated hyperparameter
-    # optimization methods are by now routinely used in data mining. However, this
-    # progress is not yet matched by equal progress on automatic analyses that
-    # yield information beyond performance-optimizing hyperparameter settings.
-    # In this example, we aim to answer the following two questions: Given an
-    # algorithm, what are generally its most important hyperparameters?
-    #
-    # This work is carried out on the OpenML-100 benchmark suite, which can be
-    # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-    # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-    # with specific kernel (we will perform a post-process filter operation for
-    # this). We should set some other experimental parameters (number of results
-    # per task, evaluation measure and the number of trees of the internal
-    # functional Anova) before the fun can begin.
-    #
-    # Note that we simplify the example in several ways:
-    #
-    # 1) We only consider numerical hyperparameters
-    # 2) We consider all hyperparameters that are numerical (in reality, some
-    #    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-    #    (e.g., ``random_state``)
-    # 3) We assume all hyperparameters to be on uniform scale
-    #
-    # Any difference in conclusion between the actual paper and the presented
-    # results is most likely due to one of these simplifications. For example,
-    # the hyperparameter C looks rather insignificant, whereas it is quite
-    # important when it is put on a log-scale. All these simplifications can be
-    # addressed by defining a ConfigSpace. For a more elaborated example that uses
-    # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
->>>>>>> develop
-
-    suite = openml.study.get_suite("OpenML100")
-    flow_id = 7707
-    parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-    evaluation_measure = "predictive_accuracy"
-    limit_per_task = 500
-    limit_nr_tasks = 15
-    n_trees = 16
-
-<<<<<<< HEAD
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]},
-                )
-                for _, setup in evals.iterrows()
-            ]
-=======
-    fanova_results = []
-    # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-    # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-    for idx, task_id in enumerate(suite.tasks):
-        if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-            continue
-        print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-        )
-        # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-        evals = openml.evaluations.list_evaluations_setups(
-            evaluation_measure,
-            flows=[flow_id],
-            tasks=[task_id],
-            size=limit_per_task,
-            output_format="dataframe",
->>>>>>> develop
-        )
-
-        performance_column = "value"
-        # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-        # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-        # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-        # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-        # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-        # setups that belong to the flows embedded in this example though.
-        try:
-            setups_evals = pd.DataFrame(
-                [
-                    dict(
-                        **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
-                    )
-                    for _, setup in evals.iterrows()
-                ]
-            )
-        except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
-            continue
-        # apply our filters, to have only the setups that comply to the hyperparameters we want
-        for filter_key, filter_value in parameter_filters.items():
-            setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-        # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-        # the fanova library needs to be informed by using a configspace object.
-        setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-        # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-        # ``verbose``.
-        setups_evals = setups_evals[
-            [
-                c
-                for c in list(setups_evals)
-                if len(setups_evals[c].unique()) > 1 or c == performance_column
-            ]
-        ]
-        # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-        # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-        # determine x values to pass to fanova library
-        parameter_names = [
-            pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-        ]
-        evaluator = fanova.fanova.fANOVA(
-            X=setups_evals[parameter_names].to_numpy(),
-            Y=setups_evals[performance_column].to_numpy(),
-            n_trees=n_trees,
-        )
-        for idx, pname in enumerate(parameter_names):
-            try:
-                fanova_results.append(
-                    {
-                        "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                    }
-                )
-            except RuntimeError as e:
-                # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-                # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-                # paper).
-                print("Task %d error: %s" % (task_id, e))
-                continue
-
-    # transform ``fanova_results`` from a list of dicts into a DataFrame
-    fanova_results = pd.DataFrame(fanova_results)
-
-    ##############################################################################
-    # make the boxplot of the variance contribution. Obviously, we can also use
-    # this data to make the Nemenyi plot, but this relies on the rather complex
-    # ``Orange`` dependency (``pip install Orange3``). For the complete example,
-    # the reader is referred to the more elaborate script (referred to earlier)
-    fig, ax = plt.subplots()
-    sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-    ax.set_ylabel("Variance Contribution")
-    ax.set_xlabel(None)
-    plt.tight_layout()
-    plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py b/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py
deleted file mode 100644
index 7ec60fe53..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":  # noqa
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove this code to use it manually.")
-exit()
-
-import json
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
-
-suite = openml.study.get_suite("OpenML100")
-flow_id = 7707
-parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-evaluation_measure = "predictive_accuracy"
-limit_per_task = 500
-limit_nr_tasks = 15
-n_trees = 16
-
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-        output_format="dataframe",
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]}
-                )
-                for _, setup in evals.iterrows()
-            ]
-        )
-    except json.decoder.JSONDecodeError as e:
-        print("Task %d error: %s" % (task_id, e))
-        continue
-    # apply our filters, to have only the setups that comply to the hyperparameters we want
-    for filter_key, filter_value in parameter_filters.items():
-        setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-    # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-    # the fanova library needs to be informed by using a configspace object.
-    setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-    # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-    # ``verbose``.
-    setups_evals = setups_evals[
-        [
-            c
-            for c in list(setups_evals)
-            if len(setups_evals[c].unique()) > 1 or c == performance_column
-        ]
-    ]
-    # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-    # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-    # determine x values to pass to fanova library
-    parameter_names = [
-        pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-    ]
-    evaluator = fanova.fanova.fANOVA(
-        X=setups_evals[parameter_names].to_numpy(),
-        Y=setups_evals[performance_column].to_numpy(),
-        n_trees=n_trees,
-    )
-    for idx, pname in enumerate(parameter_names):
-        try:
-            fanova_results.append(
-                {
-                    "hyperparameter": pname.split(".")[-1],
-                    "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                }
-            )
-        except RuntimeError as e:
-            # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-            # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-            # paper).
-            print("Task %d error: %s" % (task_id, e))
-            continue
-
-# transform ``fanova_results`` from a list of dicts into a DataFrame
-fanova_results = pd.DataFrame(fanova_results)
-
-##############################################################################
-# make the boxplot of the variance contribution. Obviously, we can also use
-# this data to make the Nemenyi plot, but this relies on the rather complex
-# ``Orange`` dependency (``pip install Orange3``). For the complete example,
-# the reader is referred to the more elaborate script (referred to earlier)
-fig, ax = plt.subplots()
-sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-ax.set_ylabel("Variance Contribution")
-ax.set_xlabel(None)
-plt.tight_layout()
-plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py b/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py
deleted file mode 100644
index 7ec60fe53..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":  # noqa
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove this code to use it manually.")
-exit()
-
-import json
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
-
-suite = openml.study.get_suite("OpenML100")
-flow_id = 7707
-parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-evaluation_measure = "predictive_accuracy"
-limit_per_task = 500
-limit_nr_tasks = 15
-n_trees = 16
-
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-        output_format="dataframe",
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]}
-                )
-                for _, setup in evals.iterrows()
-            ]
-        )
-    except json.decoder.JSONDecodeError as e:
-        print("Task %d error: %s" % (task_id, e))
-        continue
-    # apply our filters, to have only the setups that comply to the hyperparameters we want
-    for filter_key, filter_value in parameter_filters.items():
-        setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-    # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-    # the fanova library needs to be informed by using a configspace object.
-    setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-    # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-    # ``verbose``.
-    setups_evals = setups_evals[
-        [
-            c
-            for c in list(setups_evals)
-            if len(setups_evals[c].unique()) > 1 or c == performance_column
-        ]
-    ]
-    # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-    # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-    # determine x values to pass to fanova library
-    parameter_names = [
-        pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-    ]
-    evaluator = fanova.fanova.fANOVA(
-        X=setups_evals[parameter_names].to_numpy(),
-        Y=setups_evals[performance_column].to_numpy(),
-        n_trees=n_trees,
-    )
-    for idx, pname in enumerate(parameter_names):
-        try:
-            fanova_results.append(
-                {
-                    "hyperparameter": pname.split(".")[-1],
-                    "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                }
-            )
-        except RuntimeError as e:
-            # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-            # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-            # paper).
-            print("Task %d error: %s" % (task_id, e))
-            continue
-
-# transform ``fanova_results`` from a list of dicts into a DataFrame
-fanova_results = pd.DataFrame(fanova_results)
-
-##############################################################################
-# make the boxplot of the variance contribution. Obviously, we can also use
-# this data to make the Nemenyi plot, but this relies on the rather complex
-# ``Orange`` dependency (``pip install Orange3``). For the complete example,
-# the reader is referred to the more elaborate script (referred to earlier)
-fig, ax = plt.subplots()
-sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-ax.set_ylabel("Variance Contribution")
-ax.set_xlabel(None)
-plt.tight_layout()
-plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py b/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py
deleted file mode 100644
index 2230ade2d..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    sys.exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove this code to use it manually.")
-sys.exit()
-
-import json
-
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
-
-suite = openml.study.get_suite("OpenML100")
-flow_id = 7707
-parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-evaluation_measure = "predictive_accuracy"
-limit_per_task = 500
-limit_nr_tasks = 15
-n_trees = 16
-
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]},
-                )
-                for _, setup in evals.iterrows()
-            ]
-        )
-    except json.decoder.JSONDecodeError as e:
-        print("Task %d error: %s" % (task_id, e))
-        continue
-    # apply our filters, to have only the setups that comply to the hyperparameters we want
-    for filter_key, filter_value in parameter_filters.items():
-        setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-    # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-    # the fanova library needs to be informed by using a configspace object.
-    setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-    # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-    # ``verbose``.
-    setups_evals = setups_evals[
-        [
-            c
-            for c in list(setups_evals)
-            if len(setups_evals[c].unique()) > 1 or c == performance_column
-        ]
-    ]
-    # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-    # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-    # determine x values to pass to fanova library
-    parameter_names = [
-        pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-    ]
-    evaluator = fanova.fanova.fANOVA(
-        X=setups_evals[parameter_names].to_numpy(),
-        Y=setups_evals[performance_column].to_numpy(),
-        n_trees=n_trees,
-    )
-    for idx, pname in enumerate(parameter_names):
-        try:
-            fanova_results.append(
-                {
-                    "hyperparameter": pname.split(".")[-1],
-                    "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                }
-            )
-        except RuntimeError as e:
-            # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-            # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-            # paper).
-            print("Task %d error: %s" % (task_id, e))
-            continue
-
-# transform ``fanova_results`` from a list of dicts into a DataFrame
-fanova_results = pd.DataFrame(fanova_results)
-
-##############################################################################
-# make the boxplot of the variance contribution. Obviously, we can also use
-# this data to make the Nemenyi plot, but this relies on the rather complex
-# ``Orange`` dependency (``pip install Orange3``). For the complete example,
-# the reader is referred to the more elaborate script (referred to earlier)
-fig, ax = plt.subplots()
-sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-ax.set_ylabel("Variance Contribution")
-ax.set_xlabel(None)
-plt.tight_layout()
-plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py b/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py
deleted file mode 100644
index 2230ade2d..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    sys.exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove this code to use it manually.")
-sys.exit()
-
-import json
-
-import fanova
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-
-import openml
-
-##############################################################################
-# With the advent of automated machine learning, automated hyperparameter
-# optimization methods are by now routinely used in data mining. However, this
-# progress is not yet matched by equal progress on automatic analyses that
-# yield information beyond performance-optimizing hyperparameter settings.
-# In this example, we aim to answer the following two questions: Given an
-# algorithm, what are generally its most important hyperparameters?
-#
-# This work is carried out on the OpenML-100 benchmark suite, which can be
-# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-# with specific kernel (we will perform a post-process filter operation for
-# this). We should set some other experimental parameters (number of results
-# per task, evaluation measure and the number of trees of the internal
-# functional Anova) before the fun can begin.
-#
-# Note that we simplify the example in several ways:
-#
-# 1) We only consider numerical hyperparameters
-# 2) We consider all hyperparameters that are numerical (in reality, some
-#    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-#    (e.g., ``random_state``)
-# 3) We assume all hyperparameters to be on uniform scale
-#
-# Any difference in conclusion between the actual paper and the presented
-# results is most likely due to one of these simplifications. For example,
-# the hyperparameter C looks rather insignificant, whereas it is quite
-# important when it is put on a log-scale. All these simplifications can be
-# addressed by defining a ConfigSpace. For a more elaborated example that uses
-# this, please see:
-# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
-
-suite = openml.study.get_suite("OpenML100")
-flow_id = 7707
-parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-evaluation_measure = "predictive_accuracy"
-limit_per_task = 500
-limit_nr_tasks = 15
-n_trees = 16
-
-fanova_results = []
-# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-for idx, task_id in enumerate(suite.tasks):
-    if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-        continue
-    print(
-        "Starting with task %d (%d/%d)"
-        % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-    )
-    # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-    evals = openml.evaluations.list_evaluations_setups(
-        evaluation_measure,
-        flows=[flow_id],
-        tasks=[task_id],
-        size=limit_per_task,
-    )
-
-    performance_column = "value"
-    # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-    # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-    # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-    # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-    # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-    # setups that belong to the flows embedded in this example though.
-    try:
-        setups_evals = pd.DataFrame(
-            [
-                dict(
-                    **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                    **{performance_column: setup[performance_column]},
-                )
-                for _, setup in evals.iterrows()
-            ]
-        )
-    except json.decoder.JSONDecodeError as e:
-        print("Task %d error: %s" % (task_id, e))
-        continue
-    # apply our filters, to have only the setups that comply to the hyperparameters we want
-    for filter_key, filter_value in parameter_filters.items():
-        setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-    # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-    # the fanova library needs to be informed by using a configspace object.
-    setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-    # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-    # ``verbose``.
-    setups_evals = setups_evals[
-        [
-            c
-            for c in list(setups_evals)
-            if len(setups_evals[c].unique()) > 1 or c == performance_column
-        ]
-    ]
-    # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-    # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-    # determine x values to pass to fanova library
-    parameter_names = [
-        pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-    ]
-    evaluator = fanova.fanova.fANOVA(
-        X=setups_evals[parameter_names].to_numpy(),
-        Y=setups_evals[performance_column].to_numpy(),
-        n_trees=n_trees,
-    )
-    for idx, pname in enumerate(parameter_names):
-        try:
-            fanova_results.append(
-                {
-                    "hyperparameter": pname.split(".")[-1],
-                    "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                }
-            )
-        except RuntimeError as e:
-            # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-            # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-            # paper).
-            print("Task %d error: %s" % (task_id, e))
-            continue
-
-# transform ``fanova_results`` from a list of dicts into a DataFrame
-fanova_results = pd.DataFrame(fanova_results)
-
-##############################################################################
-# make the boxplot of the variance contribution. Obviously, we can also use
-# this data to make the Nemenyi plot, but this relies on the rather complex
-# ``Orange`` dependency (``pip install Orange3``). For the complete example,
-# the reader is referred to the more elaborate script (referred to earlier)
-fig, ax = plt.subplots()
-sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-ax.set_ylabel("Variance Contribution")
-ax.set_xlabel(None)
-plt.tight_layout()
-plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py b/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py
deleted file mode 100644
index 6522013e3..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":  # noqa
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove the `if False` in this code to use it manually.")
-if False:
-    import json
-    import fanova
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    import openml
-
-
-    ##############################################################################
-    # With the advent of automated machine learning, automated hyperparameter
-    # optimization methods are by now routinely used in data mining. However, this
-    # progress is not yet matched by equal progress on automatic analyses that
-    # yield information beyond performance-optimizing hyperparameter settings.
-    # In this example, we aim to answer the following two questions: Given an
-    # algorithm, what are generally its most important hyperparameters?
-    #
-    # This work is carried out on the OpenML-100 benchmark suite, which can be
-    # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-    # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-    # with specific kernel (we will perform a post-process filter operation for
-    # this). We should set some other experimental parameters (number of results
-    # per task, evaluation measure and the number of trees of the internal
-    # functional Anova) before the fun can begin.
-    #
-    # Note that we simplify the example in several ways:
-    #
-    # 1) We only consider numerical hyperparameters
-    # 2) We consider all hyperparameters that are numerical (in reality, some
-    #    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-    #    (e.g., ``random_state``)
-    # 3) We assume all hyperparameters to be on uniform scale
-    #
-    # Any difference in conclusion between the actual paper and the presented
-    # results is most likely due to one of these simplifications. For example,
-    # the hyperparameter C looks rather insignificant, whereas it is quite
-    # important when it is put on a log-scale. All these simplifications can be
-    # addressed by defining a ConfigSpace. For a more elaborated example that uses
-    # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
-
-    suite = openml.study.get_suite("OpenML100")
-    flow_id = 7707
-    parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-    evaluation_measure = "predictive_accuracy"
-    limit_per_task = 500
-    limit_nr_tasks = 15
-    n_trees = 16
-
-    fanova_results = []
-    # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-    # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-    for idx, task_id in enumerate(suite.tasks):
-        if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-            continue
-        print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-        )
-        # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-        evals = openml.evaluations.list_evaluations_setups(
-            evaluation_measure,
-            flows=[flow_id],
-            tasks=[task_id],
-            size=limit_per_task,
-            output_format="dataframe",
-        )
-
-        performance_column = "value"
-        # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-        # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-        # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-        # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-        # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-        # setups that belong to the flows embedded in this example though.
-        try:
-            setups_evals = pd.DataFrame(
-                [
-                    dict(
-                        **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
-                    )
-                    for _, setup in evals.iterrows()
-                ]
-            )
-        except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
-            continue
-        # apply our filters, to have only the setups that comply to the hyperparameters we want
-        for filter_key, filter_value in parameter_filters.items():
-            setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-        # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-        # the fanova library needs to be informed by using a configspace object.
-        setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-        # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-        # ``verbose``.
-        setups_evals = setups_evals[
-            [
-                c
-                for c in list(setups_evals)
-                if len(setups_evals[c].unique()) > 1 or c == performance_column
-            ]
-        ]
-        # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-        # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-        # determine x values to pass to fanova library
-        parameter_names = [
-            pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-        ]
-        evaluator = fanova.fanova.fANOVA(
-            X=setups_evals[parameter_names].to_numpy(),
-            Y=setups_evals[performance_column].to_numpy(),
-            n_trees=n_trees,
-        )
-        for idx, pname in enumerate(parameter_names):
-            try:
-                fanova_results.append(
-                    {
-                        "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                    }
-                )
-            except RuntimeError as e:
-                # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-                # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-                # paper).
-                print("Task %d error: %s" % (task_id, e))
-                continue
-
-    # transform ``fanova_results`` from a list of dicts into a DataFrame
-    fanova_results = pd.DataFrame(fanova_results)
-
-    ##############################################################################
-    # make the boxplot of the variance contribution. Obviously, we can also use
-    # this data to make the Nemenyi plot, but this relies on the rather complex
-    # ``Orange`` dependency (``pip install Orange3``). For the complete example,
-    # the reader is referred to the more elaborate script (referred to earlier)
-    fig, ax = plt.subplots()
-    sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-    ax.set_ylabel("Variance Contribution")
-    ax.set_xlabel(None)
-    plt.tight_layout()
-    plt.show()
diff --git a/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py b/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py
deleted file mode 100644
index 6522013e3..000000000
--- a/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
-
-# License: BSD 3-Clause
-
-import sys
-
-if sys.platform == "win32":  # noqa
-    print(
-        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
-    )
-    exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove the `if False` in this code to use it manually.")
-if False:
-    import json
-    import fanova
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    import openml
-
-
-    ##############################################################################
-    # With the advent of automated machine learning, automated hyperparameter
-    # optimization methods are by now routinely used in data mining. However, this
-    # progress is not yet matched by equal progress on automatic analyses that
-    # yield information beyond performance-optimizing hyperparameter settings.
-    # In this example, we aim to answer the following two questions: Given an
-    # algorithm, what are generally its most important hyperparameters?
-    #
-    # This work is carried out on the OpenML-100 benchmark suite, which can be
-    # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
-    # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
-    # with specific kernel (we will perform a post-process filter operation for
-    # this). We should set some other experimental parameters (number of results
-    # per task, evaluation measure and the number of trees of the internal
-    # functional Anova) before the fun can begin.
-    #
-    # Note that we simplify the example in several ways:
-    #
-    # 1) We only consider numerical hyperparameters
-    # 2) We consider all hyperparameters that are numerical (in reality, some
-    #    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
-    #    (e.g., ``random_state``)
-    # 3) We assume all hyperparameters to be on uniform scale
-    #
-    # Any difference in conclusion between the actual paper and the presented
-    # results is most likely due to one of these simplifications. For example,
-    # the hyperparameter C looks rather insignificant, whereas it is quite
-    # important when it is put on a log-scale. All these simplifications can be
-    # addressed by defining a ConfigSpace. For a more elaborated example that uses
-    # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
-
-    suite = openml.study.get_suite("OpenML100")
-    flow_id = 7707
-    parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
-    evaluation_measure = "predictive_accuracy"
-    limit_per_task = 500
-    limit_nr_tasks = 15
-    n_trees = 16
-
-    fanova_results = []
-    # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
-    # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
-    for idx, task_id in enumerate(suite.tasks):
-        if limit_nr_tasks is not None and idx >= limit_nr_tasks:
-            continue
-        print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
-        )
-        # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
-        evals = openml.evaluations.list_evaluations_setups(
-            evaluation_measure,
-            flows=[flow_id],
-            tasks=[task_id],
-            size=limit_per_task,
-            output_format="dataframe",
-        )
-
-        performance_column = "value"
-        # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
-        # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
-        # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
-        # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
-        # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
-        # setups that belong to the flows embedded in this example though.
-        try:
-            setups_evals = pd.DataFrame(
-                [
-                    dict(
-                        **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
-                    )
-                    for _, setup in evals.iterrows()
-                ]
-            )
-        except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
-            continue
-        # apply our filters, to have only the setups that comply to the hyperparameters we want
-        for filter_key, filter_value in parameter_filters.items():
-            setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
-        # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
-        # the fanova library needs to be informed by using a configspace object.
-        setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
-        # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
-        # ``verbose``.
-        setups_evals = setups_evals[
-            [
-                c
-                for c in list(setups_evals)
-                if len(setups_evals[c].unique()) > 1 or c == performance_column
-            ]
-        ]
-        # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
-        # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
-        # determine x values to pass to fanova library
-        parameter_names = [
-            pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
-        ]
-        evaluator = fanova.fanova.fANOVA(
-            X=setups_evals[parameter_names].to_numpy(),
-            Y=setups_evals[performance_column].to_numpy(),
-            n_trees=n_trees,
-        )
-        for idx, pname in enumerate(parameter_names):
-            try:
-                fanova_results.append(
-                    {
-                        "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
-                    }
-                )
-            except RuntimeError as e:
-                # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
-                # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
-                # paper).
-                print("Task %d error: %s" % (task_id, e))
-                continue
-
-    # transform ``fanova_results`` from a list of dicts into a DataFrame
-    fanova_results = pd.DataFrame(fanova_results)
-
-    ##############################################################################
-    # make the boxplot of the variance contribution. Obviously, we can also use
-    # this data to make the Nemenyi plot, but this relies on the rather complex
-    # ``Orange`` dependency (``pip install Orange3``). For the complete example,
-    # the reader is referred to the more elaborate script (referred to earlier)
-    fig, ax = plt.subplots()
-    sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-    ax.set_ylabel("Variance Contribution")
-    ax.set_xlabel(None)
-    plt.tight_layout()
-    plt.show()

From ddda71113de125ef7bd5ecf8b22354127bf23421 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 10:47:45 +0200
Subject: [PATCH 07/23] fix: ensure dtypes as in original code

---
 openml/datasets/dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 91254ebd7..3712c99ba 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -463,7 +463,7 @@ def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
             data = self._get_arff(self.format)
         except OSError as e:
             logger.critical(
-                f"Please check that the data file {arff_file_path} is " "there and can be read.",
+                f"Please check that the data file {arff_file_path} is there and can be read.",
             )
             raise e
 
@@ -716,7 +716,7 @@ def valid_category(cat: Any) -> bool:
         raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
         return pd.Series(raw_cat, index=series.index, name=series.name)
 
-    def get_data(  # noqa: C901, PLR0912
+    def get_data(  # noqa: C901
         self,
         target: list[str] | str | None = None,
         include_row_id: bool = False,  # noqa: FBT001, FBT002
@@ -790,10 +790,10 @@ def get_data(  # noqa: C901, PLR0912
             y = y.sparse.to_dense()
 
             # Since it was sparsified, the y column may not be of the correct dtype, hence we check
-            # if it was categorical and convert it to and integer if needs be.
+            # if it was categorical and convert it to an integer if needs be.
             category_names = data.columns[categorical_mask]
-            if target_name in category_names:
-                y = y.astype(int)
+            target_dtype = int if target_name in category_names else float
+            y = y.astype(target_dtype)
 
         # Finally, remove the target from the list of attributes and categorical mask
         target_index = attribute_names.index(target_name)

From 063a8e6841fe0ece37b365ff07dfce74aaf3ad5a Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 12:03:05 +0200
Subject: [PATCH 08/23] fix: remove incorrect parsing of sparse pandas

---
 openml/datasets/dataset.py          | 9 ---------
 tests/test_datasets/test_dataset.py | 4 ++--
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 3712c99ba..fa83d2b8a 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -786,15 +786,6 @@ def get_data(  # noqa: C901
         x = data.drop(columns=[target_name])
         y = data[target_name].squeeze()
 
-        if isinstance(y.dtype, pd.SparseDtype):
-            y = y.sparse.to_dense()
-
-            # Since it was sparsified, the y column may not be of the correct dtype, hence we check
-            # if it was categorical and convert it to an integer if needs be.
-            category_names = data.columns[categorical_mask]
-            target_dtype = int if target_name in category_names else float
-            y = y.astype(target_dtype)
-
         # Finally, remove the target from the list of attributes and categorical mask
         target_index = attribute_names.index(target_name)
         categorical_mask.pop(target_index)
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 8afc1fb91..d132c4233 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -373,14 +373,14 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
         self.sparse_dataset.ignore_attribute = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
-        # TODO(eddiebergman): Will break from dataset_format removal
         X, y, categorical, _ = self.sparse_dataset.get_data(
             target="class",
             include_row_id=False,
             include_ignore_attribute=False,
         )
         assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
-        assert y.dtype in [np.int32, np.int64]
+        # array format returned dense, but now we only return sparse and let the user handle it.
+        assert isinstance(y.dtypes, pd.SparseDtype)
         assert X.shape == (600, 19998)
 
         assert len(categorical) == 19998

From 2e6c4c770074a03264670ef74a031f33c5ff073b Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 12:03:48 +0200
Subject: [PATCH 09/23] fix: make sklearn tests work with pandas

---
 .../test_evaluations_example.py               |  1 -
 .../test_sklearn_extension.py                 | 78 ++++++++++++-------
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index ba9b46216..a9ad7e8c1 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -24,7 +24,6 @@ def test_example_python_paper(self):
                 "predictive_accuracy",
                 flows=[8353],
                 tasks=[6],
-                # output_format="dataframe", ... This was removed
                 parameters_in_separate_columns=True,
             )  # Choose an SVM flow, for example 8353, and a task.
 
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 242f6793c..dec8d9e00 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -61,6 +61,28 @@ def __init__(self, boolean, integer, floating_point_value):
     def fit(self, X, y):
         pass
 
+def _get_sklearn_preprocessing():
+    from sklearn.compose import ColumnTransformer, make_column_selector
+    from sklearn.preprocessing import OrdinalEncoder
+
+    return [
+            (
+                "cat_handling",
+                ColumnTransformer(
+                    transformers=[
+                        (
+                            "cat",
+                            OrdinalEncoder(
+                                handle_unknown="use_encoded_value", unknown_value=np.nan
+                            ),
+                            make_column_selector(dtype_include=["object", "category"]),
+                        )
+                    ],
+                    remainder="passthrough",
+                ),
+            ),
+            ("imp", SimpleImputer())]
+
 
 class TestSklearnExtensionFlowFunctions(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take less
@@ -735,7 +757,7 @@ def test_serialize_feature_union(self):
         )
         fixture_structure = {
             fixture_name: [],
-            f"sklearn.preprocessing.{module_name_encoder}." "OneHotEncoder": ["ohe"],
+            f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder": ["ohe"],
             f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
         }
         assert serialization.name == fixture_name
@@ -862,13 +884,13 @@ def test_serialize_complex_flow(self):
             weight_name,
             tree_name,
         )
-        pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={}," "boosting={})".format(
+        pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={},boosting={})".format(
             ohe_name,
             scaler_name,
             boosting_name,
         )
         fixture_name = (
-            "sklearn.model_selection._search.RandomizedSearchCV" "(estimator=%s)" % pipeline_name
+            "sklearn.model_selection._search.RandomizedSearchCV(estimator=%s)" % pipeline_name
         )
         fixture_structure = {
             ohe_name: ["estimator", "ohe"],
@@ -1235,7 +1257,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
 
         fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2)))
         fixture = (
-            "Found a second occurence of component .*.PCA when trying " "to serialize FeatureUnion"
+            "Found a second occurence of component .*.PCA when trying to serialize FeatureUnion"
         )
         with pytest.raises(ValueError, match=fixture):
             self.extension.model_to_flow(fu)
@@ -1737,13 +1759,13 @@ def test_run_model_on_fold_classification_1_array(self):
 
         X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
+        X_train = X.iloc[train_indices]
+        y_train = y.iloc[train_indices]
+        X_test = X.iloc[test_indices]
+        y_test = y.iloc[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())],
+            steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -1875,14 +1897,16 @@ def test_run_model_on_fold_classification_2(self):
 
         X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
+        X_train = X.iloc[train_indices]
+        y_train = y.iloc[train_indices]
+        X_test = X.iloc[test_indices]
+        y_test = y.iloc[test_indices]
 
         pipeline = sklearn.model_selection.GridSearchCV(
-            sklearn.tree.DecisionTreeClassifier(),
-            {"max_depth": [1, 2]},
+            sklearn.pipeline.Pipeline(
+                steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
+            ),
+            {"clf__max_depth": [1, 2]},
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -1931,7 +1955,7 @@ class HardNaiveBayes(sklearn.naive_bayes.GaussianNB):
             # class for testing a naive bayes classifier that does not allow soft
             # predictions
             def predict_proba(*args, **kwargs):
-                raise AttributeError("predict_proba is not available when " "probability=False")
+                raise AttributeError("predict_proba is not available when probability=False")
 
         # task 1 (test server) is important: it is a task with an unused class
         tasks = [
@@ -1950,17 +1974,17 @@ def predict_proba(*args, **kwargs):
                 fold=0,
                 sample=0,
             )
-            X_train = X[train_indices]
-            y_train = y[train_indices]
-            X_test = X[test_indices]
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
             clf1 = sklearn.pipeline.Pipeline(
                 steps=[
-                    ("imputer", SimpleImputer()),
+                    *_get_sklearn_preprocessing(),
                     ("estimator", sklearn.naive_bayes.GaussianNB()),
                 ],
             )
             clf2 = sklearn.pipeline.Pipeline(
-                steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())],
+                steps=[*_get_sklearn_preprocessing(), ("estimator", HardNaiveBayes())],
             )
 
             pred_1, proba_1, _, _ = self.extension._run_model_on_fold(
@@ -2005,10 +2029,10 @@ def test_run_model_on_fold_regression(self):
 
         X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
+        X_train = X.iloc[train_indices]
+        y_train = y.iloc[train_indices]
+        X_test = X.iloc[test_indices]
+        y_test = y.iloc[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(
             steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())],
@@ -2059,7 +2083,7 @@ def test_run_model_on_fold_clustering(self):
         X = task.get_X()
 
         pipeline = sklearn.pipeline.Pipeline(
-            steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())],
+            steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.cluster.KMeans())],
         )
         # TODO add some mocking here to actually test the innards of this function, too!
         res = self.extension._run_model_on_fold(
@@ -2115,7 +2139,7 @@ def test__extract_trace_data(self):
         X, y = task.get_X_and_y()
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            clf.fit(X[train], y[train])
+            clf.fit(X.iloc[train], y.iloc[train])
 
         # check num layers of MLP
         assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"]

From 12dedb046814742535428fc0dd030c13c7ee1724 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 14:52:31 +0200
Subject: [PATCH 10/23] fix: fix listing calls and test for utils

---
 openml/datasets/functions.py    |  9 +++++++--
 openml/evaluations/functions.py |  6 +++++-
 openml/flows/functions.py       | 18 ++++++++++++------
 openml/runs/functions.py        | 20 +++++++++++---------
 openml/setups/functions.py      | 12 ++++++++----
 openml/study/functions.py       | 11 ++++++++---
 openml/tasks/functions.py       |  7 +++++--
 openml/utils.py                 |  5 +++--
 tests/test_utils/test_utils.py  | 12 ++++--------
 9 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index b2b5c6ea5..ea7266c62 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -73,7 +73,7 @@ def list_qualities() -> list[str]:
         raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
 
     if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
-        raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list')
+        raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
 
     return qualities["oml:data_qualities_list"]["oml:quality"]
 
@@ -179,7 +179,12 @@ def _list_datasets(
     -------
     datasets : dataframe
     """
-    api_call = f"data/list/limit/{limit}/offset/{offset}"
+    api_call = "data/list"
+
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index bee326438..f44fe3a93 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -194,7 +194,11 @@ def _list_evaluations(  # noqa: C901
     -------
     list of OpenMLEvaluation objects
     """
-    api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}"
+    api_call = f"evaluation/list/function/{function}"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
             if value is not None:
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 7cb93386b..9906958e5 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -66,7 +66,7 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
             return _create_flow_from_xml(fh.read())
     except OSError as e:
         openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir)
-        raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) from e
+        raise OpenMLCacheException("Flow file for fid %d not cached" % fid) from e
 
 
 @openml.utils.thread_safe_if_oslo_installed
@@ -190,11 +190,17 @@ def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
     -------
     flows : dataframe
     """
-    api_call = f"flow/list/limit/{limit}/offset/{offset}"
+    api_call = "flow/list"
+
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+            if value is not None:
+                api_call += f"/{operator}/{value}"
 
     return __list_flows(api_call=api_call)
 
@@ -417,11 +423,11 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
             for name in set(attr1.keys()).union(attr2.keys()):
                 if name not in attr1:
                     raise ValueError(
-                        f"Component {name} only available in " "argument2, but not in argument1.",
+                        f"Component {name} only available in argument2, but not in argument1.",
                     )
                 if name not in attr2:
                     raise ValueError(
-                        f"Component {name} only available in " "argument2, but not in argument1.",
+                        f"Component {name} only available in argument2, but not in argument1.",
                     )
                 assert_flows_equal(
                     attr1[name],
@@ -482,7 +488,7 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
                 params2 = set(flow2.parameters_meta_info)
                 if params1 != params2:
                     raise ValueError(
-                        "Parameter list in meta info for parameters differ " "in the two flows.",
+                        "Parameter list in meta info for parameters differ in the two flows.",
                     )
                 # iterating over the parameter's meta info list
                 for param in params1:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 92a102ec2..e66af7b15 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -250,8 +250,7 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
         if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
             if flow_id is not False:
                 raise PyOpenMLError(
-                    "Local flow_id does not match server flow_id: "
-                    f"'{flow.flow_id}' vs '{flow_id}'",
+                    f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'",
                 )
             raise PyOpenMLError(
                 "Flow does not exist on the server, but 'flow.flow_id' is not None."
@@ -283,8 +282,7 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
 
     if flow.extension.check_if_model_fitted(flow.model):
         warnings.warn(
-            "The model is already fitted!"
-            " This might cause inconsistency in comparison of results.",
+            "The model is already fitted! This might cause inconsistency in comparison of results.",
             RuntimeWarning,
             stacklevel=2,
         )
@@ -860,7 +858,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
         if not from_server:
             return None
 
-        raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)
+        raise AttributeError("Run XML does not contain required (server) field: ", fieldname)
 
     run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
         "oml:run"
@@ -920,7 +918,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
     sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {}
     if "oml:output_data" not in run:
         if from_server:
-            raise ValueError("Run does not contain output_data " "(OpenML server error?)")
+            raise ValueError("Run does not contain output_data (OpenML server error?)")
         predictions_url = None
     else:
         output_data = run["oml:output_data"]
@@ -972,7 +970,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
                     evaluations[key] = value
 
     if "description" not in files and from_server is True:
-        raise ValueError("No description file for run %d in run " "description XML" % run_id)
+        raise ValueError("No description file for run %d in run description XML" % run_id)
 
     if "predictions" not in files and from_server is True:
         task = openml.tasks.get_task(task_id)
@@ -1100,7 +1098,7 @@ def list_runs(  # noqa: PLR0913
     return pd.concat(batches)
 
 
-def _list_runs(  # noqa: PLR0913
+def _list_runs(  # noqa: PLR0913, C901
     limit: int,
     offset: int,
     *,
@@ -1150,7 +1148,11 @@ def _list_runs(  # noqa: PLR0913
     dict, or dataframe
         List of found runs.
     """
-    api_call = f"run/list/limit/{limit}/offset/{offset}"
+    api_call = "run/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if id is not None:
         api_call += "/run/{}".format(",".join([str(int(i)) for i in id]))
     if task is not None:
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 2ab18e428..cc71418df 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -192,7 +192,7 @@ def _list_setups(
     filters which are put into the kwargs.
 
     limit : int
-    listing_offset : int
+    offset : int
     setup : list(int), optional
     flow : int, optional
     tag : str, optional
@@ -201,7 +201,11 @@ def _list_setups(
     -------
     The setups that match the filters, going from id to the OpenMLSetup object.
     """
-    api_call = f"setup/list/offset/{offset}/limit/{limit}"
+    api_call = "setup/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if setup is not None:
         api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup]))
     if flow is not None:
@@ -220,12 +224,12 @@ def __list_setups(api_call: str) -> list[OpenMLSetup]:
     # Minimalistic check if the XML is useful
     if "oml:setups" not in setups_dict:
         raise ValueError(
-            'Error in return XML, does not contain "oml:setups":' f" {setups_dict!s}",
+            f'Error in return XML, does not contain "oml:setups": {setups_dict!s}',
         )
 
     if "@xmlns:oml" not in setups_dict["oml:setups"]:
         raise ValueError(
-            "Error in return XML, does not contain " f'"oml:setups"/@xmlns:oml: {setups_dict!s}',
+            f'Error in return XML, does not contain "oml:setups"/@xmlns:oml: {setups_dict!s}',
         )
 
     if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri:
diff --git a/openml/study/functions.py b/openml/study/functions.py
index fb71272a1..4e16879d7 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -298,7 +298,7 @@ def update_study_status(study_id: int, status: str) -> None:
     """
     legal_status = {"active", "deactivated"}
     if status not in legal_status:
-        raise ValueError("Illegal status value. " f"Legal values: {legal_status}")
+        raise ValueError(f"Illegal status value. Legal values: {legal_status}")
     data = {"study_id": study_id, "status": status}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
     result = xmltodict.parse(result_xml)
@@ -548,10 +548,15 @@ def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
     -------
     studies : dataframe
     """
-    api_call = f"study/list/limit/{limit}/offset/{offset}"
+    api_call = "study/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += f"/{operator}/{value}"
+            if value is not None:
+                api_call += f"/{operator}/{value}"
     return __list_studies(api_call=api_call)
 
 
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 497e35c69..25156f2e5 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -176,7 +176,6 @@ def list_tasks(  # noqa: PLR0913
     """
     listing_call = partial(
         _list_tasks,
-        size=size,
         task_type=task_type,
         tag=tag,
         data_tag=data_tag,
@@ -222,7 +221,11 @@ def _list_tasks(
     -------
     dataframe
     """
-    api_call = f"task/list/limit/{limit}/offset/{offset}"
+    api_call = "task/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if task_type is not None:
         tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
         api_call += f"/type/{tvalue}"
diff --git a/openml/utils.py b/openml/utils.py
index d06d626a1..cef9f9d49 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -271,11 +271,12 @@ def _list_all(  # noqa: C901
     page = 0
     results: list[_SizedT] = []
 
-    LIMIT = limit
-    BATCH_SIZE_ORIG = batch_size
     offset = offset if offset is not None else 0
     batch_size = batch_size if batch_size is not None else 10_000
 
+    LIMIT = limit
+    BATCH_SIZE_ORIG = batch_size
+
     # Default batch size per paging.
     # This one can be set in filters (batch_size), but should not be
     # changed afterwards. The derived batch_size can be changed.
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 2b451d49b..3b4a34b57 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -3,9 +3,7 @@
 import os
 import unittest.mock
 import pytest
-import shutil
 import openml
-from itertools import chain
 from openml.testing import _check_dataset
 
 
@@ -35,7 +33,7 @@ def min_number_setups_on_test_server() -> int:
 
 @pytest.fixture()
 def min_number_runs_on_test_server() -> int:
-    """After a reset at least 50 runs are on the test server"""
+    """After a reset at least 21 runs are on the test server"""
     return 21
 
 
@@ -71,14 +69,12 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
         batch_size=batch_size,
     )
     assert len(batches) >= 2
-    flattened = list(chain(*batches))
-    assert min_number_tasks_on_test_server <= len(flattened)
+    assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
 @pytest.mark.server()
 def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
-        batch_size=100,
         size=min_number_datasets_on_test_server,
     )
 
@@ -104,7 +100,7 @@ def test_list_all_for_setups(min_number_setups_on_test_server):
 @pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
 def test_list_all_for_runs(min_number_runs_on_test_server):
-    runs = openml.runs.list_runs(batch_size=25, size=min_number_runs_on_test_server)
+    runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
@@ -155,4 +151,4 @@ def test_correct_test_server_download_state():
     """
     task = openml.tasks.get_task(119)
     dataset = task.get_dataset()
-    assert len(dataset.features) == dataset.get_data()[0].shape[1]
\ No newline at end of file
+    assert len(dataset.features) == dataset.get_data()[0].shape[1]

From 6517f6abef063b2ec0190f5074b35a4a2d664926 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 16:05:41 +0200
Subject: [PATCH 11/23] fix/maint: update and fix tests for new dataframe
 default

---
 .../test_sklearn_extension.py                 | 41 ++++++++-------
 tests/test_runs/test_run.py                   | 32 ++++++++++--
 tests/test_runs/test_run_functions.py         | 51 +++++++++++++------
 tests/test_tasks/test_classification_task.py  |  8 +--
 tests/test_tasks/test_learning_curve_task.py  |  8 +--
 tests/test_tasks/test_regression_task.py      |  8 +--
 tests/test_tasks/test_supervised_task.py      |  4 +-
 7 files changed, 101 insertions(+), 51 deletions(-)

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index dec8d9e00..d2aa3d47d 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -61,27 +61,31 @@ def __init__(self, boolean, integer, floating_point_value):
     def fit(self, X, y):
         pass
 
+
+def _cat_col_selector(X):
+    return X.select_dtypes(include=["object", "category"]).columns
+
+
 def _get_sklearn_preprocessing():
-    from sklearn.compose import ColumnTransformer, make_column_selector
+    from sklearn.compose import ColumnTransformer
     from sklearn.preprocessing import OrdinalEncoder
 
     return [
-            (
-                "cat_handling",
-                ColumnTransformer(
-                    transformers=[
-                        (
-                            "cat",
-                            OrdinalEncoder(
-                                handle_unknown="use_encoded_value", unknown_value=np.nan
-                            ),
-                            make_column_selector(dtype_include=["object", "category"]),
-                        )
-                    ],
-                    remainder="passthrough",
-                ),
+        (
+            "cat_handling",
+            ColumnTransformer(
+                transformers=[
+                    (
+                        "cat",
+                        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
+                        _cat_col_selector,
+                    )
+                ],
+                remainder="passthrough",
             ),
-            ("imp", SimpleImputer())]
+        ),
+        ("imp", SimpleImputer()),
+    ]
 
 
 class TestSklearnExtensionFlowFunctions(TestBase):
@@ -1904,7 +1908,10 @@ def test_run_model_on_fold_classification_2(self):
 
         pipeline = sklearn.model_selection.GridSearchCV(
             sklearn.pipeline.Pipeline(
-                steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
+                steps=[
+                    *_get_sklearn_preprocessing(),
+                    ("clf", sklearn.tree.DecisionTreeClassifier()),
+                ],
             ),
             {"clf__max_depth": [1, 2]},
         )
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 6769b2140..9e2173007 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -204,17 +204,42 @@ def test_to_from_filesystem_no_model(self):
         with self.assertRaises(ValueError, msg="Could not find model.pkl"):
             openml.runs.OpenMLRun.from_filesystem(cache_path)
 
+    @staticmethod
+    def _cat_col_selector(X):
+        return X.select_dtypes(include=["object", "category"]).columns
+
     @staticmethod
     def _get_models_tasks_for_tests():
+        from sklearn.compose import ColumnTransformer
+        from sklearn.preprocessing import OrdinalEncoder
+
+        basic_preprocessing = [
+            (
+                "cat_handling",
+                ColumnTransformer(
+                    transformers=[
+                        (
+                            "cat",
+                            OrdinalEncoder(
+                                handle_unknown="use_encoded_value", unknown_value=np.nan
+                            ),
+                            TestRun._cat_col_selector,
+                        )
+                    ],
+                    remainder="passthrough",
+                ),
+            ),
+            ("imp", SimpleImputer()),
+        ]
         model_clf = Pipeline(
             [
-                ("imputer", SimpleImputer(strategy="mean")),
+                *basic_preprocessing,
                 ("classifier", DummyClassifier(strategy="prior")),
             ],
         )
         model_reg = Pipeline(
             [
-                ("imputer", SimpleImputer(strategy="mean")),
+                *basic_preprocessing,
                 (
                     "regressor",
                     # LR because dummy does not produce enough float-like values
@@ -263,9 +288,8 @@ def assert_run_prediction_data(task, run, model):
 
             assert_method = np.testing.assert_array_almost_equal
             if task.task_type == "Supervised Classification":
-                y_pred = np.take(task.class_labels, y_pred)
-                y_test = np.take(task.class_labels, y_test)
                 assert_method = np.testing.assert_array_equal
+            y_test = y_test.values
 
             # Assert correctness
             assert_method(y_pred, saved_y_pred)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 77d783514..22bf3f8c2 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -26,9 +26,10 @@
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline, make_pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.compose import ColumnTransformer
 
 import openml
 import openml._api_calls
@@ -130,9 +131,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
                 time.sleep(10)
                 continue
 
-            assert (
-                len(run.evaluations) > 0
-            ), "Expect not-None evaluations to always contain elements."
+            assert len(run.evaluations) > 0, (
+                "Expect not-None evaluations to always contain elements."
+            )
             return
 
         raise RuntimeError(
@@ -306,7 +307,7 @@ def _remove_random_state(flow):
             flow_server = self.extension.model_to_flow(clf_server)
 
             if flow.class_name not in classes_without_random_state:
-                error_msg = "Flow class %s (id=%d) does not have a random " "state parameter" % (
+                error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
                     flow.class_name,
                     flow.flow_id,
                 )
@@ -479,7 +480,7 @@ def determine_grid_size(param_grid):
                     grid_iterations += determine_grid_size(sub_grid)
                 return grid_iterations
             else:
-                raise TypeError("Param Grid should be of type list " "(GridSearch only) or dict")
+                raise TypeError("Param Grid should be of type list (GridSearch only) or dict")
 
         run = self._perform_run(
             task_id,
@@ -1286,7 +1287,7 @@ def test_run_with_illegal_flow_id_1(self):
         flow_new = self.extension.model_to_flow(clf)
 
         flow_new.flow_id = -1
-        expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
+        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
         with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             openml.runs.run_flow_on_task(
                 task=task,
@@ -1326,7 +1327,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         run.to_filesystem(cache_path)
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
-        expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
+        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
         self.assertRaisesRegex(
             openml.exceptions.PyOpenMLError,
             expected_message_regex,
@@ -1827,14 +1828,33 @@ def test_joblib_backends(self, parallel_mock):
             (1, "sequential", 40),
         ]:
             clf = sklearn.model_selection.RandomizedSearchCV(
-                estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
+                estimator=sklearn.pipeline.Pipeline(
+                    [
+                        (
+                            "cat_handling",
+                            ColumnTransformer(
+                                transformers=[
+                                    (
+                                        "cat",
+                                        OrdinalEncoder(
+                                            handle_unknown="use_encoded_value", unknown_value=-1
+                                        ),
+                                        x.select_dtypes(include=["object", "category"]).columns,
+                                    )
+                                ],
+                                remainder="passthrough",
+                            ),
+                        ),
+                        ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
+                    ]
+                ),
                 param_distributions={
-                    "max_depth": [3, None],
-                    "max_features": [1, 2, 3, 4],
-                    "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-                    "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                    "bootstrap": [True, False],
-                    "criterion": ["gini", "entropy"],
+                    "clf__max_depth": [3, None],
+                    "clf__max_features": [1, 2, 3, 4],
+                    "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+                    "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                    "clf__bootstrap": [True, False],
+                    "clf__criterion": ["gini", "entropy"],
                 },
                 random_state=1,
                 cv=sklearn.model_selection.StratifiedKFold(
@@ -1851,7 +1871,6 @@ def test_joblib_backends(self, parallel_mock):
                     model=clf,
                     task=task,
                     add_local_measures=True,
-                    # dataset_format="array",  # "dataframe" would require handling of categoricals
                     n_jobs=n_jobs,
                 )
             assert type(res[0]) == list
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 661e8eced..bb4545154 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import numpy as np
+import pandas as pd
 
 from openml.tasks import TaskType, get_task
 
@@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
-        assert isinstance(X, np.ndarray)
+        assert isinstance(X, pd.DataFrame)
         assert Y.shape == (768,)
-        assert isinstance(Y, np.ndarray)
-        assert Y.dtype == int
+        assert isinstance(Y, pd.Series)
+        assert pd.api.types.is_categorical_dtype(Y)
 
     def test_download_task(self):
         task = super().test_download_task()
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 0e781c8ff..885f80a27 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -1,7 +1,7 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import numpy as np
+import pandas as pd
 
 from openml.tasks import TaskType, get_task
 
@@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
-        assert isinstance(X, np.ndarray)
+        assert isinstance(X, pd.DataFrame)
         assert Y.shape == (768,)
-        assert isinstance(Y, np.ndarray)
-        assert Y.dtype == int
+        assert isinstance(Y, pd.Series)
+        assert pd.api.types.is_categorical_dtype(Y)
 
     def test_download_task(self):
         task = super().test_download_task()
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 29a8254df..36decc534 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -3,7 +3,7 @@
 
 import ast
 
-import numpy as np
+import pandas as pd
 
 import openml
 from openml.exceptions import OpenMLServerException
@@ -51,10 +51,10 @@ def setUp(self, n_levels: int = 1):
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
-        assert isinstance(X, np.ndarray)
+        assert isinstance(X, pd.DataFrame)
         assert Y.shape == (194,)
-        assert isinstance(Y, np.ndarray)
-        assert Y.dtype == float
+        assert isinstance(Y, pd.Series)
+        assert pd.api.types.is_numeric_dtype(Y)
 
     def test_download_task(self):
         task = super().test_download_task()
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 00ce1f276..9c90b7e03 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -3,7 +3,7 @@
 
 import unittest
 
-import numpy as np
+import pandas as pd
 
 from openml.tasks import get_task
 
@@ -27,7 +27,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
-    def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]:
+    def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
         return X, Y

From 466022e38a67f9d7978faf293a03311b5bb127a6 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 16:55:28 +0200
Subject: [PATCH 12/23] fix/maint: resolve tests that used old default format

---
 openml/extensions/sklearn/extension.py   | 29 ++++++++++++-----
 tests/conftest.py                        |  7 ++++-
 tests/test_runs/test_run_functions.py    | 40 +++++++++++++++++-------
 tests/test_study/test_study_functions.py | 13 ++++----
 4 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 2d40d03b8..fc8697e84 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -1144,7 +1144,7 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]
                 optional_params[param] = default_val
         return optional_params, required_params
 
-    def _deserialize_model(
+    def _deserialize_model(  # noqa: C901
         self,
         flow: OpenMLFlow,
         keep_defaults: bool,  # noqa: FBT001
@@ -1219,6 +1219,20 @@ def _deserialize_model(
                 if param not in components:
                     del parameter_dict[param]
 
+        if not strict_version:
+            # Ignore incompatible parameters
+            allowed_parameter = list(inspect.signature(model_class.__init__).parameters)
+            for p in list(parameter_dict.keys()):
+                if p not in allowed_parameter:
+                    warnings.warn(
+                        f"While deserializing in a non-strict way, parameter {p} is not "
+                        f"allowed for {model_class.__name__} likely due to a version mismatch. "
+                        "We ignore the parameter.",
+                        UserWarning,
+                        stacklevel=2,
+                    )
+                    del parameter_dict[p]
+
         return model_class(**parameter_dict)
 
     def _check_dependencies(
@@ -1254,8 +1268,7 @@ def _check_dependencies(
             else:
                 raise NotImplementedError(f"operation '{operation}' is not supported")
             message = (
-                "Trying to deserialize a model with dependency "
-                f"{dependency_string} not satisfied."
+                f"Trying to deserialize a model with dependency {dependency_string} not satisfied."
             )
             if not check:
                 if strict_version:
@@ -1497,7 +1510,7 @@ def _prevent_optimize_n_jobs(self, model):
             )
             if len(n_jobs_vals) > 0:
                 raise PyOpenMLError(
-                    "openml-python should not be used to " "optimize the n_jobs parameter.",
+                    "openml-python should not be used to optimize the n_jobs parameter.",
                 )
 
     ################################################################################################
@@ -1555,7 +1568,7 @@ def _seed_current_object(current_value):
 
             if current_value is not None:
                 raise ValueError(
-                    "Models should be seeded with int or None (this should never " "happen). ",
+                    "Models should be seeded with int or None (this should never happen). ",
                 )
 
             return True
@@ -1780,10 +1793,10 @@ def _prediction_to_probabilities(
             # to handle the case when dataset is numpy and categories are encoded
             # however the class labels stored in task are still categories
             if isinstance(y_train, np.ndarray) and isinstance(
-                cast(List, task.class_labels)[0],
+                cast("List", task.class_labels)[0],
                 str,
             ):
-                model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
+                model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes]
 
         modelpredict_start_cputime = time.process_time()
         modelpredict_start_walltime = time.time()
@@ -2006,7 +2019,7 @@ def is_subcomponent_specification(values):
                         # (mixed)). OpenML replaces the subcomponent by an
                         # OpenMLFlow object.
                         if len(subcomponent) < 2 or len(subcomponent) > 3:
-                            raise ValueError("Component reference should be " "size {2,3}. ")
+                            raise ValueError("Component reference should be size {2,3}. ")
 
                         subcomponent_identifier = subcomponent[0]
                         subcomponent_flow = subcomponent[1]
diff --git a/tests/conftest.py b/tests/conftest.py
index 79ee2bbd3..b523117c1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,6 +23,10 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+import multiprocessing
+
+multiprocessing.set_start_method("spawn", force=True)
+
 from collections.abc import Iterator
 import logging
 import os
@@ -33,6 +37,7 @@
 import openml
 from openml.testing import TestBase
 
+
 # creating logger for unit test file deletion status
 logger = logging.getLogger("unit_tests")
 logger.setLevel(logging.DEBUG)
@@ -170,7 +175,7 @@ def pytest_sessionfinish() -> None:
         # Delete any test dirs that remain
         # In edge cases due to a mixture of pytest parametrization and oslo concurrency,
         # some file lock are created after leaving the test. This removes these files!
-        test_files_dir=Path(__file__).parent.parent / "openml"
+        test_files_dir = Path(__file__).parent.parent / "openml"
         for f in test_files_dir.glob("tests.*"):
             if f.is_dir():
                 shutil.rmtree(f)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 22bf3f8c2..c529b6c2b 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -272,7 +272,7 @@ def _remove_random_state(flow):
         task = openml.tasks.get_task(task_id)
 
         X, y = task.get_X_and_y()
-        assert np.count_nonzero(np.isnan(X)) == n_missing_vals
+        assert X.isna().sum().sum() == n_missing_vals
         run = openml.runs.run_flow_on_task(
             flow=flow,
             task=task,
@@ -401,7 +401,7 @@ def _check_sample_evaluations(
 
     @pytest.mark.sklearn()
     def test_run_regression_on_classif_task(self):
-        task_id = 115  # diabetes; crossvalidation
+        task_id = 259  # collins; crossvalidation; has numeric targets
 
         clf = LinearRegression()
         task = openml.tasks.get_task(task_id)
@@ -1758,7 +1758,26 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
         loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
-        clf = SGDClassifier(loss=loss, random_state=1)
+        clf = sklearn.pipeline.Pipeline(
+            [
+                (
+                    "cat_handling",
+                    ColumnTransformer(
+                        transformers=[
+                            (
+                                "cat",
+                                OrdinalEncoder(
+                                    handle_unknown="use_encoded_value", unknown_value=-1
+                                ),
+                                x.select_dtypes(include=["object", "category"]).columns,
+                            )
+                        ],
+                        remainder="passthrough",
+                    ),
+                ),
+                ("clf", SGDClassifier(loss=loss, random_state=1)),
+            ]
+        )
         n_jobs = 2
         backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
         with parallel_backend(backend, n_jobs=n_jobs):
@@ -1767,7 +1786,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
                 model=clf,
                 task=task,
                 add_local_measures=True,
-                # dataset_format="array",  # "dataframe" would require handling of categoricals
                 n_jobs=n_jobs,
             )
         # This unit test will fail if joblib is unable to distribute successfully since the
@@ -1784,16 +1802,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         assert len(res[2]) == 7
         assert len(res[3]) == 7
         expected_scores = [
-            0.965625,
             0.94375,
-            0.946875,
-            0.953125,
+            0.95625,
+            0.959375,
+            0.96875,
+            0.96875,
             0.96875,
-            0.965625,
-            0.9435736677115988,
             0.9467084639498433,
-            0.9749216300940439,
-            0.9655172413793104,
+            0.9373040752351097,
+            0.9561128526645768,
+            0.9467084639498433
         ]
         scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
         np.testing.assert_array_almost_equal(
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index d13f770c8..8652d5547 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -183,20 +183,21 @@ def test_publish_study(self):
         self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids))
 
         # test whether the list run function also handles study data fine
-        run_ids = openml.runs.list_runs(study=study.id)
-        self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
+        run_ids = openml.runs.list_runs(study=study.id) # returns DF
+        self.assertSetEqual(set(run_ids["run_id"]), set(study_downloaded.runs))
 
         # test whether the list evaluation function also handles study data fine
-        run_ids = openml.evaluations.list_evaluations(
+        run_ids = openml.evaluations.list_evaluations( # returns list of objects
             "predictive_accuracy",
             size=None,
             study=study.id,
+            output_format="object", # making the default explicit
         )
         self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
 
         # attach more runs, since we fetch 11 here, at least one is non-overlapping
         run_list_additional = openml.runs.list_runs(size=11, offset=10)
-        run_list_additional = set(run_list_additional) - set(run_ids)
+        run_list_additional = set(run_list_additional["run_id"]) - set(run_ids)
         openml.study.attach_to_study(study.id, list(run_list_additional))
         study_downloaded = openml.study.get_study(study.id)
         # verify again
@@ -227,7 +228,7 @@ def test_study_attach_illegal(self):
             benchmark_suite=None,
             name="study with illegal runs",
             description="none",
-            run_ids=list(run_list.keys()),
+            run_ids=list(run_list["run_id"]),
         )
         study.publish()
         TestBase._mark_entity_for_removal("study", study.id)
@@ -246,7 +247,7 @@ def test_study_attach_illegal(self):
             match="Problem attaching entities.",
         ):
             # some runs already attached
-            openml.study.attach_to_study(study.id, list(run_list_more.keys()))
+            openml.study.attach_to_study(study.id, list(run_list_more["run_id"]))
         study_downloaded = openml.study.get_study(study.id)
         self.assertListEqual(study_original.runs, study_downloaded.runs)
 

From bd120f5df435e72f5fd892bf461147263df20e60 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 17:26:21 +0200
Subject: [PATCH 13/23] fix: remove OrdinalEncoder

---
 .../test_sklearn_extension/test_sklearn_extension.py   |  3 +--
 tests/test_runs/test_run.py                            |  6 ++----
 tests/test_runs/test_run_functions.py                  | 10 +++-------
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index d2aa3d47d..7dfc1b404 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -68,7 +68,6 @@ def _cat_col_selector(X):
 
 def _get_sklearn_preprocessing():
     from sklearn.compose import ColumnTransformer
-    from sklearn.preprocessing import OrdinalEncoder
 
     return [
         (
@@ -77,7 +76,7 @@ def _get_sklearn_preprocessing():
                 transformers=[
                     (
                         "cat",
-                        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
+                        OneHotEncoder(handle_unknown="ignore"),
                         _cat_col_selector,
                     )
                 ],
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 9e2173007..58a0dddf5 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -211,7 +211,7 @@ def _cat_col_selector(X):
     @staticmethod
     def _get_models_tasks_for_tests():
         from sklearn.compose import ColumnTransformer
-        from sklearn.preprocessing import OrdinalEncoder
+        from sklearn.preprocessing import OneHotEncoder
 
         basic_preprocessing = [
             (
@@ -220,9 +220,7 @@ def _get_models_tasks_for_tests():
                     transformers=[
                         (
                             "cat",
-                            OrdinalEncoder(
-                                handle_unknown="use_encoded_value", unknown_value=np.nan
-                            ),
+                            OneHotEncoder(handle_unknown="ignore"),
                             TestRun._cat_col_selector,
                         )
                     ],
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index c529b6c2b..1562623d5 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -26,7 +26,7 @@
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline, make_pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.compose import ColumnTransformer
@@ -1766,9 +1766,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
                         transformers=[
                             (
                                 "cat",
-                                OrdinalEncoder(
-                                    handle_unknown="use_encoded_value", unknown_value=-1
-                                ),
+                                OneHotEncoder(handle_unknown="ignore"),
                                 x.select_dtypes(include=["object", "category"]).columns,
                             )
                         ],
@@ -1854,9 +1852,7 @@ def test_joblib_backends(self, parallel_mock):
                                 transformers=[
                                     (
                                         "cat",
-                                        OrdinalEncoder(
-                                            handle_unknown="use_encoded_value", unknown_value=-1
-                                        ),
+                                        OneHotEncoder(handle_unknown="ignore"),
                                         x.select_dtypes(include=["object", "category"]).columns,
                                     )
                                 ],

From de597b54a2d381e748c381b9b986e6b362f4d1c9 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 17:32:14 +0200
Subject: [PATCH 14/23] fix: update test to new assert with onehot

---
 tests/test_runs/test_run_functions.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 1562623d5..7235075c0 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1800,16 +1800,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         assert len(res[2]) == 7
         assert len(res[3]) == 7
         expected_scores = [
-            0.94375,
-            0.95625,
-            0.959375,
-            0.96875,
-            0.96875,
-            0.96875,
-            0.9467084639498433,
-            0.9373040752351097,
-            0.9561128526645768,
-            0.9467084639498433
+            0.9625,
+            0.953125,
+            0.965625,
+            0.9125,
+            0.98125,
+            0.975,
+            0.9247648902821317,
+            0.9404388714733543,
+            0.9780564263322884,
+            0.9623824451410659,
         ]
         scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
         np.testing.assert_array_almost_equal(

From 32e6fbf55def28e14e8538e8a054d4423d9d5a9c Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 17:52:36 +0200
Subject: [PATCH 15/23] fix/maint: update examples

---
 examples/30_extended/study_tutorial.py     | 1 +
 examples/40_paper/2018_kdd_rijn_example.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index d1f6ccbd0..c0874b944 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -48,6 +48,7 @@
 evaluations = openml.evaluations.list_evaluations(
     function="predictive_accuracy",
     study=study.study_id,
+    output_format="dataframe",
 )
 print(evaluations.head())
 
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index f98f5de55..751f53470 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -1,5 +1,6 @@
 """
-This example is deprecated! You will need to manually remove checks in this code to make it run.
+This example is deprecated! You will need to manually remove adapt this code to make it run.
+We deprecated this example in our CI as it requires fanova as a dependency. However, fanova is not supported in all Python versions used in our CI/CD.
 
 van Rijn and Hutter (2018)
 ==========================

From bae06ca55408ebf1ed297015325e67a8f0dd1d7a Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 17:58:29 +0200
Subject: [PATCH 16/23] fix: example revert

---
 examples/30_extended/fetch_evaluations_tutorial.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
index 4ef2d3e5f..6c8a88ec8 100644
--- a/examples/30_extended/fetch_evaluations_tutorial.py
+++ b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -60,7 +60,9 @@
 # Note that we now filter the evaluations based on another parameter 'task'.
 
 metric = "predictive_accuracy"
-evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id])
+evals = openml.evaluations.list_evaluations(
+    function=metric, tasks=[task_id], output_format="dataframe"
+)
 # Displaying the first 10 rows
 print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
@@ -158,7 +160,10 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
 # List evaluations in descending order based on predictive_accuracy with
 # hyperparameters
 evals_setups = openml.evaluations.list_evaluations_setups(
-    function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
+    function="predictive_accuracy",
+    tasks=[31],
+    size=100,
+    sort_order="desc",
 )
 
 ""

From 78b1888cd17b00a68082be316bb667f1dbf35be6 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 18:13:27 +0200
Subject: [PATCH 17/23] fix: add impute for tests to work with older sklearn
 version

---
 .../test_sklearn_extension.py                       | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 7dfc1b404..706a67aa6 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -76,7 +76,18 @@ def _get_sklearn_preprocessing():
                 transformers=[
                     (
                         "cat",
-                        OneHotEncoder(handle_unknown="ignore"),
+                        sklearn.pipeline.Pipeline(
+                            [
+                                (
+                                    "cat_si",
+                                    SimpleImputer(
+                                        strategy="constant",
+                                        fill_value="missing",
+                                    ),
+                                ),
+                                ("cat_ohe", OneHotEncoder(handle_unknown="ignore")),
+                            ],
+                        ),
                         _cat_col_selector,
                     )
                 ],

From 22b6b526f2b6569cdb199a8c4c04bfacf4aef6b1 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Mon, 16 Jun 2025 18:24:48 +0200
Subject: [PATCH 18/23] fix: make examples work

---
 examples/40_paper/2018_ida_strang_example.py      |  1 +
 examples/40_paper/2018_neurips_perrone_example.py | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index 8b116507c..d9fdc78a7 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -51,6 +51,7 @@
     size=None,
     flows=flow_ids,
     study=study_id,
+    output_format="dataframe",
 )
 # gives us a table with columns data_id, flow1_value, flow2_value
 evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 7c29ede8d..91768e010 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -181,8 +181,18 @@ def list_categorical_attributes(flow_type="svm"):
 num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
 
 # Creating the one-hot encoder for numerical representation of categorical columns
-enc = OneHotEncoder(handle_unknown="ignore")
-
+enc = Pipeline(
+    [
+        (
+            "cat_si",
+            SimpleImputer(
+                strategy="constant",
+                fill_value="missing",
+            ),
+        ),
+        ("cat_ohe", OneHotEncoder(handle_unknown="ignore")),
+    ],
+)
 # Combining column transformers
 ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
 

From 232b37c3561e4ccfda2da3c9738e6c16e051dbc3 Mon Sep 17 00:00:00 2001
From: Lennart Purucker <purucker@cs.uni-freiburg.de>
Date: Tue, 17 Jun 2025 15:03:22 +0200
Subject: [PATCH 19/23] Update openml/utils.py

Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
---
 openml/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index cef9f9d49..e0580d236 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -316,8 +316,8 @@ def _list_all(  # noqa: C901
             # check if the number of required results has been achieved
             # always do a 'bigger than' check,
             # in case of bugs to prevent infinite loops
-            n_recieved = sum(len(result) for result in results)
-            if n_recieved >= LIMIT:
+            n_received = sum(len(result) for result in results)
+            if n_received >= LIMIT:
                 break
 
             # check if there are enough results to fulfill a batch

From f14fce61345e27604d3fb990b78fd9416c5eae77 Mon Sep 17 00:00:00 2001
From: Lennart Purucker <purucker@cs.uni-freiburg.de>
Date: Tue, 17 Jun 2025 15:03:33 +0200
Subject: [PATCH 20/23] Update openml/utils.py

Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
---
 openml/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index e0580d236..7e72e7aee 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -321,8 +321,8 @@ def _list_all(  # noqa: C901
                 break
 
             # check if there are enough results to fulfill a batch
-            if LIMIT - n_recieved < BATCH_SIZE_ORIG:
-                batch_size = LIMIT - n_recieved
+            if LIMIT - n_received < BATCH_SIZE_ORIG:
+                batch_size = LIMIT - n_received
 
     return results
 

From 7fb5eb2efe96034f7ff133133b2d762398c40a11 Mon Sep 17 00:00:00 2001
From: Lennart Purucker <purucker@cs.uni-freiburg.de>
Date: Tue, 17 Jun 2025 15:03:51 +0200
Subject: [PATCH 21/23] Update openml/setups/setup.py

Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
---
 openml/setups/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index a98e832c0..d7d1f8915 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -7,7 +7,6 @@
 import openml.flows
 
 
-# TODO(eddiebergman): All functionality handled by dataclass
 class OpenMLSetup:
     """Setup object (a.k.a. Configuration).
 

From f45530f987dc736671df1332964aa9bcca8044b5 Mon Sep 17 00:00:00 2001
From: Lennart Purucker <purucker@cs.uni-freiburg.de>
Date: Tue, 17 Jun 2025 15:03:57 +0200
Subject: [PATCH 22/23] Update openml/setups/setup.py

Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
---
 openml/setups/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index d7d1f8915..c3d8149e7 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -66,7 +66,6 @@ def __repr__(self) -> str:
         return header + body
 
 
-# TODO(eddiebergman): All functionality handled by dataclass
 class OpenMLParameter:
     """Parameter object (used in setup).
 

From 7fb31cea7a991e35843a285d7730ff6136fde399 Mon Sep 17 00:00:00 2001
From: LennartPurucker <contact@lennart-purucker.com>
Date: Tue, 17 Jun 2025 15:05:18 +0200
Subject: [PATCH 23/23] remove comment we do not understand

---
 tests/test_datasets/test_dataset_functions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index b0c5f891e..fb29009a3 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -124,7 +124,6 @@ def test_list_datasets_paginate(self):
             datasets = openml.datasets.list_datasets(offset=i, size=size)
             assert len(datasets) == size
             assert len(datasets.columns) >= 2
-            # Maybe index?
             assert "did" in datasets.columns
             assert datasets["did"].dtype == int
             assert "status" in datasets.columns