From 7a6f845a1bc35cec0408a508ff768e0be4b7c6bc Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 16 Oct 2024 14:14:07 +0200 Subject: [PATCH 01/23] refactor: Remove "array" format --- examples/20_basic/simple_datasets_tutorial.py | 6 +- .../task_manual_iteration_tutorial.py | 8 +- openml/datasets/functions.py | 152 ++----------- openml/evaluations/evaluation.py | 20 ++ openml/evaluations/functions.py | 148 +++++-------- openml/flows/functions.py | 122 +---------- openml/runs/functions.py | 104 ++------- openml/setups/functions.py | 173 +++++---------- openml/setups/setup.py | 23 ++ openml/study/functions.py | 204 +++--------------- openml/tasks/functions.py | 64 +----- openml/tasks/task.py | 73 +------ openml/utils.py | 71 ++---- tests/test_datasets/test_dataset.py | 129 +---------- tests/test_datasets/test_dataset_functions.py | 3 +- .../test_sklearn_extension.py | 114 ++++++---- tests/test_runs/test_run_functions.py | 12 +- tests/test_setups/test_setup_functions.py | 7 - tests/test_utils/test_utils.py | 14 +- 19 files changed, 368 insertions(+), 1079 deletions(-) diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py index b90d53660..9b18aab14 100644 --- a/examples/20_basic/simple_datasets_tutorial.py +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -19,7 +19,7 @@ # List datasets # ============= -datasets_df = openml.datasets.list_datasets(output_format="dataframe") +datasets_df = openml.datasets.list_datasets() print(datasets_df.head(n=10)) ############################################################################ @@ -48,7 +48,7 @@ # attribute_names - the names of the features for the examples (X) and # target feature (y) X, y, categorical_indicator, attribute_names = dataset.get_data( - dataset_format="dataframe", target=dataset.default_target_attribute + target=dataset.default_target_attribute ) ############################################################################ @@ -63,9 +63,9 @@ # Visualize the dataset # ===================== +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt sns.set_style("darkgrid") diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py index 676a742a1..dda40de50 100644 --- a/examples/30_extended/task_manual_iteration_tutorial.py +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -68,7 +68,7 @@ #################################################################################################### # And then split the data based on this: -X, y = task.get_X_and_y(dataset_format="dataframe") +X, y = task.get_X_and_y() X_train = X.iloc[train_indices] y_train = y.iloc[train_indices] X_test = X.iloc[test_indices] @@ -88,7 +88,7 @@ task_id = 3 task = openml.tasks.get_task(task_id) -X, y = task.get_X_and_y(dataset_format="dataframe") +X, y = task.get_X_and_y() n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -132,7 +132,7 @@ task_id = 1767 task = openml.tasks.get_task(task_id) -X, y = task.get_X_and_y(dataset_format="dataframe") +X, y = task.get_X_and_y() n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -176,7 +176,7 @@ task_id = 1702 task = openml.tasks.get_task(task_id) -X, y = task.get_X_and_y(dataset_format="dataframe") +X, y = task.get_X_and_y() n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 0901171d6..5ce095814 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -7,7 +7,7 @@ from collections import OrderedDict from pathlib import Path from pyexpat import ExpatError -from typing import TYPE_CHECKING, Any, overload +from typing import TYPE_CHECKING, Any from typing_extensions import Literal import arff @@ -75,54 +75,16 @@ def list_qualities() -> list[str]: return qualities["oml:data_qualities_list"]["oml:quality"] -@overload -def list_datasets( - data_id: list[int] | None = ..., - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - tag: str | None = ..., - *, - output_format: Literal["dataframe"], - **kwargs: Any, -) -> pd.DataFrame: ... - - -@overload -def list_datasets( - data_id: list[int] | None, - offset: int | None, - size: int | None, - status: str | None, - tag: str | None, - output_format: Literal["dataframe"], - **kwargs: Any, -) -> pd.DataFrame: ... - - -@overload -def list_datasets( - data_id: list[int] | None = ..., - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - tag: str | None = ..., - output_format: Literal["dict"] = "dict", - **kwargs: Any, -) -> pd.DataFrame: ... - - def list_datasets( data_id: list[int] | None = None, offset: int | None = None, size: int | None = None, status: str | None = None, tag: str | None = None, - output_format: Literal["dataframe", "dict"] = "dict", **kwargs: Any, -) -> dict | pd.DataFrame: - """ - Return a list of all dataset which are on OpenML. +) -> pd.DataFrame: + """Return a dataframe of all dataset which are on OpenML. + Supports large amount of results. Parameters @@ -139,10 +101,6 @@ def list_datasets( default active datasets are returned, but also datasets from another status can be requested. tag : str, optional - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs : dict, optional Legal filter operators (keys in the dict): data_name, data_version, number_instances, @@ -150,76 +108,29 @@ def list_datasets( Returns ------- - datasets : dict of dicts, or dataframe - - If output_format='dict' - A mapping from dataset ID to dict. - - Every dataset is represented by a dictionary containing - the following information: - - dataset id - - name - - format - - status - If qualities are calculated for the dataset, some of - these are also returned. - - - If output_format='dataframe' - Each row maps to a dataset - Each column contains the following information: - - dataset id - - name - - format - - status - If qualities are calculated for the dataset, some of - these are also included as columns. + datasets: dataframe + Each row maps to a dataset + Each column contains the following information: + - dataset id + - name + - format + - status + If qualities are calculated for the dataset, some of + these are also included as columns. """ - if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", - ) - - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - return openml.utils._list_all( # type: ignore - data_id=data_id, - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_datasets, + data_id=data_id, offset=offset, size=size, status=status, tag=tag, **kwargs, ) + return pd.concat(batches, ignore_index=True) -@overload -def _list_datasets( - data_id: list | None = ..., - output_format: Literal["dict"] = "dict", - **kwargs: Any, -) -> dict: ... - - -@overload -def _list_datasets( - data_id: list | None = ..., - output_format: Literal["dataframe"] = "dataframe", - **kwargs: Any, -) -> pd.DataFrame: ... - - -def _list_datasets( - data_id: list | None = None, - output_format: Literal["dict", "dataframe"] = "dict", - **kwargs: Any, -) -> dict | pd.DataFrame: +def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame: """ Perform api call to return a list of all datasets. @@ -232,10 +143,6 @@ def _list_datasets( data_id : list, optional - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs : dict, optional Legal filter operators (keys in the dict): tag, status, limit, offset, data_name, data_version, number_instances, @@ -243,7 +150,7 @@ def _list_datasets( Returns ------- - datasets : dict of dicts, or dataframe + datasets : dataframe """ api_call = "data/list" @@ -252,21 +159,10 @@ def _list_datasets( api_call += f"/{operator}/{value}" if data_id is not None: api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id])) - return __list_datasets(api_call=api_call, output_format=output_format) - - -@overload -def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... - + return __list_datasets(api_call=api_call) -@overload -def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... - -def __list_datasets( - api_call: str, - output_format: Literal["dict", "dataframe"] = "dict", -) -> dict | pd.DataFrame: +def __list_datasets(api_call: str) -> pd.DataFrame: xml_string = openml._api_calls._perform_api_call(api_call, "get") datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) @@ -295,10 +191,7 @@ def __list_datasets( dataset[quality["@name"]] = float(quality["#text"]) datasets[dataset["did"]] = dataset - if output_format == "dataframe": - datasets = pd.DataFrame.from_dict(datasets, orient="index") - - return datasets + return pd.DataFrame.from_dict(datasets, orient="index") def _expand_parameter(parameter: str | list[str] | None) -> list[str]: @@ -1493,8 +1386,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None: def _get_online_dataset_format(dataset_id: int) -> str: - """Get the dataset format for a given dataset id - from the OpenML website. + """Get the dataset format for a given dataset id from the OpenML website. Parameters ---------- diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index 3cf732f25..70fab9f28 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -8,6 +8,8 @@ import openml.tasks +# TODO(eddiebergman): A lot of this class is automatically +# handled by a dataclass class OpenMLEvaluation: """ Contains all meta-information about a run / evaluation combination, @@ -78,6 +80,24 @@ def __init__( # noqa: PLR0913 self.values = values self.array_data = array_data + def _to_dict(self) -> dict: + return { + "run_id": self.run_id, + "task_id": self.task_id, + "setup_id": self.setup_id, + "flow_id": self.flow_id, + "flow_name": self.flow_name, + "data_id": self.data_id, + "data_name": self.data_name, + "function": self.function, + "upload_time": self.upload_time, + "uploader": self.uploader, + "uploader_name": self.uploader_name, + "value": self.value, + "values": self.values, + "array_data": self.array_data, + } + def __repr__(self) -> str: header = "OpenML Evaluation" header = "{}\n{}\n".format(header, "=" * len(header)) diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index a39096a58..3b4735250 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -import warnings +from itertools import chain from typing import Any from typing_extensions import Literal, overload @@ -31,8 +31,8 @@ def list_evaluations( study: int | None = ..., per_fold: bool | None = ..., sort_order: str | None = ..., - output_format: Literal["dict", "object"] = "dict", -) -> dict: ... + output_format: Literal["object"] = "object", +) -> dict[int, OpenMLEvaluation]: ... @overload @@ -66,10 +66,10 @@ def list_evaluations( study: int | None = None, per_fold: bool | None = None, sort_order: str | None = None, - output_format: Literal["object", "dict", "dataframe"] = "object", -) -> dict | pd.DataFrame: - """ - List all run-evaluation pairs matching all of the given filters. + output_format: Literal["object", "dataframe"] = "object", +) -> dict[int, OpenMLEvaluation] | pd.DataFrame: + """List all run-evaluation pairs matching all of the given filters. + (Supports large amount of results) Parameters @@ -105,33 +105,20 @@ def list_evaluations( output_format: str, optional (default='object') The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects - - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame Returns ------- dict or dataframe """ - if output_format not in ["dataframe", "dict", "object"]: - raise ValueError( - "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.", - ) - - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15. " - "To ensure your code will continue to work, " - "use `output_format`='dataframe' or `output_format`='object'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) + if output_format not in ("dataframe", "object"): + raise ValueError("Invalid output format. Only 'object', 'dataframe'.") per_fold_str = None if per_fold is not None: per_fold_str = str(per_fold).lower() - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + eval_collection: list[list[OpenMLEvaluation]] = openml.utils._list_all( listing_call=_list_evaluations, function=function, offset=offset, @@ -146,6 +133,13 @@ def list_evaluations( sort_order=sort_order, per_fold=per_fold_str, ) + flattened = list(chain.from_iterable(eval_collection)) + + if output_format == "dataframe": + records = [item._to_dict() for item in flattened] + return pd.DataFrame.from_records(records, index="run_id") + + return {e.run_id: e for e in flattened} def _list_evaluations( @@ -157,9 +151,8 @@ def _list_evaluations( uploaders: list | None = None, study: int | None = None, sort_order: str | None = None, - output_format: Literal["object", "dict", "dataframe"] = "object", **kwargs: Any, -) -> dict | pd.DataFrame: +) -> list[OpenMLEvaluation]: """ Perform API call ``/evaluation/function{function}/{filters}`` @@ -190,17 +183,9 @@ def _list_evaluations( sort_order : str, optional order of sorting evaluations, ascending ("asc") or descending ("desc") - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - - If 'dataframe' the output is a pandas DataFrame - Returns ------- - dict of objects, or dataframe + list of OpenMLEvaluation objects """ api_call = f"evaluation/list/function/{function}" if kwargs is not None: @@ -221,13 +206,10 @@ def _list_evaluations( if sort_order is not None: api_call += f"/sort_order/{sort_order}" - return __list_evaluations(api_call, output_format=output_format) + return __list_evaluations(api_call) -def __list_evaluations( - api_call: str, - output_format: Literal["object", "dict", "dataframe"] = "object", -) -> dict | pd.DataFrame: +def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]: """Helper function to parse API calls which are lists of runs""" xml_string = openml._api_calls._perform_api_call(api_call, "get") evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",)) @@ -241,29 +223,24 @@ def __list_evaluations( evals_dict["oml:evaluations"], ) - evals: dict[int, dict | OpenMLEvaluation] = {} uploader_ids = list( {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]}, ) api_users = "user/list/user_id/" + ",".join(uploader_ids) xml_string_user = openml._api_calls._perform_api_call(api_users, "get") + users = xmltodict.parse(xml_string_user, force_list=("oml:user",)) user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]} + + evals = [] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]: run_id = int(eval_["oml:run_id"]) - - value = None - if "oml:value" in eval_: - value = float(eval_["oml:value"]) - - values = None - if "oml:values" in eval_: - values = json.loads(eval_["oml:values"]) - + value = float(eval_["oml:value"]) if "oml:value" in eval_ else None + values = json.loads(eval_["oml:values"]) if eval_["oml:values"] is not None else None array_data = eval_.get("oml:array_data") - if output_format == "object": - evals[run_id] = OpenMLEvaluation( + evals.append( + OpenMLEvaluation( run_id=run_id, task_id=int(eval_["oml:task_id"]), setup_id=int(eval_["oml:setup_id"]), @@ -279,28 +256,7 @@ def __list_evaluations( values=values, array_data=array_data, ) - else: - # for output_format in ['dict', 'dataframe'] - evals[run_id] = { - "run_id": int(eval_["oml:run_id"]), - "task_id": int(eval_["oml:task_id"]), - "setup_id": int(eval_["oml:setup_id"]), - "flow_id": int(eval_["oml:flow_id"]), - "flow_name": eval_["oml:flow_name"], - "data_id": int(eval_["oml:data_id"]), - "data_name": eval_["oml:data_name"], - "function": eval_["oml:function"], - "upload_time": eval_["oml:upload_time"], - "uploader": int(eval_["oml:uploader"]), - "uploader_name": user_dict[eval_["oml:uploader"]], - "value": value, - "values": values, - "array_data": array_data, - } - - if output_format == "dataframe": - rows = list(evals.values()) - return pd.DataFrame.from_records(rows, columns=rows[0].keys()) # type: ignore + ) return evals @@ -321,9 +277,11 @@ def list_evaluation_measures() -> list[str]: qualities = xmltodict.parse(xml_string, force_list=("oml:measures")) # Minimalistic check if the XML is useful if "oml:evaluation_measures" not in qualities: - raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"') + raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"') + if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list): - raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list') + raise TypeError('Error in return XML, does not contain "oml:measure" as a list') + return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"] @@ -343,14 +301,13 @@ def list_estimation_procedures() -> list[str]: # Minimalistic check if the XML is useful if "oml:estimationprocedures" not in api_results: - raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedures"') + raise ValueError('Error in return XML, does not contain "oml:estimationprocedures"') + if "oml:estimationprocedure" not in api_results["oml:estimationprocedures"]: - raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedure"') + raise ValueError('Error in return XML, does not contain "oml:estimationprocedure"') if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list): - raise TypeError( - "Error in return XML, does not contain " '"oml:estimationprocedure" as a list', - ) + raise TypeError('Error in return XML, does not contain "oml:estimationprocedure" as a list') return [ prod["oml:name"] @@ -370,11 +327,9 @@ def list_evaluations_setups( tag: str | None = None, per_fold: bool | None = None, sort_order: str | None = None, - output_format: str = "dataframe", parameters_in_separate_columns: bool = False, # noqa: FBT001, FBT002 -) -> dict | pd.DataFrame: - """ - List all run-evaluation pairs matching all of the given filters +) -> pd.DataFrame: + """List all run-evaluation pairs matching all of the given filters and their hyperparameter settings. Parameters @@ -400,23 +355,16 @@ def list_evaluations_setups( per_fold : bool, optional sort_order : str, optional order of sorting evaluations, ascending ("asc") or descending ("desc") - output_format: str, optional (default='dataframe') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame parameters_in_separate_columns: bool, optional (default= False) Returns hyperparameters in separate columns if set to True. Valid only for a single flow - Returns ------- - dict or dataframe with hyperparameter settings as a list of tuples. + dataframe with hyperparameter settings as a list of tuples. """ if parameters_in_separate_columns and (flows is None or len(flows) != 1): - raise ValueError( - "Can set parameters_in_separate_columns to true " "only for single flow_id", - ) + raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id") # List evaluations evals = list_evaluations( @@ -439,11 +387,14 @@ def list_evaluations_setups( _df = pd.DataFrame() if len(evals) != 0: N = 100 # size of section - length = len(evals["setup_id"].unique()) # length of the array we want to split + uniq = np.asarray(evals["setup_id"].unique()) + length = len(uniq) + # array_split - allows indices_or_sections to not equally divide the array # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N. - uniq = np.asarray(evals["setup_id"].unique()) - setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1) + split_size = ((length - 1) // N) + 1 + setup_chunks = np.array_split(uniq, split_size) + setup_data = pd.DataFrame() for _setups in setup_chunks: result = openml.setups.list_setups(setup=_setups, output_format="dataframe") @@ -471,7 +422,4 @@ def list_evaluations_setups( axis=1, ) - if output_format == "dataframe": - return _df - - return _df.to_dict(orient="index") + return _df diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 3d056ac60..e43e18c17 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -3,10 +3,8 @@ import os import re -import warnings from collections import OrderedDict -from typing import Any, Dict, overload -from typing_extensions import Literal +from typing import Any, Dict import dateutil.parser import pandas as pd @@ -133,44 +131,12 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow: return _create_flow_from_xml(flow_xml) -@overload -def list_flows( - offset: int | None = ..., - size: int | None = ..., - tag: str | None = ..., - output_format: Literal["dict"] = "dict", - **kwargs: Any, -) -> dict: ... - - -@overload -def list_flows( - offset: int | None = ..., - size: int | None = ..., - tag: str | None = ..., - *, - output_format: Literal["dataframe"], - **kwargs: Any, -) -> pd.DataFrame: ... - - -@overload -def list_flows( - offset: int | None, - size: int | None, - tag: str | None, - output_format: Literal["dataframe"], - **kwargs: Any, -) -> pd.DataFrame: ... - - def list_flows( offset: int | None = None, size: int | None = None, tag: str | None = None, - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any, -) -> dict | pd.DataFrame: +) -> pd.DataFrame: """ Return a list of all flows which are on OpenML. (Supports large amount of results) @@ -183,29 +149,12 @@ def list_flows( the maximum number of flows to return tag : str, optional the tag to include - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs: dict, optional Legal filter operators: uploader. Returns ------- - flows : dict of dicts, or dataframe - - If output_format='dict' - A mapping from flow_id to a dict giving a brief overview of the - respective flow. - Every flow is represented by a dictionary containing - the following information: - - flow id - - full name - - name - - version - - external version - - uploader - - - If output_format='dataframe' + flows : dataframe Each row maps to a dataset Each column contains the following information: - flow id @@ -215,61 +164,28 @@ def list_flows( - external version - uploader """ - if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", - ) - - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - return openml.utils._list_all( - list_output_format=output_format, + batches = openml.utils._list_all( listing_call=_list_flows, offset=offset, size=size, tag=tag, **kwargs, ) + return pd.concat(batches, ignore_index=True) -@overload -def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: ... - - -@overload -def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... - - -@overload -def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... - - -def _list_flows( - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any -) -> dict | pd.DataFrame: +def _list_flows(**kwargs: Any) -> pd.DataFrame: """ Perform the api call that return a list of all flows. Parameters ---------- - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - kwargs: dict, optional Legal filter operators: uploader, tag, limit, offset. Returns ------- - flows : dict, or dataframe + flows : dataframe """ api_call = "flow/list" @@ -277,7 +193,7 @@ def _list_flows( for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" - return __list_flows(api_call=api_call, output_format=output_format) + return __list_flows(api_call=api_call) def flow_exists(name: str, external_version: str) -> int | bool: @@ -378,23 +294,12 @@ def get_flow_id( raise ValueError("exact_version should be False if model is None!") return flow_exists(name=flow_name, external_version=external_version) - flows = list_flows(output_format="dataframe") - assert isinstance(flows, pd.DataFrame) # Make mypy happy + flows = list_flows() flows = flows.query(f'name == "{flow_name}"') return flows["id"].to_list() # type: ignore[no-any-return] -@overload -def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... - - -@overload -def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... - - -def __list_flows( - api_call: str, output_format: Literal["dict", "dataframe"] = "dict" -) -> dict | pd.DataFrame: +def __list_flows(api_call: str) -> pd.DataFrame: """Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. @@ -402,8 +307,6 @@ def __list_flows( ---------- api_call: str Retrieves the information about flows. - output_format: str in {"dict", "dataframe"} - The output format. Returns ------- @@ -431,10 +334,7 @@ def __list_flows( } flows[fid] = flow - if output_format == "dataframe": - flows = pd.DataFrame.from_dict(flows, orient="index") - - return flows + return pd.DataFrame.from_dict(flows, orient="index") def _check_flow_for_server_id(flow: OpenMLFlow) -> None: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index c6af4a481..cff649c1d 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -7,7 +7,6 @@ from collections import OrderedDict from pathlib import Path from typing import TYPE_CHECKING, Any -from typing_extensions import Literal import numpy as np import pandas as pd @@ -65,7 +64,6 @@ def run_model_on_task( # noqa: PLR0913 add_local_measures: bool = True, # noqa: FBT001, FBT002 upload_flow: bool = False, # noqa: FBT001, FBT002 return_flow: bool = False, # noqa: FBT001, FBT002 - dataset_format: Literal["array", "dataframe"] = "dataframe", n_jobs: int | None = None, ) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]: """Run the model on the dataset defined by the task. @@ -94,9 +92,6 @@ def run_model_on_task( # noqa: PLR0913 If False, do not upload the flow to OpenML. return_flow : bool (default=False) If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun. - dataset_format : str (default='dataframe') - If 'array', the dataset is passed to the model as a numpy array. - If 'dataframe', the dataset is passed to the model as a pandas dataframe. n_jobs : int (default=None) The number of processes/threads to distribute the evaluation asynchronously. If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially. @@ -170,7 +165,6 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask: seed=seed, add_local_measures=add_local_measures, upload_flow=upload_flow, - dataset_format=dataset_format, n_jobs=n_jobs, ) if return_flow: @@ -186,7 +180,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 seed: int | None = None, add_local_measures: bool = True, # noqa: FBT001, FBT002 upload_flow: bool = False, # noqa: FBT001, FBT002 - dataset_format: Literal["array", "dataframe"] = "dataframe", n_jobs: int | None = None, ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. @@ -216,9 +209,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 upload_flow : bool (default=False) If True, upload the flow to OpenML if it does not exist yet. If False, do not upload the flow to OpenML. - dataset_format : str (default='dataframe') - If 'array', the dataset is passed to the model as a numpy array. - If 'dataframe', the dataset is passed to the model as a pandas dataframe. n_jobs : int (default=None) The number of processes/threads to distribute the evaluation asynchronously. If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially. @@ -306,7 +296,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 task=task, extension=flow.extension, add_local_measures=add_local_measures, - dataset_format=dataset_format, n_jobs=n_jobs, ) @@ -468,13 +457,12 @@ def run_exists(task_id: int, setup_id: int) -> set[int]: return set() -def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901 +def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, C901 *, model: Any, task: OpenMLTask, extension: Extension, add_local_measures: bool, - dataset_format: Literal["array", "dataframe"], n_jobs: int | None = None, ) -> tuple[ list[list], @@ -495,8 +483,6 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901 The OpenML extension object. add_local_measures : bool Whether to compute additional local evaluation measures. - dataset_format : str - The format in which to download the dataset. n_jobs : int Number of jobs to run in parallel. If None, use 1 core by default. If -1, use all available cores. @@ -560,7 +546,6 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901 rep_no=rep_no, sample_no=sample_no, task=task, - dataset_format=dataset_format, configuration=_config, ) for _n_fit, rep_no, fold_no, sample_no in jobs @@ -704,7 +689,6 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 rep_no: int, sample_no: int, task: OpenMLTask, - dataset_format: Literal["array", "dataframe"], configuration: _Config | None = None, ) -> tuple[ np.ndarray, @@ -730,8 +714,6 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 Sample number to be run. task : OpenMLTask The task object from OpenML. - dataset_format : str - The dataset format to be used. configuration : _Config Hyperparameters to configure the model. @@ -755,24 +737,15 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 ) if isinstance(task, OpenMLSupervisedTask): - x, y = task.get_X_and_y(dataset_format=dataset_format) - if isinstance(x, pd.DataFrame): - assert isinstance(y, (pd.Series, pd.DataFrame)) - train_x = x.iloc[train_indices] - train_y = y.iloc[train_indices] - test_x = x.iloc[test_indices] - test_y = y.iloc[test_indices] - else: - # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing - assert y is not None - train_x = x[train_indices] # type: ignore - train_y = y[train_indices] - test_x = x[test_indices] # type: ignore - test_y = y[test_indices] + x, y = task.get_X_and_y() + assert isinstance(y, (pd.Series, pd.DataFrame)) + train_x = x.iloc[train_indices] + train_y = y.iloc[train_indices] + test_x = x.iloc[test_indices] + test_y = y.iloc[test_indices] elif isinstance(task, OpenMLClusteringTask): - x = task.get_X(dataset_format=dataset_format) - # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing - train_x = x.iloc[train_indices] if isinstance(x, pd.DataFrame) else x[train_indices] # type: ignore + x = task.get_X() + train_x = x.iloc[train_indices] train_y = None test_x = None test_y = None @@ -793,8 +766,7 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 model=model, task=task, X_train=train_x, - # TODO(eddiebergman): Likely should not be ignored - y_train=train_y, # type: ignore + y_train=train_y, rep_no=rep_no, fold_no=fold_no, X_test=test_x, @@ -1050,8 +1022,6 @@ def _get_cached_run(run_id: int) -> OpenMLRun: raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e -# TODO(eddiebergman): Could overload, likely too large an annoying to do -# nvm, will be deprecated in 0.15 def list_runs( # noqa: PLR0913 offset: int | None = None, size: int | None = None, @@ -1063,11 +1033,10 @@ def list_runs( # noqa: PLR0913 tag: str | None = None, study: int | None = None, display_errors: bool = False, # noqa: FBT001, FBT002 - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any, -) -> dict | pd.DataFrame: - """ - List all runs matching all of the given filters. +) -> pd.DataFrame: + """List all runs matching all of the given filters. + (Supports large amount of results) Parameters @@ -1095,31 +1064,13 @@ def list_runs( # noqa: PLR0913 Whether to list runs which have an error (for example a missing prediction file). - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - kwargs : dict, optional Legal filter operators: task_type. Returns ------- - dict of dicts, or dataframe + dataframe """ - if output_format not in ["dataframe", "dict"]: - raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.") - - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - # TODO(eddiebergman): Do we really need this runtime type validation? if id is not None and (not isinstance(id, list)): raise TypeError("id must be of type list.") if task is not None and (not isinstance(task, list)): @@ -1131,8 +1082,7 @@ def list_runs( # noqa: PLR0913 if uploader is not None and (not isinstance(uploader, list)): raise TypeError("uploader must be of type list.") - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_runs, offset=offset, size=size, @@ -1146,6 +1096,7 @@ def list_runs( # noqa: PLR0913 display_errors=display_errors, **kwargs, ) + return pd.concat(batches, ignore_index=True) def _list_runs( # noqa: PLR0913 @@ -1156,9 +1107,8 @@ def _list_runs( # noqa: PLR0913 uploader: list | None = None, study: int | None = None, display_errors: bool = False, # noqa: FBT002, FBT001 - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any, -) -> dict | pd.DataFrame: +) -> pd.DataFrame: """ Perform API call `/run/list/{filters}' ` @@ -1186,18 +1136,12 @@ def _list_runs( # noqa: PLR0913 Whether to list runs which have an error (for example a missing prediction file). - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - kwargs : dict, optional Legal filter operators: task_type. Returns ------- - dict, or dataframe - List of found runs. + dataframe of found runs. """ api_call = "run/list" if kwargs is not None: @@ -1217,12 +1161,10 @@ def _list_runs( # noqa: PLR0913 api_call += "/study/%d" % study if display_errors: api_call += "/show_errors/true" - return __list_runs(api_call=api_call, output_format=output_format) + return __list_runs(api_call=api_call) -def __list_runs( - api_call: str, output_format: Literal["dict", "dataframe"] = "dict" -) -> dict | pd.DataFrame: +def __list_runs(api_call: str) -> pd.DataFrame: """Helper function to parse API calls which are lists of runs""" xml_string = openml._api_calls._perform_api_call(api_call, "get") runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",)) @@ -1257,11 +1199,7 @@ def __list_runs( } for r in runs_dict["oml:runs"]["oml:run"] } - - if output_format == "dataframe": - runs = pd.DataFrame.from_dict(runs, orient="index") - - return runs + return pd.DataFrame.from_dict(runs, orient="index") def format_prediction( # noqa: PLR0913 diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 0bcd2b4e2..383e2e16e 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -1,8 +1,8 @@ # License: BSD 3-Clause from __future__ import annotations -import warnings from collections import OrderedDict +from itertools import chain from pathlib import Path from typing import Any, Iterable from typing_extensions import Literal @@ -125,7 +125,7 @@ def get_setup(setup_id: int) -> OpenMLSetup: fh.write(setup_xml) result_dict = xmltodict.parse(setup_xml) - return _create_setup_from_xml(result_dict, output_format="object") # type: ignore + return _create_setup_from_xml(result_dict) def list_setups( # noqa: PLR0913 @@ -134,8 +134,8 @@ def list_setups( # noqa: PLR0913 flow: int | None = None, tag: str | None = None, setup: Iterable[int] | None = None, - output_format: Literal["object", "dict", "dataframe"] = "object", -) -> dict | pd.DataFrame: + output_format: Literal["object", "dataframe"] = "object", +) -> dict[int, OpenMLSetup] | pd.DataFrame: """ List all setups matching all of the given filters. @@ -148,30 +148,19 @@ def list_setups( # noqa: PLR0913 setup : Iterable[int], optional output_format: str, optional (default='object') The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame Returns ------- dict or dataframe """ - if output_format not in ["dataframe", "dict", "object"]: + if output_format not in ["dataframe", "object"]: raise ValueError( - "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable.", + "Invalid output format selected. Only 'object', or 'dataframe' applicable.", ) - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15. " - "To ensure your code will continue to work, " - "use `output_format`='dataframe' or `output_format`='object'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - batch_size = 1000 # batch size for setups is lower - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_setups, offset=offset, size=size, @@ -180,15 +169,15 @@ def list_setups( # noqa: PLR0913 setup=setup, batch_size=batch_size, ) + flattened = list(chain.from_iterable(batches)) + if output_format == "object": + return {setup.setup_id: setup for setup in flattened} + return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id") -def _list_setups( - setup: Iterable[int] | None = None, - output_format: Literal["dict", "dataframe", "object"] = "object", - **kwargs: Any, -) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]: - """ - Perform API call `/setup/list/{filters}` + +def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[OpenMLSetup]: + """Perform API call `/setup/list/{filters}` Parameters ---------- @@ -197,18 +186,12 @@ def _list_setups( setup : list(int), optional - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - - If 'object' the output is a dict of OpenMLSetup objects - kwargs: dict, optional Legal filter operators: flow, setup, limit, offset, tag. Returns ------- - dict or dataframe or list[OpenMLSetup] + The setups that match the filters, going from id to the OpenMLSetup object. """ api_call = "setup/list" if setup is not None: @@ -217,12 +200,10 @@ def _list_setups( for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" - return __list_setups(api_call=api_call, output_format=output_format) + return __list_setups(api_call=api_call) -def __list_setups( - api_call: str, output_format: Literal["dict", "dataframe", "object"] = "object" -) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]: +def __list_setups(api_call: str) -> list[OpenMLSetup]: """Helper function to parse API calls which are lists of setups""" xml_string = openml._api_calls._perform_api_call(api_call, "get") setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",)) @@ -247,22 +228,10 @@ def __list_setups( assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"]) - setups = {} - for setup_ in setups_dict["oml:setups"]["oml:setup"]: - # making it a dict to give it the right format - current = _create_setup_from_xml( - {"oml:setup_parameters": setup_}, - output_format=output_format, - ) - if output_format == "object": - setups[current.setup_id] = current # type: ignore - else: - setups[current["setup_id"]] = current # type: ignore - - if output_format == "dataframe": - setups = pd.DataFrame.from_dict(setups, orient="index") - - return setups + return [ + _create_setup_from_xml({"oml:setup_parameters": setup_}) + for setup_ in setups_dict["oml:setups"]["oml:setup"] + ] def initialize_model(setup_id: int) -> Any: @@ -297,9 +266,7 @@ def initialize_model(setup_id: int) -> Any: return flow.extension.flow_to_model(flow) -def _to_dict( - flow_id: int, openml_parameter_settings: list[OpenMLParameter] | list[dict[str, Any]] -) -> OrderedDict: +def _to_dict(flow_id: int, openml_parameter_settings: list[dict[str, Any]]) -> OrderedDict: """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. @@ -307,7 +274,7 @@ def _to_dict( ---------- flow_id : int ID of the flow. - openml_parameter_settings : List[OpenMLParameter] + openml_parameter_settings : list[dict[str, Any]] A list of OpenML parameter settings. Returns @@ -325,77 +292,41 @@ def _to_dict( return xml -def _create_setup_from_xml( - result_dict: dict, output_format: Literal["dict", "dataframe", "object"] = "object" -) -> OpenMLSetup | dict[str, int | dict[int, Any] | None]: +def _create_setup_from_xml(result_dict: dict) -> OpenMLSetup: """Turns an API xml result into a OpenMLSetup object (or dict)""" - if output_format in ["dataframe", "dict"]: - _output_format: Literal["dict", "object"] = "dict" - elif output_format == "object": - _output_format = "object" - else: - raise ValueError( - f"Invalid output format selected: {output_format}" - "Only 'dict', 'object', or 'dataframe' applicable.", - ) - setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"]) flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"]) + if "oml:parameter" not in result_dict["oml:setup_parameters"]: - parameters = None + return OpenMLSetup(setup_id, flow_id, parameters=None) + + xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"] + if isinstance(xml_parameters, dict): + parameters = { + int(xml_parameters["oml:id"]): _create_setup_parameter_from_xml(xml_parameters), + } + elif isinstance(xml_parameters, list): + parameters = { + int(xml_parameter["oml:id"]): _create_setup_parameter_from_xml(xml_parameter) + for xml_parameter in xml_parameters + } else: - parameters = {} - # basically all others - xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"] - if isinstance(xml_parameters, dict): - oml_id = int(xml_parameters["oml:id"]) - parameters[oml_id] = _create_setup_parameter_from_xml( - result_dict=xml_parameters, - output_format=_output_format, - ) - elif isinstance(xml_parameters, list): - for xml_parameter in xml_parameters: - oml_id = int(xml_parameter["oml:id"]) - parameters[oml_id] = _create_setup_parameter_from_xml( - result_dict=xml_parameter, - output_format=_output_format, - ) - else: - raise ValueError( - "Expected None, list or dict, received " - f"something else: {type(xml_parameters)!s}", - ) - - if _output_format in ["dataframe", "dict"]: - return {"setup_id": setup_id, "flow_id": flow_id, "parameters": parameters} + raise ValueError( + f"Expected None, list or dict, received something else: {type(xml_parameters)!s}", + ) + return OpenMLSetup(setup_id, flow_id, parameters) -def _create_setup_parameter_from_xml( - result_dict: dict[str, str], output_format: Literal["object", "dict"] = "object" -) -> dict[str, int | str] | OpenMLParameter: +def _create_setup_parameter_from_xml(result_dict: dict[str, str]) -> OpenMLParameter: """Create an OpenMLParameter object or a dictionary from an API xml result.""" - if output_format == "object": - return OpenMLParameter( - input_id=int(result_dict["oml:id"]), - flow_id=int(result_dict["oml:flow_id"]), - flow_name=result_dict["oml:flow_name"], - full_name=result_dict["oml:full_name"], - parameter_name=result_dict["oml:parameter_name"], - data_type=result_dict["oml:data_type"], - default_value=result_dict["oml:default_value"], - value=result_dict["oml:value"], - ) - - # FIXME: likely we want to crash here if unknown output_format but not backwards compatible - # output_format == "dict" case, - return { - "input_id": int(result_dict["oml:id"]), - "flow_id": int(result_dict["oml:flow_id"]), - "flow_name": result_dict["oml:flow_name"], - "full_name": result_dict["oml:full_name"], - "parameter_name": result_dict["oml:parameter_name"], - "data_type": result_dict["oml:data_type"], - "default_value": result_dict["oml:default_value"], - "value": result_dict["oml:value"], - } + return OpenMLParameter( + input_id=int(result_dict["oml:id"]), + flow_id=int(result_dict["oml:flow_id"]), + flow_name=result_dict["oml:flow_name"], + full_name=result_dict["oml:full_name"], + parameter_name=result_dict["oml:parameter_name"], + data_type=result_dict["oml:data_type"], + default_value=result_dict["oml:default_value"], + value=result_dict["oml:value"], + ) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index e8dc059e7..83b258dd7 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -7,6 +7,7 @@ import openml.flows +# TODO(eddiebergman): All functionality handled by dataclass class OpenMLSetup: """Setup object (a.k.a. Configuration). @@ -34,6 +35,15 @@ def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | Non self.flow_id = flow_id self.parameters = parameters + def _to_dict(self) -> dict[str, Any]: + return { + "setup_id": self.setup_id, + "flow_id": self.flow_id, + "parameters": {p.input_id: p._to_dict() for p in self.parameters.values()} + if self.parameters is not None + else None, + } + def __repr__(self) -> str: header = "OpenML Setup" header = "{}\n{}\n".format(header, "=" * len(header)) @@ -57,6 +67,7 @@ def __repr__(self) -> str: return header + body +# TODO(eddiebergman): All functionality handled by dataclass class OpenMLParameter: """Parameter object (used in setup). @@ -102,6 +113,18 @@ def __init__( # noqa: PLR0913 self.default_value = default_value self.value = value + def _to_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "flow_id": self.flow_id, + "flow_name": self.flow_name, + "full_name": self.full_name, + "parameter_name": self.parameter_name, + "data_type": self.data_type, + "default_value": self.default_value, + "value": self.value, + } + def __repr__(self) -> str: header = "OpenML Parameter" header = "{}\n{}\n".format(header, "=" * len(header)) diff --git a/openml/study/functions.py b/openml/study/functions.py index 7fdc6f636..f7db1e0aa 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -3,8 +3,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any, overload -from typing_extensions import Literal +from typing import TYPE_CHECKING, Any import pandas as pd import xmltodict @@ -433,33 +432,12 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int: return int(result["oml:linked_entities"]) -@overload -def list_suites( - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - uploader: list[int] | None = ..., - output_format: Literal["dict"] = "dict", -) -> dict: ... - - -@overload -def list_suites( - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - uploader: list[int] | None = ..., - output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: ... - - def list_suites( offset: int | None = None, size: int | None = None, status: str | None = None, uploader: list[int] | None = None, - output_format: Literal["dict", "dataframe"] = "dict", -) -> dict | pd.DataFrame: +) -> pd.DataFrame: """ Return a list of all suites which are on OpenML. @@ -474,49 +452,20 @@ def list_suites( suites are returned. uploader : list (int), optional Result filter. Will only return suites created by these users. - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame Returns ------- - datasets : dict of dicts, or dataframe - - If output_format='dict' - Every suite is represented by a dictionary containing the following information: - - id - - alias (optional) - - name - - main_entity_type - - status - - creator - - creation_date - - - If output_format='dataframe' - Every row is represented by a dictionary containing the following information: - - id - - alias (optional) - - name - - main_entity_type - - status - - creator - - creation_date + datasets : dataframe + Every row is represented by a dictionary containing the following information: + - id + - alias (optional) + - name + - main_entity_type + - status + - creator + - creation_date """ - if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", - ) - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_studies, offset=offset, size=size, @@ -524,28 +473,7 @@ def list_suites( status=status, uploader=uploader, ) - - -@overload -def list_studies( - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - uploader: list[str] | None = ..., - benchmark_suite: int | None = ..., - output_format: Literal["dict"] = "dict", -) -> dict: ... - - -@overload -def list_studies( - offset: int | None = ..., - size: int | None = ..., - status: str | None = ..., - uploader: list[str] | None = ..., - benchmark_suite: int | None = ..., - output_format: Literal["dataframe"] = "dataframe", -) -> pd.DataFrame: ... + return pd.concat(batches, ignore_index=True) def list_studies( @@ -554,8 +482,7 @@ def list_studies( status: str | None = None, uploader: list[str] | None = None, benchmark_suite: int | None = None, - output_format: Literal["dict", "dataframe"] = "dict", -) -> dict | pd.DataFrame: +) -> pd.DataFrame: """ Return a list of all studies which are on OpenML. @@ -571,55 +498,23 @@ def list_studies( uploader : list (int), optional Result filter. Will only return studies created by these users. benchmark_suite : int, optional - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame Returns ------- - datasets : dict of dicts, or dataframe - - If output_format='dict' - Every dataset is represented by a dictionary containing - the following information: - - id - - alias (optional) - - name - - benchmark_suite (optional) - - status - - creator - - creation_date - If qualities are calculated for the dataset, some of - these are also returned. - - - If output_format='dataframe' - Every dataset is represented by a dictionary containing - the following information: - - id - - alias (optional) - - name - - benchmark_suite (optional) - - status - - creator - - creation_date - If qualities are calculated for the dataset, some of - these are also returned. + datasets : dataframe + Every dataset is represented by a dictionary containing + the following information: + - id + - alias (optional) + - name + - benchmark_suite (optional) + - status + - creator + - creation_date + If qualities are calculated for the dataset, some of + these are also returned. """ - if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", - ) - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_studies, offset=offset, size=size, @@ -628,54 +523,30 @@ def list_studies( uploader=uploader, benchmark_suite=benchmark_suite, ) + return pd.concat(batches, ignore_index=True) -@overload -def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: ... - - -@overload -def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ... - - -def _list_studies( - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any -) -> dict | pd.DataFrame: - """ - Perform api call to return a list of studies. +def _list_studies(**kwargs: Any) -> pd.DataFrame: + """Perform api call to return a list of studies. Parameters ---------- - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs : dict, optional Legal filter operators (keys in the dict): status, limit, offset, main_entity_type, uploader Returns ------- - studies : dict of dicts + studies : dataframe """ api_call = "study/list" if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" - return __list_studies(api_call=api_call, output_format=output_format) - - -@overload -def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ... - - -@overload -def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ... + return __list_studies(api_call=api_call) -def __list_studies( - api_call: str, output_format: Literal["dict", "dataframe"] = "dict" -) -> dict | pd.DataFrame: +def __list_studies(api_call: str) -> pd.DataFrame: """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. @@ -683,13 +554,10 @@ def __list_studies( ---------- api_call : str The API call for retrieving the list of OpenML studies. - output_format : str in {"dict", "dataframe"} - Format of the output, either 'object' for a dictionary - or 'dataframe' for a Pandas DataFrame. Returns ------- - Union[Dict, pd.DataFrame] + pd.DataFrame A dictionary or Pandas DataFrame of OpenML studies, depending on the value of 'output_format'. """ @@ -725,6 +593,4 @@ def __list_studies( current_study["id"] = int(current_study["id"]) studies[study_id] = current_study - if output_format == "dataframe": - studies = pd.DataFrame.from_dict(studies, orient="index") - return studies + return pd.DataFrame.from_dict(studies, orient="index") diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 54030422d..6877658f7 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -5,7 +5,6 @@ import re import warnings from typing import Any -from typing_extensions import Literal import pandas as pd import xmltodict @@ -131,9 +130,8 @@ def list_tasks( offset: int | None = None, size: int | None = None, tag: str | None = None, - output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any, -) -> dict | pd.DataFrame: +) -> pd.DataFrame: """ Return a number of tasks having the given tag and task_type @@ -150,10 +148,6 @@ def list_tasks( the maximum number of tasks to show tag : str, optional the tag to include - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs: dict, optional Legal filter operators: data_tag, status, data_id, data_name, number_instances, number_features, @@ -161,31 +155,13 @@ def list_tasks( Returns ------- - dict - All tasks having the given task_type and the give tag. Every task is - represented by a dictionary containing the following information: - task id, dataset id, task_type and status. If qualities are calculated - for the associated dataset, some of these are also returned. dataframe All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned. """ - if output_format not in ["dataframe", "dict"]: - raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.", - ) - # TODO: [0.15] - if output_format == "dict": - msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) - return openml.utils._list_all( # type: ignore - list_output_format=output_format, # type: ignore + batches = openml.utils._list_all( listing_call=_list_tasks, task_type=task_type, offset=offset, @@ -193,13 +169,10 @@ def list_tasks( tag=tag, **kwargs, ) + return pd.concat(batches, ignore_index=True) -def _list_tasks( - task_type: TaskType | None = None, - output_format: Literal["dict", "dataframe"] = "dict", - **kwargs: Any, -) -> dict | pd.DataFrame: +def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFrame: """ Perform the api call to return a number of tasks having the given filters. @@ -210,10 +183,6 @@ def _list_tasks( type when used as a filter in list tasks call. task_type : TaskType, optional Refers to the type of task. - output_format: str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame kwargs: dict, optional Legal filter operators: tag, task_id (list), data_tag, status, limit, offset, data_id, data_name, number_instances, number_features, @@ -221,7 +190,7 @@ def _list_tasks( Returns ------- - dict or dataframe + dataframe """ api_call = "task/list" if task_type is not None: @@ -232,27 +201,20 @@ def _list_tasks( value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 api_call += f"/{operator}/{value}" - return __list_tasks(api_call=api_call, output_format=output_format) + return __list_tasks(api_call=api_call) -# TODO(eddiebergman): overload todefine type returned -def __list_tasks( # noqa: PLR0912, C901 - api_call: str, - output_format: Literal["dict", "dataframe"] = "dict", -) -> dict | pd.DataFrame: - """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. +def __list_tasks(api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912 + """Returns a Pandas DataFrame with information about OpenML tasks. Parameters ---------- api_call : str The API call specifying which tasks to return. - output_format : str in {"dict", "dataframe"} - Output format for the returned object. Returns ------- - Union[Dict, pd.DataFrame] - A dictionary or a Pandas DataFrame with information about OpenML tasks. + A Pandas DataFrame with information about OpenML tasks. Raises ------ @@ -339,13 +301,9 @@ def __list_tasks( # noqa: PLR0912, C901 else: warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2) - if output_format == "dataframe": - tasks = pd.DataFrame.from_dict(tasks, orient="index") - - return tasks + return pd.DataFrame.from_dict(tasks, orient="index") -# TODO(eddiebergman): Maybe since this isn't public api, we can make it keyword only? def get_tasks( task_ids: list[int], download_data: bool | None = None, @@ -590,7 +548,7 @@ def create_task( task_type_id=task_type, task_type="None", # TODO: refactor to get task type string from ID. data_set_id=dataset_id, - target_name=target_name, + target_name=target_name, # type: ignore estimation_procedure_id=estimation_procedure_id, evaluation_measure=evaluation_measure, **kwargs, diff --git a/openml/tasks/task.py b/openml/tasks/task.py index e7d19bdce..395b52482 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -8,7 +8,7 @@ from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Any, Sequence -from typing_extensions import Literal, TypedDict, overload +from typing_extensions import TypedDict import openml._api_calls import openml.config @@ -21,7 +21,6 @@ if TYPE_CHECKING: import numpy as np import pandas as pd - import scipy.sparse # TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used @@ -277,52 +276,14 @@ def __init__( # noqa: PLR0913 self.target_name = target_name - @overload - def get_X_and_y( - self, dataset_format: Literal["array"] = "array" - ) -> tuple[ - np.ndarray | scipy.sparse.spmatrix, - np.ndarray | None, - ]: ... - - @overload - def get_X_and_y( - self, dataset_format: Literal["dataframe"] - ) -> tuple[ - pd.DataFrame, - pd.Series | pd.DataFrame | None, - ]: ... - - # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`? - def get_X_and_y( - self, dataset_format: Literal["dataframe", "array"] = "array" - ) -> tuple[ - np.ndarray | pd.DataFrame | scipy.sparse.spmatrix, - np.ndarray | pd.Series | pd.DataFrame | None, - ]: + def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: """Get data associated with the current task. - Parameters - ---------- - dataset_format : str - Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data` - for possible options. - Returns ------- tuple - X and y """ - # TODO: [0.15] - if dataset_format == "array": - warnings.warn( - "Support for `dataset_format='array'` will be removed in 0.15," - "start using `dataset_format='dataframe' to ensure your code " - "will continue to work. You can use the dataframe's `to_numpy` " - "function to continue using numpy arrays.", - category=FutureWarning, - stacklevel=2, - ) dataset = self.get_dataset() if self.task_type_id not in ( TaskType.SUPERVISED_CLASSIFICATION, @@ -331,10 +292,7 @@ def get_X_and_y( ): raise NotImplementedError(self.task_type) - X, y, _, _ = dataset.get_data( - dataset_format=dataset_format, - target=self.target_name, - ) + X, y, _, _ = dataset.get_data(target=self.target_name) return X, y def _to_dict(self) -> dict[str, dict]: @@ -536,34 +494,15 @@ def __init__( # noqa: PLR0913 self.target_name = target_name - @overload - def get_X( - self, - dataset_format: Literal["array"] = "array", - ) -> np.ndarray | scipy.sparse.spmatrix: ... - - @overload - def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: ... - - def get_X( - self, - dataset_format: Literal["array", "dataframe"] = "array", - ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix: + def get_X(self) -> pd.DataFrame: """Get data associated with the current task. - Parameters - ---------- - dataset_format : str - Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data` - for possible options. - Returns ------- - tuple - X and y - + The X data as a dataframe """ dataset = self.get_dataset() - data, *_ = dataset.get_data(dataset_format=dataset_format, target=None) + data, *_ = dataset.get_data(target=None) return data def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: diff --git a/openml/utils.py b/openml/utils.py index 66c4df800..aecf860d8 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -6,11 +6,10 @@ import warnings from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Mapping, TypeVar, overload +from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload from typing_extensions import Literal, ParamSpec import numpy as np -import pandas as pd import xmltodict from minio.helpers import ProgressType from tqdm import tqdm @@ -27,6 +26,7 @@ P = ParamSpec("P") R = TypeVar("R") + _SizedT = TypeVar("_SizedT", bound=Sized) @overload @@ -237,39 +237,11 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: raise -@overload -def _list_all( - listing_call: Callable[P, Any], - list_output_format: Literal["dict"] = ..., - *args: P.args, - **filters: P.kwargs, -) -> dict: ... - - -@overload -def _list_all( - listing_call: Callable[P, Any], - list_output_format: Literal["object"], - *args: P.args, - **filters: P.kwargs, -) -> dict: ... - - -@overload -def _list_all( - listing_call: Callable[P, Any], - list_output_format: Literal["dataframe"], - *args: P.args, - **filters: P.kwargs, -) -> pd.DataFrame: ... - - -def _list_all( # noqa: C901, PLR0912 - listing_call: Callable[P, Any], - list_output_format: Literal["dict", "dataframe", "object"] = "dict", +def _list_all( # noqa: C901 + listing_call: Callable[P, _SizedT], *args: P.args, **filters: P.kwargs, -) -> dict | pd.DataFrame: +) -> list[_SizedT]: """Helper to handle paged listing requests. Example usage: @@ -280,11 +252,6 @@ def _list_all( # noqa: C901, PLR0912 ---------- listing_call : callable Call listing, e.g. list_evaluations. - list_output_format : str, optional (default='dict') - The parameter decides the format of the output. - - If 'dict' the output is a dict of dict - - If 'dataframe' the output is a pandas DataFrame - - If 'object' the output is a dict of objects (only for some `listing_call`) *args : Variable length argument list Any required arguments for the listing call. **filters : Arbitrary keyword arguments @@ -294,12 +261,12 @@ def _list_all( # noqa: C901, PLR0912 Returns ------- - dict or dataframe + List of types returned from the listing call """ # eliminate filters that have a None value active_filters = {key: value for key, value in filters.items() if value is not None} page = 0 - result = pd.DataFrame() if list_output_format == "dataframe" else {} + results: list[_SizedT] = [] # Default batch size per paging. # This one can be set in filters (batch_size), but should not be @@ -313,6 +280,8 @@ def _list_all( # noqa: C901, PLR0912 if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)): raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}") + # If our batch size is larger than the limit, we should only + # request one batch of size of LIMIT if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT: BATCH_SIZE_ORIG = LIMIT @@ -326,24 +295,17 @@ def _list_all( # noqa: C901, PLR0912 current_offset = offset + BATCH_SIZE_ORIG * page new_batch = listing_call( *args, - output_format=list_output_format, # type: ignore **{**active_filters, "limit": batch_size, "offset": current_offset}, # type: ignore ) except openml.exceptions.OpenMLServerNoResult: - # we want to return an empty dict in this case # NOTE: This above statement may not actually happen, but we could just return here # to enforce it... break - if list_output_format == "dataframe": - if len(result) == 0: - result = new_batch - else: - result = pd.concat([result, new_batch], ignore_index=True) - else: - # For output_format = 'dict' (or catch all) - result.update(new_batch) + results.append(new_batch) + # If the batch is less than our requested batch_size, that's the last batch + # and we can bail out. if len(new_batch) < batch_size: break @@ -352,14 +314,15 @@ def _list_all( # noqa: C901, PLR0912 # check if the number of required results has been achieved # always do a 'bigger than' check, # in case of bugs to prevent infinite loops - if len(result) >= LIMIT: + n_recieved = sum(len(result) for result in results) + if n_recieved >= LIMIT: break # check if there are enough results to fulfill a batch - if LIMIT - len(result) < BATCH_SIZE_ORIG: - batch_size = LIMIT - len(result) + if LIMIT - n_recieved < BATCH_SIZE_ORIG: + batch_size = LIMIT - n_recieved - return result + return results def _get_cache_dir_for_key(key: str) -> Path: diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 4598b8985..102429b41 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -95,25 +95,8 @@ def test__unpack_categories_with_nan_likes(self): self.assertListEqual(list(clean_series.values), expected_values) self.assertListEqual(list(clean_series.cat.categories.values), list("ab")) - def test_get_data_array(self): - # Basic usage - rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array") - assert isinstance(rval, np.ndarray) - assert rval.dtype == np.float32 - assert rval.shape == (898, 39) - assert len(categorical) == 39 - assert all(isinstance(cat, bool) for cat in categorical) - assert len(attribute_names) == 39 - assert all(isinstance(att, str) for att in attribute_names) - assert _ is None - - # check that an error is raised when the dataset contains string - err_msg = "PyOpenML cannot handle string when returning numpy arrays" - with pytest.raises(PyOpenMLError, match=err_msg): - self.titanic.get_data(dataset_format="array") - def test_get_data_pandas(self): - data, _, _, _ = self.titanic.get_data(dataset_format="dataframe") + data, _, _, _ = self.titanic.get_data() assert isinstance(data, pd.DataFrame) assert data.shape[1] == len(self.titanic.features) assert data.shape[0] == 1309 @@ -137,7 +120,6 @@ def test_get_data_pandas(self): assert data[col_name].dtype.name == col_dtype[col_name] X, y, _, _ = self.titanic.get_data( - dataset_format="dataframe", target=self.titanic.default_target_attribute, ) assert isinstance(X, pd.DataFrame) @@ -160,12 +142,6 @@ def test_get_data_boolean_pandas(self): assert data["c"].dtype.name == "category" assert set(data["c"].cat.categories) == {True, False} - def test_get_data_no_str_data_for_nparrays(self): - # check that an error is raised when the dataset contains string - err_msg = "PyOpenML cannot handle string when returning numpy arrays" - with pytest.raises(PyOpenMLError, match=err_msg): - self.titanic.get_data(dataset_format="array") - def _check_expected_type(self, dtype, is_cat, col): if is_cat: expected_type = "category" @@ -193,16 +169,6 @@ def test_get_data_with_rowid(self): assert rval.shape == (898, 38) assert len(categorical) == 38 - def test_get_data_with_target_array(self): - X, y, _, attribute_names = self.dataset.get_data(dataset_format="array", target="class") - assert isinstance(X, np.ndarray) - assert X.dtype == np.float32 - assert X.shape == (898, 38) - assert y.dtype in [np.int32, np.int64] - assert y.shape == (898,) - assert len(attribute_names) == 38 - assert "class" not in attribute_names - @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_target_pandas(self): X, y, categorical, attribute_names = self.dataset.get_data(target="class") @@ -247,13 +213,8 @@ def test_get_data_with_nonexisting_class(self): # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to # indices 4 and 5, and that nothing is mapped to index 3. - _, y, _, _ = self.dataset.get_data("class", dataset_format="dataframe") + _, y, _, _ = self.dataset.get_data("class") assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"] - _, y, _, _ = self.dataset.get_data("class", dataset_format="array") - assert np.min(y) == 0 - assert np.max(y) == 5 - # Check that no label is mapped to 3, since it is reserved for label '4'. - assert np.sum(y == 3) == 0 def test_get_data_corrupt_pickle(self): # Lazy loaded dataset, populate cache. @@ -345,21 +306,20 @@ def test_get_feature_with_ontology_data_id_11(self): def test_add_remove_ontology_to_dataset(self): did = 1 feature_index = 1 - ontology = 'https://www.openml.org/unittest/' + str(time()) + ontology = "https://www.openml.org/unittest/" + str(time()) openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology) openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology) def test_add_same_ontology_multiple_features(self): did = 1 - ontology = 'https://www.openml.org/unittest/' + str(time()) + ontology = "https://www.openml.org/unittest/" + str(time()) for i in range(3): openml.datasets.functions.data_feature_add_ontology(did, i, ontology) - def test_add_illegal_long_ontology(self): did = 1 - ontology = 'http://www.google.com/' + ('a' * 257) + ontology = "http://www.google.com/" + ("a" * 257) try: openml.datasets.functions.data_feature_add_ontology(did, 1, ontology) assert False @@ -368,13 +328,14 @@ def test_add_illegal_long_ontology(self): def test_add_illegal_url_ontology(self): did = 1 - ontology = 'not_a_url' + str(time()) + ontology = "not_a_url" + str(time()) try: openml.datasets.functions.data_feature_add_ontology(did, 1, ontology) assert False except openml.exceptions.OpenMLServerException as e: assert e.code == 1106 + @pytest.mark.production() class OpenMLDatasetTestSparse(TestBase): _multiprocess_can_split_ = True @@ -385,28 +346,8 @@ def setUp(self): self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False) - def test_get_sparse_dataset_array_with_target(self): - X, y, _, attribute_names = self.sparse_dataset.get_data( - dataset_format="array", - target="class", - ) - - assert sparse.issparse(X) - assert X.dtype == np.float32 - assert X.shape == (600, 20000) - - assert isinstance(y, np.ndarray) - assert y.dtype in [np.int32, np.int64] - assert y.shape == (600,) - - assert len(attribute_names) == 20000 - assert "class" not in attribute_names - def test_get_sparse_dataset_dataframe_with_target(self): - X, y, _, attribute_names = self.sparse_dataset.get_data( - dataset_format="dataframe", - target="class", - ) + X, y, _, attribute_names = self.sparse_dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) assert isinstance(X.dtypes[0], pd.SparseDtype) assert X.shape == (600, 20000) @@ -418,18 +359,6 @@ def test_get_sparse_dataset_dataframe_with_target(self): assert len(attribute_names) == 20000 assert "class" not in attribute_names - def test_get_sparse_dataset_array(self): - rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array") - assert sparse.issparse(rval) - assert rval.dtype == np.float32 - assert rval.shape == (600, 20001) - - assert len(categorical) == 20001 - assert all(isinstance(cat, bool) for cat in categorical) - - assert len(attribute_names) == 20001 - assert all(isinstance(att, str) for att in attribute_names) - def test_get_sparse_dataset_dataframe(self): rval, *_ = self.sparse_dataset.get_data() assert isinstance(rval, pd.DataFrame) @@ -439,52 +368,12 @@ def test_get_sparse_dataset_dataframe(self): ) assert rval.shape == (600, 20001) - def test_get_sparse_dataset_with_rowid(self): - self.sparse_dataset.row_id_attribute = ["V256"] - rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", - include_row_id=True, - ) - assert sparse.issparse(rval) - assert rval.dtype == np.float32 - assert rval.shape == (600, 20001) - assert len(categorical) == 20001 - - rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", - include_row_id=False, - ) - assert sparse.issparse(rval) - assert rval.dtype == np.float32 - assert rval.shape == (600, 20000) - assert len(categorical) == 20000 - - def test_get_sparse_dataset_with_ignore_attributes(self): - self.sparse_dataset.ignore_attribute = ["V256"] - rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", - include_ignore_attribute=True, - ) - assert sparse.issparse(rval) - assert rval.dtype == np.float32 - assert rval.shape == (600, 20001) - - assert len(categorical) == 20001 - rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", - include_ignore_attribute=False, - ) - assert sparse.issparse(rval) - assert rval.dtype == np.float32 - assert rval.shape == (600, 20000) - assert len(categorical) == 20000 - def test_get_sparse_dataset_rowid_and_ignore_and_target(self): # TODO: re-add row_id and ignore attributes self.sparse_dataset.ignore_attribute = ["V256"] self.sparse_dataset.row_id_attribute = ["V512"] + # TODO(eddiebergman): Will break from dataset_format removal X, y, categorical, _ = self.sparse_dataset.get_data( - dataset_format="array", target="class", include_row_id=False, include_ignore_attribute=False, diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index a15100070..3995f1b98 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -308,7 +308,8 @@ def ensure_absence_of_real_data(): def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) - X, *_ = dataset.get_data(dataset_format="array") + # TODO(eddiebergman): Will break from dataset_format removal + X, *_ = dataset.get_data() assert isinstance(X, scipy.sparse.csr_matrix) def test_download_rowid(self): diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index e181aaa15..242f6793c 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -261,7 +261,7 @@ def test_serialize_model(self): ("min_samples_split", "2"), ("min_weight_fraction_leaf", "0.0"), ("presort", presort_val), - ('monotonic_cst', 'null'), + ("monotonic_cst", "null"), ("random_state", "null"), ("splitter", '"best"'), ), @@ -331,21 +331,23 @@ def test_serialize_model_clustering(self): n_init = '"auto"' algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"' - fixture_parameters = OrderedDict([ - ("algorithm", algorithm), - ("copy_x", "true"), - ("init", '"k-means++"'), - ("max_iter", "300"), - ("n_clusters", "8"), - ("n_init", n_init), - ("n_jobs", n_jobs_val), - ("precompute_distances", precomp_val), - ("random_state", "null"), - ("tol", "0.0001"), - ("verbose", "0"), - ]) - - if sklearn_version >= Version("1.0" ): + fixture_parameters = OrderedDict( + [ + ("algorithm", algorithm), + ("copy_x", "true"), + ("init", '"k-means++"'), + ("max_iter", "300"), + ("n_clusters", "8"), + ("n_init", n_init), + ("n_jobs", n_jobs_val), + ("precompute_distances", precomp_val), + ("random_state", "null"), + ("tol", "0.0001"), + ("verbose", "0"), + ] + ) + + if sklearn_version >= Version("1.0"): fixture_parameters.pop("n_jobs") fixture_parameters.pop("precompute_distances") @@ -369,7 +371,9 @@ def test_serialize_model_clustering(self): @pytest.mark.sklearn() def test_serialize_model_with_subcomponent(self): - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()} model = sklearn.ensemble.AdaBoostClassifier( n_estimators=100, @@ -428,8 +432,7 @@ def test_serialize_model_with_subcomponent(self): serialization.components[estimator_name].class_name == fixture_subcomponent_class_name ) assert ( - serialization.components[estimator_name].description - == fixture_subcomponent_description + serialization.components[estimator_name].description == fixture_subcomponent_description ) self.assertDictEqual(structure, fixture_structure) @@ -702,7 +705,9 @@ def test_serialize_column_transformer_pipeline(self): reason="Pipeline processing behaviour updated", ) def test_serialize_feature_union(self): - sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" + sparse_parameter = ( + "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" + ) ohe_params = {sparse_parameter: False} if Version(sklearn.__version__) >= Version("0.20"): ohe_params["categories"] = "auto" @@ -719,7 +724,9 @@ def test_serialize_feature_union(self): ) structure = serialization.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + module_name_encoder = ( + "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + ) scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" fixture_name = ( "sklearn.pipeline.FeatureUnion(" @@ -765,7 +772,9 @@ def test_serialize_feature_union(self): @pytest.mark.sklearn() def test_serialize_feature_union_switched_names(self): - ohe_params = {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {} + ohe_params = ( + {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {} + ) ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) scaler = sklearn.preprocessing.StandardScaler() fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) @@ -787,7 +796,9 @@ def test_serialize_feature_union_switched_names(self): ) # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + module_name_encoder = ( + "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + ) scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" assert ( fu1_serialization.name == "sklearn.pipeline.FeatureUnion(" @@ -836,7 +847,9 @@ def test_serialize_complex_flow(self): ) structure = serialized.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + module_name_encoder = ( + "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + ) ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder scaler_name = "sklearn.preprocessing.{}.StandardScaler".format( "data" if Version(sklearn.__version__) < Version("0.22") else "_data", @@ -1294,7 +1307,9 @@ def test_paralizable_check(self): # using this param distribution should not raise an exception legal_param_dist = {"n_estimators": [2, 3, 4]} - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) legal_models = [ sklearn.ensemble.RandomForestClassifier(), sklearn.ensemble.RandomForestClassifier(n_jobs=5), @@ -1506,7 +1521,9 @@ def test_deserialize_complex_with_defaults(self): pipe_adjusted = sklearn.clone(pipe_orig) impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean" sparse = Version(sklearn.__version__) >= Version("0.23") - sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" + sparse_parameter = ( + "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" + ) estimator_name = ( "base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator" ) @@ -1532,7 +1549,9 @@ def test_deserialize_complex_with_defaults(self): @pytest.mark.sklearn() def test_openml_param_name_to_sklearn(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -1569,17 +1588,21 @@ def test_openml_param_name_to_sklearn(self): def test_obtain_parameter_values_flow_not_from_server(self): model = sklearn.linear_model.LogisticRegression(solver="lbfgs") flow = self.extension.model_to_flow(model) - logistic_name = "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic" + logistic_name = ( + "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic" + ) msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!" with pytest.raises(ValueError, match=msg): self.extension.obtain_parameter_values(flow) - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) model = sklearn.ensemble.AdaBoostClassifier( **{ estimator_name: sklearn.linear_model.LogisticRegression( - solver="lbfgs", + solver="lbfgs", ), } ) @@ -1650,7 +1673,7 @@ def test_run_model_on_task(self): ("dummy", sklearn.dummy.DummyClassifier()), ], ) - openml.runs.run_model_on_task(pipe, task, dataset_format="array") + openml.runs.run_model_on_task(pipe, task) @pytest.mark.sklearn() def test_seed_model(self): @@ -1751,7 +1774,9 @@ def test_run_model_on_fold_classification_1_array(self): assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( + lambda: collections.defaultdict(dict) + ) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] @@ -1778,7 +1803,7 @@ def test_run_model_on_fold_classification_1_dataframe(self): task = openml.tasks.get_task(1) # anneal; crossvalidation # diff test_run_model_on_fold_classification_1_array() - X, y = task.get_X_and_y(dataset_format="dataframe") + X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) X_train = X.iloc[train_indices] y_train = y.iloc[train_indices] @@ -1786,7 +1811,9 @@ def test_run_model_on_fold_classification_1_dataframe(self): y_test = y.iloc[test_indices] # Helper functions to return required columns for ColumnTransformer - sparse = {"sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False} + sparse = { + "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False + } cat_imp = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore", **sparse), @@ -1825,7 +1852,9 @@ def test_run_model_on_fold_classification_1_dataframe(self): assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( + lambda: collections.defaultdict(dict) + ) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] @@ -1878,7 +1907,9 @@ def test_run_model_on_fold_classification_2(self): assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape)) # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( + lambda: collections.defaultdict(dict) + ) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] @@ -2001,7 +2032,9 @@ def test_run_model_on_fold_regression(self): assert y_hat_proba is None # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( + lambda: collections.defaultdict(dict) + ) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] @@ -2023,7 +2056,7 @@ def test_run_model_on_fold_clustering(self): openml.config.server = self.production_server task = openml.tasks.get_task(126033) - X = task.get_X(dataset_format="array") + X = task.get_X() pipeline = sklearn.pipeline.Pipeline( steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())], @@ -2045,7 +2078,9 @@ def test_run_model_on_fold_clustering(self): assert y_hat_proba is None # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict)) + fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( + lambda: collections.defaultdict(dict) + ) for measure in user_defined_measures: fold_evaluations[measure][0][0] = user_defined_measures[measure] @@ -2186,7 +2221,6 @@ def test_run_on_model_with_empty_steps(self): X, y, categorical_ind, feature_names = dataset.get_data( target=dataset.default_target_attribute, - dataset_format="array", ) categorical_ind = np.array(categorical_ind) (cat_idx,) = np.where(categorical_ind) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index d43a8bab5..960a72d6d 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -413,7 +413,6 @@ def test_run_regression_on_classif_task(self): model=clf, task=task, avoid_duplicate_runs=False, - dataset_format="array", ) @pytest.mark.sklearn() @@ -1355,7 +1354,6 @@ def test__run_task_get_arffcontent(self): model=clf, task=task, add_local_measures=True, - dataset_format="dataframe", ) arff_datacontent, trace, fold_evaluations, _ = res # predictions @@ -1601,7 +1599,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): task=task, extension=self.extension, add_local_measures=True, - dataset_format="dataframe", ) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different @@ -1645,7 +1642,6 @@ def test_run_on_dataset_with_missing_labels_array(self): task=task, extension=self.extension, add_local_measures=True, - dataset_format="array", # diff test_run_on_dataset_with_missing_labels_dataframe() ) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different @@ -1767,7 +1763,7 @@ def test_format_prediction_task_regression(self): def test__run_task_get_arffcontent_2(self, parallel_mock): """Tests if a run executed in parallel is collated correctly.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp - x, y = task.get_X_and_y(dataset_format="dataframe") + x, y = task.get_X_and_y() num_instances = x.shape[0] line_length = 6 + len(task.class_labels) loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss" @@ -1780,7 +1776,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): model=clf, task=task, add_local_measures=True, - dataset_format="array", # "dataframe" would require handling of categoricals + # dataset_format="array", # "dataframe" would require handling of categoricals n_jobs=n_jobs, ) # This unit test will fail if joblib is unable to distribute successfully since the @@ -1825,7 +1821,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): def test_joblib_backends(self, parallel_mock): """Tests evaluation of a run using various joblib backends and n_jobs.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp - x, y = task.get_X_and_y(dataset_format="dataframe") + x, y = task.get_X_and_y() num_instances = x.shape[0] line_length = 6 + len(task.class_labels) @@ -1865,7 +1861,7 @@ def test_joblib_backends(self, parallel_mock): model=clf, task=task, add_local_measures=True, - dataset_format="array", # "dataframe" would require handling of categoricals + # dataset_format="array", # "dataframe" would require handling of categoricals n_jobs=n_jobs, ) assert type(res[0]) == list diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 9e357f6aa..06cf821c2 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -164,13 +164,6 @@ def test_list_setups_output_format(self): assert isinstance(setups, pd.DataFrame) assert len(setups) == 10 - # TODO: [0.15] Remove section as `dict` is no longer supported. - with pytest.warns(FutureWarning): - setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10) - assert isinstance(setups, Dict) - assert isinstance(setups[next(iter(setups.keys()))], Dict) - assert len(setups) == 10 - def test_setuplist_offset(self): size = 10 setups = openml.setups.list_setups(offset=0, size=size) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index cae947917..4f71f63c3 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -5,6 +5,7 @@ import pytest import shutil import openml +from itertools import chain from openml.testing import _check_dataset @@ -83,10 +84,6 @@ def _mocked_perform_api_call(call, request_method): @pytest.mark.server() def test_list_all(): openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) - openml.utils._list_all( - listing_call=openml.tasks.functions._list_tasks, - list_output_format="dataframe", - ) @pytest.mark.server() @@ -104,12 +101,13 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): # By setting the batch size one lower than the minimum we guarantee at least two # batches and at the same time do as few batches (roundtrips) as possible. batch_size = min_number_tasks_on_test_server - 1 - res = openml.utils._list_all( + batches = openml.utils._list_all( listing_call=openml.tasks.functions._list_tasks, - list_output_format="dataframe", batch_size=batch_size, ) - assert min_number_tasks_on_test_server <= len(res) + assert len(batches) >= 2 + flattened = list(chain(*batches)) + assert min_number_tasks_on_test_server <= len(flattened) @pytest.mark.server() @@ -202,4 +200,4 @@ def test_correct_test_server_download_state(): """ task = openml.tasks.get_task(119) dataset = task.get_dataset() - assert len(dataset.features) == dataset.get_data(dataset_format="dataframe")[0].shape[1] \ No newline at end of file + assert len(dataset.features) == dataset.get_data()[0].shape[1] \ No newline at end of file From 6296f3778a2e6bf2d74967b35255a9d0b71198d8 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 16 Oct 2024 18:08:47 +0200 Subject: [PATCH 02/23] refactor: Explicitly name parameter to listing functions --- examples/40_paper/2018_ida_strang_example.py | 14 ++--- openml/datasets/functions.py | 56 +++++++++++++------ openml/evaluations/functions.py | 25 +++++---- openml/flows/functions.py | 22 ++++---- openml/runs/functions.py | 50 +++++++++-------- openml/setups/functions.py | 38 +++++++------ openml/study/functions.py | 25 +++++---- openml/tasks/functions.py | 57 ++++++++++++++------ openml/testing.py | 4 +- openml/utils.py | 43 +++++++-------- tests/test_tasks/test_task.py | 2 +- tests/test_utils/test_utils.py | 1 - 12 files changed, 204 insertions(+), 133 deletions(-) diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py index 8b225125b..8b116507c 100644 --- a/examples/40_paper/2018_ida_strang_example.py +++ b/examples/40_paper/2018_ida_strang_example.py @@ -17,8 +17,8 @@ # License: BSD 3-Clause import matplotlib.pyplot as plt + import openml -import pandas as pd ############################################################################## # A basic step for each data-mining or machine learning task is to determine @@ -47,13 +47,16 @@ # Downloads all evaluation records related to this study evaluations = openml.evaluations.list_evaluations( - measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe" + measure, + size=None, + flows=flow_ids, + study=study_id, ) # gives us a table with columns data_id, flow1_value, flow2_value evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna() # downloads all data qualities (for scatter plot) data_qualities = openml.datasets.list_datasets( - data_id=list(evaluations.index.values), output_format="dataframe" + data_id=list(evaluations.index.values), ) # removes irrelevant data qualities data_qualities = data_qualities[meta_features] @@ -86,10 +89,9 @@ def determine_class(val_lin, val_nonlin): if val_lin < val_nonlin: return class_values[0] - elif val_nonlin < val_lin: + if val_nonlin < val_lin: return class_values[1] - else: - return class_values[2] + return class_values[2] evaluations["class"] = evaluations.apply( diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 5ce095814..a328f9762 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -5,6 +5,7 @@ import logging import warnings from collections import OrderedDict +from functools import partial from pathlib import Path from pyexpat import ExpatError from typing import TYPE_CHECKING, Any @@ -81,7 +82,12 @@ def list_datasets( size: int | None = None, status: str | None = None, tag: str | None = None, - **kwargs: Any, + data_name: str | None = None, + data_version: int | None = None, + number_instances: int | None = None, + number_features: int | None = None, + number_classes: int | None = None, + number_missing_values: int | None = None, ) -> pd.DataFrame: """Return a dataframe of all dataset which are on OpenML. @@ -101,10 +107,12 @@ def list_datasets( default active datasets are returned, but also datasets from another status can be requested. tag : str, optional - kwargs : dict, optional - Legal filter operators (keys in the dict): - data_name, data_version, number_instances, - number_features, number_classes, number_missing_values. + data_name : str, optional + data_version : int, optional + number_instances : int, optional + number_features : int, optional + number_classes : int, optional + number_missing_values : int, optional Returns ------- @@ -118,19 +126,29 @@ def list_datasets( If qualities are calculated for the dataset, some of these are also included as columns. """ - batches = openml.utils._list_all( - listing_call=_list_datasets, + listing_call = partial( + _list_datasets, data_id=data_id, - offset=offset, - size=size, status=status, tag=tag, - **kwargs, + data_name=data_name, + data_version=data_version, + number_instances=number_instances, + number_features=number_features, + number_classes=number_classes, + number_missing_values=number_missing_values, ) - return pd.concat(batches, ignore_index=True) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + return pd.concat(batches) -def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame: +def _list_datasets( + limit: int, + offset: int, + *, + data_id: list[int] | None = None, + **kwargs: Any, +) -> pd.DataFrame: """ Perform api call to return a list of all datasets. @@ -141,6 +159,10 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr display_errors is also separated from the kwargs since it has a default value. + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. data_id : list, optional kwargs : dict, optional @@ -152,7 +174,7 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr ------- datasets : dataframe """ - api_call = "data/list" + api_call = f"data/list/list/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): @@ -242,12 +264,13 @@ def check_datasets_active( dict A dictionary with items {did: bool} """ - datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe") - missing = set(dataset_ids) - set(datasets.get("did", [])) + datasets = list_datasets(status="all", data_id=dataset_ids) + missing = set(dataset_ids) - set(datasets.index) if raise_error_if_not_exist and missing: missing_str = ", ".join(str(did) for did in missing) raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.") - return dict(datasets["status"] == "active") + mask = datasets["status"] == "active" + return dict(mask) def _name_to_id( @@ -285,7 +308,6 @@ def _name_to_id( data_name=dataset_name, status=status, data_version=version, - output_format="dataframe", ) if error_if_multiple and len(candidates) > 1: msg = f"Multiple active datasets exist with name '{dataset_name}'." diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 3b4735250..0adab61ac 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +from functools import partial from itertools import chain from typing import Any from typing_extensions import Literal, overload @@ -56,7 +57,7 @@ def list_evaluations( def list_evaluations( function: str, offset: int | None = None, - size: int | None = 10000, + size: int | None = None, tasks: list[str | int] | None = None, setups: list[str | int] | None = None, flows: list[str | int] | None = None, @@ -118,11 +119,9 @@ def list_evaluations( if per_fold is not None: per_fold_str = str(per_fold).lower() - eval_collection: list[list[OpenMLEvaluation]] = openml.utils._list_all( - listing_call=_list_evaluations, + listing_call = partial( + _list_evaluations, function=function, - offset=offset, - size=size, tasks=tasks, setups=setups, flows=flows, @@ -133,8 +132,9 @@ def list_evaluations( sort_order=sort_order, per_fold=per_fold_str, ) - flattened = list(chain.from_iterable(eval_collection)) + eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size) + flattened = list(chain.from_iterable(eval_collection)) if output_format == "dataframe": records = [item._to_dict() for item in flattened] return pd.DataFrame.from_records(records, index="run_id") @@ -143,6 +143,9 @@ def list_evaluations( def _list_evaluations( + limit: int, + offset: int, + *, function: str, tasks: list | None = None, setups: list | None = None, @@ -161,6 +164,10 @@ def _list_evaluations( The arguments that are lists are separated from the single value ones which are put into the kwargs. + limit : int + the number of evaluations to return + offset : int + the number of evaluations to skip, starting from the first function : str the evaluation function. e.g., predictive_accuracy @@ -178,7 +185,7 @@ def _list_evaluations( study : int, optional kwargs: dict, optional - Legal filter operators: tag, limit, offset. + Legal filter operators: tag, per_fold sort_order : str, optional order of sorting evaluations, ascending ("asc") or descending ("desc") @@ -187,7 +194,7 @@ def _list_evaluations( ------- list of OpenMLEvaluation objects """ - api_call = f"evaluation/list/function/{function}" + api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" @@ -202,7 +209,7 @@ def _list_evaluations( if uploaders is not None: api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders])) if study is not None: - api_call += "/study/%d" % study + api_call += f"/study/{study}" if sort_order is not None: api_call += f"/sort_order/{sort_order}" diff --git a/openml/flows/functions.py b/openml/flows/functions.py index e43e18c17..f1f1da6fb 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -4,6 +4,7 @@ import os import re from collections import OrderedDict +from functools import partial from typing import Any, Dict import dateutil.parser @@ -135,7 +136,7 @@ def list_flows( offset: int | None = None, size: int | None = None, tag: str | None = None, - **kwargs: Any, + uploader: str | None = None, ) -> pd.DataFrame: """ Return a list of all flows which are on OpenML. @@ -164,30 +165,29 @@ def list_flows( - external version - uploader """ - batches = openml.utils._list_all( - listing_call=_list_flows, - offset=offset, - size=size, - tag=tag, - **kwargs, - ) + listing_call = partial(_list_flows, tag=tag, uploader=uploader) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) return pd.concat(batches, ignore_index=True) -def _list_flows(**kwargs: Any) -> pd.DataFrame: +def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: """ Perform the api call that return a list of all flows. Parameters ---------- + limit : int + the maximum number of flows to return + offset : int + the number of flows to skip, starting from the first kwargs: dict, optional - Legal filter operators: uploader, tag, limit, offset. + Legal filter operators: uploader, tag Returns ------- flows : dataframe """ - api_call = "flow/list" + api_call = f"flow/list/limit/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): diff --git a/openml/runs/functions.py b/openml/runs/functions.py index cff649c1d..67a035825 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -5,6 +5,7 @@ import time import warnings from collections import OrderedDict +from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Any @@ -447,8 +448,7 @@ def run_exists(task_id: int, setup_id: int) -> set[int]: return set() try: - result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe") - assert isinstance(result, pd.DataFrame) # TODO(eddiebergman): Remove once #1299 + result = list_runs(task=[task_id], setup=[setup_id]) return set() if result.empty else set(result["run_id"]) except OpenMLServerException as exception: # error code implies no results. The run does not exist yet @@ -1033,10 +1033,10 @@ def list_runs( # noqa: PLR0913 tag: str | None = None, study: int | None = None, display_errors: bool = False, # noqa: FBT001, FBT002 - **kwargs: Any, + task_type: TaskType | int | None = None, ) -> pd.DataFrame: - """List all runs matching all of the given filters. - + """ + List all runs matching all of the given filters. (Supports large amount of results) Parameters @@ -1064,8 +1064,7 @@ def list_runs( # noqa: PLR0913 Whether to list runs which have an error (for example a missing prediction file). - kwargs : dict, optional - Legal filter operators: task_type. + task_type : str, optional Returns ------- @@ -1082,10 +1081,8 @@ def list_runs( # noqa: PLR0913 if uploader is not None and (not isinstance(uploader, list)): raise TypeError("uploader must be of type list.") - batches = openml.utils._list_all( - listing_call=_list_runs, - offset=offset, - size=size, + listing_call = partial( + _list_runs, id=id, task=task, setup=setup, @@ -1094,20 +1091,25 @@ def list_runs( # noqa: PLR0913 tag=tag, study=study, display_errors=display_errors, - **kwargs, + task_type=task_type, ) - return pd.concat(batches, ignore_index=True) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + return pd.concat(batches) def _list_runs( # noqa: PLR0913 + limit: int, + offset: int, + *, id: list | None = None, # noqa: A002 task: list | None = None, setup: list | None = None, flow: list | None = None, uploader: list | None = None, study: int | None = None, - display_errors: bool = False, # noqa: FBT002, FBT001 - **kwargs: Any, + tag: str | None = None, + display_errors: bool = False, + task_type: TaskType | int | None = None, ) -> pd.DataFrame: """ Perform API call `/run/list/{filters}' @@ -1128,6 +1130,8 @@ def _list_runs( # noqa: PLR0913 flow : list, optional + tag: str, optional + uploader : list, optional study : int, optional @@ -1136,17 +1140,14 @@ def _list_runs( # noqa: PLR0913 Whether to list runs which have an error (for example a missing prediction file). - kwargs : dict, optional - Legal filter operators: task_type. + task_type : str, optional Returns ------- - dataframe of found runs. + dict, or dataframe + List of found runs. """ - api_call = "run/list" - if kwargs is not None: - for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + api_call = f"run/list/limit/{limit}/offset/{offset}" if id is not None: api_call += "/run/{}".format(",".join([str(int(i)) for i in id])) if task is not None: @@ -1161,6 +1162,11 @@ def _list_runs( # noqa: PLR0913 api_call += "/study/%d" % study if display_errors: api_call += "/show_errors/true" + if tag is not None: + api_call += f"/tag/{tag}" + if task_type is not None: + tvalue = task_type.value if isinstance(task_type, TaskType) else task_type + api_call += f"/task_type/{tvalue}" return __list_runs(api_call=api_call) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 383e2e16e..0c3095501 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -2,6 +2,7 @@ from __future__ import annotations from collections import OrderedDict +from functools import partial from itertools import chain from pathlib import Path from typing import Any, Iterable @@ -149,6 +150,7 @@ def list_setups( # noqa: PLR0913 output_format: str, optional (default='object') The parameter decides the format of the output. - If 'dataframe' the output is a pandas DataFrame + - If 'object' the output is a dictionary of OpenMLSetup objects Returns ------- @@ -159,15 +161,12 @@ def list_setups( # noqa: PLR0913 "Invalid output format selected. Only 'object', or 'dataframe' applicable.", ) - batch_size = 1000 # batch size for setups is lower + listing_call = partial(_list_setups, flow=flow, tag=tag, setup=setup) batches = openml.utils._list_all( - listing_call=_list_setups, + listing_call, + batch_size=1_000, # batch size for setups is lower offset=offset, - size=size, - flow=flow, - tag=tag, - setup=setup, - batch_size=batch_size, + limit=size, ) flattened = list(chain.from_iterable(batches)) if output_format == "object": @@ -176,7 +175,14 @@ def list_setups( # noqa: PLR0913 return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id") -def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[OpenMLSetup]: +def _list_setups( + limit: int, + offset: int, + *, + setup: Iterable[int] | None = None, + flow: int | None = None, + tag: str | None = None, +) -> list[OpenMLSetup]: """Perform API call `/setup/list/{filters}` Parameters @@ -184,21 +190,23 @@ def _list_setups(setup: Iterable[int] | None = None, **kwargs: Any) -> list[Open The setup argument that is a list is separated from the single value filters which are put into the kwargs. + limit : int + listing_offset : int setup : list(int), optional - - kwargs: dict, optional - Legal filter operators: flow, setup, limit, offset, tag. + flow : int, optional + tag : str, optional Returns ------- The setups that match the filters, going from id to the OpenMLSetup object. """ - api_call = "setup/list" + api_call = f"setup/list/offset/{offset}/limit/{limit}" if setup is not None: api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) - if kwargs is not None: - for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + if flow is not None: + api_call += f"/flow/{flow}" + if tag is not None: + api_call += f"/tag/{tag}" return __list_setups(api_call=api_call) diff --git a/openml/study/functions.py b/openml/study/functions.py index f7db1e0aa..59f255b47 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -3,6 +3,7 @@ from __future__ import annotations import warnings +from functools import partial from typing import TYPE_CHECKING, Any import pandas as pd @@ -465,14 +466,13 @@ def list_suites( - creator - creation_date """ - batches = openml.utils._list_all( - listing_call=_list_studies, - offset=offset, - size=size, + listing_call = partial( + _list_studies, main_entity_type="task", status=status, uploader=uploader, ) + batches = openml.utils._list_all(listing_call, limit=size, offset=offset) return pd.concat(batches, ignore_index=True) @@ -514,32 +514,35 @@ def list_studies( If qualities are calculated for the dataset, some of these are also returned. """ - batches = openml.utils._list_all( - listing_call=_list_studies, - offset=offset, - size=size, + listing_call = partial( + _list_studies, main_entity_type="run", status=status, uploader=uploader, benchmark_suite=benchmark_suite, ) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) return pd.concat(batches, ignore_index=True) -def _list_studies(**kwargs: Any) -> pd.DataFrame: +def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: """Perform api call to return a list of studies. Parameters ---------- + limit: int + The maximum number of studies to return. + offset: int + The number of studies to skip, starting from the first. kwargs : dict, optional Legal filter operators (keys in the dict): - status, limit, offset, main_entity_type, uploader + status, main_entity_type, uploader, benchmark_suite Returns ------- studies : dataframe """ - api_call = "study/list" + api_call = f"study/list/limit/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): api_call += f"/{operator}/{value}" diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 6877658f7..1f4ec71f8 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -4,6 +4,7 @@ import os import re import warnings +from functools import partial from typing import Any import pandas as pd @@ -125,12 +126,18 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]: return procs -def list_tasks( +def list_tasks( # noqa: PLR0913 task_type: TaskType | None = None, offset: int | None = None, size: int | None = None, tag: str | None = None, - **kwargs: Any, + data_tag: str | None = None, + status: str | None = None, + data_name: str | None = None, + number_instances: int | None = None, + number_features: int | None = None, + number_classes: int | None = None, + number_missing_values: int | None = None, ) -> pd.DataFrame: """ Return a number of tasks having the given tag and task_type @@ -140,18 +147,22 @@ def list_tasks( Filter task_type is separated from the other filters because it is used as task_type in the task description, but it is named type when used as a filter in list tasks call. - task_type : TaskType, optional - Refers to the type of task. offset : int, optional the number of tasks to skip, starting from the first + task_type : TaskType, optional + Refers to the type of task. size : int, optional the maximum number of tasks to show tag : str, optional the tag to include - kwargs: dict, optional - Legal filter operators: data_tag, status, data_id, data_name, - number_instances, number_features, - number_classes, number_missing_values. + data_tag : str, optional + the tag of the dataset + status : str, optional + data_name : str, optional + number_instances : int, optional + number_features : int, optional + number_classes : int, optional + number_missing_values : int, optional Returns ------- @@ -161,18 +172,29 @@ def list_tasks( as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned. """ - batches = openml.utils._list_all( - listing_call=_list_tasks, - task_type=task_type, - offset=offset, + listing_call = partial( + _list_tasks, size=size, + task_type=task_type, tag=tag, - **kwargs, + data_tag=data_tag, + status=status, + data_name=data_name, + number_instances=number_instances, + number_features=number_features, + number_classes=number_classes, + number_missing_values=number_missing_values, ) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) return pd.concat(batches, ignore_index=True) -def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFrame: +def _list_tasks( + limit: int, + offset: int, + task_type: TaskType | int | None = None, + **kwargs: Any, +) -> pd.DataFrame: """ Perform the api call to return a number of tasks having the given filters. @@ -181,6 +203,8 @@ def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFram Filter task_type is separated from the other filters because it is used as task_type in the task description, but it is named type when used as a filter in list tasks call. + limit: int + offset: int task_type : TaskType, optional Refers to the type of task. kwargs: dict, optional @@ -192,9 +216,10 @@ def _list_tasks(task_type: TaskType | None = None, **kwargs: Any) -> pd.DataFram ------- dataframe """ - api_call = "task/list" + api_call = f"task/list/limit/{limit}/offset/{offset}" if task_type is not None: - api_call += "/type/%d" % task_type.value + tvalue = task_type.value if isinstance(task_type, TaskType) else task_type + api_call += f"/type/{tvalue}" if kwargs is not None: for operator, value in kwargs.items(): if operator == "task_id": diff --git a/openml/testing.py b/openml/testing.py index 9016ff6a9..3a3c19c7e 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -12,7 +12,6 @@ from pathlib import Path from typing import ClassVar -import pandas as pd import requests import openml @@ -286,8 +285,7 @@ def check_task_existence( int, None """ return_val = None - tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe") - assert isinstance(tasks, pd.DataFrame) + tasks = openml.tasks.list_tasks(task_type=task_type) if len(tasks) == 0: return None tasks = tasks.loc[tasks["did"] == dataset_id] diff --git a/openml/utils.py b/openml/utils.py index aecf860d8..52e3a3675 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -238,9 +238,11 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: def _list_all( # noqa: C901 - listing_call: Callable[P, _SizedT], - *args: P.args, - **filters: P.kwargs, + listing_call: Callable[[int, int], _SizedT], + *, + limit: int | None = None, + offset: int | None = None, + batch_size: int | None = 10_000, ) -> list[_SizedT]: """Helper to handle paged listing requests. @@ -251,32 +253,35 @@ def _list_all( # noqa: C901 Parameters ---------- listing_call : callable - Call listing, e.g. list_evaluations. - *args : Variable length argument list - Any required arguments for the listing call. - **filters : Arbitrary keyword arguments - Any filters that can be applied to the listing function. - additionally, the batch_size can be specified. This is - useful for testing purposes. + Call listing, e.g. list_evaluations. Takes two positional + arguments: batch_size and offset. + batch_size : int, optional + The batch size to use for the listing call. + offset : int, optional + The initial offset to use for the listing call. + limit : int, optional + The total size of the listing. If not provided, the function will + request the first batch and then continue until no more results are + returned Returns ------- - List of types returned from the listing call + List of types returned from type of the listing call """ - # eliminate filters that have a None value - active_filters = {key: value for key, value in filters.items() if value is not None} page = 0 results: list[_SizedT] = [] + LIMIT = limit + BATCH_SIZE_ORIG = batch_size + offset = offset if offset is not None else 0 + batch_size = batch_size if batch_size is not None else 10_000 + # Default batch size per paging. # This one can be set in filters (batch_size), but should not be # changed afterwards. The derived batch_size can be changed. - BATCH_SIZE_ORIG = active_filters.pop("batch_size", 10000) if not isinstance(BATCH_SIZE_ORIG, int): raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}") - # max number of results to be shown - LIMIT: int | float | None = active_filters.pop("size", None) # type: ignore if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)): raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}") @@ -285,7 +290,6 @@ def _list_all( # noqa: C901 if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT: BATCH_SIZE_ORIG = LIMIT - offset = active_filters.pop("offset", 0) if not isinstance(offset, int): raise ValueError(f"'offset' should be an integer but got {offset}") @@ -293,10 +297,7 @@ def _list_all( # noqa: C901 while True: try: current_offset = offset + BATCH_SIZE_ORIG * page - new_batch = listing_call( - *args, - **{**active_filters, "limit": batch_size, "offset": current_offset}, # type: ignore - ) + new_batch = listing_call(batch_size, current_offset) except openml.exceptions.OpenMLServerNoResult: # NOTE: This above statement may not actually happen, but we could just return here # to enforce it... diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index ec5a8caf5..311ffd365 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -71,7 +71,7 @@ def test_upload_task(self): ) def _get_compatible_rand_dataset(self) -> list: - active_datasets = list_datasets(status="active", output_format="dataframe") + active_datasets = list_datasets(status="active") # depending on the task type, find either datasets # with only symbolic features or datasets with only diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 4f71f63c3..38d497e2d 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -115,7 +115,6 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server): datasets = openml.datasets.list_datasets( batch_size=100, size=min_number_datasets_on_test_server, - output_format="dataframe", ) assert min_number_datasets_on_test_server == len(datasets) From a31dafc3ccb5832172944f5c5c6e8c10a0d4042e Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 17 Oct 2024 17:20:37 +0200 Subject: [PATCH 03/23] fix: Don't double call list --- openml/datasets/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index a328f9762..522e9d84d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -174,7 +174,7 @@ def _list_datasets( ------- datasets : dataframe """ - api_call = f"data/list/list/{limit}/offset/{offset}" + api_call = f"data/list/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): From 3bd924ca40783631ec9bdab819823ebf6ca7c79a Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 4 Nov 2024 17:14:54 +0100 Subject: [PATCH 04/23] update... --- README.md | 2 +- examples/20_basic/simple_datasets_tutorial.py | 2 +- .../simple_flows_and_runs_tutorial.py | 4 +- examples/30_extended/datasets_tutorial.py | 17 +- .../30_extended/fetch_evaluations_tutorial.py | 10 +- .../30_extended/flows_and_runs_tutorial.py | 8 +- .../plot_svm_hyperparameters_tutorial.py | 4 +- examples/30_extended/study_tutorial.py | 10 +- examples/30_extended/suites_tutorial.py | 11 +- examples/30_extended/tasks_tutorial.py | 20 +- .../40_paper/2015_neurips_feurer_example.py | 7 +- examples/40_paper/2018_kdd_rijn_example.py | 13 +- .../40_paper/2018_neurips_perrone_example.py | 12 +- openml/datasets/dataset.py | 225 ++++++------------ openml/datasets/functions.py | 32 ++- openml/evaluations/functions.py | 65 ++--- openml/flows/functions.py | 5 +- openml/runs/functions.py | 3 + openml/setups/functions.py | 7 +- openml/setups/setup.py | 2 +- openml/study/functions.py | 13 +- openml/tasks/functions.py | 15 +- openml/tasks/task.py | 4 +- tests/test_datasets/test_dataset.py | 22 +- tests/test_datasets/test_dataset_functions.py | 68 +++--- .../test_evaluation_functions.py | 2 - .../test_evaluations_example.py | 3 +- .../test_sklearn_extension.py | 2 +- tests/test_flows/test_flow.py | 26 +- tests/test_flows/test_flow_functions.py | 18 +- tests/test_openml/test_api_calls.py | 2 +- tests/test_runs/test_run.py | 8 +- tests/test_runs/test_run_functions.py | 38 ++- tests/test_setups/test_setup_functions.py | 7 +- tests/test_study/test_study_functions.py | 14 +- tests/test_tasks/test_task_functions.py | 27 +-- tests/test_tasks/test_task_methods.py | 6 +- tests/test_utils/test_utils.py | 19 +- 38 files changed, 319 insertions(+), 434 deletions(-) diff --git a/README.md b/README.md index 0bad7ac66..081bf7923 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@
    - OpenML Logo + OpenML Logo

    OpenML-Python

    Python Logo
    diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py index 9b18aab14..fed997d58 100644 --- a/examples/20_basic/simple_datasets_tutorial.py +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -48,7 +48,7 @@ # attribute_names - the names of the features for the examples (X) and # target feature (y) X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute + target_names=dataset.default_target_attribute ) ############################################################################ diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index eec6d7e8b..63a3a6d85 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -7,9 +7,9 @@ # License: BSD 3-Clause -import openml from sklearn import ensemble, neighbors +import openml ############################################################################ # .. warning:: @@ -23,7 +23,7 @@ # NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20 dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute + target_names=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=3) clf.fit(X, y) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 606455dd8..2841c68f7 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -8,29 +8,24 @@ # License: BSD 3-Clauses -import openml import pandas as pd + +import openml from openml.datasets import edit_dataset, fork_dataset, get_dataset ############################################################################ # Exercise 0 # ********** # -# * List datasets -# -# * Use the output_format parameter to select output type -# * Default gives 'dict' (other option: 'dataframe', see below) -# -# Note: list_datasets will return a pandas dataframe by default from 0.15. When using -# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'. -datalist = openml.datasets.list_datasets(output_format="dataframe") +# * List datasets and return a dataframe +datalist = openml.datasets.list_datasets() datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]] print(f"First 10 of {len(datalist)} datasets...") datalist.head(n=10) # The same can be done with lesser lines of code -openml_df = openml.datasets.list_datasets(output_format="dataframe") +openml_df = openml.datasets.list_datasets() openml_df.head(n=10) ############################################################################ @@ -73,7 +68,7 @@ # dataset. In particular, many datasets for supervised problems have a set # `default_target_attribute` which may help identify the target variable. X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute + target_names=dataset.default_target_attribute ) print(X.head()) print(X.info()) diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py index 86302e2d1..4ef2d3e5f 100644 --- a/examples/30_extended/fetch_evaluations_tutorial.py +++ b/examples/30_extended/fetch_evaluations_tutorial.py @@ -32,9 +32,7 @@ # Required filters can be applied to retrieve results from runs as required. # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations -openml.evaluations.list_evaluations( - function="predictive_accuracy", size=10, output_format="dataframe" -) +openml.evaluations.list_evaluations(function="predictive_accuracy", size=10) # Using other evaluation metrics, 'precision' in this case evals = openml.evaluations.list_evaluations( @@ -62,9 +60,7 @@ # Note that we now filter the evaluations based on another parameter 'task'. metric = "predictive_accuracy" -evals = openml.evaluations.list_evaluations( - function=metric, tasks=[task_id], output_format="dataframe" -) +evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id]) # Displaying the first 10 rows print(evals.head(n=10)) # Sorting the evaluations in decreasing order of the metric chosen @@ -94,7 +90,7 @@ def plot_cdf(values, metric="predictive_accuracy"): plt.minorticks_on() plt.grid(visible=True, which="minor", linestyle="--") plt.axvline(max_val, linestyle="--", color="gray") - plt.text(max_val, 0, "%.3f" % max_val, fontsize=9) + plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9) plt.show() diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index b7c000101..bbe94c1dc 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -7,9 +7,9 @@ # License: BSD 3-Clause -import openml -from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree +from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree +import openml ############################################################################ # We'll use the test server for the rest of this tutorial. @@ -27,7 +27,7 @@ # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68 dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute + target_names=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) @@ -38,7 +38,7 @@ # * e.g. categorical features -> do feature encoding dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute + target_names=dataset.default_target_attribute ) print(f"Categorical features: {categorical_indicator}") transformer = compose.ColumnTransformer( diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py index e366c56df..491507d16 100644 --- a/examples/30_extended/plot_svm_hyperparameters_tutorial.py +++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py @@ -6,9 +6,10 @@ # License: BSD 3-Clause -import openml import numpy as np +import openml + #################################################################################################### # First step - obtaining the data # =============================== @@ -22,7 +23,6 @@ function="predictive_accuracy", flows=[8353], tasks=[6], - output_format="dataframe", # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise, # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary. parameters_in_separate_columns=True, diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index 8715dfb4a..d1f6ccbd0 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -17,16 +17,11 @@ import openml - ############################################################################ # Listing studies # *************** -# -# * Use the output_format parameter to select output type -# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an -# easier-to-work-with data structure -studies = openml.study.list_studies(output_format="dataframe", status="all") +studies = openml.study.list_studies(status="all") print(studies.head(n=10)) @@ -52,7 +47,6 @@ # the evaluations available for the conducted runs: evaluations = openml.evaluations.list_evaluations( function="predictive_accuracy", - output_format="dataframe", study=study.study_id, ) print(evaluations.head()) @@ -81,7 +75,7 @@ # To verify # https://test.openml.org/api/v1/study/1 suite = openml.study.get_suite("OpenML100") -print(all([t_id in suite.tasks for t_id in tasks])) +print(all(t_id in suite.tasks for t_id in tasks)) run_ids = [] for task_id in tasks: diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py index 935d4c529..19f5cdc1a 100644 --- a/examples/30_extended/suites_tutorial.py +++ b/examples/30_extended/suites_tutorial.py @@ -19,16 +19,11 @@ import openml - ############################################################################ # Listing suites # ************** -# -# * Use the output_format parameter to select output type -# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an -# easier-to-work-with data structure -suites = openml.study.list_suites(output_format="dataframe", status="all") +suites = openml.study.list_suites(status="all") print(suites.head(n=10)) ############################################################################ @@ -51,7 +46,7 @@ ############################################################################ # And we can use the task listing functionality to learn more about them: -tasks = openml.tasks.list_tasks(output_format="dataframe") +tasks = openml.tasks.list_tasks() # Using ``@`` in `pd.DataFrame.query < # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_ @@ -76,7 +71,7 @@ # We'll take a random subset of at least ten tasks of all available tasks on # the test server: -all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"]) +all_tasks = list(openml.tasks.list_tasks()["tid"]) task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20)) # The study needs a machine-readable and unique alias. To obtain this, diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py index 19a7e542c..63821c7a2 100644 --- a/examples/30_extended/tasks_tutorial.py +++ b/examples/30_extended/tasks_tutorial.py @@ -9,7 +9,6 @@ import openml from openml.tasks import TaskType -import pandas as pd ############################################################################ # @@ -30,14 +29,11 @@ # ^^^^^^^^^^^^^ # # We will start by simply listing only *supervised classification* tasks. -# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we -# request a +# **openml.tasks.list_tasks()** getting a # `pandas dataframe `_ -# instead to have better visualization capabilities and easier access: +# to have good visualization capabilities and easier access: -tasks = openml.tasks.list_tasks( - task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe" -) +tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) print(tasks.columns) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -71,21 +67,21 @@ # # Similar to listing tasks by task type, we can list tasks by tags: -tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe") +tasks = openml.tasks.list_tasks(tag="OpenML100") print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) ############################################################################ # Furthermore, we can list tasks based on the dataset id: -tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe") +tasks = openml.tasks.list_tasks(data_id=1471) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) ############################################################################ # In addition, a size limit and an offset can be applied both separately and simultaneously: -tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe") +tasks = openml.tasks.list_tasks(size=10, offset=50) print(tasks) ############################################################################ @@ -101,7 +97,7 @@ # Finally, it is also possible to list all tasks on OpenML with: ############################################################################ -tasks = openml.tasks.list_tasks(output_format="dataframe") +tasks = openml.tasks.list_tasks() print(len(tasks)) ############################################################################ @@ -195,7 +191,7 @@ # Error code for 'task already exists' if e.code == 614: # Lookup task - tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe") + tasks = openml.tasks.list_tasks(data_id=128) tasks = tasks.query( 'task_type == "Supervised Classification" ' 'and estimation_procedure == "10-fold Crossvalidation" ' diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py index 3960c3852..f0413d27a 100644 --- a/examples/40_paper/2015_neurips_feurer_example.py +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -13,12 +13,10 @@ | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter | In *Advances in Neural Information Processing Systems 28*, 2015 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf -""" # noqa F401 +""" # License: BSD 3-Clause -import pandas as pd - import openml #################################################################################################### @@ -60,7 +58,6 @@ tasks = openml.tasks.list_tasks( task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, status="all", - output_format="dataframe", ) # Query only those with holdout as the resampling startegy. @@ -68,7 +65,7 @@ task_ids = [] for did in dataset_ids: - tasks_ = list(tasks.query("did == {}".format(did)).tid) + tasks_ = list(tasks.query(f"did == {did}").tid) if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest). task_id = min(tasks_) else: diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py index 7ec60fe53..2230ade2d 100644 --- a/examples/40_paper/2018_kdd_rijn_example.py +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -32,17 +32,18 @@ import sys -if sys.platform == "win32": # noqa +if sys.platform == "win32": print( "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" ) - exit() + sys.exit() # DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline print("This example is deprecated, remove this code to use it manually.") -exit() +sys.exit() import json + import fanova import matplotlib.pyplot as plt import pandas as pd @@ -50,7 +51,6 @@ import openml - ############################################################################## # With the advent of automated machine learning, automated hyperparameter # optimization methods are by now routinely used in data mining. However, this @@ -81,7 +81,7 @@ # important when it is put on a log-scale. All these simplifications can be # addressed by defining a ConfigSpace. For a more elaborated example that uses # this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 +# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py suite = openml.study.get_suite("OpenML100") flow_id = 7707 @@ -107,7 +107,6 @@ flows=[flow_id], tasks=[task_id], size=limit_per_task, - output_format="dataframe", ) performance_column = "value" @@ -122,7 +121,7 @@ [ dict( **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} + **{performance_column: setup[performance_column]}, ) for _, setup in evals.iterrows() ] diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 0d72846ac..7c29ede8d 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -27,16 +27,17 @@ # License: BSD 3-Clause -import openml import numpy as np import pandas as pd from matplotlib import pyplot as plt -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer +from sklearn.ensemble import RandomForestRegressor +from sklearn.impute import SimpleImputer from sklearn.metrics import mean_squared_error +from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder -from sklearn.ensemble import RandomForestRegressor + +import openml flow_type = "svm" # this example will use the smaller svm flow evaluations ############################################################################ @@ -94,7 +95,6 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu tasks=task_ids, flows=[flow_id], uploaders=[2702], - output_format="dataframe", parameters_in_separate_columns=True, ) return eval_df, task_ids, flow_id @@ -206,7 +206,7 @@ def list_categorical_attributes(flow_type="svm"): model.fit(X, y) y_pred = model.predict(X) -print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred))) +print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}") ############################################################################# diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index b00c458e3..12ce262e7 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -17,13 +17,28 @@ import xmltodict from openml.base import OpenMLBase -from openml.exceptions import PyOpenMLError from .data_feature import OpenMLDataFeature logger = logging.getLogger(__name__) +def _ensure_dataframe( + data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix, + attribute_names: list | None = None, +) -> pd.DataFrame: + if isinstance(data, pd.DataFrame): + return data + if scipy.sparse.issparse(data): + return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names) + if isinstance(data, np.ndarray): + return pd.DataFrame(data, columns=attribute_names) # type: ignore + if isinstance(data, pd.Series): + return data.to_frame() + + raise TypeError(f"Data type {type(data)} not supported.") + + class OpenMLDataset(OpenMLBase): """Dataset object. @@ -575,13 +590,17 @@ def _cache_compressed_file_from_file( return data, categorical, attribute_names - def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]: + def _parse_data_from_file( + self, + data_file: Path, + ) -> tuple[list[str], list[bool], pd.DataFrame | scipy.sparse.csr_matrix]: if data_file.suffix == ".arff": data, categorical, attribute_names = self._parse_data_from_arff(data_file) elif data_file.suffix == ".pq": attribute_names, categorical, data = self._parse_data_from_pq(data_file) else: raise ValueError(f"Unknown file type for file '{data_file}'.") + return attribute_names, categorical, data def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]: @@ -593,7 +612,7 @@ def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], p attribute_names = list(data.columns) return attribute_names, categorical, data - def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: # noqa: PLR0912, C901 + def _load_data(self) -> tuple[pd.DataFrame, list[bool], list[str]]: # noqa: PLR0912, C901, PLR0915 """Load data from compressed format or arff. Download data if not present on disk.""" need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None @@ -604,7 +623,8 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool] file_to_load = self.data_file if self.parquet_file is None else self.parquet_file assert file_to_load is not None - return self._cache_compressed_file_from_file(Path(file_to_load)) + data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load)) + return _ensure_dataframe(data, attrs), cats, attrs # helper variable to help identify where errors occur fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file @@ -616,12 +636,13 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool] data = pd.read_feather(self.data_feather_file) fpath = self.feather_attribute_file - with open(self.feather_attribute_file, "rb") as fh: # noqa: PTH123 + with self.feather_attribute_file.open("rb") as fh: categorical, attribute_names = pickle.load(fh) # noqa: S301 else: assert self.data_pickle_file is not None - with open(self.data_pickle_file, "rb") as fh: # noqa: PTH123 + with self.data_pickle_file.open("rb") as fh: data, categorical, attribute_names = pickle.load(fh) # noqa: S301 + except FileNotFoundError as e: raise ValueError( f"Cannot find file for dataset {self.name} at location '{fpath}'." @@ -660,7 +681,7 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool] file_to_load = self.data_file if self.parquet_file is None else self.parquet_file assert file_to_load is not None attr, cat, df = self._parse_data_from_file(Path(file_to_load)) - return df, cat, attr + return _ensure_dataframe(df), cat, attr data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data) if self.cache_format == "pickle" and not data_up_to_date: @@ -668,79 +689,9 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool] file_to_load = self.data_file if self.parquet_file is None else self.parquet_file assert file_to_load is not None - return self._cache_compressed_file_from_file(Path(file_to_load)) - return data, categorical, attribute_names + data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load)) - # TODO(eddiebergman): Can type this better with overload - # TODO(eddiebergman): Could also techinically use scipy.sparse.sparray - @staticmethod - def _convert_array_format( - data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix, - array_format: Literal["array", "dataframe"], - attribute_names: list | None = None, - ) -> pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix: - """Convert a dataset to a given array format. - - Converts to numpy array if data is non-sparse. - Converts to a sparse dataframe if data is sparse. - - Parameters - ---------- - array_format : str {'array', 'dataframe'} - Desired data type of the output - - If array_format='array' - If data is non-sparse - Converts to numpy-array - Enforces numeric encoding of categorical columns - Missing values are represented as NaN in the numpy-array - else returns data as is - - If array_format='dataframe' - If data is sparse - Works only on sparse data - Converts sparse data to sparse dataframe - else returns data as is - - """ - if array_format == "array" and not isinstance(data, scipy.sparse.spmatrix): - # We encode the categories such that they are integer to be able - # to make a conversion to numeric for backward compatibility - def _encode_if_category(column: pd.Series | np.ndarray) -> pd.Series | np.ndarray: - if column.dtype.name == "category": - column = column.cat.codes.astype(np.float32) - mask_nan = column == -1 - column[mask_nan] = np.nan - return column - - if isinstance(data, pd.DataFrame): - columns = { - column_name: _encode_if_category(data.loc[:, column_name]) - for column_name in data.columns - } - data = pd.DataFrame(columns) - else: - data = _encode_if_category(data) - - try: - # TODO(eddiebergman): float32? - return_array = np.asarray(data, dtype=np.float32) - except ValueError as e: - raise PyOpenMLError( - "PyOpenML cannot handle string when returning numpy" - ' arrays. Use dataset_format="dataframe".', - ) from e - - return return_array - - if array_format == "dataframe": - if scipy.sparse.issparse(data): - data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names) - else: - data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data" - logger.warning( - f"Cannot convert {data_type} ({type(data)}) to '{array_format}'." - " Returning input data.", - ) - return data + return _ensure_dataframe(data, attribute_names), categorical, attribute_names @staticmethod def _unpack_categories(series: pd.Series, categories: list) -> pd.Series: @@ -761,19 +712,13 @@ def valid_category(cat: Any) -> bool: raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories) return pd.Series(raw_cat, index=series.index, name=series.name) - def get_data( # noqa: C901, PLR0912, PLR0915 + def get_data( # noqa: C901, PLR0912 self, target: list[str] | str | None = None, include_row_id: bool = False, # noqa: FBT001, FBT002 include_ignore_attribute: bool = False, # noqa: FBT001, FBT002 - dataset_format: Literal["array", "dataframe"] = "dataframe", - ) -> tuple[ - np.ndarray | pd.DataFrame | scipy.sparse.csr_matrix, - np.ndarray | pd.DataFrame | None, - list[bool], - list[str], - ]: - """Returns dataset content as dataframes or sparse matrices. + ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]: + """Returns dataset content as dataframes. Parameters ---------- @@ -785,35 +730,20 @@ def get_data( # noqa: C901, PLR0912, PLR0915 include_ignore_attribute : boolean (default=False) Whether to include columns that are marked as "ignore" on the server in the dataset. - dataset_format : string (default='dataframe') - The format of returned dataset. - If ``array``, the returned dataset will be a NumPy array or a SciPy sparse - matrix. Support for ``array`` will be removed in 0.15. - If ``dataframe``, the returned dataset will be a Pandas DataFrame. Returns ------- - X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns) - Dataset - y : ndarray or pd.Series, shape (n_samples, ) or None + X : dataframe, shape (n_samples, n_columns) + Dataset, may have sparse dtypes in the columns if required. + y : pd.Series, shape (n_samples, ) or None Target column - categorical_indicator : boolean ndarray + categorical_indicator : list[bool] Mask that indicate categorical features. - attribute_names : List[str] + attribute_names : list[str] List of attribute names. """ - # TODO: [0.15] - if dataset_format == "array": - warnings.warn( - "Support for `dataset_format='array'` will be removed in 0.15," - "start using `dataset_format='dataframe' to ensure your code " - "will continue to work. You can use the dataframe's `to_numpy` " - "function to continue using numpy arrays.", - category=FutureWarning, - stacklevel=2, - ) - data, categorical, attribute_names = self._load_data() + data, categorical_mask, attribute_names = self._load_data() to_exclude = [] if not include_row_id and self.row_id_attribute is not None: @@ -831,54 +761,43 @@ def get_data( # noqa: C901, PLR0912, PLR0915 if len(to_exclude) > 0: logger.info(f"Going to remove the following attributes: {to_exclude}") keep = np.array([column not in to_exclude for column in attribute_names]) - data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep] - - categorical = [cat for cat, k in zip(categorical, keep) if k] + data = data.drop(columns=to_exclude) + categorical_mask = [cat for cat, k in zip(categorical_mask, keep) if k] attribute_names = [att for att, k in zip(attribute_names, keep) if k] if target is None: - data = self._convert_array_format(data, dataset_format, attribute_names) # type: ignore - targets = None + return data, None, categorical_mask, attribute_names + + if isinstance(target, str): + target_names = target.split(",") if "," in target else [target] else: - if isinstance(target, str): - target = target.split(",") if "," in target else [target] - targets = np.array([column in target for column in attribute_names]) - target_names = [column for column in attribute_names if column in target] - if np.sum(targets) > 1: - raise NotImplementedError( - "Number of requested targets %d is not implemented." % np.sum(targets), - ) - target_categorical = [ - cat for cat, column in zip(categorical, attribute_names) if column in target - ] - target_dtype = int if target_categorical[0] else float - - if isinstance(data, pd.DataFrame): - x = data.iloc[:, ~targets] - y = data.iloc[:, targets] - else: - x = data[:, ~targets] - y = data[:, targets].astype(target_dtype) # type: ignore - - categorical = [cat for cat, t in zip(categorical, targets) if not t] - attribute_names = [att for att, k in zip(attribute_names, targets) if not k] - - x = self._convert_array_format(x, dataset_format, attribute_names) # type: ignore - if dataset_format == "array" and scipy.sparse.issparse(y): - # scikit-learn requires dense representation of targets - y = np.asarray(y.todense()).astype(target_dtype) - # dense representation of single column sparse arrays become a 2-d array - # need to flatten it to a 1-d array for _convert_array_format() - y = y.squeeze() - y = self._convert_array_format(y, dataset_format, target_names) - y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y - if len(y.shape) > 1 and y.shape[1] == 1: - # single column targets should be 1-d for both `array` and `dataframe` formats - assert isinstance(y, (np.ndarray, pd.DataFrame, pd.Series)) - y = y.squeeze() - data, targets = x, y - - return data, targets, categorical, attribute_names # type: ignore + target_names = target + + # All the assumptions below for the target are dependant on the number of targets being 1 + n_targets = len(target_names) + if n_targets > 1: + raise NotImplementedError(f"Number of targets {n_targets} not implemented.") + + target_name = target_names[0] + x = data.drop(columns=[target_name]) + y = data[target_name].squeeze() + + if isinstance(y.dtype, pd.SparseDtype): + y = y.sparse.to_dense() + + # Since it was sparsified, the y column may not be of the correct dtype, hence we check + # if it was categorical and convert it to and integer if needs be. + category_names = data.columns[categorical_mask] + if target_name in category_names: + y = y.astype(int) + + # Finally, remove the target from the list of attributes and categorical mask + target_index = attribute_names.index(target_name) + categorical_mask.pop(target_index) + attribute_names.remove(target_name) + + assert isinstance(y, pd.Series) + return x, y, categorical_mask, attribute_names def _load_features(self) -> None: """Load the features metadata from the server and store it in the dataset object.""" diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 522e9d84d..d9f8d36a0 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -84,10 +84,10 @@ def list_datasets( tag: str | None = None, data_name: str | None = None, data_version: int | None = None, - number_instances: int | None = None, - number_features: int | None = None, - number_classes: int | None = None, - number_missing_values: int | None = None, + number_instances: int | str | None = None, + number_features: int | str | None = None, + number_classes: int | str | None = None, + number_missing_values: int | str | None = None, ) -> pd.DataFrame: """Return a dataframe of all dataset which are on OpenML. @@ -109,10 +109,10 @@ def list_datasets( tag : str, optional data_name : str, optional data_version : int, optional - number_instances : int, optional - number_features : int, optional - number_classes : int, optional - number_missing_values : int, optional + number_instances : int | str, optional + number_features : int | str, optional + number_classes : int | str, optional + number_missing_values : int | str, optional Returns ------- @@ -139,6 +139,9 @@ def list_datasets( number_missing_values=number_missing_values, ) batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + if len(batches) == 0: + return pd.DataFrame() + return pd.concat(batches) @@ -174,11 +177,12 @@ def _list_datasets( ------- datasets : dataframe """ - api_call = f"data/list/{limit}/offset/{offset}" + api_call = f"data/list/limit/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + if value is not None: + api_call += f"/{operator}/{value}" if data_id is not None: api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id])) return __list_datasets(api_call=api_call) @@ -213,7 +217,13 @@ def __list_datasets(api_call: str) -> pd.DataFrame: dataset[quality["@name"]] = float(quality["#text"]) datasets[dataset["did"]] = dataset - return pd.DataFrame.from_dict(datasets, orient="index") + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) def _expand_parameter(parameter: str | list[str] | None) -> list[str]: diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 0adab61ac..bee326438 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -21,37 +21,37 @@ @overload def list_evaluations( function: str, - offset: int | None = ..., - size: int | None = ..., - tasks: list[str | int] | None = ..., - setups: list[str | int] | None = ..., - flows: list[str | int] | None = ..., - runs: list[str | int] | None = ..., - uploaders: list[str | int] | None = ..., - tag: str | None = ..., - study: int | None = ..., - per_fold: bool | None = ..., - sort_order: str | None = ..., - output_format: Literal["object"] = "object", -) -> dict[int, OpenMLEvaluation]: ... + offset: int | None = None, + size: int | None = None, + tasks: list[str | int] | None = None, + setups: list[str | int] | None = None, + flows: list[str | int] | None = None, + runs: list[str | int] | None = None, + uploaders: list[str | int] | None = None, + tag: str | None = None, + study: int | None = None, + per_fold: bool | None = None, + sort_order: str | None = None, + output_format: Literal["dataframe"] = ..., +) -> pd.DataFrame: ... @overload def list_evaluations( function: str, - offset: int | None = ..., - size: int | None = ..., - tasks: list[str | int] | None = ..., - setups: list[str | int] | None = ..., - flows: list[str | int] | None = ..., - runs: list[str | int] | None = ..., - uploaders: list[str | int] | None = ..., - tag: str | None = ..., - study: int | None = ..., - per_fold: bool | None = ..., - sort_order: str | None = ..., - output_format: Literal["dataframe"] = ..., -) -> pd.DataFrame: ... + offset: int | None = None, + size: int | None = None, + tasks: list[str | int] | None = None, + setups: list[str | int] | None = None, + flows: list[str | int] | None = None, + runs: list[str | int] | None = None, + uploaders: list[str | int] | None = None, + tag: str | None = None, + study: int | None = None, + per_fold: bool | None = None, + sort_order: str | None = None, + output_format: Literal["object"] = "object", +) -> dict[int, OpenMLEvaluation]: ... def list_evaluations( @@ -137,12 +137,12 @@ def list_evaluations( flattened = list(chain.from_iterable(eval_collection)) if output_format == "dataframe": records = [item._to_dict() for item in flattened] - return pd.DataFrame.from_records(records, index="run_id") + return pd.DataFrame.from_records(records) # No index... return {e.run_id: e for e in flattened} -def _list_evaluations( +def _list_evaluations( # noqa: C901 limit: int, offset: int, *, @@ -197,7 +197,8 @@ def _list_evaluations( api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + if value is not None: + api_call += f"/{operator}/{value}" if tasks is not None: api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks])) if setups is not None: @@ -243,7 +244,7 @@ def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]: for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]: run_id = int(eval_["oml:run_id"]) value = float(eval_["oml:value"]) if "oml:value" in eval_ else None - values = json.loads(eval_["oml:values"]) if eval_["oml:values"] is not None else None + values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None array_data = eval_.get("oml:array_data") evals.append( @@ -408,10 +409,10 @@ def list_evaluations_setups( assert isinstance(result, pd.DataFrame) result = result.drop("flow_id", axis=1) # concat resulting setup chunks into single datframe - setup_data = pd.concat([setup_data, result], ignore_index=True) + setup_data = pd.concat([setup_data, result]) parameters = [] - # Convert parameters of setup into list of tuples of (hyperparameter, value) + # Convert parameters of setup into dict of (hyperparameter, value) for parameter_dict in setup_data["parameters"]: if parameter_dict is not None: parameters.append( diff --git a/openml/flows/functions.py b/openml/flows/functions.py index f1f1da6fb..7cb93386b 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -167,7 +167,10 @@ def list_flows( """ listing_call = partial(_list_flows, tag=tag, uploader=uploader) batches = openml.utils._list_all(listing_call, offset=offset, limit=size) - return pd.concat(batches, ignore_index=True) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 67a035825..ab852a029 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -1094,6 +1094,9 @@ def list_runs( # noqa: PLR0913 task_type=task_type, ) batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + if len(batches) == 0: + return pd.DataFrame() + return pd.concat(batches) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 0c3095501..1b7ffb329 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -90,7 +90,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup: setup_file = setup_cache_dir / "description.xml" with setup_file.open(encoding="utf8") as fh: setup_xml = xmltodict.parse(fh.read()) - return _create_setup_from_xml(setup_xml, output_format="object") # type: ignore + return _create_setup_from_xml(setup_xml) except OSError as e: raise openml.exceptions.OpenMLCacheException( @@ -120,7 +120,7 @@ def get_setup(setup_id: int) -> OpenMLSetup: try: return _get_cached_setup(setup_id) except openml.exceptions.OpenMLCacheException: - url_suffix = "/setup/%d" % setup_id + url_suffix = f"/setup/{setup_id}" setup_xml = openml._api_calls._perform_api_call(url_suffix, "get") with setup_file.open("w", encoding="utf8") as fh: fh.write(setup_xml) @@ -172,7 +172,8 @@ def list_setups( # noqa: PLR0913 if output_format == "object": return {setup.setup_id: setup for setup in flattened} - return pd.DataFrame.from_records([setup._to_dict() for setup in flattened], index="setup_id") + records = [setup._to_dict() for setup in flattened] + return pd.DataFrame.from_records(records, index="setup_id") def _list_setups( diff --git a/openml/setups/setup.py b/openml/setups/setup.py index 83b258dd7..a98e832c0 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -39,7 +39,7 @@ def _to_dict(self) -> dict[str, Any]: return { "setup_id": self.setup_id, "flow_id": self.flow_id, - "parameters": {p.input_id: p._to_dict() for p in self.parameters.values()} + "parameters": {p.id: p._to_dict() for p in self.parameters.values()} if self.parameters is not None else None, } diff --git a/openml/study/functions.py b/openml/study/functions.py index 59f255b47..fb71272a1 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -473,7 +473,10 @@ def list_suites( uploader=uploader, ) batches = openml.utils._list_all(listing_call, limit=size, offset=offset) - return pd.concat(batches, ignore_index=True) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) def list_studies( @@ -522,7 +525,10 @@ def list_studies( benchmark_suite=benchmark_suite, ) batches = openml.utils._list_all(listing_call, offset=offset, limit=size) - return pd.concat(batches, ignore_index=True) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: @@ -561,8 +567,7 @@ def __list_studies(api_call: str) -> pd.DataFrame: Returns ------- pd.DataFrame - A dictionary or Pandas DataFrame of OpenML studies, - depending on the value of 'output_format'. + A Pandas DataFrame of OpenML studies """ xml_string = openml._api_calls._perform_api_call(api_call, "get") study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 1f4ec71f8..497e35c69 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -134,6 +134,7 @@ def list_tasks( # noqa: PLR0913 data_tag: str | None = None, status: str | None = None, data_name: str | None = None, + data_id: int | None = None, number_instances: int | None = None, number_features: int | None = None, number_classes: int | None = None, @@ -157,6 +158,7 @@ def list_tasks( # noqa: PLR0913 the tag to include data_tag : str, optional the tag of the dataset + data_id : int, optional status : str, optional data_name : str, optional number_instances : int, optional @@ -179,6 +181,7 @@ def list_tasks( # noqa: PLR0913 tag=tag, data_tag=data_tag, status=status, + data_id=data_id, data_name=data_name, number_instances=number_instances, number_features=number_features, @@ -186,7 +189,10 @@ def list_tasks( # noqa: PLR0913 number_missing_values=number_missing_values, ) batches = openml.utils._list_all(listing_call, offset=offset, limit=size) - return pd.concat(batches, ignore_index=True) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) def _list_tasks( @@ -222,9 +228,10 @@ def _list_tasks( api_call += f"/type/{tvalue}" if kwargs is not None: for operator, value in kwargs.items(): - if operator == "task_id": - value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 - api_call += f"/{operator}/{value}" + if value is not None: + if operator == "task_id": + value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901 + api_call += f"/{operator}/{value}" return __list_tasks(api_call=api_call) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 395b52482..c3079db91 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -292,7 +292,7 @@ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: ): raise NotImplementedError(self.task_type) - X, y, _, _ = dataset.get_data(target=self.target_name) + X, y, _, _ = dataset.get_data(target_names=self.target_name) return X, y def _to_dict(self) -> dict[str, dict]: @@ -502,7 +502,7 @@ def get_X(self) -> pd.DataFrame: The X data as a dataframe """ dataset = self.get_dataset() - data, *_ = dataset.get_data(target=None) + data, *_ = dataset.get_data(target_names=None) return data def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 102429b41..0675c21d2 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -120,7 +120,7 @@ def test_get_data_pandas(self): assert data[col_name].dtype.name == col_dtype[col_name] X, y, _, _ = self.titanic.get_data( - target=self.titanic.default_target_attribute, + target_names=self.titanic.default_target_attribute, ) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) @@ -171,7 +171,7 @@ def test_get_data_with_rowid(self): @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_target_pandas(self): - X, y, categorical, attribute_names = self.dataset.get_data(target="class") + X, y, categorical, attribute_names = self.dataset.get_data(target_names="class") assert isinstance(X, pd.DataFrame) for dtype, is_cat, col in zip(X.dtypes, categorical, X): self._check_expected_type(dtype, is_cat, X[col]) @@ -187,7 +187,7 @@ def test_get_data_with_target_pandas(self): def test_get_data_rowid_and_ignore_and_target(self): self.dataset.ignore_attribute = ["condition"] self.dataset.row_id_attribute = ["hardness"] - X, y, categorical, names = self.dataset.get_data(target="class") + X, y, categorical, names = self.dataset.get_data(target_names="class") assert X.shape == (898, 36) assert len(categorical) == 36 cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 @@ -273,7 +273,8 @@ def test_lazy_loading_metadata(self): def test_equality_comparison(self): self.assertEqual(self.iris, self.iris) self.assertNotEqual(self.iris, self.titanic) - self.assertNotEqual(self.titanic, 'Wrong_object') + self.assertNotEqual(self.titanic, "Wrong_object") + class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): @@ -285,14 +286,14 @@ def test_tagging(self): # tags can be at most 64 alphanumeric (+ underscore) chars unique_indicator = str(time()).replace(".", "") tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}" - datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") + datasets = openml.datasets.list_datasets(tag=tag) assert datasets.empty self.dataset.push_tag(tag) - datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") + datasets = openml.datasets.list_datasets(tag=tag) assert len(datasets) == 1 assert 125 in datasets["did"] self.dataset.remove_tag(tag) - datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe") + datasets = openml.datasets.list_datasets(tag=tag) assert datasets.empty def test_get_feature_with_ontology_data_id_11(self): @@ -347,7 +348,7 @@ def setUp(self): self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False) def test_get_sparse_dataset_dataframe_with_target(self): - X, y, _, attribute_names = self.sparse_dataset.get_data(target="class") + X, y, _, attribute_names = self.sparse_dataset.get_data(target_names="class") assert isinstance(X, pd.DataFrame) assert isinstance(X.dtypes[0], pd.SparseDtype) assert X.shape == (600, 20000) @@ -374,12 +375,11 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.sparse_dataset.row_id_attribute = ["V512"] # TODO(eddiebergman): Will break from dataset_format removal X, y, categorical, _ = self.sparse_dataset.get_data( - target="class", + target_names="class", include_row_id=False, include_ignore_attribute=False, ) - assert sparse.issparse(X) - assert X.dtype == np.float32 + assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes) assert y.dtype in [np.int32, np.int64] assert X.shape == (600, 19998) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 3995f1b98..d6b85689a 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -113,9 +113,8 @@ def test_tag_untag_dataset(self): all_tags = _tag_entity("data", 1, tag, untag=True) assert tag not in all_tags - def test_list_datasets_output_format(self): - datasets = openml.datasets.list_datasets(output_format="dataframe") - assert isinstance(datasets, pd.DataFrame) + def test_list_datasets_length(self): + datasets = openml.datasets.list_datasets() assert len(datasets) >= 100 def test_list_datasets_paginate(self): @@ -123,14 +122,18 @@ def test_list_datasets_paginate(self): max = 100 for i in range(0, max, size): datasets = openml.datasets.list_datasets(offset=i, size=size) - assert size == len(datasets) - self._check_datasets(datasets) + assert len(datasets) == size + assert len(datasets.columns) >= 2 + # Maybe index? + assert "did" in datasets.columns + assert datasets["did"].dtype == int + assert "status" in datasets.columns + assert datasets["status"].dtype == pd.CategoricalDtype( + categories=["in_preparation", "active", "deactivated"], + ) def test_list_datasets_empty(self): - datasets = openml.datasets.list_datasets( - tag="NoOneWouldUseThisTagAnyway", - output_format="dataframe", - ) + datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway") assert datasets.empty @pytest.mark.production() @@ -308,9 +311,9 @@ def ensure_absence_of_real_data(): def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) - # TODO(eddiebergman): Will break from dataset_format removal X, *_ = dataset.get_data() - assert isinstance(X, scipy.sparse.csr_matrix) + assert isinstance(X, pd.DataFrame) + assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly @@ -570,11 +573,7 @@ def test_upload_dataset_with_url(self): def _assert_status_of_dataset(self, *, did: int, status: str): """Asserts there is exactly one dataset with id `did` and its current status is `status`""" # need to use listing fn, as this is immune to cache - result = openml.datasets.list_datasets( - data_id=[did], - status="all", - output_format="dataframe", - ) + result = openml.datasets.list_datasets(data_id=[did], status="all") result = result.to_dict(orient="index") # I think we should drop the test that one result is returned, # the server should never return multiple results? @@ -1522,8 +1521,8 @@ def test_list_datasets_with_high_size_parameter(self): # Testing on prod since concurrent deletion of uploded datasets make the test fail openml.config.server = self.production_server - datasets_a = openml.datasets.list_datasets(output_format="dataframe") - datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf) + datasets_a = openml.datasets.list_datasets() + datasets_b = openml.datasets.list_datasets(size=np.inf) # Reverting to test server openml.config.server = self.test_server @@ -1792,7 +1791,7 @@ def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame): @pytest.fixture(scope="module") def all_datasets(): - return openml.datasets.list_datasets(output_format="dataframe") + return openml.datasets.list_datasets() def test_list_datasets(all_datasets: pd.DataFrame): @@ -1804,49 +1803,37 @@ def test_list_datasets(all_datasets: pd.DataFrame): def test_list_datasets_by_tag(all_datasets: pd.DataFrame): - tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe") + tag_datasets = openml.datasets.list_datasets(tag="study_14") assert 0 < len(tag_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(tag_datasets) def test_list_datasets_by_size(): - datasets = openml.datasets.list_datasets(size=5, output_format="dataframe") + datasets = openml.datasets.list_datasets(size=5) assert len(datasets) == 5 _assert_datasets_have_id_and_valid_status(datasets) def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame): - small_datasets = openml.datasets.list_datasets( - number_instances="5..100", - output_format="dataframe", - ) + small_datasets = openml.datasets.list_datasets(number_instances="5..100") assert 0 < len(small_datasets) <= len(all_datasets) _assert_datasets_have_id_and_valid_status(small_datasets) def test_list_datasets_by_number_features(all_datasets: pd.DataFrame): - wide_datasets = openml.datasets.list_datasets( - number_features="50..100", - output_format="dataframe", - ) + wide_datasets = openml.datasets.list_datasets(number_features="50..100") assert 8 <= len(wide_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(wide_datasets) def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame): - five_class_datasets = openml.datasets.list_datasets( - number_classes="5", - output_format="dataframe", - ) + five_class_datasets = openml.datasets.list_datasets(number_classes="5") assert 3 <= len(five_class_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(five_class_datasets) def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame): - na_datasets = openml.datasets.list_datasets( - number_missing_values="5..100", - output_format="dataframe", - ) + na_datasets = openml.datasets.list_datasets(number_missing_values="5..100") assert 5 <= len(na_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(na_datasets) @@ -1856,7 +1843,6 @@ def test_list_datasets_combined_filters(all_datasets: pd.DataFrame): tag="study_14", number_instances="100..1000", number_missing_values="800..1000", - output_format="dataframe", ) assert 1 <= len(combined_filter_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(combined_filter_datasets) @@ -1956,8 +1942,12 @@ def test_get_dataset_with_invalid_id() -> None: openml.datasets.get_dataset(INVALID_ID) assert e.value.code == 111 + def test_read_features_from_xml_with_whitespace() -> None: from openml.datasets.dataset import _read_features - features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml" + + features_file = ( + Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml" + ) dict = _read_features(features_file) assert dict[1].nominal_values == [" - 50000.", " 50000+."] diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index 7af01384f..37b0ce7c8 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -17,7 +17,6 @@ def _check_list_evaluation_setups(self, **kwargs): "predictive_accuracy", **kwargs, sort_order="desc", - output_format="dataframe", ) evals = openml.evaluations.list_evaluations( "predictive_accuracy", @@ -250,7 +249,6 @@ def test_list_evaluations_setups_filter_flow(self): flows=flow_id, size=size, sort_order="desc", - output_format="dataframe", parameters_in_separate_columns=True, ) columns = list(evals_cols.columns) diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py index bf5b03f3f..aabf40bda 100644 --- a/tests/test_evaluations/test_evaluations_example.py +++ b/tests/test_evaluations/test_evaluations_example.py @@ -8,6 +8,7 @@ class TestEvaluationsExample(unittest.TestCase): def test_example_python_paper(self): # Example script which will appear in the upcoming OpenML-Python paper # This test ensures that the example will keep running! + # import matplotlib.pyplot as plt import numpy as np @@ -18,7 +19,7 @@ def test_example_python_paper(self): "predictive_accuracy", flows=[8353], tasks=[6], - output_format="dataframe", + # output_format="dataframe", ... This was removed parameters_in_separate_columns=True, ) # Choose an SVM flow, for example 8353, and a task. diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 242f6793c..aeae743c2 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -2220,7 +2220,7 @@ def test_run_on_model_with_empty_steps(self): task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation X, y, categorical_ind, feature_names = dataset.get_data( - target=dataset.default_target_attribute, + target_names=dataset.default_target_attribute, ) categorical_ind = np.array(categorical_ind) (cat_idx,) = np.where(categorical_ind) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index dafbeaf3c..dcf074c8f 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -101,20 +101,20 @@ def test_get_structure(self): assert subflow.flow_id == sub_flow_id def test_tagging(self): - flows = openml.flows.list_flows(size=1, output_format="dataframe") + flows = openml.flows.list_flows(size=1) flow_id = flows["id"].iloc[0] flow = openml.flows.get_flow(flow_id) # tags can be at most 64 alphanumeric (+ underscore) chars unique_indicator = str(time.time()).replace(".", "") tag = f"test_tag_TestFlow_{unique_indicator}" - flows = openml.flows.list_flows(tag=tag, output_format="dataframe") + flows = openml.flows.list_flows(tag=tag) assert len(flows) == 0 flow.push_tag(tag) - flows = openml.flows.list_flows(tag=tag, output_format="dataframe") + flows = openml.flows.list_flows(tag=tag) assert len(flows) == 1 assert flow_id in flows["id"] flow.remove_tag(tag) - flows = openml.flows.list_flows(tag=tag, output_format="dataframe") + flows = openml.flows.list_flows(tag=tag) assert len(flows) == 0 def test_from_xml_to_xml(self): @@ -156,7 +156,9 @@ def test_from_xml_to_xml(self): @pytest.mark.sklearn() def test_to_xml_from_xml(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -269,12 +271,14 @@ def test_semi_legal_flow(self): # TODO: Test if parameters are set correctly! # should not throw error as it contains two differentiable forms of # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48) - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) semi_legal = sklearn.ensemble.BaggingClassifier( **{ estimator_name: sklearn.ensemble.BaggingClassifier( **{ - estimator_name:sklearn.tree.DecisionTreeClassifier(), + estimator_name: sklearn.tree.DecisionTreeClassifier(), } ) } @@ -428,7 +432,9 @@ def test_sklearn_to_upload_to_flow(self): percentile=30, ) fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)]) - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) boosting = sklearn.ensemble.AdaBoostClassifier( **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, ) @@ -499,7 +505,9 @@ def test_sklearn_to_upload_to_flow(self): assert new_flow is not flow # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + module_name_encoder = ( + "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" + ) if Version(sklearn.__version__) < Version("0.22"): fixture_name = ( f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV(" diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index b3d5be1a6..a25c2d740 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -51,7 +51,7 @@ def test_list_flows(self): openml.config.server = self.production_server # We can only perform a smoke test here because we test on dynamic # data from the internet... - flows = openml.flows.list_flows(output_format="dataframe") + flows = openml.flows.list_flows() # 3000 as the number of flows on openml.org assert len(flows) >= 1500 for flow in flows.to_dict(orient="index").values(): @@ -62,20 +62,20 @@ def test_list_flows_output_format(self): openml.config.server = self.production_server # We can only perform a smoke test here because we test on dynamic # data from the internet... - flows = openml.flows.list_flows(output_format="dataframe") + flows = openml.flows.list_flows() assert isinstance(flows, pd.DataFrame) assert len(flows) >= 1500 @pytest.mark.production() def test_list_flows_empty(self): openml.config.server = self.production_server - flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe") + flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123") assert flows.empty @pytest.mark.production() def test_list_flows_by_tag(self): openml.config.server = self.production_server - flows = openml.flows.list_flows(tag="weka", output_format="dataframe") + flows = openml.flows.list_flows(tag="weka") assert len(flows) >= 5 for flow in flows.to_dict(orient="index").values(): self._check_flow(flow) @@ -86,7 +86,7 @@ def test_list_flows_paginate(self): size = 10 maximum = 100 for i in range(0, maximum, size): - flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe") + flows = openml.flows.list_flows(offset=i, size=size) assert size >= len(flows) for flow in flows.to_dict(orient="index").values(): self._check_flow(flow) @@ -199,14 +199,18 @@ def test_are_flows_equal_ignore_parameter_values(self): new_flow.parameters["a"] = 7 with pytest.raises(ValueError) as excinfo: openml.flows.functions.assert_flows_equal(flow, new_flow) - assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value) + assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str( + excinfo.value + ) openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True) del new_flow.parameters["a"] with pytest.raises(ValueError) as excinfo: openml.flows.functions.assert_flows_equal(flow, new_flow) - assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value) + assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str( + excinfo.value + ) self.assertRaisesRegex( ValueError, diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 37cf6591d..4968fb363 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -16,7 +16,7 @@ class TestConfig(openml.testing.TestBase): def test_too_long_uri(self): with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"): - openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe") + openml.datasets.list_datasets(data_id=list(range(10000))) @unittest.mock.patch("time.sleep") @unittest.mock.patch("requests.Session") diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index ce46b6548..6769b2140 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -26,21 +26,21 @@ class TestRun(TestBase): # less than 1 seconds def test_tagging(self): - runs = openml.runs.list_runs(size=1, output_format="dataframe") + runs = openml.runs.list_runs(size=1) assert not runs.empty, "Test server state is incorrect" run_id = runs["run_id"].iloc[0] run = openml.runs.get_run(run_id) # tags can be at most 64 alphanumeric (+ underscore) chars unique_indicator = str(time()).replace(".", "") tag = f"test_tag_TestRun_{unique_indicator}" - runs = openml.runs.list_runs(tag=tag, output_format="dataframe") + runs = openml.runs.list_runs(tag=tag) assert len(runs) == 0 run.push_tag(tag) - runs = openml.runs.list_runs(tag=tag, output_format="dataframe") + runs = openml.runs.list_runs(tag=tag) assert len(runs) == 1 assert run_id in runs["run_id"] run.remove_tag(tag) - runs = openml.runs.list_runs(tag=tag, output_format="dataframe") + runs = openml.runs.list_runs(tag=tag) assert len(runs) == 0 @staticmethod diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 960a72d6d..7ce5bf71e 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1434,25 +1434,21 @@ def _check_run(self, run): def test_get_runs_list(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server - runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe") + runs = openml.runs.list_runs(id=[2], display_errors=True) assert len(runs) == 1 for run in runs.to_dict(orient="index").values(): self._check_run(run) def test_list_runs_empty(self): - runs = openml.runs.list_runs(task=[0], output_format="dataframe") + runs = openml.runs.list_runs(task=[0]) assert runs.empty - def test_list_runs_output_format(self): - runs = openml.runs.list_runs(size=1000, output_format="dataframe") - assert isinstance(runs, pd.DataFrame) - @pytest.mark.production() def test_get_runs_list_by_task(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server task_ids = [20] - runs = openml.runs.list_runs(task=task_ids, output_format="dataframe") + runs = openml.runs.list_runs(task=task_ids) assert len(runs) >= 590 for run in runs.to_dict(orient="index").values(): assert run["task_id"] in task_ids @@ -1460,7 +1456,7 @@ def test_get_runs_list_by_task(self): num_runs = len(runs) task_ids.append(21) - runs = openml.runs.list_runs(task=task_ids, output_format="dataframe") + runs = openml.runs.list_runs(task=task_ids) assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): assert run["task_id"] in task_ids @@ -1473,7 +1469,7 @@ def test_get_runs_list_by_uploader(self): # 29 is Dominik Kirchhoff uploader_ids = [29] - runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe") + runs = openml.runs.list_runs(uploader=uploader_ids) assert len(runs) >= 2 for run in runs.to_dict(orient="index").values(): assert run["uploader"] in uploader_ids @@ -1482,7 +1478,7 @@ def test_get_runs_list_by_uploader(self): uploader_ids.append(274) - runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe") + runs = openml.runs.list_runs(uploader=uploader_ids) assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): assert run["uploader"] in uploader_ids @@ -1493,7 +1489,7 @@ def test_get_runs_list_by_flow(self): # TODO: comes from live, no such lists on test openml.config.server = self.production_server flow_ids = [1154] - runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe") + runs = openml.runs.list_runs(flow=flow_ids) assert len(runs) >= 1 for run in runs.to_dict(orient="index").values(): assert run["flow_id"] in flow_ids @@ -1501,7 +1497,7 @@ def test_get_runs_list_by_flow(self): num_runs = len(runs) flow_ids.append(1069) - runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe") + runs = openml.runs.list_runs(flow=flow_ids) assert len(runs) >= num_runs + 1 for run in runs.to_dict(orient="index").values(): assert run["flow_id"] in flow_ids @@ -1515,12 +1511,7 @@ def test_get_runs_pagination(self): size = 10 max = 100 for i in range(0, max, size): - runs = openml.runs.list_runs( - offset=i, - size=size, - uploader=uploader_ids, - output_format="dataframe", - ) + runs = openml.runs.list_runs(offset=i, size=size, uploader=uploader_ids) assert size >= len(runs) for run in runs.to_dict(orient="index").values(): assert run["uploader"] in uploader_ids @@ -1543,23 +1534,22 @@ def test_get_runs_list_by_filters(self): # self.assertRaises(openml.exceptions.OpenMLServerError, # openml.runs.list_runs) - runs = openml.runs.list_runs(id=ids, output_format="dataframe") + runs = openml.runs.list_runs(id=ids) assert len(runs) == 2 - runs = openml.runs.list_runs(task=tasks, output_format="dataframe") + runs = openml.runs.list_runs(task=tasks) assert len(runs) >= 2 - runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe") + runs = openml.runs.list_runs(uploader=uploaders_2) assert len(runs) >= 10 - runs = openml.runs.list_runs(flow=flows, output_format="dataframe") + runs = openml.runs.list_runs(flow=flows) assert len(runs) >= 100 runs = openml.runs.list_runs( id=ids, task=tasks, uploader=uploaders_1, - output_format="dataframe", ) assert len(runs) == 2 @@ -1568,7 +1558,7 @@ def test_get_runs_list_by_tag(self): # TODO: comes from live, no such lists on test # Unit test works on production server only openml.config.server = self.production_server - runs = openml.runs.list_runs(tag="curves", output_format="dataframe") + runs = openml.runs.list_runs(tag="curves") assert len(runs) >= 1 @pytest.mark.sklearn() diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 06cf821c2..88338663a 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -4,7 +4,6 @@ import hashlib import time import unittest.mock -from typing import Dict import pandas as pd import pytest @@ -155,12 +154,12 @@ def test_list_setups_empty(self): def test_list_setups_output_format(self): openml.config.server = self.production_server flow_id = 6794 - setups = openml.setups.list_setups(flow=flow_id, output_format="object", size=10) - assert isinstance(setups, Dict) + setups = openml.setups.list_setups(flow=flow_id, size=10) + assert isinstance(setups, dict) assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup) assert len(setups) == 10 - setups = openml.setups.list_setups(flow=flow_id, output_format="dataframe", size=10) + setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe") assert isinstance(setups, pd.DataFrame) assert len(setups) == 10 diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index d01a1dcf4..d13f770c8 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -1,7 +1,6 @@ # License: BSD 3-Clause from __future__ import annotations -import pandas as pd import pytest import unittest @@ -236,13 +235,15 @@ def test_study_attach_illegal(self): study_original = openml.study.get_study(study.id) with pytest.raises( - openml.exceptions.OpenMLServerException, match="Problem attaching entities." + openml.exceptions.OpenMLServerException, + match="Problem attaching entities.", ): # run id does not exists openml.study.attach_to_study(study.id, [0]) with pytest.raises( - openml.exceptions.OpenMLServerException, match="Problem attaching entities." + openml.exceptions.OpenMLServerException, + match="Problem attaching entities.", ): # some runs already attached openml.study.attach_to_study(study.id, list(run_list_more.keys())) @@ -251,11 +252,6 @@ def test_study_attach_illegal(self): @unittest.skip("It is unclear when we can expect the test to pass or fail.") def test_study_list(self): - study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe") + study_list = openml.study.list_studies(status="in_preparation") # might fail if server is recently reset assert len(study_list) >= 2 - - @unittest.skip("It is unclear when we can expect the test to pass or fail.") - def test_study_list_output_format(self): - study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe") - assert isinstance(study_list, pd.DataFrame) diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 046184791..856352ac2 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -57,7 +57,7 @@ def test__get_estimation_procedure_list(self): def test_list_clustering_task(self): # as shown by #383, clustering tasks can give list/dict casting problems openml.config.server = self.production_server - openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10, output_format="dataframe") + openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10) # the expected outcome is that it doesn't crash. No assertions. def _check_task(self, task): @@ -72,34 +72,30 @@ def _check_task(self, task): def test_list_tasks_by_type(self): num_curves_tasks = 198 # number is flexible, check server if fails ttid = TaskType.LEARNING_CURVE - tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe") + tasks = openml.tasks.list_tasks(task_type=ttid) assert len(tasks) >= num_curves_tasks for task in tasks.to_dict(orient="index").values(): assert ttid == task["ttid"] self._check_task(task) - def test_list_tasks_output_format(self): + def test_list_tasks_length(self): ttid = TaskType.LEARNING_CURVE - tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe") - assert isinstance(tasks, pd.DataFrame) + tasks = openml.tasks.list_tasks(task_type=ttid) assert len(tasks) > 100 def test_list_tasks_empty(self): - tasks = cast( - pd.DataFrame, - openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag", output_format="dataframe"), - ) + tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag") assert tasks.empty def test_list_tasks_by_tag(self): num_basic_tasks = 100 # number is flexible, check server if fails - tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe") + tasks = openml.tasks.list_tasks(tag="OpenML100") assert len(tasks) >= num_basic_tasks for task in tasks.to_dict(orient="index").values(): self._check_task(task) def test_list_tasks(self): - tasks = openml.tasks.list_tasks(output_format="dataframe") + tasks = openml.tasks.list_tasks() assert len(tasks) >= 900 for task in tasks.to_dict(orient="index").values(): self._check_task(task) @@ -108,7 +104,7 @@ def test_list_tasks_paginate(self): size = 10 max = 100 for i in range(0, max, size): - tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe") + tasks = openml.tasks.list_tasks(offset=i, size=size) assert size >= len(tasks) for task in tasks.to_dict(orient="index").values(): self._check_task(task) @@ -123,12 +119,7 @@ def test_list_tasks_per_type_paginate(self): ] for j in task_types: for i in range(0, max, size): - tasks = openml.tasks.list_tasks( - task_type=j, - offset=i, - size=size, - output_format="dataframe", - ) + tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size) assert size >= len(tasks) for task in tasks.to_dict(orient="index").values(): assert j == task["ttid"] diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 552fbe949..4480c2cbc 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -20,14 +20,14 @@ def test_tagging(self): # tags can be at most 64 alphanumeric (+ underscore) chars unique_indicator = str(time()).replace(".", "") tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}" - tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") + tasks = openml.tasks.list_tasks(tag=tag) assert len(tasks) == 0 task.push_tag(tag) - tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") + tasks = openml.tasks.list_tasks(tag=tag) assert len(tasks) == 1 assert 1 in tasks["tid"] task.remove_tag(tag) - tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe") + tasks = openml.tasks.list_tasks(tag=tag) assert len(tasks) == 0 def test_get_train_and_test_split_indices(self): diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 38d497e2d..51504dedc 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -88,11 +88,7 @@ def test_list_all(): @pytest.mark.server() def test_list_all_for_tasks(min_number_tasks_on_test_server): - tasks = openml.tasks.list_tasks( - batch_size=1000, - size=min_number_tasks_on_test_server, - output_format="dataframe", - ) + tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server) assert min_number_tasks_on_test_server == len(tasks) @@ -124,11 +120,7 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server): @pytest.mark.server() def test_list_all_for_flows(min_number_flows_on_test_server): - flows = openml.flows.list_flows( - batch_size=25, - size=min_number_flows_on_test_server, - output_format="dataframe", - ) + flows = openml.flows.list_flows(size=min_number_flows_on_test_server) assert min_number_flows_on_test_server == len(flows) @@ -161,12 +153,7 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server): @pytest.mark.server() @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call) def test_list_all_few_results_available(_perform_api_call): - datasets = openml.datasets.list_datasets( - size=1000, - data_name="iris", - data_version=1, - output_format="dataframe", - ) + datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1) assert len(datasets) == 1, "only one iris dataset version 1 should be present" assert _perform_api_call.call_count == 1, "expect just one call to get one dataset" From 2c8212287a9a76704f475473f6ee0b3cb03b885e Mon Sep 17 00:00:00 2001 From: SubhadityaMukherjee Date: Tue, 1 Apr 2025 15:05:22 +0200 Subject: [PATCH 05/23] rename target_names back to target --- examples/20_basic/simple_datasets_tutorial.py | 2 +- examples/20_basic/simple_flows_and_runs_tutorial.py | 2 +- examples/30_extended/datasets_tutorial.py | 2 +- examples/30_extended/flows_and_runs_tutorial.py | 4 ++-- openml/tasks/task.py | 4 ++-- tests/test_datasets/test_dataset.py | 10 +++++----- .../test_sklearn_extension/test_sklearn_extension.py | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py index fed997d58..9b18aab14 100644 --- a/examples/20_basic/simple_datasets_tutorial.py +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -48,7 +48,7 @@ # attribute_names - the names of the features for the examples (X) and # target feature (y) X, y, categorical_indicator, attribute_names = dataset.get_data( - target_names=dataset.default_target_attribute + target=dataset.default_target_attribute ) ############################################################################ diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index 63a3a6d85..f7d7a49d1 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -23,7 +23,7 @@ # NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20 dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target_names=dataset.default_target_attribute + target=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=3) clf.fit(X, y) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 2841c68f7..77a46d8b0 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -68,7 +68,7 @@ # dataset. In particular, many datasets for supervised problems have a set # `default_target_attribute` which may help identify the target variable. X, y, categorical_indicator, attribute_names = dataset.get_data( - target_names=dataset.default_target_attribute + target=dataset.default_target_attribute ) print(X.head()) print(X.info()) diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index bbe94c1dc..afd398feb 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -27,7 +27,7 @@ # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68 dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target_names=dataset.default_target_attribute + target=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) @@ -38,7 +38,7 @@ # * e.g. categorical features -> do feature encoding dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1) X, y, categorical_indicator, attribute_names = dataset.get_data( - target_names=dataset.default_target_attribute + target=dataset.default_target_attribute ) print(f"Categorical features: {categorical_indicator}") transformer = compose.ColumnTransformer( diff --git a/openml/tasks/task.py b/openml/tasks/task.py index c3079db91..395b52482 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -292,7 +292,7 @@ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: ): raise NotImplementedError(self.task_type) - X, y, _, _ = dataset.get_data(target_names=self.target_name) + X, y, _, _ = dataset.get_data(target=self.target_name) return X, y def _to_dict(self) -> dict[str, dict]: @@ -502,7 +502,7 @@ def get_X(self) -> pd.DataFrame: The X data as a dataframe """ dataset = self.get_dataset() - data, *_ = dataset.get_data(target_names=None) + data, *_ = dataset.get_data(target=None) return data def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]: diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 0675c21d2..8afc1fb91 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -120,7 +120,7 @@ def test_get_data_pandas(self): assert data[col_name].dtype.name == col_dtype[col_name] X, y, _, _ = self.titanic.get_data( - target_names=self.titanic.default_target_attribute, + target=self.titanic.default_target_attribute, ) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) @@ -171,7 +171,7 @@ def test_get_data_with_rowid(self): @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_with_target_pandas(self): - X, y, categorical, attribute_names = self.dataset.get_data(target_names="class") + X, y, categorical, attribute_names = self.dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) for dtype, is_cat, col in zip(X.dtypes, categorical, X): self._check_expected_type(dtype, is_cat, X[col]) @@ -187,7 +187,7 @@ def test_get_data_with_target_pandas(self): def test_get_data_rowid_and_ignore_and_target(self): self.dataset.ignore_attribute = ["condition"] self.dataset.row_id_attribute = ["hardness"] - X, y, categorical, names = self.dataset.get_data(target_names="class") + X, y, categorical, names = self.dataset.get_data(target="class") assert X.shape == (898, 36) assert len(categorical) == 36 cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 @@ -348,7 +348,7 @@ def setUp(self): self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False) def test_get_sparse_dataset_dataframe_with_target(self): - X, y, _, attribute_names = self.sparse_dataset.get_data(target_names="class") + X, y, _, attribute_names = self.sparse_dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) assert isinstance(X.dtypes[0], pd.SparseDtype) assert X.shape == (600, 20000) @@ -375,7 +375,7 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.sparse_dataset.row_id_attribute = ["V512"] # TODO(eddiebergman): Will break from dataset_format removal X, y, categorical, _ = self.sparse_dataset.get_data( - target_names="class", + target="class", include_row_id=False, include_ignore_attribute=False, ) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index aeae743c2..242f6793c 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -2220,7 +2220,7 @@ def test_run_on_model_with_empty_steps(self): task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation X, y, categorical_ind, feature_names = dataset.get_data( - target_names=dataset.default_target_attribute, + target=dataset.default_target_attribute, ) categorical_ind = np.array(categorical_ind) (cat_idx,) = np.where(categorical_ind) From ce5788108dbb6047ccbf73a261ea8ed04d4c910f Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 10:33:31 +0200 Subject: [PATCH 06/23] fix: resolve kdd_rijn example to be without dataframe parameter and remove wrong merge files --- examples/40_paper/2018_kdd_rijn_example.py | 14 +- .../2018_kdd_rijn_example_BACKUP_34122.py | 271 ------------------ .../2018_kdd_rijn_example_BACKUP_39115.py | 271 ------------------ .../2018_kdd_rijn_example_BASE_34122.py | 189 ------------ .../2018_kdd_rijn_example_BASE_39115.py | 189 ------------ .../2018_kdd_rijn_example_LOCAL_34122.py | 188 ------------ .../2018_kdd_rijn_example_LOCAL_39115.py | 188 ------------ .../2018_kdd_rijn_example_REMOTE_34122.py | 188 ------------ .../2018_kdd_rijn_example_REMOTE_39115.py | 188 ------------ 9 files changed, 11 insertions(+), 1675 deletions(-) delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BASE_34122.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_BASE_39115.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py delete mode 100644 examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py index f38466502..f98f5de55 100644 --- a/examples/40_paper/2018_kdd_rijn_example.py +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -106,13 +106,19 @@ size=limit_per_task, ) - + performance_column = "value" + # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance + # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine + # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format + # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for + # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the + # setups that belong to the flows embedded in this example though. try: setups_evals = pd.DataFrame( [ dict( **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} + **{performance_column: setup[performance_column]}, ) for _, setup in evals.iterrows() ] @@ -152,7 +158,9 @@ fanova_results.append( { "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], + "fanova": evaluator.quantify_importance([idx])[(idx,)][ + "individual importance" + ], } ) except RuntimeError as e: diff --git a/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py b/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py deleted file mode 100644 index bf3b7ac83..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_BACKUP_34122.py +++ /dev/null @@ -1,271 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - sys.exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -<<<<<<< HEAD -print("This example is deprecated, remove this code to use it manually.") -sys.exit() - -import json - -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py -======= -print("This example is deprecated, remove the `if False` in this code to use it manually.") -if False: - import json - import fanova - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - import openml - - - ############################################################################## - # With the advent of automated machine learning, automated hyperparameter - # optimization methods are by now routinely used in data mining. However, this - # progress is not yet matched by equal progress on automatic analyses that - # yield information beyond performance-optimizing hyperparameter settings. - # In this example, we aim to answer the following two questions: Given an - # algorithm, what are generally its most important hyperparameters? - # - # This work is carried out on the OpenML-100 benchmark suite, which can be - # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we - # conduct the experiment on the Support Vector Machine (``flow_id=7707``) - # with specific kernel (we will perform a post-process filter operation for - # this). We should set some other experimental parameters (number of results - # per task, evaluation measure and the number of trees of the internal - # functional Anova) before the fun can begin. - # - # Note that we simplify the example in several ways: - # - # 1) We only consider numerical hyperparameters - # 2) We consider all hyperparameters that are numerical (in reality, some - # hyperparameters might be inactive (e.g., ``degree``) or irrelevant - # (e.g., ``random_state``) - # 3) We assume all hyperparameters to be on uniform scale - # - # Any difference in conclusion between the actual paper and the presented - # results is most likely due to one of these simplifications. For example, - # the hyperparameter C looks rather insignificant, whereas it is quite - # important when it is put on a log-scale. All these simplifications can be - # addressed by defining a ConfigSpace. For a more elaborated example that uses - # this, please see: - # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 ->>>>>>> develop - - suite = openml.study.get_suite("OpenML100") - flow_id = 7707 - parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} - evaluation_measure = "predictive_accuracy" - limit_per_task = 500 - limit_nr_tasks = 15 - n_trees = 16 - -<<<<<<< HEAD -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]}, - ) - for _, setup in evals.iterrows() - ] -======= - fanova_results = [] - # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the - # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. - for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", ->>>>>>> develop - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - - # transform ``fanova_results`` from a list of dicts into a DataFrame - fanova_results = pd.DataFrame(fanova_results) - - ############################################################################## - # make the boxplot of the variance contribution. Obviously, we can also use - # this data to make the Nemenyi plot, but this relies on the rather complex - # ``Orange`` dependency (``pip install Orange3``). For the complete example, - # the reader is referred to the more elaborate script (referred to earlier) - fig, ax = plt.subplots() - sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - ax.set_ylabel("Variance Contribution") - ax.set_xlabel(None) - plt.tight_layout() - plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py b/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py deleted file mode 100644 index bf3b7ac83..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_BACKUP_39115.py +++ /dev/null @@ -1,271 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - sys.exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -<<<<<<< HEAD -print("This example is deprecated, remove this code to use it manually.") -sys.exit() - -import json - -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py -======= -print("This example is deprecated, remove the `if False` in this code to use it manually.") -if False: - import json - import fanova - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - import openml - - - ############################################################################## - # With the advent of automated machine learning, automated hyperparameter - # optimization methods are by now routinely used in data mining. However, this - # progress is not yet matched by equal progress on automatic analyses that - # yield information beyond performance-optimizing hyperparameter settings. - # In this example, we aim to answer the following two questions: Given an - # algorithm, what are generally its most important hyperparameters? - # - # This work is carried out on the OpenML-100 benchmark suite, which can be - # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we - # conduct the experiment on the Support Vector Machine (``flow_id=7707``) - # with specific kernel (we will perform a post-process filter operation for - # this). We should set some other experimental parameters (number of results - # per task, evaluation measure and the number of trees of the internal - # functional Anova) before the fun can begin. - # - # Note that we simplify the example in several ways: - # - # 1) We only consider numerical hyperparameters - # 2) We consider all hyperparameters that are numerical (in reality, some - # hyperparameters might be inactive (e.g., ``degree``) or irrelevant - # (e.g., ``random_state``) - # 3) We assume all hyperparameters to be on uniform scale - # - # Any difference in conclusion between the actual paper and the presented - # results is most likely due to one of these simplifications. For example, - # the hyperparameter C looks rather insignificant, whereas it is quite - # important when it is put on a log-scale. All these simplifications can be - # addressed by defining a ConfigSpace. For a more elaborated example that uses - # this, please see: - # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 ->>>>>>> develop - - suite = openml.study.get_suite("OpenML100") - flow_id = 7707 - parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} - evaluation_measure = "predictive_accuracy" - limit_per_task = 500 - limit_nr_tasks = 15 - n_trees = 16 - -<<<<<<< HEAD -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]}, - ) - for _, setup in evals.iterrows() - ] -======= - fanova_results = [] - # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the - # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. - for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", ->>>>>>> develop - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - - # transform ``fanova_results`` from a list of dicts into a DataFrame - fanova_results = pd.DataFrame(fanova_results) - - ############################################################################## - # make the boxplot of the variance contribution. Obviously, we can also use - # this data to make the Nemenyi plot, but this relies on the rather complex - # ``Orange`` dependency (``pip install Orange3``). For the complete example, - # the reader is referred to the more elaborate script (referred to earlier) - fig, ax = plt.subplots() - sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - ax.set_ylabel("Variance Contribution") - ax.set_xlabel(None) - plt.tight_layout() - plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py b/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py deleted file mode 100644 index 7ec60fe53..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_BASE_34122.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": # noqa - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove this code to use it manually.") -exit() - -import json -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 - -suite = openml.study.get_suite("OpenML100") -flow_id = 7707 -parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} -evaluation_measure = "predictive_accuracy" -limit_per_task = 500 -limit_nr_tasks = 15 -n_trees = 16 - -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - -# transform ``fanova_results`` from a list of dicts into a DataFrame -fanova_results = pd.DataFrame(fanova_results) - -############################################################################## -# make the boxplot of the variance contribution. Obviously, we can also use -# this data to make the Nemenyi plot, but this relies on the rather complex -# ``Orange`` dependency (``pip install Orange3``). For the complete example, -# the reader is referred to the more elaborate script (referred to earlier) -fig, ax = plt.subplots() -sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") -ax.set_ylabel("Variance Contribution") -ax.set_xlabel(None) -plt.tight_layout() -plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py b/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py deleted file mode 100644 index 7ec60fe53..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_BASE_39115.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": # noqa - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove this code to use it manually.") -exit() - -import json -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 - -suite = openml.study.get_suite("OpenML100") -flow_id = 7707 -parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} -evaluation_measure = "predictive_accuracy" -limit_per_task = 500 -limit_nr_tasks = 15 -n_trees = 16 - -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - -# transform ``fanova_results`` from a list of dicts into a DataFrame -fanova_results = pd.DataFrame(fanova_results) - -############################################################################## -# make the boxplot of the variance contribution. Obviously, we can also use -# this data to make the Nemenyi plot, but this relies on the rather complex -# ``Orange`` dependency (``pip install Orange3``). For the complete example, -# the reader is referred to the more elaborate script (referred to earlier) -fig, ax = plt.subplots() -sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") -ax.set_ylabel("Variance Contribution") -ax.set_xlabel(None) -plt.tight_layout() -plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py b/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py deleted file mode 100644 index 2230ade2d..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_LOCAL_34122.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - sys.exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove this code to use it manually.") -sys.exit() - -import json - -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py - -suite = openml.study.get_suite("OpenML100") -flow_id = 7707 -parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} -evaluation_measure = "predictive_accuracy" -limit_per_task = 500 -limit_nr_tasks = 15 -n_trees = 16 - -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]}, - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - -# transform ``fanova_results`` from a list of dicts into a DataFrame -fanova_results = pd.DataFrame(fanova_results) - -############################################################################## -# make the boxplot of the variance contribution. Obviously, we can also use -# this data to make the Nemenyi plot, but this relies on the rather complex -# ``Orange`` dependency (``pip install Orange3``). For the complete example, -# the reader is referred to the more elaborate script (referred to earlier) -fig, ax = plt.subplots() -sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") -ax.set_ylabel("Variance Contribution") -ax.set_xlabel(None) -plt.tight_layout() -plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py b/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py deleted file mode 100644 index 2230ade2d..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_LOCAL_39115.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - sys.exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove this code to use it manually.") -sys.exit() - -import json - -import fanova -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - -import openml - -############################################################################## -# With the advent of automated machine learning, automated hyperparameter -# optimization methods are by now routinely used in data mining. However, this -# progress is not yet matched by equal progress on automatic analyses that -# yield information beyond performance-optimizing hyperparameter settings. -# In this example, we aim to answer the following two questions: Given an -# algorithm, what are generally its most important hyperparameters? -# -# This work is carried out on the OpenML-100 benchmark suite, which can be -# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we -# conduct the experiment on the Support Vector Machine (``flow_id=7707``) -# with specific kernel (we will perform a post-process filter operation for -# this). We should set some other experimental parameters (number of results -# per task, evaluation measure and the number of trees of the internal -# functional Anova) before the fun can begin. -# -# Note that we simplify the example in several ways: -# -# 1) We only consider numerical hyperparameters -# 2) We consider all hyperparameters that are numerical (in reality, some -# hyperparameters might be inactive (e.g., ``degree``) or irrelevant -# (e.g., ``random_state``) -# 3) We assume all hyperparameters to be on uniform scale -# -# Any difference in conclusion between the actual paper and the presented -# results is most likely due to one of these simplifications. For example, -# the hyperparameter C looks rather insignificant, whereas it is quite -# important when it is put on a log-scale. All these simplifications can be -# addressed by defining a ConfigSpace. For a more elaborated example that uses -# this, please see: -# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py - -suite = openml.study.get_suite("OpenML100") -flow_id = 7707 -parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} -evaluation_measure = "predictive_accuracy" -limit_per_task = 500 -limit_nr_tasks = 15 -n_trees = 16 - -fanova_results = [] -# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the -# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. -for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]}, - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - -# transform ``fanova_results`` from a list of dicts into a DataFrame -fanova_results = pd.DataFrame(fanova_results) - -############################################################################## -# make the boxplot of the variance contribution. Obviously, we can also use -# this data to make the Nemenyi plot, but this relies on the rather complex -# ``Orange`` dependency (``pip install Orange3``). For the complete example, -# the reader is referred to the more elaborate script (referred to earlier) -fig, ax = plt.subplots() -sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") -ax.set_ylabel("Variance Contribution") -ax.set_xlabel(None) -plt.tight_layout() -plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py b/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py deleted file mode 100644 index 6522013e3..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_REMOTE_34122.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": # noqa - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove the `if False` in this code to use it manually.") -if False: - import json - import fanova - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - import openml - - - ############################################################################## - # With the advent of automated machine learning, automated hyperparameter - # optimization methods are by now routinely used in data mining. However, this - # progress is not yet matched by equal progress on automatic analyses that - # yield information beyond performance-optimizing hyperparameter settings. - # In this example, we aim to answer the following two questions: Given an - # algorithm, what are generally its most important hyperparameters? - # - # This work is carried out on the OpenML-100 benchmark suite, which can be - # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we - # conduct the experiment on the Support Vector Machine (``flow_id=7707``) - # with specific kernel (we will perform a post-process filter operation for - # this). We should set some other experimental parameters (number of results - # per task, evaluation measure and the number of trees of the internal - # functional Anova) before the fun can begin. - # - # Note that we simplify the example in several ways: - # - # 1) We only consider numerical hyperparameters - # 2) We consider all hyperparameters that are numerical (in reality, some - # hyperparameters might be inactive (e.g., ``degree``) or irrelevant - # (e.g., ``random_state``) - # 3) We assume all hyperparameters to be on uniform scale - # - # Any difference in conclusion between the actual paper and the presented - # results is most likely due to one of these simplifications. For example, - # the hyperparameter C looks rather insignificant, whereas it is quite - # important when it is put on a log-scale. All these simplifications can be - # addressed by defining a ConfigSpace. For a more elaborated example that uses - # this, please see: - # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 - - suite = openml.study.get_suite("OpenML100") - flow_id = 7707 - parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} - evaluation_measure = "predictive_accuracy" - limit_per_task = 500 - limit_nr_tasks = 15 - n_trees = 16 - - fanova_results = [] - # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the - # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. - for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - - # transform ``fanova_results`` from a list of dicts into a DataFrame - fanova_results = pd.DataFrame(fanova_results) - - ############################################################################## - # make the boxplot of the variance contribution. Obviously, we can also use - # this data to make the Nemenyi plot, but this relies on the rather complex - # ``Orange`` dependency (``pip install Orange3``). For the complete example, - # the reader is referred to the more elaborate script (referred to earlier) - fig, ax = plt.subplots() - sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - ax.set_ylabel("Variance Contribution") - ax.set_xlabel(None) - plt.tight_layout() - plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py b/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py deleted file mode 100644 index 6522013e3..000000000 --- a/examples/40_paper/2018_kdd_rijn_example_REMOTE_39115.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -van Rijn and Hutter (2018) -========================== - -A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. - -Example Deprecation Warning! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore. - -Publication -~~~~~~~~~~~ - -| Hyperparameter importance across datasets -| Jan N. van Rijn and Frank Hutter -| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 - -Requirements -~~~~~~~~~~~~ - -This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other -systems). - -The following Python packages are required: - -pip install openml[examples,docs] fanova ConfigSpace<1.0 -""" - -# License: BSD 3-Clause - -import sys - -if sys.platform == "win32": # noqa - print( - "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" - ) - exit() - -# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline -print("This example is deprecated, remove the `if False` in this code to use it manually.") -if False: - import json - import fanova - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - import openml - - - ############################################################################## - # With the advent of automated machine learning, automated hyperparameter - # optimization methods are by now routinely used in data mining. However, this - # progress is not yet matched by equal progress on automatic analyses that - # yield information beyond performance-optimizing hyperparameter settings. - # In this example, we aim to answer the following two questions: Given an - # algorithm, what are generally its most important hyperparameters? - # - # This work is carried out on the OpenML-100 benchmark suite, which can be - # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we - # conduct the experiment on the Support Vector Machine (``flow_id=7707``) - # with specific kernel (we will perform a post-process filter operation for - # this). We should set some other experimental parameters (number of results - # per task, evaluation measure and the number of trees of the internal - # functional Anova) before the fun can begin. - # - # Note that we simplify the example in several ways: - # - # 1) We only consider numerical hyperparameters - # 2) We consider all hyperparameters that are numerical (in reality, some - # hyperparameters might be inactive (e.g., ``degree``) or irrelevant - # (e.g., ``random_state``) - # 3) We assume all hyperparameters to be on uniform scale - # - # Any difference in conclusion between the actual paper and the presented - # results is most likely due to one of these simplifications. For example, - # the hyperparameter C looks rather insignificant, whereas it is quite - # important when it is put on a log-scale. All these simplifications can be - # addressed by defining a ConfigSpace. For a more elaborated example that uses - # this, please see: - # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 - - suite = openml.study.get_suite("OpenML100") - flow_id = 7707 - parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"} - evaluation_measure = "predictive_accuracy" - limit_per_task = 500 - limit_nr_tasks = 15 - n_trees = 16 - - fanova_results = [] - # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the - # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. - for idx, task_id in enumerate(suite.tasks): - if limit_nr_tasks is not None and idx >= limit_nr_tasks: - continue - print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) - ) - # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) - evals = openml.evaluations.list_evaluations_setups( - evaluation_measure, - flows=[flow_id], - tasks=[task_id], - size=limit_per_task, - output_format="dataframe", - ) - - performance_column = "value" - # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance - # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine - # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format - # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for - # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the - # setups that belong to the flows embedded in this example though. - try: - setups_evals = pd.DataFrame( - [ - dict( - **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} - ) - for _, setup in evals.iterrows() - ] - ) - except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) - continue - # apply our filters, to have only the setups that comply to the hyperparameters we want - for filter_key, filter_value in parameter_filters.items(): - setups_evals = setups_evals[setups_evals[filter_key] == filter_value] - # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, - # the fanova library needs to be informed by using a configspace object. - setups_evals = setups_evals.select_dtypes(include=["int64", "float64"]) - # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, - # ``verbose``. - setups_evals = setups_evals[ - [ - c - for c in list(setups_evals) - if len(setups_evals[c].unique()) > 1 or c == performance_column - ] - ] - # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., - # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: - - # determine x values to pass to fanova library - parameter_names = [ - pname for pname in setups_evals.columns.to_numpy() if pname != performance_column - ] - evaluator = fanova.fanova.fANOVA( - X=setups_evals[parameter_names].to_numpy(), - Y=setups_evals[performance_column].to_numpy(), - n_trees=n_trees, - ) - for idx, pname in enumerate(parameter_names): - try: - fanova_results.append( - { - "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], - } - ) - except RuntimeError as e: - # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant - # for all configurations (there is no variance). We will skip these tasks (like the authors did in the - # paper). - print("Task %d error: %s" % (task_id, e)) - continue - - # transform ``fanova_results`` from a list of dicts into a DataFrame - fanova_results = pd.DataFrame(fanova_results) - - ############################################################################## - # make the boxplot of the variance contribution. Obviously, we can also use - # this data to make the Nemenyi plot, but this relies on the rather complex - # ``Orange`` dependency (``pip install Orange3``). For the complete example, - # the reader is referred to the more elaborate script (referred to earlier) - fig, ax = plt.subplots() - sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax) - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - ax.set_ylabel("Variance Contribution") - ax.set_xlabel(None) - plt.tight_layout() - plt.show() From ddda71113de125ef7bd5ecf8b22354127bf23421 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 10:47:45 +0200 Subject: [PATCH 07/23] fix: ensure dtypes as in original code --- openml/datasets/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 91254ebd7..3712c99ba 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -463,7 +463,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 data = self._get_arff(self.format) except OSError as e: logger.critical( - f"Please check that the data file {arff_file_path} is " "there and can be read.", + f"Please check that the data file {arff_file_path} is there and can be read.", ) raise e @@ -716,7 +716,7 @@ def valid_category(cat: Any) -> bool: raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories) return pd.Series(raw_cat, index=series.index, name=series.name) - def get_data( # noqa: C901, PLR0912 + def get_data( # noqa: C901 self, target: list[str] | str | None = None, include_row_id: bool = False, # noqa: FBT001, FBT002 @@ -790,10 +790,10 @@ def get_data( # noqa: C901, PLR0912 y = y.sparse.to_dense() # Since it was sparsified, the y column may not be of the correct dtype, hence we check - # if it was categorical and convert it to and integer if needs be. + # if it was categorical and convert it to an integer if needs be. category_names = data.columns[categorical_mask] - if target_name in category_names: - y = y.astype(int) + target_dtype = int if target_name in category_names else float + y = y.astype(target_dtype) # Finally, remove the target from the list of attributes and categorical mask target_index = attribute_names.index(target_name) From 063a8e6841fe0ece37b365ff07dfce74aaf3ad5a Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 12:03:05 +0200 Subject: [PATCH 08/23] fix: remove incorrect parsing of sparse pandas --- openml/datasets/dataset.py | 9 --------- tests/test_datasets/test_dataset.py | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 3712c99ba..fa83d2b8a 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -786,15 +786,6 @@ def get_data( # noqa: C901 x = data.drop(columns=[target_name]) y = data[target_name].squeeze() - if isinstance(y.dtype, pd.SparseDtype): - y = y.sparse.to_dense() - - # Since it was sparsified, the y column may not be of the correct dtype, hence we check - # if it was categorical and convert it to an integer if needs be. - category_names = data.columns[categorical_mask] - target_dtype = int if target_name in category_names else float - y = y.astype(target_dtype) - # Finally, remove the target from the list of attributes and categorical mask target_index = attribute_names.index(target_name) categorical_mask.pop(target_index) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 8afc1fb91..d132c4233 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -373,14 +373,14 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): # TODO: re-add row_id and ignore attributes self.sparse_dataset.ignore_attribute = ["V256"] self.sparse_dataset.row_id_attribute = ["V512"] - # TODO(eddiebergman): Will break from dataset_format removal X, y, categorical, _ = self.sparse_dataset.get_data( target="class", include_row_id=False, include_ignore_attribute=False, ) assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes) - assert y.dtype in [np.int32, np.int64] + # array format returned dense, but now we only return sparse and let the user handle it. + assert isinstance(y.dtypes, pd.SparseDtype) assert X.shape == (600, 19998) assert len(categorical) == 19998 From 2e6c4c770074a03264670ef74a031f33c5ff073b Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 12:03:48 +0200 Subject: [PATCH 09/23] fix: make sklearn tests work with pandas --- .../test_evaluations_example.py | 1 - .../test_sklearn_extension.py | 78 ++++++++++++------- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py index ba9b46216..a9ad7e8c1 100644 --- a/tests/test_evaluations/test_evaluations_example.py +++ b/tests/test_evaluations/test_evaluations_example.py @@ -24,7 +24,6 @@ def test_example_python_paper(self): "predictive_accuracy", flows=[8353], tasks=[6], - # output_format="dataframe", ... This was removed parameters_in_separate_columns=True, ) # Choose an SVM flow, for example 8353, and a task. diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 242f6793c..dec8d9e00 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -61,6 +61,28 @@ def __init__(self, boolean, integer, floating_point_value): def fit(self, X, y): pass +def _get_sklearn_preprocessing(): + from sklearn.compose import ColumnTransformer, make_column_selector + from sklearn.preprocessing import OrdinalEncoder + + return [ + ( + "cat_handling", + ColumnTransformer( + transformers=[ + ( + "cat", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=np.nan + ), + make_column_selector(dtype_include=["object", "category"]), + ) + ], + remainder="passthrough", + ), + ), + ("imp", SimpleImputer())] + class TestSklearnExtensionFlowFunctions(TestBase): # Splitting not helpful, these test's don't rely on the server and take less @@ -735,7 +757,7 @@ def test_serialize_feature_union(self): ) fixture_structure = { fixture_name: [], - f"sklearn.preprocessing.{module_name_encoder}." "OneHotEncoder": ["ohe"], + f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder": ["ohe"], f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], } assert serialization.name == fixture_name @@ -862,13 +884,13 @@ def test_serialize_complex_flow(self): weight_name, tree_name, ) - pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={}," "boosting={})".format( + pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={},boosting={})".format( ohe_name, scaler_name, boosting_name, ) fixture_name = ( - "sklearn.model_selection._search.RandomizedSearchCV" "(estimator=%s)" % pipeline_name + "sklearn.model_selection._search.RandomizedSearchCV(estimator=%s)" % pipeline_name ) fixture_structure = { ohe_name: ["estimator", "ohe"], @@ -1235,7 +1257,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self): fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2))) fixture = ( - "Found a second occurence of component .*.PCA when trying " "to serialize FeatureUnion" + "Found a second occurence of component .*.PCA when trying to serialize FeatureUnion" ) with pytest.raises(ValueError, match=fixture): self.extension.model_to_flow(fu) @@ -1737,13 +1759,13 @@ def test_run_model_on_fold_classification_1_array(self): X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X[train_indices] - y_train = y[train_indices] - X_test = X[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())], + steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1875,14 +1897,16 @@ def test_run_model_on_fold_classification_2(self): X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X[train_indices] - y_train = y[train_indices] - X_test = X[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] pipeline = sklearn.model_selection.GridSearchCV( - sklearn.tree.DecisionTreeClassifier(), - {"max_depth": [1, 2]}, + sklearn.pipeline.Pipeline( + steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())], + ), + {"clf__max_depth": [1, 2]}, ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1931,7 +1955,7 @@ class HardNaiveBayes(sklearn.naive_bayes.GaussianNB): # class for testing a naive bayes classifier that does not allow soft # predictions def predict_proba(*args, **kwargs): - raise AttributeError("predict_proba is not available when " "probability=False") + raise AttributeError("predict_proba is not available when probability=False") # task 1 (test server) is important: it is a task with an unused class tasks = [ @@ -1950,17 +1974,17 @@ def predict_proba(*args, **kwargs): fold=0, sample=0, ) - X_train = X[train_indices] - y_train = y[train_indices] - X_test = X[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] clf1 = sklearn.pipeline.Pipeline( steps=[ - ("imputer", SimpleImputer()), + *_get_sklearn_preprocessing(), ("estimator", sklearn.naive_bayes.GaussianNB()), ], ) clf2 = sklearn.pipeline.Pipeline( - steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())], + steps=[*_get_sklearn_preprocessing(), ("estimator", HardNaiveBayes())], ) pred_1, proba_1, _, _ = self.extension._run_model_on_fold( @@ -2005,10 +2029,10 @@ def test_run_model_on_fold_regression(self): X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X[train_indices] - y_train = y[train_indices] - X_test = X[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] pipeline = sklearn.pipeline.Pipeline( steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())], @@ -2059,7 +2083,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X() pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())], + steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.cluster.KMeans())], ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -2115,7 +2139,7 @@ def test__extract_trace_data(self): X, y = task.get_X_and_y() with warnings.catch_warnings(): warnings.simplefilter("ignore") - clf.fit(X[train], y[train]) + clf.fit(X.iloc[train], y.iloc[train]) # check num layers of MLP assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"] From 12dedb046814742535428fc0dd030c13c7ee1724 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 14:52:31 +0200 Subject: [PATCH 10/23] fix: fix listing calls and test for utils --- openml/datasets/functions.py | 9 +++++++-- openml/evaluations/functions.py | 6 +++++- openml/flows/functions.py | 18 ++++++++++++------ openml/runs/functions.py | 20 +++++++++++--------- openml/setups/functions.py | 12 ++++++++---- openml/study/functions.py | 11 ++++++++--- openml/tasks/functions.py | 7 +++++-- openml/utils.py | 5 +++-- tests/test_utils/test_utils.py | 12 ++++-------- 9 files changed, 63 insertions(+), 37 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index b2b5c6ea5..ea7266c62 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -73,7 +73,7 @@ def list_qualities() -> list[str]: raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): - raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list') + raise TypeError('Error in return XML, does not contain "oml:quality" as a list') return qualities["oml:data_qualities_list"]["oml:quality"] @@ -179,7 +179,12 @@ def _list_datasets( ------- datasets : dataframe """ - api_call = f"data/list/limit/{limit}/offset/{offset}" + api_call = "data/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index bee326438..f44fe3a93 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -194,7 +194,11 @@ def _list_evaluations( # noqa: C901 ------- list of OpenMLEvaluation objects """ - api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}" + api_call = f"evaluation/list/function/{function}" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): if value is not None: diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 7cb93386b..9906958e5 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -66,7 +66,7 @@ def _get_cached_flow(fid: int) -> OpenMLFlow: return _create_flow_from_xml(fh.read()) except OSError as e: openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir) - raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) from e + raise OpenMLCacheException("Flow file for fid %d not cached" % fid) from e @openml.utils.thread_safe_if_oslo_installed @@ -190,11 +190,17 @@ def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: ------- flows : dataframe """ - api_call = f"flow/list/limit/{limit}/offset/{offset}" + api_call = "flow/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + if value is not None: + api_call += f"/{operator}/{value}" return __list_flows(api_call=api_call) @@ -417,11 +423,11 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 for name in set(attr1.keys()).union(attr2.keys()): if name not in attr1: raise ValueError( - f"Component {name} only available in " "argument2, but not in argument1.", + f"Component {name} only available in argument2, but not in argument1.", ) if name not in attr2: raise ValueError( - f"Component {name} only available in " "argument2, but not in argument1.", + f"Component {name} only available in argument2, but not in argument1.", ) assert_flows_equal( attr1[name], @@ -482,7 +488,7 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 params2 = set(flow2.parameters_meta_info) if params1 != params2: raise ValueError( - "Parameter list in meta info for parameters differ " "in the two flows.", + "Parameter list in meta info for parameters differ in the two flows.", ) # iterating over the parameter's meta info list for param in params1: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 92a102ec2..e66af7b15 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -250,8 +250,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 if isinstance(flow.flow_id, int) and flow_id != flow.flow_id: if flow_id is not False: raise PyOpenMLError( - "Local flow_id does not match server flow_id: " - f"'{flow.flow_id}' vs '{flow_id}'", + f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'", ) raise PyOpenMLError( "Flow does not exist on the server, but 'flow.flow_id' is not None." @@ -283,8 +282,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 if flow.extension.check_if_model_fitted(flow.model): warnings.warn( - "The model is already fitted!" - " This might cause inconsistency in comparison of results.", + "The model is already fitted! This might cause inconsistency in comparison of results.", RuntimeWarning, stacklevel=2, ) @@ -860,7 +858,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore if not from_server: return None - raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname) + raise AttributeError("Run XML does not contain required (server) field: ", fieldname) run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[ "oml:run" @@ -920,7 +918,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {} if "oml:output_data" not in run: if from_server: - raise ValueError("Run does not contain output_data " "(OpenML server error?)") + raise ValueError("Run does not contain output_data (OpenML server error?)") predictions_url = None else: output_data = run["oml:output_data"] @@ -972,7 +970,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore evaluations[key] = value if "description" not in files and from_server is True: - raise ValueError("No description file for run %d in run " "description XML" % run_id) + raise ValueError("No description file for run %d in run description XML" % run_id) if "predictions" not in files and from_server is True: task = openml.tasks.get_task(task_id) @@ -1100,7 +1098,7 @@ def list_runs( # noqa: PLR0913 return pd.concat(batches) -def _list_runs( # noqa: PLR0913 +def _list_runs( # noqa: PLR0913, C901 limit: int, offset: int, *, @@ -1150,7 +1148,11 @@ def _list_runs( # noqa: PLR0913 dict, or dataframe List of found runs. """ - api_call = f"run/list/limit/{limit}/offset/{offset}" + api_call = "run/list" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if id is not None: api_call += "/run/{}".format(",".join([str(int(i)) for i in id])) if task is not None: diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 2ab18e428..cc71418df 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -192,7 +192,7 @@ def _list_setups( filters which are put into the kwargs. limit : int - listing_offset : int + offset : int setup : list(int), optional flow : int, optional tag : str, optional @@ -201,7 +201,11 @@ def _list_setups( ------- The setups that match the filters, going from id to the OpenMLSetup object. """ - api_call = f"setup/list/offset/{offset}/limit/{limit}" + api_call = "setup/list" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if setup is not None: api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup])) if flow is not None: @@ -220,12 +224,12 @@ def __list_setups(api_call: str) -> list[OpenMLSetup]: # Minimalistic check if the XML is useful if "oml:setups" not in setups_dict: raise ValueError( - 'Error in return XML, does not contain "oml:setups":' f" {setups_dict!s}", + f'Error in return XML, does not contain "oml:setups": {setups_dict!s}', ) if "@xmlns:oml" not in setups_dict["oml:setups"]: raise ValueError( - "Error in return XML, does not contain " f'"oml:setups"/@xmlns:oml: {setups_dict!s}', + f'Error in return XML, does not contain "oml:setups"/@xmlns:oml: {setups_dict!s}', ) if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: diff --git a/openml/study/functions.py b/openml/study/functions.py index fb71272a1..4e16879d7 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -298,7 +298,7 @@ def update_study_status(study_id: int, status: str) -> None: """ legal_status = {"active", "deactivated"} if status not in legal_status: - raise ValueError("Illegal status value. " f"Legal values: {legal_status}") + raise ValueError(f"Illegal status value. Legal values: {legal_status}") data = {"study_id": study_id, "status": status} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data) result = xmltodict.parse(result_xml) @@ -548,10 +548,15 @@ def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: ------- studies : dataframe """ - api_call = f"study/list/limit/{limit}/offset/{offset}" + api_call = "study/list" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if kwargs is not None: for operator, value in kwargs.items(): - api_call += f"/{operator}/{value}" + if value is not None: + api_call += f"/{operator}/{value}" return __list_studies(api_call=api_call) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 497e35c69..25156f2e5 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -176,7 +176,6 @@ def list_tasks( # noqa: PLR0913 """ listing_call = partial( _list_tasks, - size=size, task_type=task_type, tag=tag, data_tag=data_tag, @@ -222,7 +221,11 @@ def _list_tasks( ------- dataframe """ - api_call = f"task/list/limit/{limit}/offset/{offset}" + api_call = "task/list" + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" if task_type is not None: tvalue = task_type.value if isinstance(task_type, TaskType) else task_type api_call += f"/type/{tvalue}" diff --git a/openml/utils.py b/openml/utils.py index d06d626a1..cef9f9d49 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -271,11 +271,12 @@ def _list_all( # noqa: C901 page = 0 results: list[_SizedT] = [] - LIMIT = limit - BATCH_SIZE_ORIG = batch_size offset = offset if offset is not None else 0 batch_size = batch_size if batch_size is not None else 10_000 + LIMIT = limit + BATCH_SIZE_ORIG = batch_size + # Default batch size per paging. # This one can be set in filters (batch_size), but should not be # changed afterwards. The derived batch_size can be changed. diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 2b451d49b..3b4a34b57 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -3,9 +3,7 @@ import os import unittest.mock import pytest -import shutil import openml -from itertools import chain from openml.testing import _check_dataset @@ -35,7 +33,7 @@ def min_number_setups_on_test_server() -> int: @pytest.fixture() def min_number_runs_on_test_server() -> int: - """After a reset at least 50 runs are on the test server""" + """After a reset at least 21 runs are on the test server""" return 21 @@ -71,14 +69,12 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): batch_size=batch_size, ) assert len(batches) >= 2 - flattened = list(chain(*batches)) - assert min_number_tasks_on_test_server <= len(flattened) + assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches) @pytest.mark.server() def test_list_all_for_datasets(min_number_datasets_on_test_server): datasets = openml.datasets.list_datasets( - batch_size=100, size=min_number_datasets_on_test_server, ) @@ -104,7 +100,7 @@ def test_list_all_for_setups(min_number_setups_on_test_server): @pytest.mark.server() @pytest.mark.flaky() # Other tests might need to upload runs first def test_list_all_for_runs(min_number_runs_on_test_server): - runs = openml.runs.list_runs(batch_size=25, size=min_number_runs_on_test_server) + runs = openml.runs.list_runs(size=min_number_runs_on_test_server) assert min_number_runs_on_test_server == len(runs) @@ -155,4 +151,4 @@ def test_correct_test_server_download_state(): """ task = openml.tasks.get_task(119) dataset = task.get_dataset() - assert len(dataset.features) == dataset.get_data()[0].shape[1] \ No newline at end of file + assert len(dataset.features) == dataset.get_data()[0].shape[1] From 6517f6abef063b2ec0190f5074b35a4a2d664926 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 16:05:41 +0200 Subject: [PATCH 11/23] fix/maint: update and fix tests for new dataframe default --- .../test_sklearn_extension.py | 41 ++++++++------- tests/test_runs/test_run.py | 32 ++++++++++-- tests/test_runs/test_run_functions.py | 51 +++++++++++++------ tests/test_tasks/test_classification_task.py | 8 +-- tests/test_tasks/test_learning_curve_task.py | 8 +-- tests/test_tasks/test_regression_task.py | 8 +-- tests/test_tasks/test_supervised_task.py | 4 +- 7 files changed, 101 insertions(+), 51 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index dec8d9e00..d2aa3d47d 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -61,27 +61,31 @@ def __init__(self, boolean, integer, floating_point_value): def fit(self, X, y): pass + +def _cat_col_selector(X): + return X.select_dtypes(include=["object", "category"]).columns + + def _get_sklearn_preprocessing(): - from sklearn.compose import ColumnTransformer, make_column_selector + from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder return [ - ( - "cat_handling", - ColumnTransformer( - transformers=[ - ( - "cat", - OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=np.nan - ), - make_column_selector(dtype_include=["object", "category"]), - ) - ], - remainder="passthrough", - ), + ( + "cat_handling", + ColumnTransformer( + transformers=[ + ( + "cat", + OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), + _cat_col_selector, + ) + ], + remainder="passthrough", ), - ("imp", SimpleImputer())] + ), + ("imp", SimpleImputer()), + ] class TestSklearnExtensionFlowFunctions(TestBase): @@ -1904,7 +1908,10 @@ def test_run_model_on_fold_classification_2(self): pipeline = sklearn.model_selection.GridSearchCV( sklearn.pipeline.Pipeline( - steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())], + steps=[ + *_get_sklearn_preprocessing(), + ("clf", sklearn.tree.DecisionTreeClassifier()), + ], ), {"clf__max_depth": [1, 2]}, ) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 6769b2140..9e2173007 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -204,17 +204,42 @@ def test_to_from_filesystem_no_model(self): with self.assertRaises(ValueError, msg="Could not find model.pkl"): openml.runs.OpenMLRun.from_filesystem(cache_path) + @staticmethod + def _cat_col_selector(X): + return X.select_dtypes(include=["object", "category"]).columns + @staticmethod def _get_models_tasks_for_tests(): + from sklearn.compose import ColumnTransformer + from sklearn.preprocessing import OrdinalEncoder + + basic_preprocessing = [ + ( + "cat_handling", + ColumnTransformer( + transformers=[ + ( + "cat", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=np.nan + ), + TestRun._cat_col_selector, + ) + ], + remainder="passthrough", + ), + ), + ("imp", SimpleImputer()), + ] model_clf = Pipeline( [ - ("imputer", SimpleImputer(strategy="mean")), + *basic_preprocessing, ("classifier", DummyClassifier(strategy="prior")), ], ) model_reg = Pipeline( [ - ("imputer", SimpleImputer(strategy="mean")), + *basic_preprocessing, ( "regressor", # LR because dummy does not produce enough float-like values @@ -263,9 +288,8 @@ def assert_run_prediction_data(task, run, model): assert_method = np.testing.assert_array_almost_equal if task.task_type == "Supervised Classification": - y_pred = np.take(task.class_labels, y_pred) - y_test = np.take(task.class_labels, y_test) assert_method = np.testing.assert_array_equal + y_test = y_test.values # Assert correctness assert_method(y_pred, saved_y_pred) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 77d783514..22bf3f8c2 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -26,9 +26,10 @@ from sklearn.model_selection._search import BaseSearchCV from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier +from sklearn.compose import ColumnTransformer import openml import openml._api_calls @@ -130,9 +131,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): time.sleep(10) continue - assert ( - len(run.evaluations) > 0 - ), "Expect not-None evaluations to always contain elements." + assert len(run.evaluations) > 0, ( + "Expect not-None evaluations to always contain elements." + ) return raise RuntimeError( @@ -306,7 +307,7 @@ def _remove_random_state(flow): flow_server = self.extension.model_to_flow(clf_server) if flow.class_name not in classes_without_random_state: - error_msg = "Flow class %s (id=%d) does not have a random " "state parameter" % ( + error_msg = "Flow class %s (id=%d) does not have a random state parameter" % ( flow.class_name, flow.flow_id, ) @@ -479,7 +480,7 @@ def determine_grid_size(param_grid): grid_iterations += determine_grid_size(sub_grid) return grid_iterations else: - raise TypeError("Param Grid should be of type list " "(GridSearch only) or dict") + raise TypeError("Param Grid should be of type list (GridSearch only) or dict") run = self._perform_run( task_id, @@ -1286,7 +1287,7 @@ def test_run_with_illegal_flow_id_1(self): flow_new = self.extension.model_to_flow(clf) flow_new.flow_id = -1 - expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'" + expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex): openml.runs.run_flow_on_task( task=task, @@ -1326,7 +1327,7 @@ def test_run_with_illegal_flow_id_1_after_load(self): run.to_filesystem(cache_path) loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) - expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'" + expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'" self.assertRaisesRegex( openml.exceptions.PyOpenMLError, expected_message_regex, @@ -1827,14 +1828,33 @@ def test_joblib_backends(self, parallel_mock): (1, "sequential", 40), ]: clf = sklearn.model_selection.RandomizedSearchCV( - estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5), + estimator=sklearn.pipeline.Pipeline( + [ + ( + "cat_handling", + ColumnTransformer( + transformers=[ + ( + "cat", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ), + x.select_dtypes(include=["object", "category"]).columns, + ) + ], + remainder="passthrough", + ), + ), + ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)), + ] + ), param_distributions={ - "max_depth": [3, None], - "max_features": [1, 2, 3, 4], - "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], - "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"], + "clf__max_depth": [3, None], + "clf__max_features": [1, 2, 3, 4], + "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], + "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "clf__bootstrap": [True, False], + "clf__criterion": ["gini", "entropy"], }, random_state=1, cv=sklearn.model_selection.StratifiedKFold( @@ -1851,7 +1871,6 @@ def test_joblib_backends(self, parallel_mock): model=clf, task=task, add_local_measures=True, - # dataset_format="array", # "dataframe" would require handling of categoricals n_jobs=n_jobs, ) assert type(res[0]) == list diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py index 661e8eced..bb4545154 100644 --- a/tests/test_tasks/test_classification_task.py +++ b/tests/test_tasks/test_classification_task.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause from __future__ import annotations -import numpy as np +import pandas as pd from openml.tasks import TaskType, get_task @@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1): def test_get_X_and_Y(self): X, Y = super().test_get_X_and_Y() assert X.shape == (768, 8) - assert isinstance(X, np.ndarray) + assert isinstance(X, pd.DataFrame) assert Y.shape == (768,) - assert isinstance(Y, np.ndarray) - assert Y.dtype == int + assert isinstance(Y, pd.Series) + assert pd.api.types.is_categorical_dtype(Y) def test_download_task(self): task = super().test_download_task() diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py index 0e781c8ff..885f80a27 100644 --- a/tests/test_tasks/test_learning_curve_task.py +++ b/tests/test_tasks/test_learning_curve_task.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause from __future__ import annotations -import numpy as np +import pandas as pd from openml.tasks import TaskType, get_task @@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1): def test_get_X_and_Y(self): X, Y = super().test_get_X_and_Y() assert X.shape == (768, 8) - assert isinstance(X, np.ndarray) + assert isinstance(X, pd.DataFrame) assert Y.shape == (768,) - assert isinstance(Y, np.ndarray) - assert Y.dtype == int + assert isinstance(Y, pd.Series) + assert pd.api.types.is_categorical_dtype(Y) def test_download_task(self): task = super().test_download_task() diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py index 29a8254df..36decc534 100644 --- a/tests/test_tasks/test_regression_task.py +++ b/tests/test_tasks/test_regression_task.py @@ -3,7 +3,7 @@ import ast -import numpy as np +import pandas as pd import openml from openml.exceptions import OpenMLServerException @@ -51,10 +51,10 @@ def setUp(self, n_levels: int = 1): def test_get_X_and_Y(self): X, Y = super().test_get_X_and_Y() assert X.shape == (194, 32) - assert isinstance(X, np.ndarray) + assert isinstance(X, pd.DataFrame) assert Y.shape == (194,) - assert isinstance(Y, np.ndarray) - assert Y.dtype == float + assert isinstance(Y, pd.Series) + assert pd.api.types.is_numeric_dtype(Y) def test_download_task(self): task = super().test_download_task() diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py index 00ce1f276..9c90b7e03 100644 --- a/tests/test_tasks/test_supervised_task.py +++ b/tests/test_tasks/test_supervised_task.py @@ -3,7 +3,7 @@ import unittest -import numpy as np +import pandas as pd from openml.tasks import get_task @@ -27,7 +27,7 @@ def setUpClass(cls): def setUp(self, n_levels: int = 1): super().setUp() - def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]: + def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]: task = get_task(self.task_id) X, Y = task.get_X_and_y() return X, Y From 466022e38a67f9d7978faf293a03311b5bb127a6 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 16:55:28 +0200 Subject: [PATCH 12/23] fix/maint: resolve tests that used old default format --- openml/extensions/sklearn/extension.py | 29 ++++++++++++----- tests/conftest.py | 7 ++++- tests/test_runs/test_run_functions.py | 40 +++++++++++++++++------- tests/test_study/test_study_functions.py | 13 ++++---- 4 files changed, 63 insertions(+), 26 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 2d40d03b8..fc8697e84 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -1144,7 +1144,7 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set] optional_params[param] = default_val return optional_params, required_params - def _deserialize_model( + def _deserialize_model( # noqa: C901 self, flow: OpenMLFlow, keep_defaults: bool, # noqa: FBT001 @@ -1219,6 +1219,20 @@ def _deserialize_model( if param not in components: del parameter_dict[param] + if not strict_version: + # Ignore incompatible parameters + allowed_parameter = list(inspect.signature(model_class.__init__).parameters) + for p in list(parameter_dict.keys()): + if p not in allowed_parameter: + warnings.warn( + f"While deserializing in a non-strict way, parameter {p} is not " + f"allowed for {model_class.__name__} likely due to a version mismatch. " + "We ignore the parameter.", + UserWarning, + stacklevel=2, + ) + del parameter_dict[p] + return model_class(**parameter_dict) def _check_dependencies( @@ -1254,8 +1268,7 @@ def _check_dependencies( else: raise NotImplementedError(f"operation '{operation}' is not supported") message = ( - "Trying to deserialize a model with dependency " - f"{dependency_string} not satisfied." + f"Trying to deserialize a model with dependency {dependency_string} not satisfied." ) if not check: if strict_version: @@ -1497,7 +1510,7 @@ def _prevent_optimize_n_jobs(self, model): ) if len(n_jobs_vals) > 0: raise PyOpenMLError( - "openml-python should not be used to " "optimize the n_jobs parameter.", + "openml-python should not be used to optimize the n_jobs parameter.", ) ################################################################################################ @@ -1555,7 +1568,7 @@ def _seed_current_object(current_value): if current_value is not None: raise ValueError( - "Models should be seeded with int or None (this should never " "happen). ", + "Models should be seeded with int or None (this should never happen). ", ) return True @@ -1780,10 +1793,10 @@ def _prediction_to_probabilities( # to handle the case when dataset is numpy and categories are encoded # however the class labels stored in task are still categories if isinstance(y_train, np.ndarray) and isinstance( - cast(List, task.class_labels)[0], + cast("List", task.class_labels)[0], str, ): - model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes] + model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes] modelpredict_start_cputime = time.process_time() modelpredict_start_walltime = time.time() @@ -2006,7 +2019,7 @@ def is_subcomponent_specification(values): # (mixed)). OpenML replaces the subcomponent by an # OpenMLFlow object. if len(subcomponent) < 2 or len(subcomponent) > 3: - raise ValueError("Component reference should be " "size {2,3}. ") + raise ValueError("Component reference should be size {2,3}. ") subcomponent_identifier = subcomponent[0] subcomponent_flow = subcomponent[1] diff --git a/tests/conftest.py b/tests/conftest.py index 79ee2bbd3..b523117c1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,10 @@ # License: BSD 3-Clause from __future__ import annotations +import multiprocessing + +multiprocessing.set_start_method("spawn", force=True) + from collections.abc import Iterator import logging import os @@ -33,6 +37,7 @@ import openml from openml.testing import TestBase + # creating logger for unit test file deletion status logger = logging.getLogger("unit_tests") logger.setLevel(logging.DEBUG) @@ -170,7 +175,7 @@ def pytest_sessionfinish() -> None: # Delete any test dirs that remain # In edge cases due to a mixture of pytest parametrization and oslo concurrency, # some file lock are created after leaving the test. This removes these files! - test_files_dir=Path(__file__).parent.parent / "openml" + test_files_dir = Path(__file__).parent.parent / "openml" for f in test_files_dir.glob("tests.*"): if f.is_dir(): shutil.rmtree(f) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 22bf3f8c2..c529b6c2b 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -272,7 +272,7 @@ def _remove_random_state(flow): task = openml.tasks.get_task(task_id) X, y = task.get_X_and_y() - assert np.count_nonzero(np.isnan(X)) == n_missing_vals + assert X.isna().sum().sum() == n_missing_vals run = openml.runs.run_flow_on_task( flow=flow, task=task, @@ -401,7 +401,7 @@ def _check_sample_evaluations( @pytest.mark.sklearn() def test_run_regression_on_classif_task(self): - task_id = 115 # diabetes; crossvalidation + task_id = 259 # collins; crossvalidation; has numeric targets clf = LinearRegression() task = openml.tasks.get_task(task_id) @@ -1758,7 +1758,26 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): num_instances = x.shape[0] line_length = 6 + len(task.class_labels) loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss" - clf = SGDClassifier(loss=loss, random_state=1) + clf = sklearn.pipeline.Pipeline( + [ + ( + "cat_handling", + ColumnTransformer( + transformers=[ + ( + "cat", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ), + x.select_dtypes(include=["object", "category"]).columns, + ) + ], + remainder="passthrough", + ), + ), + ("clf", SGDClassifier(loss=loss, random_state=1)), + ] + ) n_jobs = 2 backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" with parallel_backend(backend, n_jobs=n_jobs): @@ -1767,7 +1786,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): model=clf, task=task, add_local_measures=True, - # dataset_format="array", # "dataframe" would require handling of categoricals n_jobs=n_jobs, ) # This unit test will fail if joblib is unable to distribute successfully since the @@ -1784,16 +1802,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): assert len(res[2]) == 7 assert len(res[3]) == 7 expected_scores = [ - 0.965625, 0.94375, - 0.946875, - 0.953125, + 0.95625, + 0.959375, + 0.96875, + 0.96875, 0.96875, - 0.965625, - 0.9435736677115988, 0.9467084639498433, - 0.9749216300940439, - 0.9655172413793104, + 0.9373040752351097, + 0.9561128526645768, + 0.9467084639498433 ] scores = [v for k, v in res[2]["predictive_accuracy"][0].items()] np.testing.assert_array_almost_equal( diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index d13f770c8..8652d5547 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -183,20 +183,21 @@ def test_publish_study(self): self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids)) # test whether the list run function also handles study data fine - run_ids = openml.runs.list_runs(study=study.id) - self.assertSetEqual(set(run_ids), set(study_downloaded.runs)) + run_ids = openml.runs.list_runs(study=study.id) # returns DF + self.assertSetEqual(set(run_ids["run_id"]), set(study_downloaded.runs)) # test whether the list evaluation function also handles study data fine - run_ids = openml.evaluations.list_evaluations( + run_ids = openml.evaluations.list_evaluations( # returns list of objects "predictive_accuracy", size=None, study=study.id, + output_format="object", # making the default explicit ) self.assertSetEqual(set(run_ids), set(study_downloaded.runs)) # attach more runs, since we fetch 11 here, at least one is non-overlapping run_list_additional = openml.runs.list_runs(size=11, offset=10) - run_list_additional = set(run_list_additional) - set(run_ids) + run_list_additional = set(run_list_additional["run_id"]) - set(run_ids) openml.study.attach_to_study(study.id, list(run_list_additional)) study_downloaded = openml.study.get_study(study.id) # verify again @@ -227,7 +228,7 @@ def test_study_attach_illegal(self): benchmark_suite=None, name="study with illegal runs", description="none", - run_ids=list(run_list.keys()), + run_ids=list(run_list["run_id"]), ) study.publish() TestBase._mark_entity_for_removal("study", study.id) @@ -246,7 +247,7 @@ def test_study_attach_illegal(self): match="Problem attaching entities.", ): # some runs already attached - openml.study.attach_to_study(study.id, list(run_list_more.keys())) + openml.study.attach_to_study(study.id, list(run_list_more["run_id"])) study_downloaded = openml.study.get_study(study.id) self.assertListEqual(study_original.runs, study_downloaded.runs) From bd120f5df435e72f5fd892bf461147263df20e60 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 17:26:21 +0200 Subject: [PATCH 13/23] fix: remove OrdinalEncoder --- .../test_sklearn_extension/test_sklearn_extension.py | 3 +-- tests/test_runs/test_run.py | 6 ++---- tests/test_runs/test_run_functions.py | 10 +++------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index d2aa3d47d..7dfc1b404 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -68,7 +68,6 @@ def _cat_col_selector(X): def _get_sklearn_preprocessing(): from sklearn.compose import ColumnTransformer - from sklearn.preprocessing import OrdinalEncoder return [ ( @@ -77,7 +76,7 @@ def _get_sklearn_preprocessing(): transformers=[ ( "cat", - OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), + OneHotEncoder(handle_unknown="ignore"), _cat_col_selector, ) ], diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 9e2173007..58a0dddf5 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -211,7 +211,7 @@ def _cat_col_selector(X): @staticmethod def _get_models_tasks_for_tests(): from sklearn.compose import ColumnTransformer - from sklearn.preprocessing import OrdinalEncoder + from sklearn.preprocessing import OneHotEncoder basic_preprocessing = [ ( @@ -220,9 +220,7 @@ def _get_models_tasks_for_tests(): transformers=[ ( "cat", - OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=np.nan - ), + OneHotEncoder(handle_unknown="ignore"), TestRun._cat_col_selector, ) ], diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index c529b6c2b..1562623d5 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -26,7 +26,7 @@ from sklearn.model_selection._search import BaseSearchCV from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder +from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.compose import ColumnTransformer @@ -1766,9 +1766,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): transformers=[ ( "cat", - OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1 - ), + OneHotEncoder(handle_unknown="ignore"), x.select_dtypes(include=["object", "category"]).columns, ) ], @@ -1854,9 +1852,7 @@ def test_joblib_backends(self, parallel_mock): transformers=[ ( "cat", - OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1 - ), + OneHotEncoder(handle_unknown="ignore"), x.select_dtypes(include=["object", "category"]).columns, ) ], From de597b54a2d381e748c381b9b986e6b362f4d1c9 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 17:32:14 +0200 Subject: [PATCH 14/23] fix: update test to new assert with onehot --- tests/test_runs/test_run_functions.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 1562623d5..7235075c0 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1800,16 +1800,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): assert len(res[2]) == 7 assert len(res[3]) == 7 expected_scores = [ - 0.94375, - 0.95625, - 0.959375, - 0.96875, - 0.96875, - 0.96875, - 0.9467084639498433, - 0.9373040752351097, - 0.9561128526645768, - 0.9467084639498433 + 0.9625, + 0.953125, + 0.965625, + 0.9125, + 0.98125, + 0.975, + 0.9247648902821317, + 0.9404388714733543, + 0.9780564263322884, + 0.9623824451410659, ] scores = [v for k, v in res[2]["predictive_accuracy"][0].items()] np.testing.assert_array_almost_equal( From 32e6fbf55def28e14e8538e8a054d4423d9d5a9c Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 17:52:36 +0200 Subject: [PATCH 15/23] fix/maint: update examples --- examples/30_extended/study_tutorial.py | 1 + examples/40_paper/2018_kdd_rijn_example.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index d1f6ccbd0..c0874b944 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -48,6 +48,7 @@ evaluations = openml.evaluations.list_evaluations( function="predictive_accuracy", study=study.study_id, + output_format="dataframe", ) print(evaluations.head()) diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py index f98f5de55..751f53470 100644 --- a/examples/40_paper/2018_kdd_rijn_example.py +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -1,5 +1,6 @@ """ -This example is deprecated! You will need to manually remove checks in this code to make it run. +This example is deprecated! You will need to manually remove adapt this code to make it run. +We deprecated this example in our CI as it requires fanova as a dependency. However, fanova is not supported in all Python versions used in our CI/CD. van Rijn and Hutter (2018) ========================== From bae06ca55408ebf1ed297015325e67a8f0dd1d7a Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 17:58:29 +0200 Subject: [PATCH 16/23] fix: example revert --- examples/30_extended/fetch_evaluations_tutorial.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py index 4ef2d3e5f..6c8a88ec8 100644 --- a/examples/30_extended/fetch_evaluations_tutorial.py +++ b/examples/30_extended/fetch_evaluations_tutorial.py @@ -60,7 +60,9 @@ # Note that we now filter the evaluations based on another parameter 'task'. metric = "predictive_accuracy" -evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id]) +evals = openml.evaluations.list_evaluations( + function=metric, tasks=[task_id], output_format="dataframe" +) # Displaying the first 10 rows print(evals.head(n=10)) # Sorting the evaluations in decreasing order of the metric chosen @@ -158,7 +160,10 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"): # List evaluations in descending order based on predictive_accuracy with # hyperparameters evals_setups = openml.evaluations.list_evaluations_setups( - function="predictive_accuracy", tasks=[31], size=100, sort_order="desc" + function="predictive_accuracy", + tasks=[31], + size=100, + sort_order="desc", ) "" From 78b1888cd17b00a68082be316bb667f1dbf35be6 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 18:13:27 +0200 Subject: [PATCH 17/23] fix: add impute for tests to work with older sklearn version --- .../test_sklearn_extension.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 7dfc1b404..706a67aa6 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -76,7 +76,18 @@ def _get_sklearn_preprocessing(): transformers=[ ( "cat", - OneHotEncoder(handle_unknown="ignore"), + sklearn.pipeline.Pipeline( + [ + ( + "cat_si", + SimpleImputer( + strategy="constant", + fill_value="missing", + ), + ), + ("cat_ohe", OneHotEncoder(handle_unknown="ignore")), + ], + ), _cat_col_selector, ) ], From 22b6b526f2b6569cdb199a8c4c04bfacf4aef6b1 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 16 Jun 2025 18:24:48 +0200 Subject: [PATCH 18/23] fix: make examples work --- examples/40_paper/2018_ida_strang_example.py | 1 + examples/40_paper/2018_neurips_perrone_example.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py index 8b116507c..d9fdc78a7 100644 --- a/examples/40_paper/2018_ida_strang_example.py +++ b/examples/40_paper/2018_ida_strang_example.py @@ -51,6 +51,7 @@ size=None, flows=flow_ids, study=study_id, + output_format="dataframe", ) # gives us a table with columns data_id, flow1_value, flow2_value evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna() diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 7c29ede8d..91768e010 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -181,8 +181,18 @@ def list_categorical_attributes(flow_type="svm"): num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1) # Creating the one-hot encoder for numerical representation of categorical columns -enc = OneHotEncoder(handle_unknown="ignore") - +enc = Pipeline( + [ + ( + "cat_si", + SimpleImputer( + strategy="constant", + fill_value="missing", + ), + ), + ("cat_ohe", OneHotEncoder(handle_unknown="ignore")), + ], +) # Combining column transformers ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)]) From 232b37c3561e4ccfda2da3c9738e6c16e051dbc3 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Tue, 17 Jun 2025 15:03:22 +0200 Subject: [PATCH 19/23] Update openml/utils.py Co-authored-by: Pieter Gijsbers --- openml/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index cef9f9d49..e0580d236 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -316,8 +316,8 @@ def _list_all( # noqa: C901 # check if the number of required results has been achieved # always do a 'bigger than' check, # in case of bugs to prevent infinite loops - n_recieved = sum(len(result) for result in results) - if n_recieved >= LIMIT: + n_received = sum(len(result) for result in results) + if n_received >= LIMIT: break # check if there are enough results to fulfill a batch From f14fce61345e27604d3fb990b78fd9416c5eae77 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Tue, 17 Jun 2025 15:03:33 +0200 Subject: [PATCH 20/23] Update openml/utils.py Co-authored-by: Pieter Gijsbers --- openml/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index e0580d236..7e72e7aee 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -321,8 +321,8 @@ def _list_all( # noqa: C901 break # check if there are enough results to fulfill a batch - if LIMIT - n_recieved < BATCH_SIZE_ORIG: - batch_size = LIMIT - n_recieved + if LIMIT - n_received < BATCH_SIZE_ORIG: + batch_size = LIMIT - n_received return results From 7fb5eb2efe96034f7ff133133b2d762398c40a11 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Tue, 17 Jun 2025 15:03:51 +0200 Subject: [PATCH 21/23] Update openml/setups/setup.py Co-authored-by: Pieter Gijsbers --- openml/setups/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index a98e832c0..d7d1f8915 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -7,7 +7,6 @@ import openml.flows -# TODO(eddiebergman): All functionality handled by dataclass class OpenMLSetup: """Setup object (a.k.a. Configuration). From f45530f987dc736671df1332964aa9bcca8044b5 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Tue, 17 Jun 2025 15:03:57 +0200 Subject: [PATCH 22/23] Update openml/setups/setup.py Co-authored-by: Pieter Gijsbers --- openml/setups/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index d7d1f8915..c3d8149e7 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -66,7 +66,6 @@ def __repr__(self) -> str: return header + body -# TODO(eddiebergman): All functionality handled by dataclass class OpenMLParameter: """Parameter object (used in setup). From 7fb31cea7a991e35843a285d7730ff6136fde399 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Tue, 17 Jun 2025 15:05:18 +0200 Subject: [PATCH 23/23] remove comment we do not understand --- tests/test_datasets/test_dataset_functions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index b0c5f891e..fb29009a3 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -124,7 +124,6 @@ def test_list_datasets_paginate(self): datasets = openml.datasets.list_datasets(offset=i, size=size) assert len(datasets) == size assert len(datasets.columns) >= 2 - # Maybe index? assert "did" in datasets.columns assert datasets["did"].dtype == int assert "status" in datasets.columns