From e7b825ea0925268921b8a5715c29c5beb67c36e2 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 11:49:59 +0200 Subject: [PATCH 1/8] Delete the extension --- openml/extensions/sklearn/__init__.py | 43 - openml/extensions/sklearn/extension.py | 2270 ------------------------ 2 files changed, 2313 deletions(-) delete mode 100644 openml/extensions/sklearn/__init__.py delete mode 100644 openml/extensions/sklearn/extension.py diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py deleted file mode 100644 index 9c1c6cba6..000000000 --- a/openml/extensions/sklearn/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -from typing import TYPE_CHECKING - -from openml.extensions import register_extension - -from .extension import SklearnExtension - -if TYPE_CHECKING: - import pandas as pd - -__all__ = ["SklearnExtension"] - -register_extension(SklearnExtension) - - -def cont(X: pd.DataFrame) -> pd.Series: - """Returns True for all non-categorical columns, False for the rest. - - This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling - of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is - required to process each type of columns separately. - This function allows transformations meant for continuous/numeric columns to access the - continuous/numeric columns given the dataset as DataFrame. - """ - if not hasattr(X, "dtypes"): - raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") - return X.dtypes != "category" - - -def cat(X: pd.DataFrame) -> pd.Series: - """Returns True for all categorical columns, False for the rest. - - This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling - of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is - required to process each type of columns separately. - This function allows transformations meant for categorical columns to access the - categorical columns given the dataset as DataFrame. - """ - if not hasattr(X, "dtypes"): - raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") - return X.dtypes == "category" diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py deleted file mode 100644 index 0c7588cdd..000000000 --- a/openml/extensions/sklearn/extension.py +++ /dev/null @@ -1,2270 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -import contextlib -import copy -import importlib -import inspect -import json -import logging -import re -import sys -import time -import traceback -import warnings -from collections import OrderedDict -from json.decoder import JSONDecodeError -from re import IGNORECASE -from typing import Any, Callable, List, Sized, cast - -import numpy as np -import pandas as pd -import scipy.sparse -import scipy.stats -import sklearn.base -import sklearn.model_selection -import sklearn.pipeline -from packaging.version import Version - -import openml -from openml.exceptions import PyOpenMLError -from openml.extensions import Extension -from openml.flows import OpenMLFlow -from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration -from openml.tasks import ( - OpenMLClassificationTask, - OpenMLClusteringTask, - OpenMLLearningCurveTask, - OpenMLRegressionTask, - OpenMLSupervisedTask, - OpenMLTask, -) - -logger = logging.getLogger(__name__) - - -DEPENDENCIES_PATTERN = re.compile( - r"^(?P[\w\-]+)((?P==|>=|>)" - r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", -) - -# NOTE(eddiebergman): This was imported before but became deprecated, -# as a result I just enumerated them manually by copy-ing and pasting, -# recommended solution in Numpy 2.0 guide was to explicitly list them. -SIMPLE_NUMPY_TYPES = [ - np.int8, - np.int16, - np.int32, - np.int64, - np.longlong, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.ulonglong, - np.float16, - np.float32, - np.float64, - np.longdouble, - np.complex64, - np.complex128, - np.clongdouble, -] -SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) - -SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") -COMPONENT_REFERENCE = "component_reference" -COMPOSITION_STEP_CONSTANT = "composition_step_constant" - - -class SklearnExtension(Extension): - """Connect scikit-learn to OpenML-Python. - The estimators which use this extension must be scikit-learn compatible, - i.e needs to be a subclass of sklearn.base.BaseEstimator". - """ - - ################################################################################################ - # General setup - - @classmethod - def can_handle_flow(cls, flow: OpenMLFlow) -> bool: - """Check whether a given describes a scikit-learn estimator. - - This is done by parsing the ``external_version`` field. - - Parameters - ---------- - flow : OpenMLFlow - - Returns - ------- - bool - """ - return cls._is_sklearn_flow(flow) - - @classmethod - def can_handle_model(cls, model: Any) -> bool: - """Check whether a model is an instance of ``sklearn.base.BaseEstimator``. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - return isinstance(model, sklearn.base.BaseEstimator) - - @classmethod - def trim_flow_name( # noqa: C901 - cls, - long_name: str, - extra_trim_length: int = 100, - _outer: bool = True, # noqa: FBT001, FBT002 - ) -> str: - """Shorten generated sklearn flow name to at most ``max_length`` characters. - - Flows are assumed to have the following naming structure: - ``(model_selection)? (pipeline)? (steps)+`` - and will be shortened to: - ``sklearn.(selection.)?(pipeline.)?(steps)+`` - e.g. (white spaces and newlines added for readability) - - .. code :: - - sklearn.pipeline.Pipeline( - columntransformer=sklearn.compose._column_transformer.ColumnTransformer( - numeric=sklearn.pipeline.Pipeline( - imputer=sklearn.preprocessing.imputation.Imputer, - standardscaler=sklearn.preprocessing.data.StandardScaler), - nominal=sklearn.pipeline.Pipeline( - simpleimputer=sklearn.impute.SimpleImputer, - onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), - variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, - svc=sklearn.svm.classes.SVC) - - -> - ``sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)`` - - Parameters - ---------- - long_name : str - The full flow name generated by the scikit-learn extension. - extra_trim_length: int (default=100) - If the trimmed name would exceed `extra_trim_length` characters, additional trimming - of the short name is performed. This reduces the produced short name length. - There is no guarantee the end result will not exceed `extra_trim_length`. - _outer : bool (default=True) - For internal use only. Specifies if the function is called recursively. - - Returns - ------- - str - - """ - - def remove_all_in_parentheses(string: str) -> str: - string, removals = re.subn(r"\([^()]*\)", "", string) - while removals > 0: - string, removals = re.subn(r"\([^()]*\)", "", string) - return string - - # Generally, we want to trim all hyperparameters, the exception to that is for model - # selection, as the `estimator` hyperparameter is very indicative of what is in the flow. - # So we first trim name of the `estimator` specified in mode selection. For reference, in - # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and - # keep it in the final trimmed flow name: - # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, - # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: ERA001, E501 - # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= - # sklearn.tree.tree.DecisionTreeClassifier)) - if "sklearn.model_selection" in long_name: - start_index = long_name.index("sklearn.model_selection") - estimator_start = ( - start_index + long_name[start_index:].index("estimator=") + len("estimator=") - ) - - model_select_boilerplate = long_name[start_index:estimator_start] - # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator=" - model_selection_class = model_select_boilerplate.split("(")[0].split(".")[-1] - - # Now we want to also find and parse the `estimator`, for this we find the closing - # parenthesis to the model selection technique: - closing_parenthesis_expected = 1 - for char in long_name[estimator_start:]: - if char == "(": - closing_parenthesis_expected += 1 - if char == ")": - closing_parenthesis_expected -= 1 - if closing_parenthesis_expected == 0: - break - - _end: int = estimator_start + len(long_name[estimator_start:]) - 1 - model_select_pipeline = long_name[estimator_start:_end] - - trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) - _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1) # trim module prefix - model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]" - name = long_name[:start_index] + model_select_short + long_name[_end + 1 :] - else: - name = long_name - - module_name = long_name.split(".")[0] - short_name = module_name + ".{}" - - if name.startswith("sklearn.pipeline"): - full_pipeline_class, pipeline = name[:-1].split("(", maxsplit=1) - pipeline_class = full_pipeline_class.split(".")[-1] - # We don't want nested pipelines in the short name, so we trim all complicated - # subcomponents, i.e. those with parentheses: - pipeline = remove_all_in_parentheses(pipeline) - - # then the pipeline steps are formatted e.g.: - # step1name=sklearn.submodule.ClassName,step2name... - components = [component.split(".")[-1] for component in pipeline.split(",")] - pipeline = f"{pipeline_class}({','.join(components)})" - if len(short_name.format(pipeline)) > extra_trim_length: - pipeline = f"{pipeline_class}(...,{components[-1]})" - else: - # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier - pipeline = remove_all_in_parentheses(name).split(".")[-1] - - if not _outer: - # Anything from parenthesis in inner calls should not be culled, so we use brackets - pipeline = pipeline.replace("(", "[").replace(")", "]") - else: - # Square brackets may be introduced with nested model_selection - pipeline = pipeline.replace("[", "(").replace("]", ")") - - return short_name.format(pipeline) - - @classmethod - def _min_dependency_str(cls, sklearn_version: str) -> str: - """Returns a string containing the minimum dependencies for the sklearn version passed. - - Parameters - ---------- - sklearn_version : str - A version string of the xx.xx.xx - - Returns - ------- - str - """ - # This explicit check is necessary to support existing entities on the OpenML servers - # that used the fixed dependency string (in the else block) - if Version(openml.__version__) > Version("0.11"): - # OpenML v0.11 onwards supports sklearn>=0.24 - # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with - # variables declared for extracting minimum dependency for that version - if Version(sklearn_version) >= Version("0.24"): - from sklearn import _min_dependencies as _mindep - - dependency_list = { - "numpy": f"{_mindep.NUMPY_MIN_VERSION}", - "scipy": f"{_mindep.SCIPY_MIN_VERSION}", - "joblib": f"{_mindep.JOBLIB_MIN_VERSION}", - "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}", - } - elif Version(sklearn_version) >= Version("0.23"): - dependency_list = { - "numpy": "1.13.3", - "scipy": "0.19.1", - "joblib": "0.11", - "threadpoolctl": "2.0.0", - } - if Version(sklearn_version).micro == 0: - dependency_list.pop("threadpoolctl") - elif Version(sklearn_version) >= Version("0.21"): - dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} - elif Version(sklearn_version) >= Version("0.19"): - dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} - else: - dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} - else: - # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 - # given that OpenML has existing flows uploaded with such dependency information, - # we change no behaviour for older sklearn version, however from 0.24 onwards - # the dependency list will be accurately updated for any flow uploaded to OpenML - dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} - - sklearn_dep = f"sklearn=={sklearn_version}" - dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()]) - return "\n".join([sklearn_dep, dep_str]) - - ################################################################################################ - # Methods for flow serialization and de-serialization - - def flow_to_model( - self, - flow: OpenMLFlow, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - strict_version: bool = True, # noqa: FBT001, FBT002 - ) -> Any: - """Initializes a sklearn model based on a flow. - - Parameters - ---------- - flow : mixed - the object to deserialize (can be flow object, or any serialized - parameter value that is accepted by) - - initialize_with_defaults : bool, optional (default=False) - If this flag is set, the hyperparameter values of flows will be - ignored and a flow with its defaults is returned. - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. - - Returns - ------- - mixed - """ - return self._deserialize_sklearn( - flow, - initialize_with_defaults=initialize_with_defaults, - strict_version=strict_version, - ) - - def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912 - self, - o: Any, - components: dict | None = None, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - recursion_depth: int = 0, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - """Recursive function to deserialize a scikit-learn flow. - - This function inspects an object to deserialize and decides how to do so. This function - delegates all work to the respective functions to deserialize special data structures etc. - This function works on everything that has been serialized to OpenML: OpenMLFlow, - components (which are flows themselves), functions, hyperparameter distributions (for - random search) and the actual hyperparameter values themselves. - - Parameters - ---------- - o : mixed - the object to deserialize (can be flow object, or any serialized - parameter value that is accepted by) - - components : Optional[dict] - Components of the current flow being de-serialized. These will not be used when - de-serializing the actual flow, but when de-serializing a component reference. - - initialize_with_defaults : bool, optional (default=False) - If this flag is set, the hyperparameter values of flows will be - ignored and a flow with its defaults is returned. - - recursion_depth : int - The depth at which this flow is called, mostly for debugging - purposes - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. - - Returns - ------- - mixed - """ - logger.info( - "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format( - "-" * recursion_depth, o, components, initialize_with_defaults - ), - ) - depth_pp = recursion_depth + 1 # shortcut var, depth plus plus - - # First, we need to check whether the presented object is a json string. - # JSON strings are used to encoder parameter values. By passing around - # json strings for parameters, we make sure that we can flow_to_sklearn - # the parameter values to the correct type. - - if isinstance(o, str): - with contextlib.suppress(JSONDecodeError): - o = json.loads(o) - - if isinstance(o, dict): - # Check if the dict encodes a 'special' object, which could not - # easily converted into a string, but rather the information to - # re-create the object were stored in a dictionary. - if "oml-python:serialized_object" in o: - serialized_type = o["oml-python:serialized_object"] - value = o["value"] - if serialized_type == "type": - rval = self._deserialize_type(value) - elif serialized_type == "rv_frozen": - rval = self._deserialize_rv_frozen(value) - elif serialized_type == "function": - rval = self._deserialize_function(value) - elif serialized_type in (COMPOSITION_STEP_CONSTANT, COMPONENT_REFERENCE): - if serialized_type == COMPOSITION_STEP_CONSTANT: - pass - elif serialized_type == COMPONENT_REFERENCE: - value = self._deserialize_sklearn( - value, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - else: - raise NotImplementedError(serialized_type) - assert components is not None # Necessary for mypy - step_name = value["step_name"] - key = value["key"] - component = self._deserialize_sklearn( - components[key], - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - # The component is now added to where it should be used - # later. It should not be passed to the constructor of the - # main flow object. - del components[key] - if step_name is None: - rval = component - elif "argument_1" not in value: - rval = (step_name, component) - else: - rval = (step_name, component, value["argument_1"]) - elif serialized_type == "cv_object": - rval = self._deserialize_cross_validator( - value, - recursion_depth=recursion_depth, - strict_version=strict_version, - ) - else: - raise ValueError(f"Cannot flow_to_sklearn {serialized_type}") - - else: - rval = OrderedDict( - ( - self._deserialize_sklearn( - o=key, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ), - self._deserialize_sklearn( - o=value, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ), - ) - for key, value in sorted(o.items()) - ) - elif isinstance(o, (list, tuple)): - rval = [ - self._deserialize_sklearn( - o=element, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - for element in o - ] - if isinstance(o, tuple): - rval = tuple(rval) - elif isinstance(o, (bool, int, float, str)) or o is None: - rval = o - elif isinstance(o, OpenMLFlow): - if not self._is_sklearn_flow(o): - raise ValueError("Only sklearn flows can be reinstantiated") - rval = self._deserialize_model( - flow=o, - keep_defaults=initialize_with_defaults, - recursion_depth=recursion_depth, - strict_version=strict_version, - ) - else: - raise TypeError(o) - logger.info(f"-{'-' * recursion_depth} flow_to_sklearn END o={o}, rval={rval}") - return rval - - def model_to_flow(self, model: Any) -> OpenMLFlow: - """Transform a scikit-learn model to a flow for uploading it to OpenML. - - Parameters - ---------- - model : Any - - Returns - ------- - OpenMLFlow - """ - # Necessary to make pypy not complain about all the different possible return types - return self._serialize_sklearn(model) - - def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any: # noqa: PLR0912, C901 - rval = None # type: Any - - # TODO: assert that only on first recursion lvl `parent_model` can be None - if self.is_estimator(o): - # is the main model or a submodel - rval = self._serialize_model(o) - elif ( - isinstance(o, (list, tuple)) - and len(o) == 2 - and o[1] in SKLEARN_PIPELINE_STRING_COMPONENTS - and isinstance(parent_model, sklearn.pipeline._BaseComposition) - ): - rval = o - elif isinstance(o, (list, tuple)): - # TODO: explain what type of parameter is here - rval = [self._serialize_sklearn(element, parent_model) for element in o] - if isinstance(o, tuple): - rval = tuple(rval) - elif isinstance(o, SIMPLE_TYPES) or o is None: - if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)): - o = o.item() # type: ignore - # base parameter values - rval = o - elif isinstance(o, dict): - # TODO: explain what type of parameter is here - if not isinstance(o, OrderedDict): - o = OrderedDict(sorted(o.items())) - - rval = OrderedDict() - for key, value in o.items(): - if not isinstance(key, str): - raise TypeError( - "Can only use string as keys, you passed " - f"type {type(key)} for value {key!s}.", - ) - _key = self._serialize_sklearn(key, parent_model) - rval[_key] = self._serialize_sklearn(value, parent_model) - elif isinstance(o, type): - # TODO: explain what type of parameter is here - rval = self._serialize_type(o) - elif isinstance(o, scipy.stats.distributions.rv_frozen): - rval = self._serialize_rv_frozen(o) - # This only works for user-defined functions (and not even partial). - # I think this is exactly what we want here as there shouldn't be any - # built-in or functool.partials in a pipeline - elif inspect.isfunction(o): - # TODO: explain what type of parameter is here - rval = self._serialize_function(o) - elif self._is_cross_validator(o): - # TODO: explain what type of parameter is here - rval = self._serialize_cross_validator(o) - else: - raise TypeError(o, type(o)) - - return rval - - def get_version_information(self) -> list[str]: - """List versions of libraries required by the flow. - - Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``. - - Returns - ------- - List - """ - # This can possibly be done by a package such as pyxb, but I could not get - # it to work properly. - import numpy - import scipy - import sklearn - - major, minor, micro, _, _ = sys.version_info - python_version = f"Python_{'.'.join([str(major), str(minor), str(micro)])}." - sklearn_version = f"Sklearn_{sklearn.__version__}." - numpy_version = f"NumPy_{numpy.__version__}." # type: ignore - scipy_version = f"SciPy_{scipy.__version__}." - - return [python_version, sklearn_version, numpy_version, scipy_version] - - def create_setup_string(self, model: Any) -> str: # noqa: ARG002 - """Create a string which can be used to reinstantiate the given model. - - Parameters - ---------- - model : Any - - Returns - ------- - str - """ - return " ".join(self.get_version_information()) - - def _is_cross_validator(self, o: Any) -> bool: - return isinstance(o, sklearn.model_selection.BaseCrossValidator) - - @classmethod - def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: - sklearn_dependency = isinstance(flow.dependencies, str) and "sklearn" in flow.dependencies - sklearn_as_external = isinstance(flow.external_version, str) and ( - flow.external_version.startswith("sklearn==") or ",sklearn==" in flow.external_version - ) - return sklearn_dependency or sklearn_as_external - - def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: - r"""Fetches the sklearn function docstring for the flow description - - Retrieves the sklearn docstring available and does the following: - * If length of docstring <= char_lim, then returns the complete docstring - * Else, trims the docstring till it encounters a 'Read more in the :ref:' - * Or till it encounters a 'Parameters\n----------\n' - The final string returned is at most of length char_lim with leading and - trailing whitespaces removed. - - Parameters - ---------- - model : sklearn model - char_lim : int - Specifying the max length of the returned string. - OpenML servers have a constraint of 1024 characters for the 'description' field. - - Returns - ------- - str - """ - - def match_format(s): - return f"{s}\n{len(s) * '-'}\n" - - s = inspect.getdoc(model) - if s is None: - return "" - try: - # trim till 'Read more' - pattern = "Read more in the :ref:" - index = s.index(pattern) - s = s[:index] - # trimming docstring to be within char_lim - if len(s) > char_lim: - s = f"{s[: char_lim - 3]}..." - return s.strip() - except ValueError: - logger.warning( - "'Read more' not found in descriptions. " - "Trying to trim till 'Parameters' if available in docstring.", - ) - try: - # if 'Read more' doesn't exist, trim till 'Parameters' - pattern = "Parameters" - index = s.index(match_format(pattern)) - except ValueError: - # returning full docstring - logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.") - index = len(s) - s = s[:index] - # trimming docstring to be within char_lim - if len(s) > char_lim: - s = f"{s[: char_lim - 3]}..." - return s.strip() - - def _extract_sklearn_parameter_docstring(self, model) -> None | str: - """Extracts the part of sklearn docstring containing parameter information - - Fetches the entire docstring and trims just the Parameter section. - The assumption is that 'Parameters' is the first section in sklearn docstrings, - followed by other sections titled 'Attributes', 'See also', 'Note', 'References', - appearing in that order if defined. - Returns a None if no section with 'Parameters' can be found in the docstring. - - Parameters - ---------- - model : sklearn model - - Returns - ------- - str, or None - """ - - def match_format(s): - return f"{s}\n{len(s) * '-'}\n" - - s = inspect.getdoc(model) - if s is None: - return None - try: - index1 = s.index(match_format("Parameters")) - except ValueError as e: - # when sklearn docstring has no 'Parameters' section - logger.warning(f"{match_format('Parameters')} {e}") - return None - - headings = ["Attributes", "Notes", "See also", "Note", "References"] - for h in headings: - try: - # to find end of Parameters section - index2 = s.index(match_format(h)) - break - except ValueError: - logger.warning(f"{h} not available in docstring") - continue - else: - # in the case only 'Parameters' exist, trim till end of docstring - index2 = len(s) - s = s[index1:index2] - return s.strip() - - def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict: - """Parses parameter type and description from sklearn dosctring - - Parameters - ---------- - model : sklearn model - char_lim : int - Specifying the max length of the returned string. - OpenML servers have a constraint of 1024 characters string fields. - - Returns - ------- - Dict, or None - """ - docstring = self._extract_sklearn_parameter_docstring(model) - if docstring is None: - # when sklearn docstring has no 'Parameters' section - return None - - n = re.compile("[.]*\n", flags=IGNORECASE) - lines = n.split(docstring) - p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE) - # The above regular expression is designed to detect sklearn parameter names and type - # in the format of [variable_name][space]:[space][type] - # The expectation is that the parameter description for this detected parameter will - # be all the lines in the docstring till the regex finds another parameter match - - # collecting parameters and their descriptions - description = [] # type: List - for s in lines: - param = p.findall(s) - if param != []: - # a parameter definition is found by regex - # creating placeholder when parameter found which will be a list of strings - # string descriptions will be appended in subsequent iterations - # till another parameter is found and a new placeholder is created - placeholder = [""] # type: List[str] - description.append(placeholder) - elif len(description) > 0: # description=[] means no parameters found yet - # appending strings to the placeholder created when parameter found - description[-1].append(s) - for i in range(len(description)): - # concatenating parameter description strings - description[i] = "\n".join(description[i]).strip() - # limiting all parameter descriptions to accepted OpenML string length - if len(description[i]) > char_lim: - description[i] = f"{description[i][: char_lim - 3]}..." - - # collecting parameters and their types - parameter_docs = OrderedDict() - matches = p.findall(docstring) - for i, param in enumerate(matches): - key, value = str(param).split(":") - parameter_docs[key.strip()] = [value.strip(), description[i]] - - # to avoid KeyError for missing parameters - param_list_true = list(model.get_params().keys()) - param_list_found = list(parameter_docs.keys()) - for param in list(set(param_list_true) - set(param_list_found)): - parameter_docs[param] = [None, None] - - return parameter_docs - - def _serialize_model(self, model: Any) -> OpenMLFlow: - """Create an OpenMLFlow. - - Calls `sklearn_to_flow` recursively to properly serialize the - parameters to strings and the components (other models) to OpenMLFlows. - - Parameters - ---------- - model : sklearn estimator - - Returns - ------- - OpenMLFlow - - """ - # Get all necessary information about the model objects itself - ( - parameters, - parameters_meta_info, - subcomponents, - subcomponents_explicit, - ) = self._extract_information_from_model(model) - - # Check that a component does not occur multiple times in a flow as this - # is not supported by OpenML - self._check_multiple_occurence_of_component_in_flow(model, subcomponents) - - # Create a flow name, which contains all components in brackets, e.g.: - # RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)), - # StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)) - class_name = model.__module__ + "." + model.__class__.__name__ - - # will be part of the name (in brackets) - sub_components_names = "" - for key in subcomponents: - name_thing = subcomponents[key] - if isinstance(name_thing, OpenMLFlow): - name = name_thing.name - elif ( - isinstance(name_thing, str) - and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - name = name_thing - else: - raise TypeError(type(subcomponents[key])) - - if key in subcomponents_explicit: - sub_components_names += "," + key + "=" + name - else: - sub_components_names += "," + name - - # slice operation on string in order to get rid of leading comma - name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name - short_name = SklearnExtension.trim_flow_name(name) - - # Get the external versions of all sub-components - external_version = self._get_external_version_string(model, subcomponents) - dependencies = self._get_dependencies() - tags = self._get_tags() - - sklearn_description = self._get_sklearn_description(model) - return OpenMLFlow( - name=name, - class_name=class_name, - custom_name=short_name, - description=sklearn_description, - model=model, - components=subcomponents, - parameters=parameters, - parameters_meta_info=parameters_meta_info, - external_version=external_version, - tags=tags, - extension=self, - language="English", - dependencies=dependencies, - ) - - def _get_dependencies(self) -> str: - return self._min_dependency_str(sklearn.__version__) # type: ignore - - def _get_tags(self) -> list[str]: - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore - sklearn_version_formatted = sklearn_version.replace("==", "_") - return [ - "openml-python", - "sklearn", - "scikit-learn", - "python", - sklearn_version_formatted, - # TODO: add more tags based on the scikit-learn - # module a flow is in? For example automatically - # annotate a class of sklearn.svm.SVC() with the - # tag svm? - ] - - def _get_external_version_string( - self, - model: Any, - sub_components: dict[str, OpenMLFlow], - ) -> str: - # Create external version string for a flow, given the model and the - # already parsed dictionary of sub_components. Retrieves the external - # version of all subcomponents, which themselves already contain all - # requirements for their subcomponents. The external version string is a - # sorted concatenation of all modules which are present in this run. - - external_versions = set() - - # The model is None if the flow is a placeholder flow such as 'passthrough' or 'drop' - if model is not None: - model_package_name = model.__module__.split(".")[0] - module = importlib.import_module(model_package_name) - model_package_version_number = module.__version__ # type: ignore - external_version = self._format_external_version( - model_package_name, - model_package_version_number, - ) - external_versions.add(external_version) - - openml_version = self._format_external_version("openml", openml.__version__) - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore - external_versions.add(openml_version) - external_versions.add(sklearn_version) - for visitee in sub_components.values(): - if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: - continue - for external_version in visitee.external_version.split(","): - external_versions.add(external_version) - return ",".join(sorted(external_versions)) - - def _check_multiple_occurence_of_component_in_flow( - self, - model: Any, - sub_components: dict[str, OpenMLFlow], - ) -> None: - to_visit_stack: list[OpenMLFlow] = [] - to_visit_stack.extend(sub_components.values()) - known_sub_components: set[str] = set() - - while len(to_visit_stack) > 0: - visitee = to_visit_stack.pop() - if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: - known_sub_components.add(visitee) - elif visitee.name in known_sub_components: - raise ValueError( - f"Found a second occurence of component {visitee.name} when " - f"trying to serialize {model}.", - ) - else: - known_sub_components.add(visitee.name) - to_visit_stack.extend(visitee.components.values()) - - def _extract_information_from_model( # noqa: PLR0915, C901, PLR0912 - self, - model: Any, - ) -> tuple[ - OrderedDict[str, str | None], - OrderedDict[str, dict | None], - OrderedDict[str, OpenMLFlow], - set, - ]: - # This function contains four "global" states and is quite long and - # complicated. If it gets to complicated to ensure it's correctness, - # it would be best to make it a class with the four "global" states being - # the class attributes and the if/elif/else in the for-loop calls to - # separate class methods - - # stores all entities that should become subcomponents - sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] - # stores the keys of all subcomponents that should become - sub_components_explicit = set() - parameters: OrderedDict[str, str | None] = OrderedDict() - parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict() - parameters_docs = self._extract_sklearn_param_info(model) - - model_parameters = model.get_params(deep=False) - for k, v in sorted(model_parameters.items(), key=lambda t: t[0]): - rval = self._serialize_sklearn(v, model) - - def flatten_all(list_): - """Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]).""" - for el in list_: - if isinstance(el, (list, tuple)) and len(el) > 0: - yield from flatten_all(el) - else: - yield el - - # In case rval is a list of lists (or tuples), we need to identify two situations: - # - sklearn pipeline steps, feature union or base classifiers in voting classifier. - # They look like e.g. [("imputer", Imputer()), ("classifier", SVC())] - # - a list of lists with simple types (e.g. int or str), such as for an OrdinalEncoder - # where all possible values for each feature are described: [[0,1,2], [1,2,5]] - is_non_empty_list_of_lists_with_same_type = ( - isinstance(rval, (list, tuple)) - and len(rval) > 0 - and isinstance(rval[0], (list, tuple)) - and all(isinstance(rval_i, type(rval[0])) for rval_i in rval) - ) - - # Check that all list elements are of simple types. - nested_list_of_simple_types = ( - is_non_empty_list_of_lists_with_same_type - and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)) - and all( - len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS - for rv in rval - ) - ) - - if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types: - # If a list of lists is identified that include 'non-simple' types (e.g. objects), - # we assume they are steps in a pipeline, feature union, or base classifiers in - # a voting classifier. - parameter_value = [] # type: List - reserved_keywords = set(model.get_params(deep=False).keys()) - - for sub_component_tuple in rval: - identifier = sub_component_tuple[0] - sub_component = sub_component_tuple[1] - sub_component_type = type(sub_component_tuple) - if not 2 <= len(sub_component_tuple) <= 3: - # length 2 is for {VotingClassifier.estimators, - # Pipeline.steps, FeatureUnion.transformer_list} - # length 3 is for ColumnTransformer - raise ValueError( - f"Length of tuple of type {sub_component_type}" - " does not match assumptions" - ) - - if isinstance(sub_component, str): - if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: - msg = ( - "Second item of tuple does not match assumptions. " - "If string, can be only 'drop' or 'passthrough' but" - f"got {sub_component}" - ) - raise ValueError(msg) - elif sub_component is None: - msg = ( - "Cannot serialize objects of None type. Please use a valid " - "placeholder for None. Note that empty sklearn estimators can be " - "replaced with 'drop' or 'passthrough'." - ) - raise ValueError(msg) - elif not isinstance(sub_component, OpenMLFlow): - msg = ( - "Second item of tuple does not match assumptions. " - f"Expected OpenMLFlow, got {type(sub_component)}" - ) - raise TypeError(msg) - - if identifier in reserved_keywords: - parent_model = f"{model.__module__}.{model.__class__.__name__}" - msg = ( - "Found element shadowing official " - f"parameter for {parent_model}: {identifier}" - ) - raise PyOpenMLError(msg) - - # when deserializing the parameter - sub_components_explicit.add(identifier) - if isinstance(sub_component, str): - external_version = self._get_external_version_string(None, {}) - dependencies = self._get_dependencies() - tags = self._get_tags() - - sub_components[identifier] = OpenMLFlow( - name=sub_component, - description="Placeholder flow for scikit-learn's string pipeline " - "members", - components=OrderedDict(), - parameters=OrderedDict(), - parameters_meta_info=OrderedDict(), - external_version=external_version, - tags=tags, - language="English", - dependencies=dependencies, - model=None, - ) - component_reference: OrderedDict[str, str | dict] = OrderedDict() - component_reference["oml-python:serialized_object"] = ( - COMPOSITION_STEP_CONSTANT - ) - cr_value: dict[str, Any] = OrderedDict() - cr_value["key"] = identifier - cr_value["step_name"] = identifier - if len(sub_component_tuple) == 3: - cr_value["argument_1"] = sub_component_tuple[2] - component_reference["value"] = cr_value - else: - sub_components[identifier] = sub_component - component_reference = OrderedDict() - component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE - cr_value = OrderedDict() - cr_value["key"] = identifier - cr_value["step_name"] = identifier - if len(sub_component_tuple) == 3: - cr_value["argument_1"] = sub_component_tuple[2] - component_reference["value"] = cr_value - parameter_value.append(component_reference) - - # Here (and in the elif and else branch below) are the only - # places where we encode a value as json to make sure that all - # parameter values still have the same type after - # deserialization - if isinstance(rval, tuple): - parameter_json = json.dumps(tuple(parameter_value)) - else: - parameter_json = json.dumps(parameter_value) - parameters[k] = parameter_json - - elif isinstance(rval, OpenMLFlow): - # A subcomponent, for example the base model in - # AdaBoostClassifier - sub_components[k] = rval - sub_components_explicit.add(k) - component_reference = OrderedDict() - component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE - cr_value = OrderedDict() - cr_value["key"] = k - cr_value["step_name"] = None - component_reference["value"] = cr_value - cr = self._serialize_sklearn(component_reference, model) - parameters[k] = json.dumps(cr) - - elif not (hasattr(rval, "__len__") and len(rval) == 0): - rval = json.dumps(rval) - parameters[k] = rval - # a regular hyperparameter - else: - parameters[k] = None - - if parameters_docs is not None: - data_type, description = parameters_docs[k] - parameters_meta_info[k] = OrderedDict( - (("description", description), ("data_type", data_type)), - ) - else: - parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None))) - - return parameters, parameters_meta_info, sub_components, sub_components_explicit - - def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]: - """ - Returns - ------- - i) a dict with all parameter names that have a default value, and - ii) a set with all parameter names that do not have a default - - Parameters - ---------- - fn_name : callable - The function of which we want to obtain the defaults - - Returns - ------- - params_with_defaults: dict - a dict mapping parameter name to the default value - params_without_defaults: set - a set with all parameters that do not have a default value - """ - # parameters with defaults are optional, all others are required. - parameters = inspect.signature(fn_name).parameters - required_params = set() - optional_params = {} - for param in parameters: - parameter = parameters.get(param) - default_val = parameter.default # type: ignore - if default_val is inspect.Signature.empty: - required_params.add(param) - else: - optional_params[param] = default_val - return optional_params, required_params - - def _deserialize_model( # noqa: C901 - self, - flow: OpenMLFlow, - keep_defaults: bool, # noqa: FBT001 - recursion_depth: int, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - logger.info(f"-{'-' * recursion_depth} deserialize {flow.name}") - model_name = flow.class_name - self._check_dependencies(flow.dependencies, strict_version=strict_version) - - parameters = flow.parameters - components = flow.components - parameter_dict: dict[str, Any] = OrderedDict() - - # Do a shallow copy of the components dictionary so we can remove the - # components from this copy once we added them into the pipeline. This - # allows us to not consider them any more when looping over the - # components, but keeping the dictionary of components untouched in the - # original components dictionary. - components_ = copy.copy(components) - - for name in parameters: - value = parameters.get(name) - logger.info(f"--{'-' * recursion_depth} flow_parameter={name}, value={value}") - rval = self._deserialize_sklearn( - value, - components=components_, - initialize_with_defaults=keep_defaults, - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - parameter_dict[name] = rval - - for name in components: - if name in parameter_dict: - continue - if name not in components_: - continue - value = components[name] - logger.info(f"--{'-' * recursion_depth} flow_component={name}, value={value}") - rval = self._deserialize_sklearn( - value, - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - parameter_dict[name] = rval - - if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS: - return flow.name - - assert model_name is not None - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - - if keep_defaults: - # obtain all params with a default - param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) - - # delete the params that have a default from the dict, - # so they get initialized with their default value - # except [...] - for param in param_defaults: - # [...] the ones that also have a key in the components dict. - # As OpenML stores different flows for ensembles with different - # (base-)components, in OpenML terms, these are not considered - # hyperparameters but rather constants (i.e., changing them would - # result in a different flow) - if param not in components: - del parameter_dict[param] - - if not strict_version: - # Ignore incompatible parameters - allowed_parameter = list(inspect.signature(model_class.__init__).parameters) - for p in list(parameter_dict.keys()): - if p not in allowed_parameter: - warnings.warn( - f"While deserializing in a non-strict way, parameter {p} is not " - f"allowed for {model_class.__name__} likely due to a version mismatch. " - "We ignore the parameter.", - UserWarning, - stacklevel=2, - ) - del parameter_dict[p] - - return model_class(**parameter_dict) - - def _check_dependencies( - self, - dependencies: str, - strict_version: bool = True, # noqa: FBT001, FBT002 - ) -> None: - if not dependencies: - return - - dependencies_list = dependencies.split("\n") - for dependency_string in dependencies_list: - match = DEPENDENCIES_PATTERN.match(dependency_string) - if not match: - raise ValueError(f"Cannot parse dependency {dependency_string}") - - dependency_name = match.group("name") - operation = match.group("operation") - version = match.group("version") - - module = importlib.import_module(dependency_name) - required_version = Version(version) - installed_version = Version(module.__version__) # type: ignore - - if operation == "==": - check = required_version == installed_version - elif operation == ">": - check = installed_version > required_version - elif operation == ">=": - check = ( - installed_version > required_version or installed_version == required_version - ) - else: - raise NotImplementedError(f"operation '{operation}' is not supported") - message = ( - f"Trying to deserialize a model with dependency {dependency_string} not satisfied." - ) - if not check: - if strict_version: - raise ValueError(message) - - warnings.warn(message, category=UserWarning, stacklevel=2) - - def _serialize_type(self, o: Any) -> OrderedDict[str, str]: - mapping = { - float: "float", - np.float32: "np.float32", - np.float64: "np.float64", - int: "int", - np.int32: "np.int32", - np.int64: "np.int64", - } - if Version(np.__version__) < Version("1.24"): - mapping[float] = "np.float" - mapping[int] = "np.int" - - ret = OrderedDict() # type: 'OrderedDict[str, str]' - ret["oml-python:serialized_object"] = "type" - ret["value"] = mapping[o] - return ret - - def _deserialize_type(self, o: str) -> Any: - mapping = { - "float": float, - "np.float32": np.float32, - "np.float64": np.float64, - "int": int, - "np.int32": np.int32, - "np.int64": np.int64, - } - - # TODO(eddiebergman): Might be able to remove this - if Version(np.__version__) < Version("1.24"): - mapping["np.float"] = np.float # type: ignore # noqa: NPY001 - mapping["np.int"] = np.int # type: ignore # noqa: NPY001 - - return mapping[o] - - def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]: - args = o.args - kwds = o.kwds - a = o.a - b = o.b - dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__ - ret: OrderedDict[str, str | dict] = OrderedDict() - ret["oml-python:serialized_object"] = "rv_frozen" - ret["value"] = OrderedDict( - (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)), - ) - return ret - - def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any: - args = o["args"] - kwds = o["kwds"] - a = o["a"] - b = o["b"] - dist_name = o["dist"] - - module_name = dist_name.rsplit(".", 1) - try: - rv_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - except AttributeError as e: - _tb = traceback.format_exc() - warnings.warn( - f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}" - f"\nTraceback: {_tb}", - RuntimeWarning, - stacklevel=2, - ) - return None - - dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) # type: ignore - dist.a = a - dist.b = b - - return dist - - def _serialize_function(self, o: Callable) -> OrderedDict[str, str]: - name = o.__module__ + "." + o.__name__ - ret = OrderedDict() # type: 'OrderedDict[str, str]' - ret["oml-python:serialized_object"] = "function" - ret["value"] = name - return ret - - def _deserialize_function(self, name: str) -> Callable: - module_name = name.rsplit(".", 1) - return getattr(importlib.import_module(module_name[0]), module_name[1]) - - def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: - ret: OrderedDict[str, str | dict] = OrderedDict() - - parameters = OrderedDict() # type: 'OrderedDict[str, Any]' - - # XXX this is copied from sklearn.model_selection._split - cls = o.__class__ - init = getattr(cls.__init__, "deprecated_original", cls.__init__) - # Ignore varargs, kw and default values and pop self - init_signature = inspect.signature(init) # type: ignore - # Consider the constructor parameters excluding 'self' - if init is object.__init__: - args = [] # type: List - else: - args = sorted( - [ - p.name - for p in init_signature.parameters.values() - if p.name != "self" and p.kind != p.VAR_KEYWORD - ], - ) - - for key in args: - # We need deprecation warnings to always be on in order to - # catch deprecated param values. - # This is set in utils/__init__.py but it gets overwritten - # when running under python3 somehow. - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always", DeprecationWarning) - value = getattr(o, key, None) - if w is not None and len(w) and w[0].category is DeprecationWarning: - # if the parameter is deprecated, don't show it - continue - - if not (isinstance(value, Sized) and len(value) == 0): - value = json.dumps(value) - parameters[key] = value - else: - parameters[key] = None - - ret["oml-python:serialized_object"] = "cv_object" - name = o.__module__ + "." + o.__class__.__name__ - value = OrderedDict([("name", name), ("parameters", parameters)]) - ret["value"] = value - - return ret - - def _deserialize_cross_validator( - self, - value: OrderedDict[str, Any], - recursion_depth: int, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - model_name = value["name"] - parameters = value["parameters"] - - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - for parameter in parameters: - parameters[parameter] = self._deserialize_sklearn( - parameters[parameter], - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - return model_class(**parameters) - - def _format_external_version( - self, - model_package_name: str, - model_package_version_number: str, - ) -> str: - return f"{model_package_name}=={model_package_version_number}" - - @staticmethod - def _get_parameter_values_recursive( - param_grid: dict | list[dict], - parameter_name: str, - ) -> list[Any]: - """ - Returns a list of values for a given hyperparameter, encountered - recursively throughout the flow. (e.g., n_jobs can be defined - for various flows) - - Parameters - ---------- - param_grid: Union[Dict, List[Dict]] - Dict mapping from hyperparameter list to value, to a list of - such dicts - - parameter_name: str - The hyperparameter that needs to be inspected - - Returns - ------- - List - A list of all values of hyperparameters with this name - """ - if isinstance(param_grid, dict): - return [ - value - for param, value in param_grid.items() - if param.split("__")[-1] == parameter_name - ] - - if isinstance(param_grid, list): - result = [] - for sub_grid in param_grid: - result.extend( - SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name), - ) - return result - - raise ValueError("Param_grid should either be a dict or list of dicts") - - def _prevent_optimize_n_jobs(self, model): - """ - Ensures that HPO classes will not optimize the n_jobs hyperparameter - - Parameters - ---------- - model: - The model that will be fitted - """ - if self._is_hpo_class(model): - if isinstance(model, sklearn.model_selection.GridSearchCV): - param_distributions = model.param_grid - elif isinstance(model, sklearn.model_selection.RandomizedSearchCV): - param_distributions = model.param_distributions - else: - if hasattr(model, "param_distributions"): - param_distributions = model.param_distributions - else: - raise AttributeError( - "Using subclass BaseSearchCV other than " - "{GridSearchCV, RandomizedSearchCV}. " - "Could not find attribute " - "param_distributions.", - ) - logger.warning( - "Warning! Using subclass BaseSearchCV other than " - "{GridSearchCV, RandomizedSearchCV}. " - "Should implement param check. ", - ) - n_jobs_vals = SklearnExtension._get_parameter_values_recursive( - param_distributions, - "n_jobs", - ) - if len(n_jobs_vals) > 0: - raise PyOpenMLError( - "openml-python should not be used to optimize the n_jobs parameter.", - ) - - ################################################################################################ - # Methods for performing runs with extension modules - - def is_estimator(self, model: Any) -> bool: - """Check whether the given model is a scikit-learn estimator. - - This function is only required for backwards compatibility and will be removed in the - near future. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - o = model - return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params") - - def seed_model(self, model: Any, seed: int | None = None) -> Any: # noqa: C901 - """Set the random state of all the unseeded components of a model and return the seeded - model. - - Required so that all seed information can be uploaded to OpenML for reproducible results. - - Models that are already seeded will maintain the seed. In this case, - only integer seeds are allowed (An exception is raised when a RandomState was used as - seed). - - Parameters - ---------- - model : sklearn model - The model to be seeded - seed : int - The seed to initialize the RandomState with. Unseeded subcomponents - will be seeded with a random number from the RandomState. - - Returns - ------- - Any - """ - - def _seed_current_object(current_value): - if isinstance(current_value, int): # acceptable behaviour - return False - - if isinstance(current_value, np.random.RandomState): - raise ValueError( - "Models initialized with a RandomState object are not " - "supported. Please seed with an integer. ", - ) - - if current_value is not None: - raise ValueError( - "Models should be seeded with int or None (this should never happen). ", - ) - - return True - - rs = np.random.RandomState(seed) - model_params = model.get_params() - random_states = {} - for param_name in sorted(model_params): - if "random_state" in param_name: - current_value = model_params[param_name] - # important to draw the value at this point (and not in the if - # statement) this way we guarantee that if a different set of - # subflows is seeded, the same number of the random generator is - # used - new_value = rs.randint(0, 2**16) - if _seed_current_object(current_value): - random_states[param_name] = new_value - - # Also seed CV objects! - elif isinstance(model_params[param_name], sklearn.model_selection.BaseCrossValidator): - if not hasattr(model_params[param_name], "random_state"): - continue - - current_value = model_params[param_name].random_state - new_value = rs.randint(0, 2**16) - if _seed_current_object(current_value): - model_params[param_name].random_state = new_value - - model.set_params(**random_states) - return model - - def check_if_model_fitted(self, model: Any) -> bool: - """Returns True/False denoting if the model has already been fitted/trained - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - from sklearn.exceptions import NotFittedError - from sklearn.utils.validation import check_is_fitted - - try: - # check if model is fitted - check_is_fitted(model) - - # Creating random dummy data of arbitrary size - dummy_data = np.random.uniform(size=(10, 3)) # noqa: NPY002 - # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more - # robust check that works across sklearn versions and models. Internally, 'predict' - # should call 'check_is_fitted' for every concerned attribute, thus offering a more - # assured check than explicit calls to 'check_is_fitted' - model.predict(dummy_data) - # Will reach here if the model was fit on a dataset with 3 features - return True - except NotFittedError: # needs to be the first exception to be caught - # Model is not fitted, as is required - return False - except ValueError: - # Will reach here if the model was fit on a dataset with more or less than 3 features - return True - - def _run_model_on_fold( # noqa: PLR0915, PLR0913, C901, PLR0912 - self, - model: Any, - task: OpenMLTask, - X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame, - rep_no: int, - fold_no: int, - y_train: np.ndarray | None = None, - X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None, - ) -> tuple[ - np.ndarray, - pd.DataFrame | None, - OrderedDict[str, float], - OpenMLRunTrace | None, - ]: - """Run a model on a repeat,fold,subsample triplet of the task and return prediction - information. - - Furthermore, it will measure run time measures in case multi-core behaviour allows this. - * exact user cpu time will be measured if the number of cores is set (recursive throughout - the model) exactly to 1 - * wall clock time will be measured if the number of cores is set (recursive throughout the - model) to any given number (but not when it is set to -1) - - Returns the data that is necessary to construct the OpenML Run object. Is used by - run_task_get_arff_content. Do not use this function unless you know what you are doing. - - Parameters - ---------- - model : Any - The UNTRAINED model to run. The model instance will be copied and not altered. - task : OpenMLTask - The task to run the model on. - X_train : array-like - Training data for the given repetition and fold. - rep_no : int - The repeat of the experiment (0-based; in case of 1 time CV, always 0) - fold_no : int - The fold nr of the experiment (0-based; in case of holdout, always 0) - y_train : Optional[np.ndarray] (default=None) - Target attributes for supervised tasks. In case of classification, these are integer - indices to the potential classes specified by dataset. - X_test : Optional, array-like (default=None) - Test attributes to test for generalization in supervised tasks. - - Returns - ------- - pred_y : np.ndarray - Predictions on the training/test set, depending on the task type. - For supervised tasks, predictions are on the test set. - For unsupervised tasks, predictions are on the training set. - proba_y : pd.DataFrame, optional - Predicted probabilities for the test set. - None, if task is not Classification or Learning Curve prediction. - user_defined_measures : OrderedDict[str, float] - User defined measures that were generated on this fold - trace : OpenMLRunTrace, optional - arff trace object from a fitted model and the trace content obtained by - repeatedly calling ``run_model_on_task`` - """ - - def _prediction_to_probabilities( - y: np.ndarray | list, - model_classes: list[Any], - class_labels: list[str] | None, - ) -> pd.DataFrame: - """Transforms predicted probabilities to match with OpenML class indices. - - Parameters - ---------- - y : np.ndarray - Predicted probabilities (possibly omitting classes if they were not present in the - training data). - model_classes : list - List of classes known_predicted by the model, ordered by their index. - class_labels : list - List of classes as stored in the task object fetched from server. - - Returns - ------- - pd.DataFrame - """ - if class_labels is None: - raise ValueError("The task has no class labels") - - if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str): - # mapping (decoding) the predictions to the categories - # creating a separate copy to not change the expected pred_y type - y = [class_labels[pred] for pred in y] # list or numpy array of predictions - - # model_classes: sklearn classifier mapping from original array id to - # prediction index id - if not isinstance(model_classes, list): - raise ValueError("please convert model classes to list prior to calling this fn") - - # DataFrame allows more accurate mapping of classes as column names - result = pd.DataFrame( - 0, - index=np.arange(len(y)), - columns=model_classes, - dtype=np.float32, - ) - for obs, prediction in enumerate(y): - result.loc[obs, prediction] = 1.0 - return result - - if isinstance(task, OpenMLSupervisedTask): - if y_train is None: - raise TypeError("argument y_train must not be of type None") - if X_test is None: - raise TypeError("argument X_test must not be of type None") - - model_copy = sklearn.base.clone(model, safe=True) - # sanity check: prohibit users from optimizing n_jobs - self._prevent_optimize_n_jobs(model_copy) - # measures and stores runtimes - user_defined_measures = OrderedDict() # type: 'OrderedDict[str, float]' - try: - # for measuring runtime. Only available since Python 3.3 - modelfit_start_cputime = time.process_time() - modelfit_start_walltime = time.time() - - if isinstance(task, OpenMLSupervisedTask): - model_copy.fit(X_train, y_train) # type: ignore - elif isinstance(task, OpenMLClusteringTask): - model_copy.fit(X_train) # type: ignore - - modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000 - modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000 - - user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime - refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 # type: ignore - user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime - - except AttributeError as e: - # typically happens when training a regressor on classification task - raise PyOpenMLError(str(e)) from e - - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - # search for model classes_ (might differ depending on modeltype) - # first, pipelines are a special case (these don't have a classes_ - # object, but rather borrows it from the last step. We do this manually, - # because of the BaseSearch check) - if isinstance(model_copy, sklearn.pipeline.Pipeline): - used_estimator = model_copy.steps[-1][-1] - else: - used_estimator = model_copy - - if self._is_hpo_class(used_estimator): - model_classes = used_estimator.best_estimator_.classes_ - else: - model_classes = used_estimator.classes_ - - if not isinstance(model_classes, list): - model_classes = model_classes.tolist() - - # to handle the case when dataset is numpy and categories are encoded - # however the class labels stored in task are still categories - if isinstance(y_train, np.ndarray) and isinstance( - cast("List", task.class_labels)[0], - str, - ): - model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes] - - modelpredict_start_cputime = time.process_time() - modelpredict_start_walltime = time.time() - - # In supervised learning this returns the predictions for Y, in clustering - # it returns the clusters - if isinstance(task, OpenMLSupervisedTask): - pred_y = model_copy.predict(X_test) - elif isinstance(task, OpenMLClusteringTask): - pred_y = model_copy.predict(X_train) - else: - raise ValueError(task) - - modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000 - user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime - user_defined_measures["usercpu_time_millis"] = ( - modelfit_dur_cputime + modelpredict_duration_cputime - ) - modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000 - user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime - user_defined_measures["wall_clock_time_millis"] = ( - modelfit_dur_walltime + modelpredict_duration_walltime + refit_time - ) - - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - try: - proba_y = model_copy.predict_proba(X_test) - proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy - except AttributeError: # predict_proba is not available when probability=False - proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels) - - if task.class_labels is not None: - if proba_y.shape[1] != len(task.class_labels): - # Remap the probabilities in case there was a class missing - # at training time. By default, the classification targets - # are mapped to be zero-based indices to the actual classes. - # Therefore, the model_classes contain the correct indices to - # the correct probability array. Example: - # classes in the dataset: 0, 1, 2, 3, 4, 5 - # classes in the training set: 0, 1, 2, 4, 5 - # then we need to add a column full of zeros into the probabilities - # for class 3 because the rest of the library expects that the - # probabilities are ordered the same way as the classes are ordered). - message = ( - f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}" - " classes!" - ) - warnings.warn(message, stacklevel=2) - openml.config.logger.warning(message) - - for _i, col in enumerate(task.class_labels): - # adding missing columns with 0 probability - if col not in model_classes: - proba_y[col] = 0 - # We re-order the columns to move possibly added missing columns into place. - proba_y = proba_y[task.class_labels] - else: - raise ValueError("The task has no class labels") - - if not np.all(set(proba_y.columns) == set(task.class_labels)): - missing_cols = list(set(task.class_labels) - set(proba_y.columns)) - raise ValueError("Predicted probabilities missing for the columns: ", missing_cols) - - elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)): - proba_y = None - else: - raise TypeError(type(task)) - - if self._is_hpo_class(model_copy): - trace_data = self._extract_trace_data(model_copy, rep_no, fold_no) - trace: OpenMLRunTrace | None = self._obtain_arff_trace( - model_copy, - trace_data, - ) - else: - trace = None - - return pred_y, proba_y, user_defined_measures, trace - - def obtain_parameter_values( # noqa: C901, PLR0915 - self, - flow: OpenMLFlow, - model: Any = None, - ) -> list[dict[str, Any]]: - """Extracts all parameter settings required for the flow from the model. - - If no explicit model is provided, the parameters will be extracted from `flow.model` - instead. - - Parameters - ---------- - flow : OpenMLFlow - OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) - - model: Any, optional (default=None) - The model from which to obtain the parameter values. Must match the flow signature. - If None, use the model specified in ``OpenMLFlow.model``. - - Returns - ------- - list - A list of dicts, where each dict has the following entries: - - ``oml:name`` : str: The OpenML parameter name - - ``oml:value`` : mixed: A representation of the parameter value - - ``oml:component`` : int: flow id to which the parameter belongs - """ - openml.flows.functions._check_flow_for_server_id(flow) - - def get_flow_dict(_flow): - flow_map = {_flow.name: _flow.flow_id} - for subflow in _flow.components: - flow_map.update(get_flow_dict(_flow.components[subflow])) - return flow_map - - def extract_parameters( # noqa: PLR0915, PLR0912, C901 - _flow, - _flow_dict, - component_model, - _main_call=False, # noqa: FBT002 - main_id=None, - ): - def is_subcomponent_specification(values): - # checks whether the current value can be a specification of - # subcomponents, as for example the value for steps parameter - # (in Pipeline) or transformers parameter (in - # ColumnTransformer). - return ( - # Specification requires list/tuple of list/tuple with - # at least length 2. - isinstance(values, (tuple, list)) - and all(isinstance(item, (tuple, list)) and len(item) > 1 for item in values) - # And each component needs to be a flow or interpretable string - and all( - isinstance(item[1], openml.flows.OpenMLFlow) - or ( - isinstance(item[1], str) - and item[1] in SKLEARN_PIPELINE_STRING_COMPONENTS - ) - for item in values - ) - ) - - # _flow is openml flow object, _param dict maps from flow name to flow - # id for the main call, the param dict can be overridden (useful for - # unit tests / sentinels) this way, for flows without subflows we do - # not have to rely on _flow_dict - exp_parameters = set(_flow.parameters) - if ( - isinstance(component_model, str) - and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - model_parameters = set() - else: - model_parameters = set(component_model.get_params(deep=False)) - if len(exp_parameters.symmetric_difference(model_parameters)) != 0: - flow_params = sorted(exp_parameters) - model_params = sorted(model_parameters) - raise ValueError( - "Parameters of the model do not match the " - "parameters expected by the " - "flow:\nexpected flow parameters: " - f"{flow_params}\nmodel parameters: {model_params}", - ) - exp_components = set(_flow.components) - if ( - isinstance(component_model, str) - and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - model_components = set() - else: - _ = set(component_model.get_params(deep=False)) - model_components = { - mp - for mp in component_model.get_params(deep=True) - if "__" not in mp and mp not in _ - } - if len(exp_components.symmetric_difference(model_components)) != 0: - is_problem = True - if len(exp_components - model_components) > 0: - # If an expected component is not returned as a component by get_params(), - # this means that it is also a parameter -> we need to check that this is - # actually the case - difference = exp_components - model_components - component_in_model_parameters = [] - for component in difference: - if component in model_parameters: - component_in_model_parameters.append(True) - else: - component_in_model_parameters.append(False) - is_problem = not all(component_in_model_parameters) - if is_problem: - flow_components = sorted(exp_components) - model_components = sorted(model_components) - raise ValueError( - "Subcomponents of the model do not match the " - "parameters expected by the " - "flow:\nexpected flow subcomponents: " - f"{flow_components}\nmodel subcomponents: {model_components}", - ) - - _params = [] - for _param_name in _flow.parameters: - _current = OrderedDict() - _current["oml:name"] = _param_name - - current_param_values = self.model_to_flow(component_model.get_params()[_param_name]) - - # Try to filter out components (a.k.a. subflows) which are - # handled further down in the code (by recursively calling - # this function)! - if isinstance(current_param_values, openml.flows.OpenMLFlow): - continue - - if is_subcomponent_specification(current_param_values): - # complex parameter value, with subcomponents - parsed_values = [] - for subcomponent in current_param_values: - # scikit-learn stores usually tuples in the form - # (name (str), subcomponent (mixed), argument - # (mixed)). OpenML replaces the subcomponent by an - # OpenMLFlow object. - if len(subcomponent) < 2 or len(subcomponent) > 3: - raise ValueError("Component reference should be size {2,3}. ") - - subcomponent_identifier = subcomponent[0] - subcomponent_flow = subcomponent[1] - if not isinstance(subcomponent_identifier, str): - raise TypeError( - "Subcomponent identifier should be of type string, " - f"but is {type(subcomponent_identifier)}", - ) - if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)): - if ( - isinstance(subcomponent_flow, str) - and subcomponent_flow in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - pass - else: - raise TypeError( - "Subcomponent flow should be of type flow, but is" - f" {type(subcomponent_flow)}", - ) - - current = { - "oml-python:serialized_object": COMPONENT_REFERENCE, - "value": { - "key": subcomponent_identifier, - "step_name": subcomponent_identifier, - }, - } - if len(subcomponent) == 3: - if not isinstance(subcomponent[2], list) and not isinstance( - subcomponent[2], - OrderedDict, - ): - raise TypeError( - "Subcomponent argument should be list or OrderedDict", - ) - current["value"]["argument_1"] = subcomponent[2] - parsed_values.append(current) - parsed_values = json.dumps(parsed_values) - else: - # vanilla parameter value - parsed_values = json.dumps(current_param_values) - - _current["oml:value"] = parsed_values - if _main_call: - _current["oml:component"] = main_id - else: - _current["oml:component"] = _flow_dict[_flow.name] - _params.append(_current) - - for _identifier in _flow.components: - subcomponent_model = component_model.get_params()[_identifier] - _params.extend( - extract_parameters( - _flow.components[_identifier], - _flow_dict, - subcomponent_model, - ), - ) - return _params - - flow_dict = get_flow_dict(flow) - model = model if model is not None else flow.model - return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id) - - def _openml_param_name_to_sklearn( - self, - openml_parameter: openml.setups.OpenMLParameter, - flow: OpenMLFlow, - ) -> str: - """ - Converts the name of an OpenMLParameter into the sklean name, given a flow. - - Parameters - ---------- - openml_parameter: OpenMLParameter - The parameter under consideration - - flow: OpenMLFlow - The flow that provides context. - - Returns - ------- - sklearn_parameter_name: str - The name the parameter will have once used in scikit-learn - """ - if not isinstance(openml_parameter, openml.setups.OpenMLParameter): - raise ValueError("openml_parameter should be an instance of OpenMLParameter") - if not isinstance(flow, OpenMLFlow): - raise ValueError("flow should be an instance of OpenMLFlow") - - flow_structure = flow.get_structure("name") - if openml_parameter.flow_name not in flow_structure: - raise ValueError("Obtained OpenMLParameter and OpenMLFlow do not correspond. ") - name = openml_parameter.flow_name # for PEP8 - return "__".join(flow_structure[name] + [openml_parameter.parameter_name]) - - ################################################################################################ - # Methods for hyperparameter optimization - - def _is_hpo_class(self, model: Any) -> bool: - """Check whether the model performs hyperparameter optimization. - - Used to check whether an optimization trace can be extracted from the model after - running it. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - return isinstance(model, sklearn.model_selection._search.BaseSearchCV) - - def instantiate_model_from_hpo_class( - self, - model: Any, - trace_iteration: OpenMLTraceIteration, - ) -> Any: - """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter - optimization model. - - Parameters - ---------- - model : Any - A hyperparameter optimization model which defines the model to be instantiated. - trace_iteration : OpenMLTraceIteration - Describing the hyperparameter settings to instantiate. - - Returns - ------- - Any - """ - if not self._is_hpo_class(model): - raise AssertionError( - f"Flow model {model} is not an instance of" - " sklearn.model_selection._search.BaseSearchCV", - ) - base_estimator = model.estimator - base_estimator.set_params(**trace_iteration.get_parameters()) - return base_estimator - - def _extract_trace_data(self, model, rep_no, fold_no): - """Extracts data from a machine learning model's cross-validation results - and creates an ARFF (Attribute-Relation File Format) trace. - - Parameters - ---------- - model : Any - A fitted hyperparameter optimization model. - rep_no : int - The repetition number. - fold_no : int - The fold number. - - Returns - ------- - A list of ARFF tracecontent. - """ - arff_tracecontent = [] - for itt_no in range(len(model.cv_results_["mean_test_score"])): - # we use the string values for True and False, as it is defined in - # this way by the OpenML server - selected = "false" - if itt_no == model.best_index_: - selected = "true" - test_score = model.cv_results_["mean_test_score"][itt_no] - arff_line = [rep_no, fold_no, itt_no, test_score, selected] - for key in model.cv_results_: - if key.startswith("param_"): - value = model.cv_results_[key][itt_no] - # Built-in serializer does not convert all numpy types, - # these methods convert them to built-in types instead. - if isinstance(value, np.generic): - # For scalars it actually returns scalars, not a list - value = value.tolist() - serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan - arff_line.append(serialized_value) - arff_tracecontent.append(arff_line) - return arff_tracecontent - - def _obtain_arff_trace( - self, - model: Any, - trace_content: list, - ) -> OpenMLRunTrace: - """Create arff trace object from a fitted model and the trace content obtained by - repeatedly calling ``run_model_on_task``. - - Parameters - ---------- - model : Any - A fitted hyperparameter optimization model. - - trace_content : List[List] - Trace content obtained by ``openml.runs.run_flow_on_task``. - - Returns - ------- - OpenMLRunTrace - """ - if not self._is_hpo_class(model): - raise AssertionError( - f"Flow model {model} is not an instance of " - "sklearn.model_selection._search.BaseSearchCV", - ) - if not hasattr(model, "cv_results_"): - raise ValueError("model should contain `cv_results_`") - - # attributes that will be in trace arff, regardless of the model - trace_attributes = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("iteration", "NUMERIC"), - ("evaluation", "NUMERIC"), - ("selected", ["true", "false"]), - ] - - # model dependent attributes for trace arff - for key in model.cv_results_: - if key.startswith("param_"): - # supported types should include all types, including bool, - # int float - supported_basic_types = (bool, int, float, str) - for param_value in model.cv_results_[key]: - if isinstance(param_value, np.generic): - param_value = param_value.tolist() # noqa: PLW2901 - if ( - isinstance(param_value, supported_basic_types) - or param_value is None - or param_value is np.ma.masked - ): - # basic string values - type = "STRING" # noqa: A001 - elif isinstance(param_value, (list, tuple)) and all( - isinstance(i, int) for i in param_value - ): - # list of integers (usually for selecting features) - # hyperparameter layer_sizes of MLPClassifier - type = "STRING" # noqa: A001 - else: - raise TypeError(f"Unsupported param type in param grid: {key}") - - # renamed the attribute param to parameter, as this is a required - # OpenML convention - this also guards against name collisions - # with the required trace attributes - attribute = (PREFIX + key[6:], type) # type: ignore - trace_attributes.append(attribute) - - return OpenMLRunTrace.generate( - trace_attributes, - trace_content, - ) From 9d93485b78df7c9342349bfe06ba2bc20ce7f91f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 12:27:21 +0200 Subject: [PATCH 2/8] Remove scikit-learn extension submodule It will now be hosted in a separate repository --- .../test_sklearn_extension/__init__.py | 0 .../test_sklearn_extension.py | 2422 ----------------- 2 files changed, 2422 deletions(-) delete mode 100644 tests/test_extensions/test_sklearn_extension/__init__.py delete mode 100644 tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py diff --git a/tests/test_extensions/test_sklearn_extension/__init__.py b/tests/test_extensions/test_sklearn_extension/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py deleted file mode 100644 index 9913436e4..000000000 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ /dev/null @@ -1,2422 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -import collections -import json -import os -import re -import sys -import unittest -import warnings -from collections import OrderedDict -from packaging.version import Version -from typing import Any -from unittest import mock - -import numpy as np -import pandas as pd -import pytest -import scipy.optimize -import scipy.stats -import sklearn.base -import sklearn.cluster -import sklearn.datasets -import sklearn.decomposition -import sklearn.dummy -import sklearn.ensemble -import sklearn.feature_selection -import sklearn.gaussian_process -import sklearn.linear_model -import sklearn.model_selection -import sklearn.naive_bayes -import sklearn.neural_network -import sklearn.pipeline -import sklearn.preprocessing -import sklearn.tree -from packaging import version -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler - -import openml -from openml.exceptions import PyOpenMLError -from openml.extensions.sklearn import SklearnExtension, cat, cont -from openml.flows import OpenMLFlow -from openml.flows.functions import assert_flows_equal -from openml.runs.trace import OpenMLRunTrace -from openml.testing import CustomImputer, SimpleImputer, TestBase - -this_directory = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(this_directory) - - -__version__ = 0.1 - - -class Model(sklearn.base.BaseEstimator): - def __init__(self, boolean, integer, floating_point_value): - self.boolean = boolean - self.integer = integer - self.floating_point_value = floating_point_value - - def fit(self, X, y): - pass - - -def _cat_col_selector(X): - return X.select_dtypes(include=["object", "category"]).columns - - -def _get_sklearn_preprocessing(): - from sklearn.compose import ColumnTransformer - - return [ - ( - "cat_handling", - ColumnTransformer( - transformers=[ - ( - "cat", - sklearn.pipeline.Pipeline( - [ - ( - "cat_si", - SimpleImputer( - strategy="constant", - fill_value="missing", - ), - ), - ("cat_ohe", OneHotEncoder(handle_unknown="ignore")), - ], - ), - _cat_col_selector, - ) - ], - remainder="passthrough", - ), - ), - ("imp", SimpleImputer()), - ] - - -class TestSklearnExtensionFlowFunctions(TestBase): - # Splitting not helpful, these test's don't rely on the server and take less - # than 1 seconds - - def setUp(self): - super().setUp(n_levels=2) - iris = sklearn.datasets.load_iris() - self.X = iris.data - self.y = iris.target - - self.extension = SklearnExtension() - - def _get_expected_pipeline_description(self, model: Any) -> str: - if version.parse(sklearn.__version__) >= version.parse("1.0"): - expected_fixture = ( - "Pipeline of transforms with a final estimator.\n\nSequentially" - " apply a list of transforms and a final estimator.\n" - "Intermediate steps of the pipeline must be 'transforms', that " - "is, they\nmust implement `fit` and `transform` methods.\nThe final " - "estimator only needs to implement `fit`.\nThe transformers in " - "the pipeline can be cached using ``memory`` argument.\n\nThe " - "purpose of the pipeline is to assemble several steps that can " - "be\ncross-validated together while setting different parameters" - ". For this, it\nenables setting parameters of the various steps" - " using their names and the\nparameter name separated by a `'__'`," - " as in the example below. A step's\nestimator may be replaced " - "entirely by setting the parameter with its name\nto another " - "estimator, or a transformer removed by setting it to\n" - "`'passthrough'` or `None`." - ) - elif version.parse(sklearn.__version__) >= version.parse("0.21.0"): - expected_fixture = ( - "Pipeline of transforms with a final estimator.\n\nSequentially" - " apply a list of transforms and a final estimator.\n" - "Intermediate steps of the pipeline must be 'transforms', that " - "is, they\nmust implement fit and transform methods.\nThe final " - "estimator only needs to implement fit.\nThe transformers in " - "the pipeline can be cached using ``memory`` argument.\n\nThe " - "purpose of the pipeline is to assemble several steps that can " - "be\ncross-validated together while setting different parameters" - ".\nFor this, it enables setting parameters of the various steps" - " using their\nnames and the parameter name separated by a '__'," - " as in the example below.\nA step's estimator may be replaced " - "entirely by setting the parameter\nwith its name to another " - "estimator, or a transformer removed by setting\nit to " - "'passthrough' or ``None``." - ) - else: - expected_fixture = self.extension._get_sklearn_description(model) - return expected_fixture - - def _serialization_test_helper( - self, - model, - X, - y, - subcomponent_parameters, - dependencies_mock_call_count=(1, 2), - ): - # Regex pattern for memory addresses of style 0x7f8e0f31ecf8 - pattern = re.compile("0x[0-9a-f]{12}") - - with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock: - serialization = self.extension.model_to_flow(model) - - if X is not None: - model.fit(X, y) - - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - assert re.sub(pattern, str(model.get_params()), "") == re.sub( - pattern, str(new_model.get_params()), "" - ) - - assert type(new_model) == type(model) - assert new_model is not model - - if X is not None: - new_model.fit(self.X, self.y) - - assert check_dependencies_mock.call_count == dependencies_mock_call_count[0] - - xml = serialization._to_dict() - new_model2 = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) - assert re.sub(pattern, str(model.get_params()), "") == re.sub( - pattern, str(new_model2.get_params()), "" - ) - - assert type(new_model2) == type(model) - assert new_model2 is not model - - if X is not None: - new_model2.fit(self.X, self.y) - - assert check_dependencies_mock.call_count == dependencies_mock_call_count[1] - - if subcomponent_parameters: - for nm in (new_model, new_model2): - new_model_params = nm.get_params() - model_params = model.get_params() - for subcomponent_parameter in subcomponent_parameters: - assert type(new_model_params[subcomponent_parameter]) == type( - model_params[subcomponent_parameter] - ) - assert ( - new_model_params[subcomponent_parameter] - is not model_params[subcomponent_parameter] - ) - del new_model_params[subcomponent_parameter] - del model_params[subcomponent_parameter] - assert new_model_params == model_params - - return serialization, new_model - - @pytest.mark.sklearn() - def test_serialize_model(self): - max_features = "auto" if Version(sklearn.__version__) < Version("1.3") else "sqrt" - model = sklearn.tree.DecisionTreeClassifier( - criterion="entropy", - max_features=max_features, - max_leaf_nodes=2000, - ) - - tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" - fixture_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" - fixture_short_name = "sklearn.DecisionTreeClassifier" - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = "A decision tree classifier." - version_fixture = self.extension._min_dependency_str(sklearn.__version__) - - presort_val = "false" if Version(sklearn.__version__) < Version("0.22") else '"deprecated"' - # min_impurity_decrease has been introduced in 0.20 - # min_impurity_split has been deprecated in 0.20 - if Version(sklearn.__version__) < Version("0.19"): - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", '"auto"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_split", "1e-07"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", "false"), - ("random_state", "null"), - ("splitter", '"best"'), - ), - ) - elif Version(sklearn.__version__) < Version("1.0"): - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", '"auto"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_decrease", "0.0"), - ("min_impurity_split", "null"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", presort_val), - ("random_state", "null"), - ("splitter", '"best"'), - ), - ) - elif Version(sklearn.__version__) < Version("1.4"): - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", f'"{max_features}"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_decrease", "0.0"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", presort_val), - ("random_state", "null"), - ("splitter", '"best"'), - ), - ) - else: - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", f'"{max_features}"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_decrease", "0.0"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", presort_val), - ("monotonic_cst", "null"), - ("random_state", "null"), - ("splitter", '"best"'), - ), - ) - - if Version(sklearn.__version__) >= Version("0.22"): - fixture_parameters.update({"ccp_alpha": "0.0"}) - fixture_parameters.move_to_end("ccp_alpha", last=False) - if Version(sklearn.__version__) >= Version("0.24"): - del fixture_parameters["presort"] - - structure_fixture = {f"sklearn.tree.{tree_name}.DecisionTreeClassifier": []} - - serialization, _ = self._serialization_test_helper( - model, - X=self.X, - y=self.y, - subcomponent_parameters=None, - ) - structure = serialization.get_structure("name") - - assert serialization.name == fixture_name - assert serialization.class_name == fixture_name - assert serialization.custom_name == fixture_short_name - assert serialization.description == fixture_description - assert serialization.parameters == fixture_parameters - assert serialization.dependencies == version_fixture - self.assertDictEqual(structure, structure_fixture) - - @pytest.mark.sklearn() - @pytest.mark.production() - def test_can_handle_flow(self): - openml.config.server = self.production_server - - R_flow = openml.flows.get_flow(6794) - assert not self.extension.can_handle_flow(R_flow) - old_3rd_party_flow = openml.flows.get_flow(7660) - assert self.extension.can_handle_flow(old_3rd_party_flow) - - openml.config.server = self.test_server - - @pytest.mark.sklearn() - def test_serialize_model_clustering(self): - model = sklearn.cluster.KMeans() - - sklearn_version = Version(sklearn.__version__) - cluster_name = "k_means_" if sklearn_version < Version("0.22") else "_kmeans" - fixture_name = f"sklearn.cluster.{cluster_name}.KMeans" - fixture_short_name = "sklearn.KMeans" - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = "K-Means clustering{}".format( - "" if sklearn_version < Version("0.22") else ".", - ) - version_fixture = self.extension._min_dependency_str(sklearn.__version__) - - n_jobs_val = "1" - if sklearn_version >= Version("0.20"): - n_jobs_val = "null" - if sklearn_version >= Version("0.23"): - n_jobs_val = '"deprecated"' - - precomp_val = '"auto"' if sklearn_version < Version("0.23") else '"deprecated"' - n_init = "10" - if sklearn_version >= Version("1.2"): - n_init = '"warn"' - if sklearn_version >= Version("1.4"): - n_init = '"auto"' - - algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"' - fixture_parameters = OrderedDict( - [ - ("algorithm", algorithm), - ("copy_x", "true"), - ("init", '"k-means++"'), - ("max_iter", "300"), - ("n_clusters", "8"), - ("n_init", n_init), - ("n_jobs", n_jobs_val), - ("precompute_distances", precomp_val), - ("random_state", "null"), - ("tol", "0.0001"), - ("verbose", "0"), - ] - ) - - if sklearn_version >= Version("1.0"): - fixture_parameters.pop("n_jobs") - fixture_parameters.pop("precompute_distances") - - fixture_structure = {f"sklearn.cluster.{cluster_name}.KMeans": []} - - serialization, _ = self._serialization_test_helper( - model, - X=None, - y=None, - subcomponent_parameters=None, - ) - structure = serialization.get_structure("name") - - assert serialization.name == fixture_name - assert serialization.class_name == fixture_name - assert serialization.custom_name == fixture_short_name - assert serialization.description == fixture_description - assert serialization.parameters == fixture_parameters - assert serialization.dependencies == version_fixture - assert structure == fixture_structure - - @pytest.mark.sklearn() - def test_serialize_model_with_subcomponent(self): - estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" - ) - estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()} - model = sklearn.ensemble.AdaBoostClassifier( - n_estimators=100, - **estimator_param, - ) - - weight_name = "{}weight_boosting".format( - "" if Version(sklearn.__version__) < Version("0.22") else "_", - ) - tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" - fixture_name = ( - f"sklearn.ensemble.{weight_name}.AdaBoostClassifier" - f"({estimator_name}=sklearn.tree.{tree_name}.DecisionTreeClassifier)" - ) - fixture_class_name = f"sklearn.ensemble.{weight_name}.AdaBoostClassifier" - fixture_short_name = "sklearn.AdaBoostClassifier" - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = ( - "An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a " - "meta-estimator that begins by fitting a\nclassifier on the original" - " dataset and then fits additional copies of the\nclassifier on the " - "same dataset but where the weights of incorrectly\nclassified " - "instances are adjusted such that subsequent classifiers focus\nmore" - " on difficult cases.\n\nThis class implements the algorithm known " - "as AdaBoost-SAMME [2]." - ) - fixture_subcomponent_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" - fixture_subcomponent_class_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier" - # str obtained from self.extension._get_sklearn_description(model.base_estimator) - fixture_subcomponent_description = "A decision tree classifier." - fixture_structure = { - fixture_name: [], - f"sklearn.tree.{tree_name}.DecisionTreeClassifier": [estimator_name], - } - - serialization, _ = self._serialization_test_helper( - model, - X=self.X, - y=self.y, - subcomponent_parameters=[estimator_name], - dependencies_mock_call_count=(2, 4), - ) - structure = serialization.get_structure("name") - - assert serialization.name == fixture_name - assert serialization.class_name == fixture_class_name - assert serialization.custom_name == fixture_short_name - if Version(sklearn.__version__) < Version("1.4"): - assert serialization.description == fixture_description - assert serialization.parameters["algorithm"] == '"SAMME.R"' - assert isinstance(serialization.parameters[estimator_name], str) - assert serialization.parameters["learning_rate"] == "1.0" - assert serialization.parameters["n_estimators"] == "100" - assert serialization.components[estimator_name].name == fixture_subcomponent_name - assert ( - serialization.components[estimator_name].class_name == fixture_subcomponent_class_name - ) - assert ( - serialization.components[estimator_name].description == fixture_subcomponent_description - ) - self.assertDictEqual(structure, fixture_structure) - - @pytest.mark.sklearn() - def test_serialize_pipeline(self): - scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - dummy = sklearn.dummy.DummyClassifier(strategy="prior") - model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("dummy", dummy)]) - - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - fixture_name = ( - "sklearn.pipeline.Pipeline(" - f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," - "dummy=sklearn.dummy.DummyClassifier)" - ) - fixture_short_name = "sklearn.Pipeline(StandardScaler,DummyClassifier)" - fixture_description = self._get_expected_pipeline_description(model) - fixture_structure = { - fixture_name: [], - f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], - "sklearn.dummy.DummyClassifier": ["dummy"], - } - - serialization, new_model = self._serialization_test_helper( - model, - X=self.X, - y=self.y, - subcomponent_parameters=["scaler", "dummy", "steps"], - dependencies_mock_call_count=(3, 6), - ) - structure = serialization.get_structure("name") - - assert serialization.name == fixture_name - assert serialization.custom_name == fixture_short_name - if Version(sklearn.__version__) < Version("1.3"): - # Newer versions of scikit-learn have update docstrings - assert serialization.description == fixture_description - self.assertDictEqual(structure, fixture_structure) - - # Comparing the pipeline - # The parameters only have the name of base objects(not the whole flow) - # as value - # memory parameter has been added in 0.19, verbose in 0.21 - if Version(sklearn.__version__) < Version("0.19"): - assert len(serialization.parameters) == 1 - elif Version(sklearn.__version__) < Version("0.21"): - assert len(serialization.parameters) == 2 - else: - assert len(serialization.parameters) == 3 - - # Hard to compare two representations of a dict due to possibly - # different sorting. Making a json makes it easier - assert json.loads(serialization.parameters["steps"]) == [ - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "scaler", "step_name": "scaler"}, - }, - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "dummy", "step_name": "dummy"}, - }, - ] - - # Checking the sub-component - assert len(serialization.components) == 2 - assert isinstance(serialization.components["scaler"], OpenMLFlow) - assert isinstance(serialization.components["dummy"], OpenMLFlow) - - assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps] - assert new_model.steps[0][1] is not model.steps[0][1] - assert new_model.steps[1][1] is not model.steps[1][1] - - @pytest.mark.sklearn() - def test_serialize_pipeline_clustering(self): - scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - km = sklearn.cluster.KMeans() - model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("clusterer", km)]) - - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - cluster_name = "k_means_" if Version(sklearn.__version__) < Version("0.22") else "_kmeans" - fixture_name = ( - "sklearn.pipeline.Pipeline(" - f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler," - f"clusterer=sklearn.cluster.{cluster_name}.KMeans)" - ) - fixture_short_name = "sklearn.Pipeline(StandardScaler,KMeans)" - fixture_description = self._get_expected_pipeline_description(model) - fixture_structure = { - fixture_name: [], - f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], - f"sklearn.cluster.{cluster_name}.KMeans": ["clusterer"], - } - serialization, new_model = self._serialization_test_helper( - model, - X=None, - y=None, - subcomponent_parameters=["scaler", "steps", "clusterer"], - dependencies_mock_call_count=(3, 6), - ) - structure = serialization.get_structure("name") - - assert serialization.name == fixture_name - assert serialization.custom_name == fixture_short_name - if Version(sklearn.__version__) < Version("1.3"): - # Newer versions of scikit-learn have update docstrings - assert serialization.description == fixture_description - self.assertDictEqual(structure, fixture_structure) - - # Comparing the pipeline - # The parameters only have the name of base objects(not the whole flow) - # as value - # memory parameter has been added in 0.19 - if Version(sklearn.__version__) < Version("0.19"): - assert len(serialization.parameters) == 1 - elif Version(sklearn.__version__) < Version("0.21"): - assert len(serialization.parameters) == 2 - else: - assert len(serialization.parameters) == 3 - # Hard to compare two representations of a dict due to possibly - # different sorting. Making a json makes it easier - assert json.loads(serialization.parameters["steps"]) == [ - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "scaler", "step_name": "scaler"}, - }, - { - "oml-python:serialized_object": "component_reference", - "value": {"key": "clusterer", "step_name": "clusterer"}, - }, - ] - - # Checking the sub-component - assert len(serialization.components) == 2 - assert isinstance(serialization.components["scaler"], OpenMLFlow) - assert isinstance(serialization.components["clusterer"], OpenMLFlow) - - assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps] - assert new_model.steps[0][1] is not model.steps[0][1] - assert new_model.steps[1][1] is not model.steps[1][1] - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.20"), - reason="columntransformer introduction in 0.20.0", - ) - def test_serialize_column_transformer(self): - # temporary local import, dependend on version 0.20 - import sklearn.compose - - model = sklearn.compose.ColumnTransformer( - transformers=[ - ("numeric", sklearn.preprocessing.StandardScaler(), [0, 1, 2]), - ( - "nominal", - sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), - [3, 4, 5], - ), - ("drop", "drop", [6, 7, 8]), - ], - remainder="passthrough", - ) - - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - fixture = ( - "sklearn.compose._column_transformer.ColumnTransformer(" - f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler," - "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)" - ) - fixture_short_name = "sklearn.ColumnTransformer" - - if version.parse(sklearn.__version__) >= version.parse("0.21.0"): - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = ( - "Applies transformers to columns of an array or pandas " - "DataFrame.\n\nThis estimator allows different columns or " - "column subsets of the input\nto be transformed separately and " - "the features generated by each transformer\nwill be " - "concatenated to form a single feature space.\nThis is useful " - "for heterogeneous or columnar data, to combine several\nfeature" - " extraction mechanisms or transformations into a single " - "transformer." - ) - else: - fixture_description = self.extension._get_sklearn_description(model) - - fixture_structure = { - fixture: [], - f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["numeric"], - "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"], - "drop": ["drop"], - } - - serialization = self.extension.model_to_flow(model) - structure = serialization.get_structure("name") - assert serialization.name == fixture - assert serialization.custom_name == fixture_short_name - assert serialization.description == fixture_description - self.assertDictEqual(structure, fixture_structure) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.20"), - reason="columntransformer introduction in 0.20.0", - ) - def test_serialize_column_transformer_pipeline(self): - # temporary local import, dependend on version 0.20 - import sklearn.compose - - inner = sklearn.compose.ColumnTransformer( - transformers=[ - ("numeric", sklearn.preprocessing.StandardScaler(), [0, 1, 2]), - ( - "nominal", - sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), - [3, 4, 5], - ), - ], - remainder="passthrough", - ) - model = sklearn.pipeline.Pipeline( - steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())], - ) - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes" - fixture_name = ( - "sklearn.pipeline.Pipeline(" - "transformer=sklearn.compose._column_transformer." - "ColumnTransformer(" - f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler," - "nominal=sklearn.preprocessing._encoders.OneHotEncoder)," - f"classifier=sklearn.tree.{tree_name}.DecisionTreeClassifier)" - ) - fixture_structure = { - f"sklearn.preprocessing.{scaler_name}.StandardScaler": [ - "transformer", - "numeric", - ], - "sklearn.preprocessing._encoders.OneHotEncoder": ["transformer", "nominal"], - "sklearn.compose._column_transformer.ColumnTransformer(numeric=" - f"sklearn.preprocessing.{scaler_name}.StandardScaler,nominal=sklearn." - "preprocessing._encoders.OneHotEncoder)": ["transformer"], - f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["classifier"], - fixture_name: [], - } - - fixture_description = self._get_expected_pipeline_description(model) - serialization, new_model = self._serialization_test_helper( - model, - X=None, - y=None, - subcomponent_parameters=( - "transformer", - "classifier", - "transformer__transformers", - "steps", - "transformer__nominal", - "transformer__numeric", - ), - dependencies_mock_call_count=(5, 10), - ) - structure = serialization.get_structure("name") - assert serialization.name == fixture_name - if Version(sklearn.__version__) < Version("1.3"): # Not yet up-to-date for later versions - assert serialization.description == fixture_description - self.assertDictEqual(structure, fixture_structure) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.20"), - reason="Pipeline processing behaviour updated", - ) - def test_serialize_feature_union(self): - sparse_parameter = ( - "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" - ) - ohe_params = {sparse_parameter: False} - if Version(sklearn.__version__) >= Version("0.20"): - ohe_params["categories"] = "auto" - ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) - scaler = sklearn.preprocessing.StandardScaler() - - fu = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) - serialization, new_model = self._serialization_test_helper( - fu, - X=self.X, - y=self.y, - subcomponent_parameters=("ohe", "scaler", "transformer_list"), - dependencies_mock_call_count=(3, 6), - ) - structure = serialization.get_structure("name") - # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = ( - "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" - ) - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - fixture_name = ( - "sklearn.pipeline.FeatureUnion(" - f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," - f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)" - ) - fixture_structure = { - fixture_name: [], - f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder": ["ohe"], - f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"], - } - assert serialization.name == fixture_name - self.assertDictEqual(structure, fixture_structure) - assert new_model.transformer_list[0][0] == fu.transformer_list[0][0] - assert ( - new_model.transformer_list[0][1].get_params() == fu.transformer_list[0][1].get_params() - ) - assert new_model.transformer_list[1][0] == fu.transformer_list[1][0] - assert ( - new_model.transformer_list[1][1].get_params() == fu.transformer_list[1][1].get_params() - ) - - assert [step[0] for step in new_model.transformer_list] == [ - step[0] for step in fu.transformer_list - ] - assert new_model.transformer_list[0][1] is not fu.transformer_list[0][1] - assert new_model.transformer_list[1][1] is not fu.transformer_list[1][1] - - fu.set_params(scaler="drop") - serialization, new_model = self._serialization_test_helper( - fu, - X=self.X, - y=self.y, - subcomponent_parameters=("ohe", "transformer_list"), - dependencies_mock_call_count=(3, 6), - ) - assert ( - serialization.name == "sklearn.pipeline.FeatureUnion(" - f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," - "scaler=drop)" - ) - assert new_model.transformer_list[1][1] == "drop" - - @pytest.mark.sklearn() - def test_serialize_feature_union_switched_names(self): - ohe_params = ( - {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {} - ) - ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params) - scaler = sklearn.preprocessing.StandardScaler() - fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) - fu2 = sklearn.pipeline.FeatureUnion(transformer_list=[("scaler", ohe), ("ohe", scaler)]) - - fu1_serialization, _ = self._serialization_test_helper( - fu1, - X=None, - y=None, - subcomponent_parameters=(), - dependencies_mock_call_count=(3, 6), - ) - fu2_serialization, _ = self._serialization_test_helper( - fu2, - X=None, - y=None, - subcomponent_parameters=(), - dependencies_mock_call_count=(3, 6), - ) - - # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = ( - "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" - ) - scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data" - assert ( - fu1_serialization.name == "sklearn.pipeline.FeatureUnion(" - f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," - f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)" - ) - assert ( - fu2_serialization.name == "sklearn.pipeline.FeatureUnion(" - f"scaler=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder," - f"ohe=sklearn.preprocessing.{scaler_name}.StandardScaler)" - ) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) >= Version("1.4"), - "AdaBoost parameter name changed as did the way its forwarded to GridSearchCV", - ) - def test_serialize_complex_flow(self): - ohe = sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore") - scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - boosting = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.tree.DecisionTreeClassifier(), - ) - model = sklearn.pipeline.Pipeline( - steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)], - ) - parameter_grid = { - "boosting__base_estimator__max_depth": scipy.stats.randint(1, 10), - "boosting__learning_rate": scipy.stats.uniform(0.01, 0.99), - "boosting__n_estimators": [1, 5, 10, 100], - } - # convert to ordered dict, sorted by keys) due to param grid check - parameter_grid = OrderedDict(sorted(parameter_grid.items())) - cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True) - rs = sklearn.model_selection.RandomizedSearchCV( - estimator=model, - param_distributions=parameter_grid, - cv=cv, - ) - serialized, new_model = self._serialization_test_helper( - rs, - X=self.X, - y=self.y, - subcomponent_parameters=(), - dependencies_mock_call_count=(6, 12), - ) - structure = serialized.get_structure("name") - # OneHotEncoder was moved to _encoders module in 0.20 - module_name_encoder = ( - "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data" - ) - ohe_name = f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder" - scaler_name = "sklearn.preprocessing.{}.StandardScaler".format( - "data" if Version(sklearn.__version__) < Version("0.22") else "_data", - ) - tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format( - "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes", - ) - weight_name = "weight" if Version(sklearn.__version__) < Version("0.22") else "_weight" - boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format( - weight_name, - tree_name, - ) - pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={},boosting={})".format( - ohe_name, - scaler_name, - boosting_name, - ) - fixture_name = ( - f"sklearn.model_selection._search.RandomizedSearchCV(estimator={pipeline_name})" - ) - fixture_structure = { - ohe_name: ["estimator", "ohe"], - scaler_name: ["estimator", "scaler"], - tree_name: ["estimator", "boosting", "base_estimator"], - boosting_name: ["estimator", "boosting"], - pipeline_name: ["estimator"], - fixture_name: [], - } - assert serialized.name == fixture_name - assert structure == fixture_structure - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.21"), - reason="Pipeline till 0.20 doesn't support 'passthrough'", - ) - def test_serialize_strings_as_pipeline_steps(self): - import sklearn.compose - - # First check: test whether a passthrough in a pipeline is serialized correctly - model = sklearn.pipeline.Pipeline(steps=[("transformer", "passthrough")]) - serialized = self.extension.model_to_flow(model) - assert isinstance(serialized, OpenMLFlow) - assert len(serialized.components) == 1 - assert serialized.components["transformer"].name == "passthrough" - serialized = self.extension._serialize_sklearn( - ("transformer", "passthrough"), - parent_model=model, - ) - assert serialized == ("transformer", "passthrough") - extracted_info = self.extension._extract_information_from_model(model) - assert len(extracted_info[2]) == 1 - assert isinstance(extracted_info[2]["transformer"], OpenMLFlow) - assert extracted_info[2]["transformer"].name == "passthrough" - - # Second check: test whether a lone passthrough in a column transformer is serialized - # correctly - model = sklearn.compose.ColumnTransformer([("passthrough", "passthrough", (0,))]) - serialized = self.extension.model_to_flow(model) - assert isinstance(serialized, OpenMLFlow) - assert len(serialized.components) == 1 - assert serialized.components["passthrough"].name == "passthrough" - serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), - parent_model=model, - ) - assert serialized == ("passthrough", "passthrough") - extracted_info = self.extension._extract_information_from_model(model) - assert len(extracted_info[2]) == 1 - assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) - assert extracted_info[2]["passthrough"].name == "passthrough" - - # Third check: passthrough and drop in a column transformer - model = sklearn.compose.ColumnTransformer( - [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))], - ) - serialized = self.extension.model_to_flow(model) - assert isinstance(serialized, OpenMLFlow) - assert len(serialized.components) == 2 - assert serialized.components["passthrough"].name == "passthrough" - assert serialized.components["drop"].name == "drop" - serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), - parent_model=model, - ) - assert serialized == ("passthrough", "passthrough") - extracted_info = self.extension._extract_information_from_model(model) - assert len(extracted_info[2]) == 2 - assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) - assert isinstance(extracted_info[2]["drop"], OpenMLFlow) - assert extracted_info[2]["passthrough"].name == "passthrough" - assert extracted_info[2]["drop"].name == "drop" - - # Fourth check: having an actual preprocessor in the column transformer, too - model = sklearn.compose.ColumnTransformer( - [ - ("passthrough", "passthrough", (0,)), - ("drop", "drop", (1,)), - ("test", sklearn.preprocessing.StandardScaler(), (2,)), - ], - ) - serialized = self.extension.model_to_flow(model) - assert isinstance(serialized, OpenMLFlow) - assert len(serialized.components) == 3 - assert serialized.components["passthrough"].name == "passthrough" - assert serialized.components["drop"].name == "drop" - serialized = self.extension._serialize_sklearn( - ("passthrough", "passthrough"), - parent_model=model, - ) - assert serialized == ("passthrough", "passthrough") - extracted_info = self.extension._extract_information_from_model(model) - assert len(extracted_info[2]) == 3 - assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow) - assert isinstance(extracted_info[2]["drop"], OpenMLFlow) - assert extracted_info[2]["passthrough"].name == "passthrough" - assert extracted_info[2]["drop"].name == "drop" - - # Fifth check: test whether a lone drop in a feature union is serialized correctly - model = sklearn.pipeline.FeatureUnion([("drop", "drop")]) - serialized = self.extension.model_to_flow(model) - assert isinstance(serialized, OpenMLFlow) - assert len(serialized.components) == 1 - assert serialized.components["drop"].name == "drop" - serialized = self.extension._serialize_sklearn(("drop", "drop"), parent_model=model) - assert serialized == ("drop", "drop") - extracted_info = self.extension._extract_information_from_model(model) - assert len(extracted_info[2]) == 1 - assert isinstance(extracted_info[2]["drop"], OpenMLFlow) - assert extracted_info[2]["drop"].name == "drop" - - @pytest.mark.sklearn() - def test_serialize_type(self): - supported_types = [float, np.float32, np.float64, int, np.int32, np.int64] - if Version(np.__version__) < Version("1.24"): - supported_types.append(float) - supported_types.append(int) - - for supported_type in supported_types: - serialized = self.extension.model_to_flow(supported_type) - deserialized = self.extension.flow_to_model(serialized) - assert deserialized == supported_type - - @pytest.mark.sklearn() - def test_serialize_rvs(self): - supported_rvs = [ - scipy.stats.norm(loc=1, scale=5), - scipy.stats.expon(loc=1, scale=5), - scipy.stats.randint(low=-3, high=15), - ] - - for supported_rv in supported_rvs: - serialized = self.extension.model_to_flow(supported_rv) - deserialized = self.extension.flow_to_model(serialized) - assert type(deserialized.dist) == type(supported_rv.dist) - del deserialized.dist - del supported_rv.dist - assert deserialized.__dict__ == supported_rv.__dict__ - - @pytest.mark.sklearn() - def test_serialize_function(self): - serialized = self.extension.model_to_flow(sklearn.feature_selection.chi2) - deserialized = self.extension.flow_to_model(serialized) - assert deserialized == sklearn.feature_selection.chi2 - - @pytest.mark.sklearn() - def test_serialize_cvobject(self): - methods = [sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut()] - fixtures = [ - OrderedDict( - [ - ("oml-python:serialized_object", "cv_object"), - ( - "value", - OrderedDict( - [ - ("name", "sklearn.model_selection._split.KFold"), - ( - "parameters", - OrderedDict( - [ - ("n_splits", "3"), - ("random_state", "null"), - ("shuffle", "false"), - ], - ), - ), - ], - ), - ), - ], - ), - OrderedDict( - [ - ("oml-python:serialized_object", "cv_object"), - ( - "value", - OrderedDict( - [ - ("name", "sklearn.model_selection._split.LeaveOneOut"), - ("parameters", OrderedDict()), - ], - ), - ), - ], - ), - ] - for method, fixture in zip(methods, fixtures): - m = self.extension.model_to_flow(method) - assert m == fixture - - m_new = self.extension.flow_to_model(m) - assert m_new is not m - assert isinstance(m_new, type(method)) - - @pytest.mark.sklearn() - def test_serialize_simple_parameter_grid(self): - # We cannot easily test for scipy random variables in here, but they - # should be covered - - # Examples from the scikit-learn documentation - models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] - grids = [ - [ - OrderedDict([("C", [1, 10, 100, 1000]), ("kernel", ["linear"])]), - OrderedDict( - [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])], - ), - ], - OrderedDict( - [ - ("bootstrap", [True, False]), - ("criterion", ["gini", "entropy"]), - ("max_depth", [3, None]), - ("max_features", [1, 3, 10]), - ("min_samples_leaf", [1, 3, 10]), - ("min_samples_split", [1, 3, 10]), - ], - ), - ] - - for grid, model in zip(grids, models): - serialized = self.extension.model_to_flow(grid) - deserialized = self.extension.flow_to_model(serialized) - - assert deserialized == grid - assert deserialized is not grid - # providing error_score because nan != nan - hpo = sklearn.model_selection.GridSearchCV( - param_grid=grid, - estimator=model, - error_score=-1000, - ) - - serialized = self.extension.model_to_flow(hpo) - deserialized = self.extension.flow_to_model(serialized) - assert hpo.param_grid == deserialized.param_grid - assert hpo.estimator.get_params() == deserialized.estimator.get_params() - hpo_params = hpo.get_params(deep=False) - deserialized_params = deserialized.get_params(deep=False) - del hpo_params["estimator"] - del deserialized_params["estimator"] - assert hpo_params == deserialized_params - - @pytest.mark.sklearn() - @unittest.skip( - "This feature needs further reworking. If we allow several " - "components, we need to register them all in the downstream " - "flows. This is so far not implemented.", - ) - def test_serialize_advanced_grid(self): - # TODO instead a GridSearchCV object should be serialized - - # This needs to be in its own function because we cannot simply check - # for the equality of the grid, because scikit-learn objects don't - # really support the equality operator - # This will only work with sklearn==0.18 - N_FEATURES_OPTIONS = [2, 4, 8] - C_OPTIONS = [1, 10, 100, 1000] - grid = [ - { - "reduce_dim": [ - sklearn.decomposition.PCA(iterated_power=7), - sklearn.decomposition.NMF(), - ], - "reduce_dim__n_components": N_FEATURES_OPTIONS, - "classify__C": C_OPTIONS, - }, - { - "reduce_dim": [ - sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2), - ], - "reduce_dim__k": N_FEATURES_OPTIONS, - "classify__C": C_OPTIONS, - }, - ] - - serialized = self.extension.model_to_flow(grid) - deserialized = self.extension.flow_to_model(serialized) - - assert ( - grid[0]["reduce_dim"][0].get_params() == deserialized[0]["reduce_dim"][0].get_params() - ) - assert grid[0]["reduce_dim"][0] is not deserialized[0]["reduce_dim"][0] - assert ( - grid[0]["reduce_dim"][1].get_params() == deserialized[0]["reduce_dim"][1].get_params() - ) - assert grid[0]["reduce_dim"][1] is not deserialized[0]["reduce_dim"][1] - assert grid[0]["reduce_dim__n_components"] == deserialized[0]["reduce_dim__n_components"] - assert grid[0]["classify__C"] == deserialized[0]["classify__C"] - assert ( - grid[1]["reduce_dim"][0].get_params() == deserialized[1]["reduce_dim"][0].get_params() - ) - assert grid[1]["reduce_dim"][0] is not deserialized[1]["reduce_dim"][0] - assert grid[1]["reduce_dim__k"] == deserialized[1]["reduce_dim__k"] - assert grid[1]["classify__C"] == deserialized[1]["classify__C"] - - @pytest.mark.sklearn() - def test_serialize_advanced_grid_fails(self): - # This unit test is checking that the test we skip above would actually fail - - param_grid = { - "base_estimator": [ - sklearn.tree.DecisionTreeClassifier(), - sklearn.tree.ExtraTreeClassifier(), - ], - } - - clf = sklearn.model_selection.GridSearchCV( - sklearn.ensemble.BaggingClassifier(), - param_grid=param_grid, - ) - with pytest.raises( - TypeError, - match=re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL), - ): - self.extension.model_to_flow(clf) - - @pytest.mark.sklearn() - def test_serialize_resampling(self): - kfold = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True) - serialized = self.extension.model_to_flow(kfold) - deserialized = self.extension.flow_to_model(serialized) - # Best approximation to get_params() - assert str(deserialized) == str(kfold) - assert deserialized is not kfold - - @pytest.mark.sklearn() - def test_hypothetical_parameter_values(self): - # The hypothetical parameter values of true, 1, 0.1 formatted as a - # string (and their correct serialization and deserialization) an only - # be checked inside a model - - model = Model("true", "1", "0.1") - - serialized = self.extension.model_to_flow(model) - serialized.external_version = "sklearn==test123" - deserialized = self.extension.flow_to_model(serialized) - assert deserialized.get_params() == model.get_params() - assert deserialized is not model - - @pytest.mark.sklearn() - def test_gaussian_process(self): - opt = scipy.optimize.fmin_l_bfgs_b - kernel = sklearn.gaussian_process.kernels.Matern() - gp = sklearn.gaussian_process.GaussianProcessClassifier(kernel=kernel, optimizer=opt) - with pytest.raises( - TypeError, - match=r"Matern\(length_scale=1, nu=1.5\), ", - ): - self.extension.model_to_flow(gp) - - @pytest.mark.sklearn() - def test_error_on_adding_component_multiple_times_to_flow(self): - # this function implicitly checks - # - openml.flows._check_multiple_occurence_of_component_in_flow() - pca = sklearn.decomposition.PCA() - pca2 = sklearn.decomposition.PCA() - pipeline = sklearn.pipeline.Pipeline((("pca1", pca), ("pca2", pca2))) - fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline" - with pytest.raises(ValueError, match=fixture): - self.extension.model_to_flow(pipeline) - - fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2))) - fixture = ( - "Found a second occurence of component .*.PCA when trying to serialize FeatureUnion" - ) - with pytest.raises(ValueError, match=fixture): - self.extension.model_to_flow(fu) - - fs = sklearn.feature_selection.SelectKBest() - fu2 = sklearn.pipeline.FeatureUnion((("pca1", pca), ("fs", fs))) - pipeline2 = sklearn.pipeline.Pipeline((("fu", fu2), ("pca2", pca2))) - fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline" - with pytest.raises(ValueError, match=fixture): - self.extension.model_to_flow(pipeline2) - - @pytest.mark.sklearn() - def test_subflow_version_propagated(self): - this_directory = os.path.dirname(os.path.abspath(__file__)) - tests_directory = os.path.abspath(os.path.join(this_directory, "..", "..")) - sys.path.append(tests_directory) - import tests.test_flows.dummy_learn.dummy_forest - - pca = sklearn.decomposition.PCA() - dummy = tests.test_flows.dummy_learn.dummy_forest.DummyRegressor() - pipeline = sklearn.pipeline.Pipeline((("pca", pca), ("dummy", dummy))) - flow = self.extension.model_to_flow(pipeline) - # In python2.7, the unit tests work differently on travis-ci; therefore, - # I put the alternative travis-ci answer here as well. While it has a - # different value, it is still correct as it is a propagation of the - # subclasses' module name - assert flow.external_version == "{},{},{}".format( - self.extension._format_external_version("openml", openml.__version__), - self.extension._format_external_version("sklearn", sklearn.__version__), - self.extension._format_external_version("tests", "0.1"), - ) - - @pytest.mark.sklearn() - @mock.patch("warnings.warn") - def test_check_dependencies(self, warnings_mock): - dependencies = ["sklearn==0.1", "sklearn>=99.99.99", "sklearn>99.99.99"] - for dependency in dependencies: - self.assertRaises(ValueError, self.extension._check_dependencies, dependency) - - @pytest.mark.sklearn() - def test_illegal_parameter_names(self): - # illegal name: estimators - clf1 = sklearn.ensemble.VotingClassifier( - estimators=[ - ("estimators", sklearn.ensemble.RandomForestClassifier()), - ("whatevs", sklearn.ensemble.ExtraTreesClassifier()), - ], - ) - clf2 = sklearn.ensemble.VotingClassifier( - estimators=[ - ("whatevs", sklearn.ensemble.RandomForestClassifier()), - ("estimators", sklearn.ensemble.ExtraTreesClassifier()), - ], - ) - cases = [clf1, clf2] - - for case in cases: - self.assertRaises(PyOpenMLError, self.extension.model_to_flow, case) - - @pytest.mark.sklearn() - def test_paralizable_check(self): - # using this model should pass the test (if param distribution is - # legal) - singlecore_bagging = sklearn.ensemble.BaggingClassifier() - # using this model should return false (if param distribution is legal) - multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5) - # using this param distribution should raise an exception - illegal_param_dist = {"base__n_jobs": [-1, 0, 1]} - # using this param distribution should not raise an exception - legal_param_dist = {"n_estimators": [2, 3, 4]} - - estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" - ) - legal_models = [ - sklearn.ensemble.RandomForestClassifier(), - sklearn.ensemble.RandomForestClassifier(n_jobs=5), - sklearn.ensemble.RandomForestClassifier(n_jobs=-1), - sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))], - ), - sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))], - ), - sklearn.pipeline.Pipeline( - steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))], - ), - sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist), - sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist), - sklearn.ensemble.BaggingClassifier( - n_jobs=-1, - **{estimator_name: sklearn.ensemble.RandomForestClassifier(n_jobs=5)}, - ), - ] - illegal_models = [ - sklearn.model_selection.GridSearchCV(singlecore_bagging, illegal_param_dist), - sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist), - ] - - if Version(sklearn.__version__) < Version("0.20"): - has_refit_time = [False, False, False, False, False, False, False, False, False] - else: - has_refit_time = [False, False, False, False, False, False, True, True, False] - - X, y = sklearn.datasets.load_iris(return_X_y=True) - for model, refit_time in zip(legal_models, has_refit_time): - model.fit(X, y) - assert refit_time == hasattr(model, "refit_time_") - - for model in illegal_models: - with pytest.raises(PyOpenMLError): - self.extension._prevent_optimize_n_jobs(model) - - @pytest.mark.sklearn() - def test__get_fn_arguments_with_defaults(self): - sklearn_version = Version(sklearn.__version__) - if sklearn_version < Version("0.19"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 15), - (sklearn.tree.DecisionTreeClassifier.__init__, 12), - (sklearn.pipeline.Pipeline.__init__, 0), - ] - elif sklearn_version < Version("0.21"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 16), - (sklearn.tree.DecisionTreeClassifier.__init__, 13), - (sklearn.pipeline.Pipeline.__init__, 1), - ] - elif sklearn_version < Version("0.22"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 16), - (sklearn.tree.DecisionTreeClassifier.__init__, 13), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - elif sklearn_version < Version("0.23"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 18), - (sklearn.tree.DecisionTreeClassifier.__init__, 14), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - elif sklearn_version < Version("0.24"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 18), - (sklearn.tree.DecisionTreeClassifier.__init__, 14), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - elif sklearn_version < Version("1.0"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 18), - (sklearn.tree.DecisionTreeClassifier.__init__, 13), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - elif sklearn_version < Version("1.4"): - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 17), - (sklearn.tree.DecisionTreeClassifier.__init__, 12), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - else: - fns = [ - (sklearn.ensemble.RandomForestRegressor.__init__, 18), - (sklearn.tree.DecisionTreeClassifier.__init__, 13), - (sklearn.pipeline.Pipeline.__init__, 2), - ] - - for fn, num_params_with_defaults in fns: - defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn) - assert isinstance(defaults, dict) - assert isinstance(defaultless, set) - # check whether we have both defaults and defaultless params - assert len(defaults) == num_params_with_defaults - assert len(defaultless) > 0 - # check no overlap - self.assertSetEqual(set(defaults.keys()), set(defaults.keys()) - defaultless) - self.assertSetEqual(defaultless, defaultless - set(defaults.keys())) - - @pytest.mark.sklearn() - def test_deserialize_with_defaults(self): - # used the 'initialize_with_defaults' flag of the deserialization - # method to return a flow that contains default hyperparameter - # settings. - steps = [ - ("Imputer", SimpleImputer()), - ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()), - ("Estimator", sklearn.tree.DecisionTreeClassifier()), - ] - pipe_orig = sklearn.pipeline.Pipeline(steps=steps) - - pipe_adjusted = sklearn.clone(pipe_orig) - if Version(sklearn.__version__) < Version("0.23"): - params = { - "Imputer__strategy": "median", - "OneHotEncoder__sparse": False, - "Estimator__min_samples_leaf": 42, - } - elif Version(sklearn.__version__) < Version("1.4"): - params = { - "Imputer__strategy": "mean", - "OneHotEncoder__sparse": True, - "Estimator__min_samples_leaf": 1, - } - else: - params = { - "Imputer__strategy": "mean", - "OneHotEncoder__sparse_output": True, - "Estimator__min_samples_leaf": 1, - } - pipe_adjusted.set_params(**params) - flow = self.extension.model_to_flow(pipe_adjusted) - pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True) - - # we want to compare pipe_deserialized and pipe_orig. We use the flow - # equals function for this - assert_flows_equal( - self.extension.model_to_flow(pipe_orig), - self.extension.model_to_flow(pipe_deserialized), - ) - - @pytest.mark.sklearn() - def test_deserialize_adaboost_with_defaults(self): - # used the 'initialize_with_defaults' flag of the deserialization - # method to return a flow that contains default hyperparameter - # settings. - steps = [ - ("Imputer", SimpleImputer()), - ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()), - ( - "Estimator", - sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier()), - ), - ] - pipe_orig = sklearn.pipeline.Pipeline(steps=steps) - - pipe_adjusted = sklearn.clone(pipe_orig) - if Version(sklearn.__version__) < Version("0.22"): - params = { - "Imputer__strategy": "median", - "OneHotEncoder__sparse": False, - "Estimator__n_estimators": 10, - } - elif Version(sklearn.__version__) < Version("1.4"): - params = { - "Imputer__strategy": "mean", - "OneHotEncoder__sparse": True, - "Estimator__n_estimators": 50, - } - else: - params = { - "Imputer__strategy": "mean", - "OneHotEncoder__sparse_output": True, - "Estimator__n_estimators": 50, - } - pipe_adjusted.set_params(**params) - flow = self.extension.model_to_flow(pipe_adjusted) - pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True) - - # we want to compare pipe_deserialized and pipe_orig. We use the flow - # equals function for this - assert_flows_equal( - self.extension.model_to_flow(pipe_orig), - self.extension.model_to_flow(pipe_deserialized), - ) - - @pytest.mark.sklearn() - def test_deserialize_complex_with_defaults(self): - # used the 'initialize_with_defaults' flag of the deserialization - # method to return a flow that contains default hyperparameter - # settings. - steps = [ - ("Imputer", SimpleImputer()), - ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()), - ( - "Estimator", - sklearn.ensemble.AdaBoostClassifier( - sklearn.ensemble.BaggingClassifier( - sklearn.ensemble.GradientBoostingClassifier(), - ), - ), - ), - ] - pipe_orig = sklearn.pipeline.Pipeline(steps=steps) - - pipe_adjusted = sklearn.clone(pipe_orig) - impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean" - sparse = Version(sklearn.__version__) >= Version("0.23") - sparse_parameter = ( - "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output" - ) - estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator" - ) - params = { - "Imputer__strategy": impute_strategy, - f"OneHotEncoder__{sparse_parameter}": sparse, - "Estimator__n_estimators": 10, - f"Estimator__{estimator_name}__n_estimators": 10, - f"Estimator__{estimator_name}__{estimator_name}__learning_rate": 0.1, - } - - pipe_adjusted.set_params(**params) - flow = self.extension.model_to_flow(pipe_adjusted) - pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True) - - # we want to compare pipe_deserialized and pipe_orig. We use the flow - # equals function for this - assert_flows_equal( - self.extension.model_to_flow(pipe_orig), - self.extension.model_to_flow(pipe_deserialized), - ) - - @pytest.mark.sklearn() - def test_openml_param_name_to_sklearn(self): - scaler = sklearn.preprocessing.StandardScaler(with_mean=False) - estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" - ) - boosting = sklearn.ensemble.AdaBoostClassifier( - **{estimator_name: sklearn.tree.DecisionTreeClassifier()}, - ) - model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)]) - flow = self.extension.model_to_flow(model) - task = openml.tasks.get_task(115) # diabetes; crossvalidation - run = openml.runs.run_flow_on_task(flow, task) - run = run.publish() - TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}") - run = openml.runs.get_run(run.run_id) - setup = openml.setups.get_setup(run.setup_id) - - # make sure to test enough parameters - assert len(setup.parameters) > 15 - - for parameter in setup.parameters.values(): - sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow) - - # test the inverse. Currently, OpenML stores the hyperparameter - # fullName as flow.name + flow.version + parameter.name on the - # server (but this behaviour is not documented and might or might - # not change in the future. Hence, we won't offer this - # transformation functionality in the main package yet.) - splitted = sklearn_name.split("__") - if len(splitted) > 1: # if len is 1, it is part of root flow - subflow = flow.get_subflow(splitted[0:-1]) - else: - subflow = flow - openml_name = f"{subflow.name}({subflow.version})_{splitted[-1]}" - assert parameter.full_name == openml_name - - @pytest.mark.sklearn() - def test_obtain_parameter_values_flow_not_from_server(self): - model = sklearn.linear_model.LogisticRegression(solver="lbfgs") - flow = self.extension.model_to_flow(model) - logistic_name = ( - "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic" - ) - msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!" - - with pytest.raises(ValueError, match=msg): - self.extension.obtain_parameter_values(flow) - - estimator_name = ( - "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" - ) - model = sklearn.ensemble.AdaBoostClassifier( - **{ - estimator_name: sklearn.linear_model.LogisticRegression( - solver="lbfgs", - ), - } - ) - flow = self.extension.model_to_flow(model) - flow.flow_id = 1 - with pytest.raises(ValueError, match=msg): - self.extension.obtain_parameter_values(flow) - - @pytest.mark.sklearn() - def test_obtain_parameter_values(self): - model = sklearn.model_selection.RandomizedSearchCV( - estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5), - param_distributions={ - "max_depth": [3, None], - "max_features": [1, 2, 3, 4], - "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], - "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"], - }, - cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True), - n_iter=5, - ) - flow = self.extension.model_to_flow(model) - flow.flow_id = 1 - flow.components["estimator"].flow_id = 2 - parameters = self.extension.obtain_parameter_values(flow) - for parameter in parameters: - assert parameter["oml:component"] is not None, parameter - if parameter["oml:name"] == "n_estimators": - assert parameter["oml:value"] == "5" - assert parameter["oml:component"] == 2 - - @pytest.mark.sklearn() - def test_numpy_type_allowed_in_flow(self): - """Simple numpy types should be serializable.""" - dt = sklearn.tree.DecisionTreeClassifier( - max_depth=np.float64(3.0), - min_samples_leaf=np.int32(5), - ) - self.extension.model_to_flow(dt) - - @pytest.mark.sklearn() - def test_numpy_array_not_allowed_in_flow(self): - """Simple numpy arrays should not be serializable.""" - bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3])) - with pytest.raises(TypeError): - self.extension.model_to_flow(bin) - - -class TestSklearnExtensionRunFunctions(TestBase): - _multiprocess_can_split_ = True - - def setUp(self): - super().setUp(n_levels=2) - self.extension = SklearnExtension() - - ################################################################################################ - # Test methods for performing runs with this extension module - - @pytest.mark.sklearn() - def test_run_model_on_task(self): - task = openml.tasks.get_task(1) # anneal; crossvalidation - # using most_frequent imputer since dataset has mixed types and to keep things simple - pipe = sklearn.pipeline.Pipeline( - [ - ("imp", SimpleImputer(strategy="most_frequent")), - ("dummy", sklearn.dummy.DummyClassifier()), - ], - ) - openml.runs.run_model_on_task(pipe, task) - - @pytest.mark.sklearn() - def test_seed_model(self): - # randomized models that are initialized without seeds, can be seeded - randomized_clfs = [ - sklearn.ensemble.BaggingClassifier(), - sklearn.model_selection.RandomizedSearchCV( - sklearn.ensemble.RandomForestClassifier(), - { - "max_depth": [3, None], - "max_features": [1, 2, 3, 4], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"], - "random_state": [-1, 0, 1, 2], - }, - cv=sklearn.model_selection.StratifiedKFold(n_splits=2, shuffle=True), - ), - sklearn.dummy.DummyClassifier(), - ] - - for idx, clf in enumerate(randomized_clfs): - const_probe = 42 - all_params = clf.get_params() - params = [key for key in all_params if key.endswith("random_state")] - assert len(params) > 0 - - # before param value is None - for param in params: - assert all_params[param] is None - - # now seed the params - clf_seeded = self.extension.seed_model(clf, const_probe) - new_params = clf_seeded.get_params() - - randstate_params = [key for key in new_params if key.endswith("random_state")] - - # afterwards, param value is set - for param in randstate_params: - assert isinstance(new_params[param], int) - assert new_params[param] is not None - - if idx == 1: - assert clf.cv.random_state == 56422 - - @pytest.mark.sklearn() - def test_seed_model_raises(self): - # the _set_model_seed_where_none should raise exception if random_state is - # anything else than an int - randomized_clfs = [ - sklearn.ensemble.BaggingClassifier(random_state=np.random.RandomState(42)), - sklearn.dummy.DummyClassifier(random_state="OpenMLIsGreat"), - ] - - for clf in randomized_clfs: - with pytest.raises(ValueError): - self.extension.seed_model(model=clf, seed=42) - - @pytest.mark.sklearn() - def test_run_model_on_fold_classification_1_array(self): - task = openml.tasks.get_task(1) # anneal; crossvalidation - - X, y = task.get_X_and_y() - train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X.iloc[train_indices] - y_train = y.iloc[train_indices] - X_test = X.iloc[test_indices] - y_test = y.iloc[test_indices] - - pipeline = sklearn.pipeline.Pipeline( - steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())], - ) - # TODO add some mocking here to actually test the innards of this function, too! - res = self.extension._run_model_on_fold( - model=pipeline, - task=task, - fold_no=0, - rep_no=0, - X_train=X_train, - y_train=y_train, - X_test=X_test, - ) - - y_hat, y_hat_proba, user_defined_measures, trace = res - - # predictions - assert isinstance(y_hat, np.ndarray) - assert y_hat.shape == y_test.shape - assert isinstance(y_hat_proba, pd.DataFrame) - assert y_hat_proba.shape == (y_test.shape[0], 6) - np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) - # The class '4' (at index 3) is not present in the training data. We check that the - # predicted probabilities for that class are zero! - np.testing.assert_array_almost_equal( - y_hat_proba.iloc[:, 3].to_numpy(), - np.zeros(y_test.shape), - ) - for i in (0, 1, 2, 4, 5): - assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) - - # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( - lambda: collections.defaultdict(dict) - ) - for measure in user_defined_measures: - fold_evaluations[measure][0][0] = user_defined_measures[measure] - - # trace. SGD does not produce any - assert trace is None - - self._check_fold_timing_evaluations( - fold_evaluations, - num_repeats=1, - num_folds=1, - task_type=task.task_type_id, - check_scores=False, - ) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.21"), - reason="SimpleImputer, ColumnTransformer available only after 0.19 and " - "Pipeline till 0.20 doesn't support indexing and 'passthrough'", - ) - def test_run_model_on_fold_classification_1_dataframe(self): - from sklearn.compose import ColumnTransformer - - task = openml.tasks.get_task(1) # anneal; crossvalidation - - # diff test_run_model_on_fold_classification_1_array() - X, y = task.get_X_and_y() - train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X.iloc[train_indices] - y_train = y.iloc[train_indices] - X_test = X.iloc[test_indices] - y_test = y.iloc[test_indices] - - # Helper functions to return required columns for ColumnTransformer - sparse = { - "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False - } - cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore", **sparse), - ) - cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler()) - ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) - pipeline = sklearn.pipeline.Pipeline( - steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], - ) - # TODO add some mocking here to actually test the innards of this function, too! - res = self.extension._run_model_on_fold( - model=pipeline, - task=task, - fold_no=0, - rep_no=0, - X_train=X_train, - y_train=y_train, - X_test=X_test, - ) - - y_hat, y_hat_proba, user_defined_measures, trace = res - - # predictions - assert isinstance(y_hat, np.ndarray) - assert y_hat.shape == y_test.shape - assert isinstance(y_hat_proba, pd.DataFrame) - assert y_hat_proba.shape == (y_test.shape[0], 6) - np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) - # The class '4' (at index 3) is not present in the training data. We check that the - # predicted probabilities for that class are zero! - np.testing.assert_array_almost_equal( - y_hat_proba.iloc[:, 3].to_numpy(), - np.zeros(y_test.shape), - ) - for i in (0, 1, 2, 4, 5): - assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)) - - # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( - lambda: collections.defaultdict(dict) - ) - for measure in user_defined_measures: - fold_evaluations[measure][0][0] = user_defined_measures[measure] - - # trace. SGD does not produce any - assert trace is None - - self._check_fold_timing_evaluations( - fold_evaluations, - num_repeats=1, - num_folds=1, - task_type=task.task_type_id, - check_scores=False, - ) - - @pytest.mark.sklearn() - def test_run_model_on_fold_classification_2(self): - task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation - - X, y = task.get_X_and_y() - train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X.iloc[train_indices] - y_train = y.iloc[train_indices] - X_test = X.iloc[test_indices] - y_test = y.iloc[test_indices] - - pipeline = sklearn.model_selection.GridSearchCV( - sklearn.pipeline.Pipeline( - steps=[ - *_get_sklearn_preprocessing(), - ("clf", sklearn.tree.DecisionTreeClassifier()), - ], - ), - {"clf__max_depth": [1, 2]}, - ) - # TODO add some mocking here to actually test the innards of this function, too! - res = self.extension._run_model_on_fold( - model=pipeline, - task=task, - fold_no=0, - rep_no=0, - X_train=X_train, - y_train=y_train, - X_test=X_test, - ) - - y_hat, y_hat_proba, user_defined_measures, trace = res - - # predictions - assert isinstance(y_hat, np.ndarray) - assert y_hat.shape == y_test.shape - assert isinstance(y_hat_proba, pd.DataFrame) - assert y_hat_proba.shape == (y_test.shape[0], 2) - np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) - for i in (0, 1): - assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape)) - - # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( - lambda: collections.defaultdict(dict) - ) - for measure in user_defined_measures: - fold_evaluations[measure][0][0] = user_defined_measures[measure] - - # check that it produced and returned a trace object of the correct length - assert isinstance(trace, OpenMLRunTrace) - assert len(trace.trace_iterations) == 2 - - self._check_fold_timing_evaluations( - fold_evaluations, - num_repeats=1, - num_folds=1, - task_type=task.task_type_id, - check_scores=False, - ) - - @pytest.mark.sklearn() - def test_run_model_on_fold_classification_3(self): - class HardNaiveBayes(sklearn.naive_bayes.GaussianNB): - # class for testing a naive bayes classifier that does not allow soft - # predictions - def predict_proba(*args, **kwargs): - raise AttributeError("predict_proba is not available when probability=False") - - # task 1 (test server) is important: it is a task with an unused class - tasks = [ - 1, # anneal; crossvalidation - 3, # anneal; crossvalidation - 115, # diabetes; crossvalidation - ] - flow = unittest.mock.Mock() - flow.name = "dummy" - - for task_id in tasks: - task = openml.tasks.get_task(task_id) - X, y = task.get_X_and_y() - train_indices, test_indices = task.get_train_test_split_indices( - repeat=0, - fold=0, - sample=0, - ) - X_train = X.iloc[train_indices] - y_train = y.iloc[train_indices] - X_test = X.iloc[test_indices] - clf1 = sklearn.pipeline.Pipeline( - steps=[ - *_get_sklearn_preprocessing(), - ("estimator", sklearn.naive_bayes.GaussianNB()), - ], - ) - clf2 = sklearn.pipeline.Pipeline( - steps=[*_get_sklearn_preprocessing(), ("estimator", HardNaiveBayes())], - ) - - pred_1, proba_1, _, _ = self.extension._run_model_on_fold( - model=clf1, - task=task, - X_train=X_train, - y_train=y_train, - X_test=X_test, - fold_no=0, - rep_no=0, - ) - pred_2, proba_2, _, _ = self.extension._run_model_on_fold( - model=clf2, - task=task, - X_train=X_train, - y_train=y_train, - X_test=X_test, - fold_no=0, - rep_no=0, - ) - - # verifies that the predictions are identical - np.testing.assert_array_equal(pred_1, pred_2) - np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0])) - # Test that there are predictions other than ones and zeros - assert np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1) < X_test.shape[ - 0 - ] * len(task.class_labels) - - np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0])) - # Test that there are only ones and zeros predicted - assert np.sum(proba_2.to_numpy() == 0) + np.sum( - proba_2.to_numpy() == 1 - ) == X_test.shape[0] * len(task.class_labels) - - @pytest.mark.sklearn() - @pytest.mark.production() - def test_run_model_on_fold_regression(self): - # There aren't any regression tasks on the test server - openml.config.server = self.production_server - task = openml.tasks.get_task(2999) - - X, y = task.get_X_and_y() - train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) - X_train = X.iloc[train_indices] - y_train = y.iloc[train_indices] - X_test = X.iloc[test_indices] - y_test = y.iloc[test_indices] - - pipeline = sklearn.pipeline.Pipeline( - steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())], - ) - # TODO add some mocking here to actually test the innards of this function, too! - res = self.extension._run_model_on_fold( - model=pipeline, - task=task, - fold_no=0, - rep_no=0, - X_train=X_train, - y_train=y_train, - X_test=X_test, - ) - - y_hat, y_hat_proba, user_defined_measures, trace = res - - # predictions - assert isinstance(y_hat, np.ndarray) - assert y_hat.shape == y_test.shape - assert y_hat_proba is None - - # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( - lambda: collections.defaultdict(dict) - ) - for measure in user_defined_measures: - fold_evaluations[measure][0][0] = user_defined_measures[measure] - - # trace. SGD does not produce any - assert trace is None - - self._check_fold_timing_evaluations( - fold_evaluations, - num_repeats=1, - num_folds=1, - task_type=task.task_type_id, - check_scores=False, - ) - - @pytest.mark.sklearn() - @pytest.mark.production() - def test_run_model_on_fold_clustering(self): - # There aren't any regression tasks on the test server - openml.config.server = self.production_server - task = openml.tasks.get_task(126033) - - X = task.get_X() - - pipeline = sklearn.pipeline.Pipeline( - steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.cluster.KMeans())], - ) - # TODO add some mocking here to actually test the innards of this function, too! - res = self.extension._run_model_on_fold( - model=pipeline, - task=task, - fold_no=0, - rep_no=0, - X_train=X, - ) - - y_hat, y_hat_proba, user_defined_measures, trace = res - - # predictions - assert isinstance(y_hat, np.ndarray) - assert y_hat.shape == (X.shape[0],) - assert y_hat_proba is None - - # check user defined measures - fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict( - lambda: collections.defaultdict(dict) - ) - for measure in user_defined_measures: - fold_evaluations[measure][0][0] = user_defined_measures[measure] - - # trace. SGD does not produce any - assert trace is None - - self._check_fold_timing_evaluations( - fold_evaluations, - num_repeats=1, - num_folds=1, - task_type=task.task_type_id, - check_scores=False, - ) - - @pytest.mark.sklearn() - def test__extract_trace_data(self): - param_grid = { - "hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]], - "activation": ["identity", "logistic", "tanh", "relu"], - "learning_rate_init": [0.1, 0.01, 0.001, 0.0001], - "max_iter": [10, 20, 40, 80], - } - num_iters = 10 - task = openml.tasks.get_task(20) # balance-scale; crossvalidation - clf = sklearn.model_selection.RandomizedSearchCV( - sklearn.neural_network.MLPClassifier(), - param_grid, - n_iter=num_iters, - ) - # just run the task on the model (without invoking any fancy extension & openml code) - train, _ = task.get_train_test_split_indices(0, 0) - X, y = task.get_X_and_y() - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - clf.fit(X.iloc[train], y.iloc[train]) - - # check num layers of MLP - assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"] - - trace_list = self.extension._extract_trace_data(clf, rep_no=0, fold_no=0) - trace = self.extension._obtain_arff_trace(clf, trace_list) - - assert isinstance(trace, OpenMLRunTrace) - assert isinstance(trace_list, list) - assert len(trace_list) == num_iters - - for trace_iteration in iter(trace): - assert trace_iteration.repeat == 0 - assert trace_iteration.fold == 0 - assert trace_iteration.iteration >= 0 - assert trace_iteration.iteration <= num_iters - assert trace_iteration.setup_string is None - assert isinstance(trace_iteration.evaluation, float) - assert np.isfinite(trace_iteration.evaluation) - assert isinstance(trace_iteration.selected, bool) - - assert len(trace_iteration.parameters) == len(param_grid) - for param in param_grid: - # Prepend with the "parameter_" prefix - param_in_trace = f"parameter_{param}" - assert param_in_trace in trace_iteration.parameters - param_value = json.loads(trace_iteration.parameters[param_in_trace]) - assert param_value in param_grid[param] - - @pytest.mark.sklearn() - def test_trim_flow_name(self): - import re - - long = """sklearn.pipeline.Pipeline( - columntransformer=sklearn.compose._column_transformer.ColumnTransformer( - numeric=sklearn.pipeline.Pipeline( - SimpleImputer=sklearn.preprocessing.imputation.Imputer, - standardscaler=sklearn.preprocessing.data.StandardScaler), - nominal=sklearn.pipeline.Pipeline( - simpleimputer=sklearn.impute.SimpleImputer, - onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), - variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, - svc=sklearn.svm.classes.SVC)""" - short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)" - shorter = "sklearn.Pipeline(...,SVC)" - long_stripped, _ = re.subn(r"\s", "", long) - assert short == SklearnExtension.trim_flow_name(long_stripped) - assert shorter == SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50) - - long = """sklearn.pipeline.Pipeline( - imputation=openmlstudy14.preprocessing.ConditionalImputer, - hotencoding=sklearn.preprocessing.data.OneHotEncoder, - variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, - classifier=sklearn.ensemble.forest.RandomForestClassifier)""" - short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)" # noqa: E501 - long_stripped, _ = re.subn(r"\s", "", long) - assert short == SklearnExtension.trim_flow_name(long_stripped) - - long = """sklearn.pipeline.Pipeline( - SimpleImputer=sklearn.preprocessing.imputation.Imputer, - VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 - Estimator=sklearn.model_selection._search.RandomizedSearchCV( - estimator=sklearn.tree.tree.DecisionTreeClassifier))""" - short = ( - "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))" - ) - long_stripped, _ = re.subn(r"\s", "", long) - assert short == SklearnExtension.trim_flow_name(long_stripped) - - long = """sklearn.model_selection._search.RandomizedSearchCV( - estimator=sklearn.pipeline.Pipeline( - SimpleImputer=sklearn.preprocessing.imputation.Imputer, - classifier=sklearn.ensemble.forest.RandomForestClassifier))""" - short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" - long_stripped, _ = re.subn(r"\s", "", long) - assert short == SklearnExtension.trim_flow_name(long_stripped) - - long = """sklearn.pipeline.FeatureUnion( - pca=sklearn.decomposition.pca.PCA, - svd=sklearn.decomposition.truncated_svd.TruncatedSVD)""" - short = "sklearn.FeatureUnion(PCA,TruncatedSVD)" - long_stripped, _ = re.subn(r"\s", "", long) - assert short == SklearnExtension.trim_flow_name(long_stripped) - - long = "sklearn.ensemble.forest.RandomForestClassifier" - short = "sklearn.RandomForestClassifier" - assert short == SklearnExtension.trim_flow_name(long) - - assert SklearnExtension.trim_flow_name("weka.IsolationForest") == "weka.IsolationForest" - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.21"), - reason="SimpleImputer, ColumnTransformer available only after 0.19 and " - "Pipeline till 0.20 doesn't support indexing and 'passthrough'", - ) - def test_run_on_model_with_empty_steps(self): - from sklearn.compose import ColumnTransformer - - # testing 'drop', 'passthrough', None as non-actionable sklearn estimators - dataset = openml.datasets.get_dataset(128) # iris - task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation - - X, y, categorical_ind, feature_names = dataset.get_data( - target=dataset.default_target_attribute, - ) - categorical_ind = np.array(categorical_ind) - (cat_idx,) = np.where(categorical_ind) - (cont_idx,) = np.where(~categorical_ind) - - clf = make_pipeline( - ColumnTransformer( - [ - ( - "cat", - make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder()), - cat_idx.tolist(), - ), - ( - "cont", - make_pipeline(SimpleImputer(strategy="median"), StandardScaler()), - cont_idx.tolist(), - ), - ], - ), - ) - - clf = sklearn.pipeline.Pipeline( - [ - ("dummystep", "passthrough"), # adding 'passthrough' as an estimator - ("prep", clf), - ("classifier", sklearn.svm.SVC(gamma="auto")), - ], - ) - - # adding 'drop' to a ColumnTransformer - if not categorical_ind.any(): - clf[1][0].set_params(cat="drop") - if not (~categorical_ind).any(): - clf[1][0].set_params(cont="drop") - - # serializing model with non-actionable step - run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) - - assert len(flow.components) == 3 - assert isinstance(flow.components["dummystep"], OpenMLFlow) - assert flow.components["dummystep"].name == "passthrough" - assert isinstance(flow.components["classifier"], OpenMLFlow) - if Version(sklearn.__version__) < Version("0.22"): - assert flow.components["classifier"].name == "sklearn.svm.classes.SVC" - else: - assert flow.components["classifier"].name == "sklearn.svm._classes.SVC" - assert isinstance(flow.components["prep"], OpenMLFlow) - assert flow.components["prep"].class_name == "sklearn.pipeline.Pipeline" - assert isinstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) - assert isinstance( - flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow - ) - assert ( - flow.components["prep"].components["columntransformer"].components["cat"].name == "drop" - ) - - # de-serializing flow to a model with non-actionable step - model = self.extension.flow_to_model(flow) - model.fit(X, y) - assert type(model) == type(clf) - assert model != clf - assert len(model.named_steps) == 3 - assert model.named_steps["dummystep"] == "passthrough" - - xml = flow._to_dict() - new_model = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) - - new_model.fit(X, y) - assert type(new_model) == type(clf) - assert new_model != clf - assert len(new_model.named_steps) == 3 - assert new_model.named_steps["dummystep"] == "passthrough" - - @pytest.mark.sklearn() - def test_sklearn_serialization_with_none_step(self): - msg = ( - "Cannot serialize objects of None type. Please use a valid " - "placeholder for None. Note that empty sklearn estimators can be " - "replaced with 'drop' or 'passthrough'." - ) - clf = sklearn.pipeline.Pipeline( - [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))], - ) - with pytest.raises(ValueError, match=msg): - self.extension.model_to_flow(clf) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.20"), - reason="columntransformer introduction in 0.20.0", - ) - def test_failed_serialization_of_custom_class(self): - """Check if any custom class inherited from sklearn expectedly fails serialization""" - try: - from sklearn.impute import SimpleImputer - except ImportError: - # for lower versions - from sklearn.preprocessing import Imputer as SimpleImputer - - import sklearn.tree - from sklearn.compose import ColumnTransformer - from sklearn.pipeline import Pipeline, make_pipeline - from sklearn.preprocessing import OneHotEncoder, StandardScaler - - cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore"), - ) - cont_imp = make_pipeline(CustomImputer(), StandardScaler()) - ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) - clf = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], - ) # build a sklearn classifier - - task = openml.tasks.get_task(253) # profb; crossvalidation - try: - _ = openml.runs.run_model_on_task(clf, task) - except AttributeError as e: - if e.args[0] == "module '__main__' has no attribute '__version__'": - raise AttributeError(e) - else: - raise Exception(e) - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.20"), - reason="columntransformer introduction in 0.20.0", - ) - def test_setupid_with_column_transformer(self): - """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new - flow each time. - """ - import sklearn.compose - from sklearn.svm import SVC - - def column_transformer_pipe(task_id): - task = openml.tasks.get_task(task_id) - # make columntransformer - preprocessor = sklearn.compose.ColumnTransformer( - transformers=[ - ("num", StandardScaler(), cont), - ("cat", OneHotEncoder(handle_unknown="ignore"), cat), - ], - ) - # make pipeline - clf = SVC(gamma="scale", random_state=1) - pipe = make_pipeline(preprocessor, clf) - # run task - run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) - run.publish() - return openml.runs.get_run(run.run_id) - - run1 = column_transformer_pipe(11) # only categorical - TestBase._mark_entity_for_removal("run", run1.run_id) - run2 = column_transformer_pipe(23) # only numeric - TestBase._mark_entity_for_removal("run", run2.run_id) - assert run1.setup_id == run2.setup_id From 0425929c55cbb89c2932c0d0b943f477ba908e2f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 12:30:39 +0200 Subject: [PATCH 3/8] Do not load sklearn extension by default --- openml/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index 48d301eec..c49505eb9 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -121,7 +121,3 @@ def populate_cache( "_api_calls", "__version__", ] - -# Load the scikit-learn extension by default -# TODO(eddiebergman): Not sure why this is at the bottom of the file -import openml.extensions.sklearn # noqa: E402, F401 From 23ada0ea44bec186200b50933674d191f1aba902 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 12:42:00 +0200 Subject: [PATCH 4/8] Disable scikit-learn tests --- tests/test_flows/test_flow.py | 4 +- tests/test_flows/test_flow_functions.py | 4 +- tests/test_runs/test_run.py | 6 +- tests/test_runs/test_run_functions.py | 6 +- tests/test_setups/test_setup_functions.py | 4 +- tests/test_study/test_study_examples.py | 77 ----------------------- 6 files changed, 12 insertions(+), 89 deletions(-) delete mode 100644 tests/test_study/test_study_examples.py diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 4a5241b62..f725074b5 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -26,7 +26,7 @@ import openml import openml.exceptions -import openml.extensions.sklearn +#import openml.extensions.sklearn import openml.utils from openml._api_calls import _perform_api_call from openml.testing import SimpleImputer, TestBase @@ -37,7 +37,7 @@ class TestFlow(TestBase): def setUp(self): super().setUp() - self.extension = openml.extensions.sklearn.SklearnExtension() + self.extension = None #openml.extensions.sklearn.SklearnExtension() def tearDown(self): super().tearDown() diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 40c78c822..b78781a1c 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -18,7 +18,7 @@ from sklearn import ensemble import openml -import openml.extensions.sklearn +#import openml.extensions.sklearn from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException from openml.testing import TestBase, create_request_response @@ -283,7 +283,7 @@ def test_sklearn_to_flow_list_of_lists(self): from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]]) - extension = openml.extensions.sklearn.SklearnExtension() + extension = None # openml.extensions.sklearn.SklearnExtension() # Test serialization works flow = extension.model_to_flow(ordinal_encoder) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index e58c72e2d..7e2117d79 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -16,7 +16,7 @@ from sklearn.tree import DecisionTreeClassifier import openml -import openml.extensions.sklearn +#import openml.extensions.sklearn from openml import OpenMLRun from openml.testing import SimpleImputer, TestBase @@ -299,7 +299,7 @@ def test_publish_with_local_loaded_flow(self): Publish a run tied to a local flow after it has first been saved to and loaded from disk. """ - extension = openml.extensions.sklearn.SklearnExtension() + extension = None # openml.extensions.sklearn.SklearnExtension() for model, task in self._get_models_tasks_for_tests(): # Make sure the flow does not exist on the server yet. @@ -339,7 +339,7 @@ def test_publish_with_local_loaded_flow(self): @pytest.mark.sklearn() def test_offline_and_online_run_identical(self): - extension = openml.extensions.sklearn.SklearnExtension() + extension = None #openml.extensions.sklearn.SklearnExtension() for model, task in self._get_models_tasks_for_tests(): # Make sure the flow does not exist on the server yet. diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 58670b354..ed4847219 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -34,12 +34,12 @@ import openml import openml._api_calls import openml.exceptions -import openml.extensions.sklearn +#import openml.extensions.sklearn from openml.exceptions import ( OpenMLNotAuthorizedError, OpenMLServerException, ) -from openml.extensions.sklearn import cat, cont +#from openml.extensions.sklearn import cat, cont from openml.runs.functions import ( _run_task_get_arffcontent, delete_run, @@ -108,7 +108,7 @@ class TestRun(TestBase): def setUp(self): super().setUp() - self.extension = openml.extensions.sklearn.SklearnExtension() + self.extension = None#openml.extensions.sklearn.SklearnExtension() def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): # it can take a while for a run to be processed on the OpenML (test) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 88ac84805..5cc531e63 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -13,7 +13,7 @@ import openml import openml.exceptions -import openml.extensions.sklearn +#import openml.extensions.sklearn from openml.testing import TestBase @@ -31,7 +31,7 @@ class TestSetupFunctions(TestBase): _multiprocess_can_split_ = True def setUp(self): - self.extension = openml.extensions.sklearn.SklearnExtension() + self.extension = None# openml.extensions.sklearn.SklearnExtension() super().setUp() @pytest.mark.sklearn() diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py deleted file mode 100644 index e3b21fc8c..000000000 --- a/tests/test_study/test_study_examples.py +++ /dev/null @@ -1,77 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -import unittest -from packaging.version import Version - -import pytest -import sklearn - -from openml.extensions.sklearn import cat, cont -from openml.testing import TestBase - - -class TestStudyFunctions(TestBase): - _multiprocess_can_split_ = True - """Test the example code of Bischl et al. (2018)""" - - @pytest.mark.sklearn() - @unittest.skipIf( - Version(sklearn.__version__) < Version("0.24"), - reason="columntransformer introduction in 0.24.0", - ) - def test_Figure1a(self): - """Test listing in Figure 1a on a single task and the old OpenML100 study. - - The original listing is pasted into the comment below because it the actual unit test - differs a bit, as for example it does not run for all tasks, but only a single one. - - import openml - import sklearn.tree, sklearn.preprocessing - benchmark_suite = openml.study.get_study('OpenML-CC18','tasks') # obtain the benchmark suite - clf = sklearn.pipeline.Pipeline(steps=[('imputer',sklearn.preprocessing.Imputer()), ('estimator',sklearn.tree.DecisionTreeClassifier())]) # build a sklearn classifier - for task_id in benchmark_suite.tasks: # iterate over all tasks - task = openml.tasks.get_task(task_id) # download the OpenML task - X, y = task.get_X_and_y() # get the data (not used in this example) - openml.config.apikey = 'FILL_IN_OPENML_API_KEY' # set the OpenML Api Key - run = openml.runs.run_model_on_task(task,clf) # run classifier on splits (requires API key) - score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score - print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean())) - run.publish() # publish the experiment on OpenML (optional) - print('URL for run: %s/run/%d' %(openml.config.server,run.run_id)) - """ # noqa: E501 - import sklearn.metrics - import sklearn.tree - from sklearn.compose import ColumnTransformer - from sklearn.impute import SimpleImputer - from sklearn.pipeline import Pipeline, make_pipeline - from sklearn.preprocessing import OneHotEncoder, StandardScaler - - import openml - - benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite - cat_imp = OneHotEncoder(handle_unknown="ignore") - cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) - ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) - clf = Pipeline( - steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())], - ) # build a sklearn classifier - for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks - task = openml.tasks.get_task(task_id) # download the OpenML task - X, y = task.get_X_and_y() # get the data (not used in this example) - openml.config.apikey = openml.config.apikey # set the OpenML Api Key - run = openml.runs.run_model_on_task( - clf, - task, - avoid_duplicate_runs=False, - ) # run classifier on splits (requires API key) - score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score - TestBase.logger.info( - f"Data set: {task.get_dataset().name}; Accuracy: {score.mean():0.2f}", - ) - run.publish() # publish the experiment on OpenML (optional) - TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info( - f"collected from {__file__.split('/')[-1]}: {run.run_id}", - ) - TestBase.logger.info("URL for run: %s/run/%d" % (openml.config.server, run.run_id)) From 0f2ac1ad720b5354ee41aace29a3e38ede8c725e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 15:20:43 +0200 Subject: [PATCH 5/8] Tests fail successfully --- openml/extensions/functions.py | 22 ++++++++++++++++++++-- openml/flows/flow.py | 17 +++++++---------- tests/test_extensions/test_functions.py | 2 ++ tests/test_flows/test_flow_functions.py | 4 ++-- tests/test_runs/test_run_functions.py | 2 ++ 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py index 302ab246c..7a944c997 100644 --- a/openml/extensions/functions.py +++ b/openml/extensions/functions.py @@ -13,6 +13,13 @@ from . import Extension +SKLEARN_HINT = ( + "But it looks related to scikit-learn. " + "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. " + "For more information, see " + "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation" +) + def register_extension(extension: type[Extension]) -> None: """Register an extension. @@ -57,7 +64,13 @@ def get_extension_by_flow( candidates.append(extension_class()) if len(candidates) == 0: if raise_if_no_extension: - raise ValueError(f"No extension registered which can handle flow: {flow}") + install_instruction = "" + if flow.name.startswith("sklearn"): + install_instruction = SKLEARN_HINT + raise ValueError( + f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). " + f"{install_instruction}" + ) return None @@ -96,7 +109,12 @@ def get_extension_by_model( candidates.append(extension_class()) if len(candidates) == 0: if raise_if_no_extension: - raise ValueError(f"No extension registered which can handle model: {model}") + install_instruction = "" + if type(model).__module__.startswith("sklearn"): + install_instruction = SKLEARN_HINT + raise ValueError( + f"No extension registered which can handle model: {model}. {install_instruction}" + ) return None diff --git a/openml/flows/flow.py b/openml/flows/flow.py index a3ff50ca1..02d24e78b 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -4,7 +4,7 @@ import logging from collections import OrderedDict from pathlib import Path -from typing import Any, Hashable, Sequence +from typing import Any, Hashable, Sequence, cast import xmltodict @@ -157,10 +157,7 @@ def __init__( # noqa: PLR0913 self.language = language self.dependencies = dependencies self.flow_id = flow_id - if extension is None: - self._extension = get_extension_by_flow(self) - else: - self._extension = extension + self._extension = extension @property def id(self) -> int | None: @@ -170,12 +167,12 @@ def id(self) -> int | None: @property def extension(self) -> Extension: """The extension of the flow (e.g., sklearn).""" - if self._extension is not None: - return self._extension + if self._extension is None: + self._extension = cast( + Extension, get_extension_by_flow(self, raise_if_no_extension=True) + ) - raise RuntimeError( - f"No extension could be found for flow {self.flow_id}: {self.name}", - ) + return self._extension def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: """Collect all information to display in the __repr__ body.""" diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py index bc7937c88..ac4610a15 100644 --- a/tests/test_extensions/test_functions.py +++ b/tests/test_extensions/test_functions.py @@ -11,6 +11,8 @@ class DummyFlow: external_version = "DummyFlow==0.1" + name = "Dummy Flow" + flow_id = 1 dependencies = None diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index b78781a1c..b44de08bf 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -321,8 +321,8 @@ def test_get_flow_reinstantiate_model(self): def test_get_flow_reinstantiate_model_no_extension(self): # Flow 10 is a WEKA flow self.assertRaisesRegex( - RuntimeError, - "No extension could be found for flow 10: weka.SMO", + ValueError, + ".* flow: 10 \(weka.SMO\). ", openml.flows.get_flow, flow_id=10, reinstantiate=True, diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index ed4847219..ed90bcad5 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1900,6 +1900,7 @@ def test_joblib_backends(self, parallel_mock): Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.sklearn() def test_delete_run(self): rs = np.random.randint(1, 2**31 - 1) clf = sklearn.pipeline.Pipeline( @@ -1928,6 +1929,7 @@ def test_delete_run(self): Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.sklearn() def test_initialize_model_from_run_nonstrict(self): # We cannot guarantee that a run with an older version exists on the server. # Thus, we test it simply with a run that we know exists that might not be loose. From 943afcc714a8682f0846c35449e22e90cd7c674b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 15:35:49 +0200 Subject: [PATCH 6/8] Add openml-sklearn as dependency of sklearn tests --- pyproject.toml | 1 + tests/conftest.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fa9a70dc1..e32f113e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,7 @@ test=[ "mypy", "ruff", "requests-mock", + "openml-sklearn", ] examples=[ "matplotlib", diff --git a/tests/conftest.py b/tests/conftest.py index 778b0498b..28ff9df46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,7 @@ import shutil from pathlib import Path import pytest +import openml_sklearn import openml from openml.testing import TestBase From 89f8fceb433b841032fb6a74567b5b6740fae275 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 16:00:40 +0200 Subject: [PATCH 7/8] Make use of openml_sklearn extension --- tests/test_flows/test_flow.py | 6 ++++-- tests/test_flows/test_flow_functions.py | 4 ++-- tests/test_runs/test_run.py | 6 +++--- tests/test_runs/test_run_functions.py | 9 +++++---- tests/test_setups/test_setup_functions.py | 4 ++-- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index f725074b5..e6407a51c 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -24,20 +24,22 @@ import sklearn.tree import xmltodict +from openml_sklearn import SklearnExtension + import openml import openml.exceptions -#import openml.extensions.sklearn import openml.utils from openml._api_calls import _perform_api_call from openml.testing import SimpleImputer, TestBase + class TestFlow(TestBase): _multiprocess_can_split_ = True def setUp(self): super().setUp() - self.extension = None #openml.extensions.sklearn.SklearnExtension() + self.extension = SklearnExtension() def tearDown(self): super().tearDown() diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index b44de08bf..4a9b03fd7 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -7,6 +7,7 @@ from collections import OrderedDict from multiprocessing.managers import Value +from openml_sklearn import SklearnExtension from packaging.version import Version from unittest import mock from unittest.mock import patch @@ -18,7 +19,6 @@ from sklearn import ensemble import openml -#import openml.extensions.sklearn from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException from openml.testing import TestBase, create_request_response @@ -283,7 +283,7 @@ def test_sklearn_to_flow_list_of_lists(self): from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]]) - extension = None # openml.extensions.sklearn.SklearnExtension() + extension = SklearnExtension() # Test serialization works flow = extension.model_to_flow(ordinal_encoder) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 7e2117d79..88fa1672b 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -8,6 +8,7 @@ import numpy as np import pytest import xmltodict +from openml_sklearn import SklearnExtension from sklearn.base import clone from sklearn.dummy import DummyClassifier from sklearn.linear_model import LinearRegression @@ -16,7 +17,6 @@ from sklearn.tree import DecisionTreeClassifier import openml -#import openml.extensions.sklearn from openml import OpenMLRun from openml.testing import SimpleImputer, TestBase @@ -299,7 +299,7 @@ def test_publish_with_local_loaded_flow(self): Publish a run tied to a local flow after it has first been saved to and loaded from disk. """ - extension = None # openml.extensions.sklearn.SklearnExtension() + extension = SklearnExtension() for model, task in self._get_models_tasks_for_tests(): # Make sure the flow does not exist on the server yet. @@ -339,7 +339,7 @@ def test_publish_with_local_loaded_flow(self): @pytest.mark.sklearn() def test_offline_and_online_run_identical(self): - extension = None #openml.extensions.sklearn.SklearnExtension() + extension = SklearnExtension() for model, task in self._get_models_tasks_for_tests(): # Make sure the flow does not exist on the server yet. diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index ed90bcad5..725421d4f 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -7,6 +7,8 @@ import time import unittest import warnings + +from openml_sklearn import SklearnExtension, cat, cont from packaging.version import Version from unittest import mock @@ -34,7 +36,6 @@ import openml import openml._api_calls import openml.exceptions -#import openml.extensions.sklearn from openml.exceptions import ( OpenMLNotAuthorizedError, OpenMLServerException, @@ -108,7 +109,7 @@ class TestRun(TestBase): def setUp(self): super().setUp() - self.extension = None#openml.extensions.sklearn.SklearnExtension() + self.extension = SklearnExtension() def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): # it can take a while for a run to be processed on the OpenML (test) @@ -1750,7 +1751,7 @@ def test_format_prediction_task_regression(self): Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) - @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") + @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") def test__run_task_get_arffcontent_2(self, parallel_mock): """Tests if a run executed in parallel is collated correctly.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp @@ -1824,7 +1825,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) - @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") + @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") def test_joblib_backends(self, parallel_mock): """Tests evaluation of a run using various joblib backends and n_jobs.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 5cc531e63..b805ca9d3 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -10,10 +10,10 @@ import sklearn.base import sklearn.naive_bayes import sklearn.tree +from openml_sklearn import SklearnExtension import openml import openml.exceptions -#import openml.extensions.sklearn from openml.testing import TestBase @@ -31,7 +31,7 @@ class TestSetupFunctions(TestBase): _multiprocess_can_split_ = True def setUp(self): - self.extension = None# openml.extensions.sklearn.SklearnExtension() + self.extension = SklearnExtension() super().setUp() @pytest.mark.sklearn() From 8ab1992d5e887354a7092134c198767ad381a77a Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 19 Jun 2025 16:06:52 +0200 Subject: [PATCH 8/8] packaging is only used in test submodules --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e32f113e5..91ec2327c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ dependencies = [ "minio", "pyarrow", "tqdm", # For MinIO download progress bars - "packaging", ] requires-python = ">=3.8" maintainers = [ @@ -81,6 +80,7 @@ test=[ "ruff", "requests-mock", "openml-sklearn", + "packaging", ] examples=[ "matplotlib",