From e7b825ea0925268921b8a5715c29c5beb67c36e2 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 11:49:59 +0200
Subject: [PATCH 1/8] Delete the extension
---
openml/extensions/sklearn/__init__.py | 43 -
openml/extensions/sklearn/extension.py | 2270 ------------------------
2 files changed, 2313 deletions(-)
delete mode 100644 openml/extensions/sklearn/__init__.py
delete mode 100644 openml/extensions/sklearn/extension.py
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
deleted file mode 100644
index 9c1c6cba6..000000000
--- a/openml/extensions/sklearn/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# License: BSD 3-Clause
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from openml.extensions import register_extension
-
-from .extension import SklearnExtension
-
-if TYPE_CHECKING:
- import pandas as pd
-
-__all__ = ["SklearnExtension"]
-
-register_extension(SklearnExtension)
-
-
-def cont(X: pd.DataFrame) -> pd.Series:
- """Returns True for all non-categorical columns, False for the rest.
-
- This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
- of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
- required to process each type of columns separately.
- This function allows transformations meant for continuous/numeric columns to access the
- continuous/numeric columns given the dataset as DataFrame.
- """
- if not hasattr(X, "dtypes"):
- raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
- return X.dtypes != "category"
-
-
-def cat(X: pd.DataFrame) -> pd.Series:
- """Returns True for all categorical columns, False for the rest.
-
- This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
- of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
- required to process each type of columns separately.
- This function allows transformations meant for categorical columns to access the
- categorical columns given the dataset as DataFrame.
- """
- if not hasattr(X, "dtypes"):
- raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
- return X.dtypes == "category"
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
deleted file mode 100644
index 0c7588cdd..000000000
--- a/openml/extensions/sklearn/extension.py
+++ /dev/null
@@ -1,2270 +0,0 @@
-# License: BSD 3-Clause
-from __future__ import annotations
-
-import contextlib
-import copy
-import importlib
-import inspect
-import json
-import logging
-import re
-import sys
-import time
-import traceback
-import warnings
-from collections import OrderedDict
-from json.decoder import JSONDecodeError
-from re import IGNORECASE
-from typing import Any, Callable, List, Sized, cast
-
-import numpy as np
-import pandas as pd
-import scipy.sparse
-import scipy.stats
-import sklearn.base
-import sklearn.model_selection
-import sklearn.pipeline
-from packaging.version import Version
-
-import openml
-from openml.exceptions import PyOpenMLError
-from openml.extensions import Extension
-from openml.flows import OpenMLFlow
-from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration
-from openml.tasks import (
- OpenMLClassificationTask,
- OpenMLClusteringTask,
- OpenMLLearningCurveTask,
- OpenMLRegressionTask,
- OpenMLSupervisedTask,
- OpenMLTask,
-)
-
-logger = logging.getLogger(__name__)
-
-
-DEPENDENCIES_PATTERN = re.compile(
- r"^(?P[\w\-]+)((?P==|>=|>)"
- r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$",
-)
-
-# NOTE(eddiebergman): This was imported before but became deprecated,
-# as a result I just enumerated them manually by copy-ing and pasting,
-# recommended solution in Numpy 2.0 guide was to explicitly list them.
-SIMPLE_NUMPY_TYPES = [
- np.int8,
- np.int16,
- np.int32,
- np.int64,
- np.longlong,
- np.uint8,
- np.uint16,
- np.uint32,
- np.uint64,
- np.ulonglong,
- np.float16,
- np.float32,
- np.float64,
- np.longdouble,
- np.complex64,
- np.complex128,
- np.clongdouble,
-]
-SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES)
-
-SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough")
-COMPONENT_REFERENCE = "component_reference"
-COMPOSITION_STEP_CONSTANT = "composition_step_constant"
-
-
-class SklearnExtension(Extension):
- """Connect scikit-learn to OpenML-Python.
- The estimators which use this extension must be scikit-learn compatible,
- i.e needs to be a subclass of sklearn.base.BaseEstimator".
- """
-
- ################################################################################################
- # General setup
-
- @classmethod
- def can_handle_flow(cls, flow: OpenMLFlow) -> bool:
- """Check whether a given describes a scikit-learn estimator.
-
- This is done by parsing the ``external_version`` field.
-
- Parameters
- ----------
- flow : OpenMLFlow
-
- Returns
- -------
- bool
- """
- return cls._is_sklearn_flow(flow)
-
- @classmethod
- def can_handle_model(cls, model: Any) -> bool:
- """Check whether a model is an instance of ``sklearn.base.BaseEstimator``.
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- bool
- """
- return isinstance(model, sklearn.base.BaseEstimator)
-
- @classmethod
- def trim_flow_name( # noqa: C901
- cls,
- long_name: str,
- extra_trim_length: int = 100,
- _outer: bool = True, # noqa: FBT001, FBT002
- ) -> str:
- """Shorten generated sklearn flow name to at most ``max_length`` characters.
-
- Flows are assumed to have the following naming structure:
- ``(model_selection)? (pipeline)? (steps)+``
- and will be shortened to:
- ``sklearn.(selection.)?(pipeline.)?(steps)+``
- e.g. (white spaces and newlines added for readability)
-
- .. code ::
-
- sklearn.pipeline.Pipeline(
- columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
- numeric=sklearn.pipeline.Pipeline(
- imputer=sklearn.preprocessing.imputation.Imputer,
- standardscaler=sklearn.preprocessing.data.StandardScaler),
- nominal=sklearn.pipeline.Pipeline(
- simpleimputer=sklearn.impute.SimpleImputer,
- onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
- variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
- svc=sklearn.svm.classes.SVC)
-
- ->
- ``sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)``
-
- Parameters
- ----------
- long_name : str
- The full flow name generated by the scikit-learn extension.
- extra_trim_length: int (default=100)
- If the trimmed name would exceed `extra_trim_length` characters, additional trimming
- of the short name is performed. This reduces the produced short name length.
- There is no guarantee the end result will not exceed `extra_trim_length`.
- _outer : bool (default=True)
- For internal use only. Specifies if the function is called recursively.
-
- Returns
- -------
- str
-
- """
-
- def remove_all_in_parentheses(string: str) -> str:
- string, removals = re.subn(r"\([^()]*\)", "", string)
- while removals > 0:
- string, removals = re.subn(r"\([^()]*\)", "", string)
- return string
-
- # Generally, we want to trim all hyperparameters, the exception to that is for model
- # selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
- # So we first trim name of the `estimator` specified in mode selection. For reference, in
- # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
- # keep it in the final trimmed flow name:
- # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
- # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: ERA001, E501
- # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
- # sklearn.tree.tree.DecisionTreeClassifier))
- if "sklearn.model_selection" in long_name:
- start_index = long_name.index("sklearn.model_selection")
- estimator_start = (
- start_index + long_name[start_index:].index("estimator=") + len("estimator=")
- )
-
- model_select_boilerplate = long_name[start_index:estimator_start]
- # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
- model_selection_class = model_select_boilerplate.split("(")[0].split(".")[-1]
-
- # Now we want to also find and parse the `estimator`, for this we find the closing
- # parenthesis to the model selection technique:
- closing_parenthesis_expected = 1
- for char in long_name[estimator_start:]:
- if char == "(":
- closing_parenthesis_expected += 1
- if char == ")":
- closing_parenthesis_expected -= 1
- if closing_parenthesis_expected == 0:
- break
-
- _end: int = estimator_start + len(long_name[estimator_start:]) - 1
- model_select_pipeline = long_name[estimator_start:_end]
-
- trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
- _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1) # trim module prefix
- model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]"
- name = long_name[:start_index] + model_select_short + long_name[_end + 1 :]
- else:
- name = long_name
-
- module_name = long_name.split(".")[0]
- short_name = module_name + ".{}"
-
- if name.startswith("sklearn.pipeline"):
- full_pipeline_class, pipeline = name[:-1].split("(", maxsplit=1)
- pipeline_class = full_pipeline_class.split(".")[-1]
- # We don't want nested pipelines in the short name, so we trim all complicated
- # subcomponents, i.e. those with parentheses:
- pipeline = remove_all_in_parentheses(pipeline)
-
- # then the pipeline steps are formatted e.g.:
- # step1name=sklearn.submodule.ClassName,step2name...
- components = [component.split(".")[-1] for component in pipeline.split(",")]
- pipeline = f"{pipeline_class}({','.join(components)})"
- if len(short_name.format(pipeline)) > extra_trim_length:
- pipeline = f"{pipeline_class}(...,{components[-1]})"
- else:
- # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
- pipeline = remove_all_in_parentheses(name).split(".")[-1]
-
- if not _outer:
- # Anything from parenthesis in inner calls should not be culled, so we use brackets
- pipeline = pipeline.replace("(", "[").replace(")", "]")
- else:
- # Square brackets may be introduced with nested model_selection
- pipeline = pipeline.replace("[", "(").replace("]", ")")
-
- return short_name.format(pipeline)
-
- @classmethod
- def _min_dependency_str(cls, sklearn_version: str) -> str:
- """Returns a string containing the minimum dependencies for the sklearn version passed.
-
- Parameters
- ----------
- sklearn_version : str
- A version string of the xx.xx.xx
-
- Returns
- -------
- str
- """
- # This explicit check is necessary to support existing entities on the OpenML servers
- # that used the fixed dependency string (in the else block)
- if Version(openml.__version__) > Version("0.11"):
- # OpenML v0.11 onwards supports sklearn>=0.24
- # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
- # variables declared for extracting minimum dependency for that version
- if Version(sklearn_version) >= Version("0.24"):
- from sklearn import _min_dependencies as _mindep
-
- dependency_list = {
- "numpy": f"{_mindep.NUMPY_MIN_VERSION}",
- "scipy": f"{_mindep.SCIPY_MIN_VERSION}",
- "joblib": f"{_mindep.JOBLIB_MIN_VERSION}",
- "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}",
- }
- elif Version(sklearn_version) >= Version("0.23"):
- dependency_list = {
- "numpy": "1.13.3",
- "scipy": "0.19.1",
- "joblib": "0.11",
- "threadpoolctl": "2.0.0",
- }
- if Version(sklearn_version).micro == 0:
- dependency_list.pop("threadpoolctl")
- elif Version(sklearn_version) >= Version("0.21"):
- dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
- elif Version(sklearn_version) >= Version("0.19"):
- dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
- else:
- dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
- else:
- # this is INCORRECT for sklearn versions >= 0.19 and < 0.24
- # given that OpenML has existing flows uploaded with such dependency information,
- # we change no behaviour for older sklearn version, however from 0.24 onwards
- # the dependency list will be accurately updated for any flow uploaded to OpenML
- dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
-
- sklearn_dep = f"sklearn=={sklearn_version}"
- dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()])
- return "\n".join([sklearn_dep, dep_str])
-
- ################################################################################################
- # Methods for flow serialization and de-serialization
-
- def flow_to_model(
- self,
- flow: OpenMLFlow,
- initialize_with_defaults: bool = False, # noqa: FBT001, FBT002
- strict_version: bool = True, # noqa: FBT001, FBT002
- ) -> Any:
- """Initializes a sklearn model based on a flow.
-
- Parameters
- ----------
- flow : mixed
- the object to deserialize (can be flow object, or any serialized
- parameter value that is accepted by)
-
- initialize_with_defaults : bool, optional (default=False)
- If this flag is set, the hyperparameter values of flows will be
- ignored and a flow with its defaults is returned.
-
- strict_version : bool, default=True
- Whether to fail if version requirements are not fulfilled.
-
- Returns
- -------
- mixed
- """
- return self._deserialize_sklearn(
- flow,
- initialize_with_defaults=initialize_with_defaults,
- strict_version=strict_version,
- )
-
- def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912
- self,
- o: Any,
- components: dict | None = None,
- initialize_with_defaults: bool = False, # noqa: FBT001, FBT002
- recursion_depth: int = 0,
- strict_version: bool = True, # noqa: FBT002, FBT001
- ) -> Any:
- """Recursive function to deserialize a scikit-learn flow.
-
- This function inspects an object to deserialize and decides how to do so. This function
- delegates all work to the respective functions to deserialize special data structures etc.
- This function works on everything that has been serialized to OpenML: OpenMLFlow,
- components (which are flows themselves), functions, hyperparameter distributions (for
- random search) and the actual hyperparameter values themselves.
-
- Parameters
- ----------
- o : mixed
- the object to deserialize (can be flow object, or any serialized
- parameter value that is accepted by)
-
- components : Optional[dict]
- Components of the current flow being de-serialized. These will not be used when
- de-serializing the actual flow, but when de-serializing a component reference.
-
- initialize_with_defaults : bool, optional (default=False)
- If this flag is set, the hyperparameter values of flows will be
- ignored and a flow with its defaults is returned.
-
- recursion_depth : int
- The depth at which this flow is called, mostly for debugging
- purposes
-
- strict_version : bool, default=True
- Whether to fail if version requirements are not fulfilled.
-
- Returns
- -------
- mixed
- """
- logger.info(
- "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format(
- "-" * recursion_depth, o, components, initialize_with_defaults
- ),
- )
- depth_pp = recursion_depth + 1 # shortcut var, depth plus plus
-
- # First, we need to check whether the presented object is a json string.
- # JSON strings are used to encoder parameter values. By passing around
- # json strings for parameters, we make sure that we can flow_to_sklearn
- # the parameter values to the correct type.
-
- if isinstance(o, str):
- with contextlib.suppress(JSONDecodeError):
- o = json.loads(o)
-
- if isinstance(o, dict):
- # Check if the dict encodes a 'special' object, which could not
- # easily converted into a string, but rather the information to
- # re-create the object were stored in a dictionary.
- if "oml-python:serialized_object" in o:
- serialized_type = o["oml-python:serialized_object"]
- value = o["value"]
- if serialized_type == "type":
- rval = self._deserialize_type(value)
- elif serialized_type == "rv_frozen":
- rval = self._deserialize_rv_frozen(value)
- elif serialized_type == "function":
- rval = self._deserialize_function(value)
- elif serialized_type in (COMPOSITION_STEP_CONSTANT, COMPONENT_REFERENCE):
- if serialized_type == COMPOSITION_STEP_CONSTANT:
- pass
- elif serialized_type == COMPONENT_REFERENCE:
- value = self._deserialize_sklearn(
- value,
- recursion_depth=depth_pp,
- strict_version=strict_version,
- )
- else:
- raise NotImplementedError(serialized_type)
- assert components is not None # Necessary for mypy
- step_name = value["step_name"]
- key = value["key"]
- component = self._deserialize_sklearn(
- components[key],
- initialize_with_defaults=initialize_with_defaults,
- recursion_depth=depth_pp,
- strict_version=strict_version,
- )
- # The component is now added to where it should be used
- # later. It should not be passed to the constructor of the
- # main flow object.
- del components[key]
- if step_name is None:
- rval = component
- elif "argument_1" not in value:
- rval = (step_name, component)
- else:
- rval = (step_name, component, value["argument_1"])
- elif serialized_type == "cv_object":
- rval = self._deserialize_cross_validator(
- value,
- recursion_depth=recursion_depth,
- strict_version=strict_version,
- )
- else:
- raise ValueError(f"Cannot flow_to_sklearn {serialized_type}")
-
- else:
- rval = OrderedDict(
- (
- self._deserialize_sklearn(
- o=key,
- components=components,
- initialize_with_defaults=initialize_with_defaults,
- recursion_depth=depth_pp,
- strict_version=strict_version,
- ),
- self._deserialize_sklearn(
- o=value,
- components=components,
- initialize_with_defaults=initialize_with_defaults,
- recursion_depth=depth_pp,
- strict_version=strict_version,
- ),
- )
- for key, value in sorted(o.items())
- )
- elif isinstance(o, (list, tuple)):
- rval = [
- self._deserialize_sklearn(
- o=element,
- components=components,
- initialize_with_defaults=initialize_with_defaults,
- recursion_depth=depth_pp,
- strict_version=strict_version,
- )
- for element in o
- ]
- if isinstance(o, tuple):
- rval = tuple(rval)
- elif isinstance(o, (bool, int, float, str)) or o is None:
- rval = o
- elif isinstance(o, OpenMLFlow):
- if not self._is_sklearn_flow(o):
- raise ValueError("Only sklearn flows can be reinstantiated")
- rval = self._deserialize_model(
- flow=o,
- keep_defaults=initialize_with_defaults,
- recursion_depth=recursion_depth,
- strict_version=strict_version,
- )
- else:
- raise TypeError(o)
- logger.info(f"-{'-' * recursion_depth} flow_to_sklearn END o={o}, rval={rval}")
- return rval
-
- def model_to_flow(self, model: Any) -> OpenMLFlow:
- """Transform a scikit-learn model to a flow for uploading it to OpenML.
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- OpenMLFlow
- """
- # Necessary to make pypy not complain about all the different possible return types
- return self._serialize_sklearn(model)
-
- def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any: # noqa: PLR0912, C901
- rval = None # type: Any
-
- # TODO: assert that only on first recursion lvl `parent_model` can be None
- if self.is_estimator(o):
- # is the main model or a submodel
- rval = self._serialize_model(o)
- elif (
- isinstance(o, (list, tuple))
- and len(o) == 2
- and o[1] in SKLEARN_PIPELINE_STRING_COMPONENTS
- and isinstance(parent_model, sklearn.pipeline._BaseComposition)
- ):
- rval = o
- elif isinstance(o, (list, tuple)):
- # TODO: explain what type of parameter is here
- rval = [self._serialize_sklearn(element, parent_model) for element in o]
- if isinstance(o, tuple):
- rval = tuple(rval)
- elif isinstance(o, SIMPLE_TYPES) or o is None:
- if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)):
- o = o.item() # type: ignore
- # base parameter values
- rval = o
- elif isinstance(o, dict):
- # TODO: explain what type of parameter is here
- if not isinstance(o, OrderedDict):
- o = OrderedDict(sorted(o.items()))
-
- rval = OrderedDict()
- for key, value in o.items():
- if not isinstance(key, str):
- raise TypeError(
- "Can only use string as keys, you passed "
- f"type {type(key)} for value {key!s}.",
- )
- _key = self._serialize_sklearn(key, parent_model)
- rval[_key] = self._serialize_sklearn(value, parent_model)
- elif isinstance(o, type):
- # TODO: explain what type of parameter is here
- rval = self._serialize_type(o)
- elif isinstance(o, scipy.stats.distributions.rv_frozen):
- rval = self._serialize_rv_frozen(o)
- # This only works for user-defined functions (and not even partial).
- # I think this is exactly what we want here as there shouldn't be any
- # built-in or functool.partials in a pipeline
- elif inspect.isfunction(o):
- # TODO: explain what type of parameter is here
- rval = self._serialize_function(o)
- elif self._is_cross_validator(o):
- # TODO: explain what type of parameter is here
- rval = self._serialize_cross_validator(o)
- else:
- raise TypeError(o, type(o))
-
- return rval
-
- def get_version_information(self) -> list[str]:
- """List versions of libraries required by the flow.
-
- Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``.
-
- Returns
- -------
- List
- """
- # This can possibly be done by a package such as pyxb, but I could not get
- # it to work properly.
- import numpy
- import scipy
- import sklearn
-
- major, minor, micro, _, _ = sys.version_info
- python_version = f"Python_{'.'.join([str(major), str(minor), str(micro)])}."
- sklearn_version = f"Sklearn_{sklearn.__version__}."
- numpy_version = f"NumPy_{numpy.__version__}." # type: ignore
- scipy_version = f"SciPy_{scipy.__version__}."
-
- return [python_version, sklearn_version, numpy_version, scipy_version]
-
- def create_setup_string(self, model: Any) -> str: # noqa: ARG002
- """Create a string which can be used to reinstantiate the given model.
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- str
- """
- return " ".join(self.get_version_information())
-
- def _is_cross_validator(self, o: Any) -> bool:
- return isinstance(o, sklearn.model_selection.BaseCrossValidator)
-
- @classmethod
- def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool:
- sklearn_dependency = isinstance(flow.dependencies, str) and "sklearn" in flow.dependencies
- sklearn_as_external = isinstance(flow.external_version, str) and (
- flow.external_version.startswith("sklearn==") or ",sklearn==" in flow.external_version
- )
- return sklearn_dependency or sklearn_as_external
-
- def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
- r"""Fetches the sklearn function docstring for the flow description
-
- Retrieves the sklearn docstring available and does the following:
- * If length of docstring <= char_lim, then returns the complete docstring
- * Else, trims the docstring till it encounters a 'Read more in the :ref:'
- * Or till it encounters a 'Parameters\n----------\n'
- The final string returned is at most of length char_lim with leading and
- trailing whitespaces removed.
-
- Parameters
- ----------
- model : sklearn model
- char_lim : int
- Specifying the max length of the returned string.
- OpenML servers have a constraint of 1024 characters for the 'description' field.
-
- Returns
- -------
- str
- """
-
- def match_format(s):
- return f"{s}\n{len(s) * '-'}\n"
-
- s = inspect.getdoc(model)
- if s is None:
- return ""
- try:
- # trim till 'Read more'
- pattern = "Read more in the :ref:"
- index = s.index(pattern)
- s = s[:index]
- # trimming docstring to be within char_lim
- if len(s) > char_lim:
- s = f"{s[: char_lim - 3]}..."
- return s.strip()
- except ValueError:
- logger.warning(
- "'Read more' not found in descriptions. "
- "Trying to trim till 'Parameters' if available in docstring.",
- )
- try:
- # if 'Read more' doesn't exist, trim till 'Parameters'
- pattern = "Parameters"
- index = s.index(match_format(pattern))
- except ValueError:
- # returning full docstring
- logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
- index = len(s)
- s = s[:index]
- # trimming docstring to be within char_lim
- if len(s) > char_lim:
- s = f"{s[: char_lim - 3]}..."
- return s.strip()
-
- def _extract_sklearn_parameter_docstring(self, model) -> None | str:
- """Extracts the part of sklearn docstring containing parameter information
-
- Fetches the entire docstring and trims just the Parameter section.
- The assumption is that 'Parameters' is the first section in sklearn docstrings,
- followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
- appearing in that order if defined.
- Returns a None if no section with 'Parameters' can be found in the docstring.
-
- Parameters
- ----------
- model : sklearn model
-
- Returns
- -------
- str, or None
- """
-
- def match_format(s):
- return f"{s}\n{len(s) * '-'}\n"
-
- s = inspect.getdoc(model)
- if s is None:
- return None
- try:
- index1 = s.index(match_format("Parameters"))
- except ValueError as e:
- # when sklearn docstring has no 'Parameters' section
- logger.warning(f"{match_format('Parameters')} {e}")
- return None
-
- headings = ["Attributes", "Notes", "See also", "Note", "References"]
- for h in headings:
- try:
- # to find end of Parameters section
- index2 = s.index(match_format(h))
- break
- except ValueError:
- logger.warning(f"{h} not available in docstring")
- continue
- else:
- # in the case only 'Parameters' exist, trim till end of docstring
- index2 = len(s)
- s = s[index1:index2]
- return s.strip()
-
- def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict:
- """Parses parameter type and description from sklearn dosctring
-
- Parameters
- ----------
- model : sklearn model
- char_lim : int
- Specifying the max length of the returned string.
- OpenML servers have a constraint of 1024 characters string fields.
-
- Returns
- -------
- Dict, or None
- """
- docstring = self._extract_sklearn_parameter_docstring(model)
- if docstring is None:
- # when sklearn docstring has no 'Parameters' section
- return None
-
- n = re.compile("[.]*\n", flags=IGNORECASE)
- lines = n.split(docstring)
- p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
- # The above regular expression is designed to detect sklearn parameter names and type
- # in the format of [variable_name][space]:[space][type]
- # The expectation is that the parameter description for this detected parameter will
- # be all the lines in the docstring till the regex finds another parameter match
-
- # collecting parameters and their descriptions
- description = [] # type: List
- for s in lines:
- param = p.findall(s)
- if param != []:
- # a parameter definition is found by regex
- # creating placeholder when parameter found which will be a list of strings
- # string descriptions will be appended in subsequent iterations
- # till another parameter is found and a new placeholder is created
- placeholder = [""] # type: List[str]
- description.append(placeholder)
- elif len(description) > 0: # description=[] means no parameters found yet
- # appending strings to the placeholder created when parameter found
- description[-1].append(s)
- for i in range(len(description)):
- # concatenating parameter description strings
- description[i] = "\n".join(description[i]).strip()
- # limiting all parameter descriptions to accepted OpenML string length
- if len(description[i]) > char_lim:
- description[i] = f"{description[i][: char_lim - 3]}..."
-
- # collecting parameters and their types
- parameter_docs = OrderedDict()
- matches = p.findall(docstring)
- for i, param in enumerate(matches):
- key, value = str(param).split(":")
- parameter_docs[key.strip()] = [value.strip(), description[i]]
-
- # to avoid KeyError for missing parameters
- param_list_true = list(model.get_params().keys())
- param_list_found = list(parameter_docs.keys())
- for param in list(set(param_list_true) - set(param_list_found)):
- parameter_docs[param] = [None, None]
-
- return parameter_docs
-
- def _serialize_model(self, model: Any) -> OpenMLFlow:
- """Create an OpenMLFlow.
-
- Calls `sklearn_to_flow` recursively to properly serialize the
- parameters to strings and the components (other models) to OpenMLFlows.
-
- Parameters
- ----------
- model : sklearn estimator
-
- Returns
- -------
- OpenMLFlow
-
- """
- # Get all necessary information about the model objects itself
- (
- parameters,
- parameters_meta_info,
- subcomponents,
- subcomponents_explicit,
- ) = self._extract_information_from_model(model)
-
- # Check that a component does not occur multiple times in a flow as this
- # is not supported by OpenML
- self._check_multiple_occurence_of_component_in_flow(model, subcomponents)
-
- # Create a flow name, which contains all components in brackets, e.g.:
- # RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)),
- # StandardScaler,AdaBoostClassifier(DecisionTreeClassifier))
- class_name = model.__module__ + "." + model.__class__.__name__
-
- # will be part of the name (in brackets)
- sub_components_names = ""
- for key in subcomponents:
- name_thing = subcomponents[key]
- if isinstance(name_thing, OpenMLFlow):
- name = name_thing.name
- elif (
- isinstance(name_thing, str)
- and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS
- ):
- name = name_thing
- else:
- raise TypeError(type(subcomponents[key]))
-
- if key in subcomponents_explicit:
- sub_components_names += "," + key + "=" + name
- else:
- sub_components_names += "," + name
-
- # slice operation on string in order to get rid of leading comma
- name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name
- short_name = SklearnExtension.trim_flow_name(name)
-
- # Get the external versions of all sub-components
- external_version = self._get_external_version_string(model, subcomponents)
- dependencies = self._get_dependencies()
- tags = self._get_tags()
-
- sklearn_description = self._get_sklearn_description(model)
- return OpenMLFlow(
- name=name,
- class_name=class_name,
- custom_name=short_name,
- description=sklearn_description,
- model=model,
- components=subcomponents,
- parameters=parameters,
- parameters_meta_info=parameters_meta_info,
- external_version=external_version,
- tags=tags,
- extension=self,
- language="English",
- dependencies=dependencies,
- )
-
- def _get_dependencies(self) -> str:
- return self._min_dependency_str(sklearn.__version__) # type: ignore
-
- def _get_tags(self) -> list[str]:
- sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore
- sklearn_version_formatted = sklearn_version.replace("==", "_")
- return [
- "openml-python",
- "sklearn",
- "scikit-learn",
- "python",
- sklearn_version_formatted,
- # TODO: add more tags based on the scikit-learn
- # module a flow is in? For example automatically
- # annotate a class of sklearn.svm.SVC() with the
- # tag svm?
- ]
-
- def _get_external_version_string(
- self,
- model: Any,
- sub_components: dict[str, OpenMLFlow],
- ) -> str:
- # Create external version string for a flow, given the model and the
- # already parsed dictionary of sub_components. Retrieves the external
- # version of all subcomponents, which themselves already contain all
- # requirements for their subcomponents. The external version string is a
- # sorted concatenation of all modules which are present in this run.
-
- external_versions = set()
-
- # The model is None if the flow is a placeholder flow such as 'passthrough' or 'drop'
- if model is not None:
- model_package_name = model.__module__.split(".")[0]
- module = importlib.import_module(model_package_name)
- model_package_version_number = module.__version__ # type: ignore
- external_version = self._format_external_version(
- model_package_name,
- model_package_version_number,
- )
- external_versions.add(external_version)
-
- openml_version = self._format_external_version("openml", openml.__version__)
- sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore
- external_versions.add(openml_version)
- external_versions.add(sklearn_version)
- for visitee in sub_components.values():
- if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS:
- continue
- for external_version in visitee.external_version.split(","):
- external_versions.add(external_version)
- return ",".join(sorted(external_versions))
-
- def _check_multiple_occurence_of_component_in_flow(
- self,
- model: Any,
- sub_components: dict[str, OpenMLFlow],
- ) -> None:
- to_visit_stack: list[OpenMLFlow] = []
- to_visit_stack.extend(sub_components.values())
- known_sub_components: set[str] = set()
-
- while len(to_visit_stack) > 0:
- visitee = to_visit_stack.pop()
- if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS:
- known_sub_components.add(visitee)
- elif visitee.name in known_sub_components:
- raise ValueError(
- f"Found a second occurence of component {visitee.name} when "
- f"trying to serialize {model}.",
- )
- else:
- known_sub_components.add(visitee.name)
- to_visit_stack.extend(visitee.components.values())
-
- def _extract_information_from_model( # noqa: PLR0915, C901, PLR0912
- self,
- model: Any,
- ) -> tuple[
- OrderedDict[str, str | None],
- OrderedDict[str, dict | None],
- OrderedDict[str, OpenMLFlow],
- set,
- ]:
- # This function contains four "global" states and is quite long and
- # complicated. If it gets to complicated to ensure it's correctness,
- # it would be best to make it a class with the four "global" states being
- # the class attributes and the if/elif/else in the for-loop calls to
- # separate class methods
-
- # stores all entities that should become subcomponents
- sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow]
- # stores the keys of all subcomponents that should become
- sub_components_explicit = set()
- parameters: OrderedDict[str, str | None] = OrderedDict()
- parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict()
- parameters_docs = self._extract_sklearn_param_info(model)
-
- model_parameters = model.get_params(deep=False)
- for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
- rval = self._serialize_sklearn(v, model)
-
- def flatten_all(list_):
- """Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1])."""
- for el in list_:
- if isinstance(el, (list, tuple)) and len(el) > 0:
- yield from flatten_all(el)
- else:
- yield el
-
- # In case rval is a list of lists (or tuples), we need to identify two situations:
- # - sklearn pipeline steps, feature union or base classifiers in voting classifier.
- # They look like e.g. [("imputer", Imputer()), ("classifier", SVC())]
- # - a list of lists with simple types (e.g. int or str), such as for an OrdinalEncoder
- # where all possible values for each feature are described: [[0,1,2], [1,2,5]]
- is_non_empty_list_of_lists_with_same_type = (
- isinstance(rval, (list, tuple))
- and len(rval) > 0
- and isinstance(rval[0], (list, tuple))
- and all(isinstance(rval_i, type(rval[0])) for rval_i in rval)
- )
-
- # Check that all list elements are of simple types.
- nested_list_of_simple_types = (
- is_non_empty_list_of_lists_with_same_type
- and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval))
- and all(
- len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS
- for rv in rval
- )
- )
-
- if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types:
- # If a list of lists is identified that include 'non-simple' types (e.g. objects),
- # we assume they are steps in a pipeline, feature union, or base classifiers in
- # a voting classifier.
- parameter_value = [] # type: List
- reserved_keywords = set(model.get_params(deep=False).keys())
-
- for sub_component_tuple in rval:
- identifier = sub_component_tuple[0]
- sub_component = sub_component_tuple[1]
- sub_component_type = type(sub_component_tuple)
- if not 2 <= len(sub_component_tuple) <= 3:
- # length 2 is for {VotingClassifier.estimators,
- # Pipeline.steps, FeatureUnion.transformer_list}
- # length 3 is for ColumnTransformer
- raise ValueError(
- f"Length of tuple of type {sub_component_type}"
- " does not match assumptions"
- )
-
- if isinstance(sub_component, str):
- if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS:
- msg = (
- "Second item of tuple does not match assumptions. "
- "If string, can be only 'drop' or 'passthrough' but"
- f"got {sub_component}"
- )
- raise ValueError(msg)
- elif sub_component is None:
- msg = (
- "Cannot serialize objects of None type. Please use a valid "
- "placeholder for None. Note that empty sklearn estimators can be "
- "replaced with 'drop' or 'passthrough'."
- )
- raise ValueError(msg)
- elif not isinstance(sub_component, OpenMLFlow):
- msg = (
- "Second item of tuple does not match assumptions. "
- f"Expected OpenMLFlow, got {type(sub_component)}"
- )
- raise TypeError(msg)
-
- if identifier in reserved_keywords:
- parent_model = f"{model.__module__}.{model.__class__.__name__}"
- msg = (
- "Found element shadowing official "
- f"parameter for {parent_model}: {identifier}"
- )
- raise PyOpenMLError(msg)
-
- # when deserializing the parameter
- sub_components_explicit.add(identifier)
- if isinstance(sub_component, str):
- external_version = self._get_external_version_string(None, {})
- dependencies = self._get_dependencies()
- tags = self._get_tags()
-
- sub_components[identifier] = OpenMLFlow(
- name=sub_component,
- description="Placeholder flow for scikit-learn's string pipeline "
- "members",
- components=OrderedDict(),
- parameters=OrderedDict(),
- parameters_meta_info=OrderedDict(),
- external_version=external_version,
- tags=tags,
- language="English",
- dependencies=dependencies,
- model=None,
- )
- component_reference: OrderedDict[str, str | dict] = OrderedDict()
- component_reference["oml-python:serialized_object"] = (
- COMPOSITION_STEP_CONSTANT
- )
- cr_value: dict[str, Any] = OrderedDict()
- cr_value["key"] = identifier
- cr_value["step_name"] = identifier
- if len(sub_component_tuple) == 3:
- cr_value["argument_1"] = sub_component_tuple[2]
- component_reference["value"] = cr_value
- else:
- sub_components[identifier] = sub_component
- component_reference = OrderedDict()
- component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE
- cr_value = OrderedDict()
- cr_value["key"] = identifier
- cr_value["step_name"] = identifier
- if len(sub_component_tuple) == 3:
- cr_value["argument_1"] = sub_component_tuple[2]
- component_reference["value"] = cr_value
- parameter_value.append(component_reference)
-
- # Here (and in the elif and else branch below) are the only
- # places where we encode a value as json to make sure that all
- # parameter values still have the same type after
- # deserialization
- if isinstance(rval, tuple):
- parameter_json = json.dumps(tuple(parameter_value))
- else:
- parameter_json = json.dumps(parameter_value)
- parameters[k] = parameter_json
-
- elif isinstance(rval, OpenMLFlow):
- # A subcomponent, for example the base model in
- # AdaBoostClassifier
- sub_components[k] = rval
- sub_components_explicit.add(k)
- component_reference = OrderedDict()
- component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE
- cr_value = OrderedDict()
- cr_value["key"] = k
- cr_value["step_name"] = None
- component_reference["value"] = cr_value
- cr = self._serialize_sklearn(component_reference, model)
- parameters[k] = json.dumps(cr)
-
- elif not (hasattr(rval, "__len__") and len(rval) == 0):
- rval = json.dumps(rval)
- parameters[k] = rval
- # a regular hyperparameter
- else:
- parameters[k] = None
-
- if parameters_docs is not None:
- data_type, description = parameters_docs[k]
- parameters_meta_info[k] = OrderedDict(
- (("description", description), ("data_type", data_type)),
- )
- else:
- parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None)))
-
- return parameters, parameters_meta_info, sub_components, sub_components_explicit
-
- def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]:
- """
- Returns
- -------
- i) a dict with all parameter names that have a default value, and
- ii) a set with all parameter names that do not have a default
-
- Parameters
- ----------
- fn_name : callable
- The function of which we want to obtain the defaults
-
- Returns
- -------
- params_with_defaults: dict
- a dict mapping parameter name to the default value
- params_without_defaults: set
- a set with all parameters that do not have a default value
- """
- # parameters with defaults are optional, all others are required.
- parameters = inspect.signature(fn_name).parameters
- required_params = set()
- optional_params = {}
- for param in parameters:
- parameter = parameters.get(param)
- default_val = parameter.default # type: ignore
- if default_val is inspect.Signature.empty:
- required_params.add(param)
- else:
- optional_params[param] = default_val
- return optional_params, required_params
-
- def _deserialize_model( # noqa: C901
- self,
- flow: OpenMLFlow,
- keep_defaults: bool, # noqa: FBT001
- recursion_depth: int,
- strict_version: bool = True, # noqa: FBT002, FBT001
- ) -> Any:
- logger.info(f"-{'-' * recursion_depth} deserialize {flow.name}")
- model_name = flow.class_name
- self._check_dependencies(flow.dependencies, strict_version=strict_version)
-
- parameters = flow.parameters
- components = flow.components
- parameter_dict: dict[str, Any] = OrderedDict()
-
- # Do a shallow copy of the components dictionary so we can remove the
- # components from this copy once we added them into the pipeline. This
- # allows us to not consider them any more when looping over the
- # components, but keeping the dictionary of components untouched in the
- # original components dictionary.
- components_ = copy.copy(components)
-
- for name in parameters:
- value = parameters.get(name)
- logger.info(f"--{'-' * recursion_depth} flow_parameter={name}, value={value}")
- rval = self._deserialize_sklearn(
- value,
- components=components_,
- initialize_with_defaults=keep_defaults,
- recursion_depth=recursion_depth + 1,
- strict_version=strict_version,
- )
- parameter_dict[name] = rval
-
- for name in components:
- if name in parameter_dict:
- continue
- if name not in components_:
- continue
- value = components[name]
- logger.info(f"--{'-' * recursion_depth} flow_component={name}, value={value}")
- rval = self._deserialize_sklearn(
- value,
- recursion_depth=recursion_depth + 1,
- strict_version=strict_version,
- )
- parameter_dict[name] = rval
-
- if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS:
- return flow.name
-
- assert model_name is not None
- module_name = model_name.rsplit(".", 1)
- model_class = getattr(importlib.import_module(module_name[0]), module_name[1])
-
- if keep_defaults:
- # obtain all params with a default
- param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__)
-
- # delete the params that have a default from the dict,
- # so they get initialized with their default value
- # except [...]
- for param in param_defaults:
- # [...] the ones that also have a key in the components dict.
- # As OpenML stores different flows for ensembles with different
- # (base-)components, in OpenML terms, these are not considered
- # hyperparameters but rather constants (i.e., changing them would
- # result in a different flow)
- if param not in components:
- del parameter_dict[param]
-
- if not strict_version:
- # Ignore incompatible parameters
- allowed_parameter = list(inspect.signature(model_class.__init__).parameters)
- for p in list(parameter_dict.keys()):
- if p not in allowed_parameter:
- warnings.warn(
- f"While deserializing in a non-strict way, parameter {p} is not "
- f"allowed for {model_class.__name__} likely due to a version mismatch. "
- "We ignore the parameter.",
- UserWarning,
- stacklevel=2,
- )
- del parameter_dict[p]
-
- return model_class(**parameter_dict)
-
- def _check_dependencies(
- self,
- dependencies: str,
- strict_version: bool = True, # noqa: FBT001, FBT002
- ) -> None:
- if not dependencies:
- return
-
- dependencies_list = dependencies.split("\n")
- for dependency_string in dependencies_list:
- match = DEPENDENCIES_PATTERN.match(dependency_string)
- if not match:
- raise ValueError(f"Cannot parse dependency {dependency_string}")
-
- dependency_name = match.group("name")
- operation = match.group("operation")
- version = match.group("version")
-
- module = importlib.import_module(dependency_name)
- required_version = Version(version)
- installed_version = Version(module.__version__) # type: ignore
-
- if operation == "==":
- check = required_version == installed_version
- elif operation == ">":
- check = installed_version > required_version
- elif operation == ">=":
- check = (
- installed_version > required_version or installed_version == required_version
- )
- else:
- raise NotImplementedError(f"operation '{operation}' is not supported")
- message = (
- f"Trying to deserialize a model with dependency {dependency_string} not satisfied."
- )
- if not check:
- if strict_version:
- raise ValueError(message)
-
- warnings.warn(message, category=UserWarning, stacklevel=2)
-
- def _serialize_type(self, o: Any) -> OrderedDict[str, str]:
- mapping = {
- float: "float",
- np.float32: "np.float32",
- np.float64: "np.float64",
- int: "int",
- np.int32: "np.int32",
- np.int64: "np.int64",
- }
- if Version(np.__version__) < Version("1.24"):
- mapping[float] = "np.float"
- mapping[int] = "np.int"
-
- ret = OrderedDict() # type: 'OrderedDict[str, str]'
- ret["oml-python:serialized_object"] = "type"
- ret["value"] = mapping[o]
- return ret
-
- def _deserialize_type(self, o: str) -> Any:
- mapping = {
- "float": float,
- "np.float32": np.float32,
- "np.float64": np.float64,
- "int": int,
- "np.int32": np.int32,
- "np.int64": np.int64,
- }
-
- # TODO(eddiebergman): Might be able to remove this
- if Version(np.__version__) < Version("1.24"):
- mapping["np.float"] = np.float # type: ignore # noqa: NPY001
- mapping["np.int"] = np.int # type: ignore # noqa: NPY001
-
- return mapping[o]
-
- def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]:
- args = o.args
- kwds = o.kwds
- a = o.a
- b = o.b
- dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__
- ret: OrderedDict[str, str | dict] = OrderedDict()
- ret["oml-python:serialized_object"] = "rv_frozen"
- ret["value"] = OrderedDict(
- (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)),
- )
- return ret
-
- def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any:
- args = o["args"]
- kwds = o["kwds"]
- a = o["a"]
- b = o["b"]
- dist_name = o["dist"]
-
- module_name = dist_name.rsplit(".", 1)
- try:
- rv_class = getattr(importlib.import_module(module_name[0]), module_name[1])
- except AttributeError as e:
- _tb = traceback.format_exc()
- warnings.warn(
- f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}"
- f"\nTraceback: {_tb}",
- RuntimeWarning,
- stacklevel=2,
- )
- return None
-
- dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) # type: ignore
- dist.a = a
- dist.b = b
-
- return dist
-
- def _serialize_function(self, o: Callable) -> OrderedDict[str, str]:
- name = o.__module__ + "." + o.__name__
- ret = OrderedDict() # type: 'OrderedDict[str, str]'
- ret["oml-python:serialized_object"] = "function"
- ret["value"] = name
- return ret
-
- def _deserialize_function(self, name: str) -> Callable:
- module_name = name.rsplit(".", 1)
- return getattr(importlib.import_module(module_name[0]), module_name[1])
-
- def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]:
- ret: OrderedDict[str, str | dict] = OrderedDict()
-
- parameters = OrderedDict() # type: 'OrderedDict[str, Any]'
-
- # XXX this is copied from sklearn.model_selection._split
- cls = o.__class__
- init = getattr(cls.__init__, "deprecated_original", cls.__init__)
- # Ignore varargs, kw and default values and pop self
- init_signature = inspect.signature(init) # type: ignore
- # Consider the constructor parameters excluding 'self'
- if init is object.__init__:
- args = [] # type: List
- else:
- args = sorted(
- [
- p.name
- for p in init_signature.parameters.values()
- if p.name != "self" and p.kind != p.VAR_KEYWORD
- ],
- )
-
- for key in args:
- # We need deprecation warnings to always be on in order to
- # catch deprecated param values.
- # This is set in utils/__init__.py but it gets overwritten
- # when running under python3 somehow.
- with warnings.catch_warnings(record=True) as w:
- warnings.simplefilter("always", DeprecationWarning)
- value = getattr(o, key, None)
- if w is not None and len(w) and w[0].category is DeprecationWarning:
- # if the parameter is deprecated, don't show it
- continue
-
- if not (isinstance(value, Sized) and len(value) == 0):
- value = json.dumps(value)
- parameters[key] = value
- else:
- parameters[key] = None
-
- ret["oml-python:serialized_object"] = "cv_object"
- name = o.__module__ + "." + o.__class__.__name__
- value = OrderedDict([("name", name), ("parameters", parameters)])
- ret["value"] = value
-
- return ret
-
- def _deserialize_cross_validator(
- self,
- value: OrderedDict[str, Any],
- recursion_depth: int,
- strict_version: bool = True, # noqa: FBT002, FBT001
- ) -> Any:
- model_name = value["name"]
- parameters = value["parameters"]
-
- module_name = model_name.rsplit(".", 1)
- model_class = getattr(importlib.import_module(module_name[0]), module_name[1])
- for parameter in parameters:
- parameters[parameter] = self._deserialize_sklearn(
- parameters[parameter],
- recursion_depth=recursion_depth + 1,
- strict_version=strict_version,
- )
- return model_class(**parameters)
-
- def _format_external_version(
- self,
- model_package_name: str,
- model_package_version_number: str,
- ) -> str:
- return f"{model_package_name}=={model_package_version_number}"
-
- @staticmethod
- def _get_parameter_values_recursive(
- param_grid: dict | list[dict],
- parameter_name: str,
- ) -> list[Any]:
- """
- Returns a list of values for a given hyperparameter, encountered
- recursively throughout the flow. (e.g., n_jobs can be defined
- for various flows)
-
- Parameters
- ----------
- param_grid: Union[Dict, List[Dict]]
- Dict mapping from hyperparameter list to value, to a list of
- such dicts
-
- parameter_name: str
- The hyperparameter that needs to be inspected
-
- Returns
- -------
- List
- A list of all values of hyperparameters with this name
- """
- if isinstance(param_grid, dict):
- return [
- value
- for param, value in param_grid.items()
- if param.split("__")[-1] == parameter_name
- ]
-
- if isinstance(param_grid, list):
- result = []
- for sub_grid in param_grid:
- result.extend(
- SklearnExtension._get_parameter_values_recursive(sub_grid, parameter_name),
- )
- return result
-
- raise ValueError("Param_grid should either be a dict or list of dicts")
-
- def _prevent_optimize_n_jobs(self, model):
- """
- Ensures that HPO classes will not optimize the n_jobs hyperparameter
-
- Parameters
- ----------
- model:
- The model that will be fitted
- """
- if self._is_hpo_class(model):
- if isinstance(model, sklearn.model_selection.GridSearchCV):
- param_distributions = model.param_grid
- elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
- param_distributions = model.param_distributions
- else:
- if hasattr(model, "param_distributions"):
- param_distributions = model.param_distributions
- else:
- raise AttributeError(
- "Using subclass BaseSearchCV other than "
- "{GridSearchCV, RandomizedSearchCV}. "
- "Could not find attribute "
- "param_distributions.",
- )
- logger.warning(
- "Warning! Using subclass BaseSearchCV other than "
- "{GridSearchCV, RandomizedSearchCV}. "
- "Should implement param check. ",
- )
- n_jobs_vals = SklearnExtension._get_parameter_values_recursive(
- param_distributions,
- "n_jobs",
- )
- if len(n_jobs_vals) > 0:
- raise PyOpenMLError(
- "openml-python should not be used to optimize the n_jobs parameter.",
- )
-
- ################################################################################################
- # Methods for performing runs with extension modules
-
- def is_estimator(self, model: Any) -> bool:
- """Check whether the given model is a scikit-learn estimator.
-
- This function is only required for backwards compatibility and will be removed in the
- near future.
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- bool
- """
- o = model
- return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params")
-
- def seed_model(self, model: Any, seed: int | None = None) -> Any: # noqa: C901
- """Set the random state of all the unseeded components of a model and return the seeded
- model.
-
- Required so that all seed information can be uploaded to OpenML for reproducible results.
-
- Models that are already seeded will maintain the seed. In this case,
- only integer seeds are allowed (An exception is raised when a RandomState was used as
- seed).
-
- Parameters
- ----------
- model : sklearn model
- The model to be seeded
- seed : int
- The seed to initialize the RandomState with. Unseeded subcomponents
- will be seeded with a random number from the RandomState.
-
- Returns
- -------
- Any
- """
-
- def _seed_current_object(current_value):
- if isinstance(current_value, int): # acceptable behaviour
- return False
-
- if isinstance(current_value, np.random.RandomState):
- raise ValueError(
- "Models initialized with a RandomState object are not "
- "supported. Please seed with an integer. ",
- )
-
- if current_value is not None:
- raise ValueError(
- "Models should be seeded with int or None (this should never happen). ",
- )
-
- return True
-
- rs = np.random.RandomState(seed)
- model_params = model.get_params()
- random_states = {}
- for param_name in sorted(model_params):
- if "random_state" in param_name:
- current_value = model_params[param_name]
- # important to draw the value at this point (and not in the if
- # statement) this way we guarantee that if a different set of
- # subflows is seeded, the same number of the random generator is
- # used
- new_value = rs.randint(0, 2**16)
- if _seed_current_object(current_value):
- random_states[param_name] = new_value
-
- # Also seed CV objects!
- elif isinstance(model_params[param_name], sklearn.model_selection.BaseCrossValidator):
- if not hasattr(model_params[param_name], "random_state"):
- continue
-
- current_value = model_params[param_name].random_state
- new_value = rs.randint(0, 2**16)
- if _seed_current_object(current_value):
- model_params[param_name].random_state = new_value
-
- model.set_params(**random_states)
- return model
-
- def check_if_model_fitted(self, model: Any) -> bool:
- """Returns True/False denoting if the model has already been fitted/trained
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- bool
- """
- from sklearn.exceptions import NotFittedError
- from sklearn.utils.validation import check_is_fitted
-
- try:
- # check if model is fitted
- check_is_fitted(model)
-
- # Creating random dummy data of arbitrary size
- dummy_data = np.random.uniform(size=(10, 3)) # noqa: NPY002
- # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more
- # robust check that works across sklearn versions and models. Internally, 'predict'
- # should call 'check_is_fitted' for every concerned attribute, thus offering a more
- # assured check than explicit calls to 'check_is_fitted'
- model.predict(dummy_data)
- # Will reach here if the model was fit on a dataset with 3 features
- return True
- except NotFittedError: # needs to be the first exception to be caught
- # Model is not fitted, as is required
- return False
- except ValueError:
- # Will reach here if the model was fit on a dataset with more or less than 3 features
- return True
-
- def _run_model_on_fold( # noqa: PLR0915, PLR0913, C901, PLR0912
- self,
- model: Any,
- task: OpenMLTask,
- X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame,
- rep_no: int,
- fold_no: int,
- y_train: np.ndarray | None = None,
- X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None,
- ) -> tuple[
- np.ndarray,
- pd.DataFrame | None,
- OrderedDict[str, float],
- OpenMLRunTrace | None,
- ]:
- """Run a model on a repeat,fold,subsample triplet of the task and return prediction
- information.
-
- Furthermore, it will measure run time measures in case multi-core behaviour allows this.
- * exact user cpu time will be measured if the number of cores is set (recursive throughout
- the model) exactly to 1
- * wall clock time will be measured if the number of cores is set (recursive throughout the
- model) to any given number (but not when it is set to -1)
-
- Returns the data that is necessary to construct the OpenML Run object. Is used by
- run_task_get_arff_content. Do not use this function unless you know what you are doing.
-
- Parameters
- ----------
- model : Any
- The UNTRAINED model to run. The model instance will be copied and not altered.
- task : OpenMLTask
- The task to run the model on.
- X_train : array-like
- Training data for the given repetition and fold.
- rep_no : int
- The repeat of the experiment (0-based; in case of 1 time CV, always 0)
- fold_no : int
- The fold nr of the experiment (0-based; in case of holdout, always 0)
- y_train : Optional[np.ndarray] (default=None)
- Target attributes for supervised tasks. In case of classification, these are integer
- indices to the potential classes specified by dataset.
- X_test : Optional, array-like (default=None)
- Test attributes to test for generalization in supervised tasks.
-
- Returns
- -------
- pred_y : np.ndarray
- Predictions on the training/test set, depending on the task type.
- For supervised tasks, predictions are on the test set.
- For unsupervised tasks, predictions are on the training set.
- proba_y : pd.DataFrame, optional
- Predicted probabilities for the test set.
- None, if task is not Classification or Learning Curve prediction.
- user_defined_measures : OrderedDict[str, float]
- User defined measures that were generated on this fold
- trace : OpenMLRunTrace, optional
- arff trace object from a fitted model and the trace content obtained by
- repeatedly calling ``run_model_on_task``
- """
-
- def _prediction_to_probabilities(
- y: np.ndarray | list,
- model_classes: list[Any],
- class_labels: list[str] | None,
- ) -> pd.DataFrame:
- """Transforms predicted probabilities to match with OpenML class indices.
-
- Parameters
- ----------
- y : np.ndarray
- Predicted probabilities (possibly omitting classes if they were not present in the
- training data).
- model_classes : list
- List of classes known_predicted by the model, ordered by their index.
- class_labels : list
- List of classes as stored in the task object fetched from server.
-
- Returns
- -------
- pd.DataFrame
- """
- if class_labels is None:
- raise ValueError("The task has no class labels")
-
- if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str):
- # mapping (decoding) the predictions to the categories
- # creating a separate copy to not change the expected pred_y type
- y = [class_labels[pred] for pred in y] # list or numpy array of predictions
-
- # model_classes: sklearn classifier mapping from original array id to
- # prediction index id
- if not isinstance(model_classes, list):
- raise ValueError("please convert model classes to list prior to calling this fn")
-
- # DataFrame allows more accurate mapping of classes as column names
- result = pd.DataFrame(
- 0,
- index=np.arange(len(y)),
- columns=model_classes,
- dtype=np.float32,
- )
- for obs, prediction in enumerate(y):
- result.loc[obs, prediction] = 1.0
- return result
-
- if isinstance(task, OpenMLSupervisedTask):
- if y_train is None:
- raise TypeError("argument y_train must not be of type None")
- if X_test is None:
- raise TypeError("argument X_test must not be of type None")
-
- model_copy = sklearn.base.clone(model, safe=True)
- # sanity check: prohibit users from optimizing n_jobs
- self._prevent_optimize_n_jobs(model_copy)
- # measures and stores runtimes
- user_defined_measures = OrderedDict() # type: 'OrderedDict[str, float]'
- try:
- # for measuring runtime. Only available since Python 3.3
- modelfit_start_cputime = time.process_time()
- modelfit_start_walltime = time.time()
-
- if isinstance(task, OpenMLSupervisedTask):
- model_copy.fit(X_train, y_train) # type: ignore
- elif isinstance(task, OpenMLClusteringTask):
- model_copy.fit(X_train) # type: ignore
-
- modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000
- modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
-
- user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
- refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 # type: ignore
- user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
-
- except AttributeError as e:
- # typically happens when training a regressor on classification task
- raise PyOpenMLError(str(e)) from e
-
- if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
- # search for model classes_ (might differ depending on modeltype)
- # first, pipelines are a special case (these don't have a classes_
- # object, but rather borrows it from the last step. We do this manually,
- # because of the BaseSearch check)
- if isinstance(model_copy, sklearn.pipeline.Pipeline):
- used_estimator = model_copy.steps[-1][-1]
- else:
- used_estimator = model_copy
-
- if self._is_hpo_class(used_estimator):
- model_classes = used_estimator.best_estimator_.classes_
- else:
- model_classes = used_estimator.classes_
-
- if not isinstance(model_classes, list):
- model_classes = model_classes.tolist()
-
- # to handle the case when dataset is numpy and categories are encoded
- # however the class labels stored in task are still categories
- if isinstance(y_train, np.ndarray) and isinstance(
- cast("List", task.class_labels)[0],
- str,
- ):
- model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes]
-
- modelpredict_start_cputime = time.process_time()
- modelpredict_start_walltime = time.time()
-
- # In supervised learning this returns the predictions for Y, in clustering
- # it returns the clusters
- if isinstance(task, OpenMLSupervisedTask):
- pred_y = model_copy.predict(X_test)
- elif isinstance(task, OpenMLClusteringTask):
- pred_y = model_copy.predict(X_train)
- else:
- raise ValueError(task)
-
- modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000
- user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime
- user_defined_measures["usercpu_time_millis"] = (
- modelfit_dur_cputime + modelpredict_duration_cputime
- )
- modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000
- user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime
- user_defined_measures["wall_clock_time_millis"] = (
- modelfit_dur_walltime + modelpredict_duration_walltime + refit_time
- )
-
- if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
- try:
- proba_y = model_copy.predict_proba(X_test)
- proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy
- except AttributeError: # predict_proba is not available when probability=False
- proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels)
-
- if task.class_labels is not None:
- if proba_y.shape[1] != len(task.class_labels):
- # Remap the probabilities in case there was a class missing
- # at training time. By default, the classification targets
- # are mapped to be zero-based indices to the actual classes.
- # Therefore, the model_classes contain the correct indices to
- # the correct probability array. Example:
- # classes in the dataset: 0, 1, 2, 3, 4, 5
- # classes in the training set: 0, 1, 2, 4, 5
- # then we need to add a column full of zeros into the probabilities
- # for class 3 because the rest of the library expects that the
- # probabilities are ordered the same way as the classes are ordered).
- message = (
- f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}"
- " classes!"
- )
- warnings.warn(message, stacklevel=2)
- openml.config.logger.warning(message)
-
- for _i, col in enumerate(task.class_labels):
- # adding missing columns with 0 probability
- if col not in model_classes:
- proba_y[col] = 0
- # We re-order the columns to move possibly added missing columns into place.
- proba_y = proba_y[task.class_labels]
- else:
- raise ValueError("The task has no class labels")
-
- if not np.all(set(proba_y.columns) == set(task.class_labels)):
- missing_cols = list(set(task.class_labels) - set(proba_y.columns))
- raise ValueError("Predicted probabilities missing for the columns: ", missing_cols)
-
- elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)):
- proba_y = None
- else:
- raise TypeError(type(task))
-
- if self._is_hpo_class(model_copy):
- trace_data = self._extract_trace_data(model_copy, rep_no, fold_no)
- trace: OpenMLRunTrace | None = self._obtain_arff_trace(
- model_copy,
- trace_data,
- )
- else:
- trace = None
-
- return pred_y, proba_y, user_defined_measures, trace
-
- def obtain_parameter_values( # noqa: C901, PLR0915
- self,
- flow: OpenMLFlow,
- model: Any = None,
- ) -> list[dict[str, Any]]:
- """Extracts all parameter settings required for the flow from the model.
-
- If no explicit model is provided, the parameters will be extracted from `flow.model`
- instead.
-
- Parameters
- ----------
- flow : OpenMLFlow
- OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server)
-
- model: Any, optional (default=None)
- The model from which to obtain the parameter values. Must match the flow signature.
- If None, use the model specified in ``OpenMLFlow.model``.
-
- Returns
- -------
- list
- A list of dicts, where each dict has the following entries:
- - ``oml:name`` : str: The OpenML parameter name
- - ``oml:value`` : mixed: A representation of the parameter value
- - ``oml:component`` : int: flow id to which the parameter belongs
- """
- openml.flows.functions._check_flow_for_server_id(flow)
-
- def get_flow_dict(_flow):
- flow_map = {_flow.name: _flow.flow_id}
- for subflow in _flow.components:
- flow_map.update(get_flow_dict(_flow.components[subflow]))
- return flow_map
-
- def extract_parameters( # noqa: PLR0915, PLR0912, C901
- _flow,
- _flow_dict,
- component_model,
- _main_call=False, # noqa: FBT002
- main_id=None,
- ):
- def is_subcomponent_specification(values):
- # checks whether the current value can be a specification of
- # subcomponents, as for example the value for steps parameter
- # (in Pipeline) or transformers parameter (in
- # ColumnTransformer).
- return (
- # Specification requires list/tuple of list/tuple with
- # at least length 2.
- isinstance(values, (tuple, list))
- and all(isinstance(item, (tuple, list)) and len(item) > 1 for item in values)
- # And each component needs to be a flow or interpretable string
- and all(
- isinstance(item[1], openml.flows.OpenMLFlow)
- or (
- isinstance(item[1], str)
- and item[1] in SKLEARN_PIPELINE_STRING_COMPONENTS
- )
- for item in values
- )
- )
-
- # _flow is openml flow object, _param dict maps from flow name to flow
- # id for the main call, the param dict can be overridden (useful for
- # unit tests / sentinels) this way, for flows without subflows we do
- # not have to rely on _flow_dict
- exp_parameters = set(_flow.parameters)
- if (
- isinstance(component_model, str)
- and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS
- ):
- model_parameters = set()
- else:
- model_parameters = set(component_model.get_params(deep=False))
- if len(exp_parameters.symmetric_difference(model_parameters)) != 0:
- flow_params = sorted(exp_parameters)
- model_params = sorted(model_parameters)
- raise ValueError(
- "Parameters of the model do not match the "
- "parameters expected by the "
- "flow:\nexpected flow parameters: "
- f"{flow_params}\nmodel parameters: {model_params}",
- )
- exp_components = set(_flow.components)
- if (
- isinstance(component_model, str)
- and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS
- ):
- model_components = set()
- else:
- _ = set(component_model.get_params(deep=False))
- model_components = {
- mp
- for mp in component_model.get_params(deep=True)
- if "__" not in mp and mp not in _
- }
- if len(exp_components.symmetric_difference(model_components)) != 0:
- is_problem = True
- if len(exp_components - model_components) > 0:
- # If an expected component is not returned as a component by get_params(),
- # this means that it is also a parameter -> we need to check that this is
- # actually the case
- difference = exp_components - model_components
- component_in_model_parameters = []
- for component in difference:
- if component in model_parameters:
- component_in_model_parameters.append(True)
- else:
- component_in_model_parameters.append(False)
- is_problem = not all(component_in_model_parameters)
- if is_problem:
- flow_components = sorted(exp_components)
- model_components = sorted(model_components)
- raise ValueError(
- "Subcomponents of the model do not match the "
- "parameters expected by the "
- "flow:\nexpected flow subcomponents: "
- f"{flow_components}\nmodel subcomponents: {model_components}",
- )
-
- _params = []
- for _param_name in _flow.parameters:
- _current = OrderedDict()
- _current["oml:name"] = _param_name
-
- current_param_values = self.model_to_flow(component_model.get_params()[_param_name])
-
- # Try to filter out components (a.k.a. subflows) which are
- # handled further down in the code (by recursively calling
- # this function)!
- if isinstance(current_param_values, openml.flows.OpenMLFlow):
- continue
-
- if is_subcomponent_specification(current_param_values):
- # complex parameter value, with subcomponents
- parsed_values = []
- for subcomponent in current_param_values:
- # scikit-learn stores usually tuples in the form
- # (name (str), subcomponent (mixed), argument
- # (mixed)). OpenML replaces the subcomponent by an
- # OpenMLFlow object.
- if len(subcomponent) < 2 or len(subcomponent) > 3:
- raise ValueError("Component reference should be size {2,3}. ")
-
- subcomponent_identifier = subcomponent[0]
- subcomponent_flow = subcomponent[1]
- if not isinstance(subcomponent_identifier, str):
- raise TypeError(
- "Subcomponent identifier should be of type string, "
- f"but is {type(subcomponent_identifier)}",
- )
- if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)):
- if (
- isinstance(subcomponent_flow, str)
- and subcomponent_flow in SKLEARN_PIPELINE_STRING_COMPONENTS
- ):
- pass
- else:
- raise TypeError(
- "Subcomponent flow should be of type flow, but is"
- f" {type(subcomponent_flow)}",
- )
-
- current = {
- "oml-python:serialized_object": COMPONENT_REFERENCE,
- "value": {
- "key": subcomponent_identifier,
- "step_name": subcomponent_identifier,
- },
- }
- if len(subcomponent) == 3:
- if not isinstance(subcomponent[2], list) and not isinstance(
- subcomponent[2],
- OrderedDict,
- ):
- raise TypeError(
- "Subcomponent argument should be list or OrderedDict",
- )
- current["value"]["argument_1"] = subcomponent[2]
- parsed_values.append(current)
- parsed_values = json.dumps(parsed_values)
- else:
- # vanilla parameter value
- parsed_values = json.dumps(current_param_values)
-
- _current["oml:value"] = parsed_values
- if _main_call:
- _current["oml:component"] = main_id
- else:
- _current["oml:component"] = _flow_dict[_flow.name]
- _params.append(_current)
-
- for _identifier in _flow.components:
- subcomponent_model = component_model.get_params()[_identifier]
- _params.extend(
- extract_parameters(
- _flow.components[_identifier],
- _flow_dict,
- subcomponent_model,
- ),
- )
- return _params
-
- flow_dict = get_flow_dict(flow)
- model = model if model is not None else flow.model
- return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id)
-
- def _openml_param_name_to_sklearn(
- self,
- openml_parameter: openml.setups.OpenMLParameter,
- flow: OpenMLFlow,
- ) -> str:
- """
- Converts the name of an OpenMLParameter into the sklean name, given a flow.
-
- Parameters
- ----------
- openml_parameter: OpenMLParameter
- The parameter under consideration
-
- flow: OpenMLFlow
- The flow that provides context.
-
- Returns
- -------
- sklearn_parameter_name: str
- The name the parameter will have once used in scikit-learn
- """
- if not isinstance(openml_parameter, openml.setups.OpenMLParameter):
- raise ValueError("openml_parameter should be an instance of OpenMLParameter")
- if not isinstance(flow, OpenMLFlow):
- raise ValueError("flow should be an instance of OpenMLFlow")
-
- flow_structure = flow.get_structure("name")
- if openml_parameter.flow_name not in flow_structure:
- raise ValueError("Obtained OpenMLParameter and OpenMLFlow do not correspond. ")
- name = openml_parameter.flow_name # for PEP8
- return "__".join(flow_structure[name] + [openml_parameter.parameter_name])
-
- ################################################################################################
- # Methods for hyperparameter optimization
-
- def _is_hpo_class(self, model: Any) -> bool:
- """Check whether the model performs hyperparameter optimization.
-
- Used to check whether an optimization trace can be extracted from the model after
- running it.
-
- Parameters
- ----------
- model : Any
-
- Returns
- -------
- bool
- """
- return isinstance(model, sklearn.model_selection._search.BaseSearchCV)
-
- def instantiate_model_from_hpo_class(
- self,
- model: Any,
- trace_iteration: OpenMLTraceIteration,
- ) -> Any:
- """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter
- optimization model.
-
- Parameters
- ----------
- model : Any
- A hyperparameter optimization model which defines the model to be instantiated.
- trace_iteration : OpenMLTraceIteration
- Describing the hyperparameter settings to instantiate.
-
- Returns
- -------
- Any
- """
- if not self._is_hpo_class(model):
- raise AssertionError(
- f"Flow model {model} is not an instance of"
- " sklearn.model_selection._search.BaseSearchCV",
- )
- base_estimator = model.estimator
- base_estimator.set_params(**trace_iteration.get_parameters())
- return base_estimator
-
- def _extract_trace_data(self, model, rep_no, fold_no):
- """Extracts data from a machine learning model's cross-validation results
- and creates an ARFF (Attribute-Relation File Format) trace.
-
- Parameters
- ----------
- model : Any
- A fitted hyperparameter optimization model.
- rep_no : int
- The repetition number.
- fold_no : int
- The fold number.
-
- Returns
- -------
- A list of ARFF tracecontent.
- """
- arff_tracecontent = []
- for itt_no in range(len(model.cv_results_["mean_test_score"])):
- # we use the string values for True and False, as it is defined in
- # this way by the OpenML server
- selected = "false"
- if itt_no == model.best_index_:
- selected = "true"
- test_score = model.cv_results_["mean_test_score"][itt_no]
- arff_line = [rep_no, fold_no, itt_no, test_score, selected]
- for key in model.cv_results_:
- if key.startswith("param_"):
- value = model.cv_results_[key][itt_no]
- # Built-in serializer does not convert all numpy types,
- # these methods convert them to built-in types instead.
- if isinstance(value, np.generic):
- # For scalars it actually returns scalars, not a list
- value = value.tolist()
- serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan
- arff_line.append(serialized_value)
- arff_tracecontent.append(arff_line)
- return arff_tracecontent
-
- def _obtain_arff_trace(
- self,
- model: Any,
- trace_content: list,
- ) -> OpenMLRunTrace:
- """Create arff trace object from a fitted model and the trace content obtained by
- repeatedly calling ``run_model_on_task``.
-
- Parameters
- ----------
- model : Any
- A fitted hyperparameter optimization model.
-
- trace_content : List[List]
- Trace content obtained by ``openml.runs.run_flow_on_task``.
-
- Returns
- -------
- OpenMLRunTrace
- """
- if not self._is_hpo_class(model):
- raise AssertionError(
- f"Flow model {model} is not an instance of "
- "sklearn.model_selection._search.BaseSearchCV",
- )
- if not hasattr(model, "cv_results_"):
- raise ValueError("model should contain `cv_results_`")
-
- # attributes that will be in trace arff, regardless of the model
- trace_attributes = [
- ("repeat", "NUMERIC"),
- ("fold", "NUMERIC"),
- ("iteration", "NUMERIC"),
- ("evaluation", "NUMERIC"),
- ("selected", ["true", "false"]),
- ]
-
- # model dependent attributes for trace arff
- for key in model.cv_results_:
- if key.startswith("param_"):
- # supported types should include all types, including bool,
- # int float
- supported_basic_types = (bool, int, float, str)
- for param_value in model.cv_results_[key]:
- if isinstance(param_value, np.generic):
- param_value = param_value.tolist() # noqa: PLW2901
- if (
- isinstance(param_value, supported_basic_types)
- or param_value is None
- or param_value is np.ma.masked
- ):
- # basic string values
- type = "STRING" # noqa: A001
- elif isinstance(param_value, (list, tuple)) and all(
- isinstance(i, int) for i in param_value
- ):
- # list of integers (usually for selecting features)
- # hyperparameter layer_sizes of MLPClassifier
- type = "STRING" # noqa: A001
- else:
- raise TypeError(f"Unsupported param type in param grid: {key}")
-
- # renamed the attribute param to parameter, as this is a required
- # OpenML convention - this also guards against name collisions
- # with the required trace attributes
- attribute = (PREFIX + key[6:], type) # type: ignore
- trace_attributes.append(attribute)
-
- return OpenMLRunTrace.generate(
- trace_attributes,
- trace_content,
- )
From 9d93485b78df7c9342349bfe06ba2bc20ce7f91f Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 12:27:21 +0200
Subject: [PATCH 2/8] Remove scikit-learn extension submodule
It will now be hosted in a separate repository
---
.../test_sklearn_extension/__init__.py | 0
.../test_sklearn_extension.py | 2422 -----------------
2 files changed, 2422 deletions(-)
delete mode 100644 tests/test_extensions/test_sklearn_extension/__init__.py
delete mode 100644 tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
diff --git a/tests/test_extensions/test_sklearn_extension/__init__.py b/tests/test_extensions/test_sklearn_extension/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
deleted file mode 100644
index 9913436e4..000000000
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ /dev/null
@@ -1,2422 +0,0 @@
-# License: BSD 3-Clause
-from __future__ import annotations
-
-import collections
-import json
-import os
-import re
-import sys
-import unittest
-import warnings
-from collections import OrderedDict
-from packaging.version import Version
-from typing import Any
-from unittest import mock
-
-import numpy as np
-import pandas as pd
-import pytest
-import scipy.optimize
-import scipy.stats
-import sklearn.base
-import sklearn.cluster
-import sklearn.datasets
-import sklearn.decomposition
-import sklearn.dummy
-import sklearn.ensemble
-import sklearn.feature_selection
-import sklearn.gaussian_process
-import sklearn.linear_model
-import sklearn.model_selection
-import sklearn.naive_bayes
-import sklearn.neural_network
-import sklearn.pipeline
-import sklearn.preprocessing
-import sklearn.tree
-from packaging import version
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-
-import openml
-from openml.exceptions import PyOpenMLError
-from openml.extensions.sklearn import SklearnExtension, cat, cont
-from openml.flows import OpenMLFlow
-from openml.flows.functions import assert_flows_equal
-from openml.runs.trace import OpenMLRunTrace
-from openml.testing import CustomImputer, SimpleImputer, TestBase
-
-this_directory = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(this_directory)
-
-
-__version__ = 0.1
-
-
-class Model(sklearn.base.BaseEstimator):
- def __init__(self, boolean, integer, floating_point_value):
- self.boolean = boolean
- self.integer = integer
- self.floating_point_value = floating_point_value
-
- def fit(self, X, y):
- pass
-
-
-def _cat_col_selector(X):
- return X.select_dtypes(include=["object", "category"]).columns
-
-
-def _get_sklearn_preprocessing():
- from sklearn.compose import ColumnTransformer
-
- return [
- (
- "cat_handling",
- ColumnTransformer(
- transformers=[
- (
- "cat",
- sklearn.pipeline.Pipeline(
- [
- (
- "cat_si",
- SimpleImputer(
- strategy="constant",
- fill_value="missing",
- ),
- ),
- ("cat_ohe", OneHotEncoder(handle_unknown="ignore")),
- ],
- ),
- _cat_col_selector,
- )
- ],
- remainder="passthrough",
- ),
- ),
- ("imp", SimpleImputer()),
- ]
-
-
-class TestSklearnExtensionFlowFunctions(TestBase):
- # Splitting not helpful, these test's don't rely on the server and take less
- # than 1 seconds
-
- def setUp(self):
- super().setUp(n_levels=2)
- iris = sklearn.datasets.load_iris()
- self.X = iris.data
- self.y = iris.target
-
- self.extension = SklearnExtension()
-
- def _get_expected_pipeline_description(self, model: Any) -> str:
- if version.parse(sklearn.__version__) >= version.parse("1.0"):
- expected_fixture = (
- "Pipeline of transforms with a final estimator.\n\nSequentially"
- " apply a list of transforms and a final estimator.\n"
- "Intermediate steps of the pipeline must be 'transforms', that "
- "is, they\nmust implement `fit` and `transform` methods.\nThe final "
- "estimator only needs to implement `fit`.\nThe transformers in "
- "the pipeline can be cached using ``memory`` argument.\n\nThe "
- "purpose of the pipeline is to assemble several steps that can "
- "be\ncross-validated together while setting different parameters"
- ". For this, it\nenables setting parameters of the various steps"
- " using their names and the\nparameter name separated by a `'__'`,"
- " as in the example below. A step's\nestimator may be replaced "
- "entirely by setting the parameter with its name\nto another "
- "estimator, or a transformer removed by setting it to\n"
- "`'passthrough'` or `None`."
- )
- elif version.parse(sklearn.__version__) >= version.parse("0.21.0"):
- expected_fixture = (
- "Pipeline of transforms with a final estimator.\n\nSequentially"
- " apply a list of transforms and a final estimator.\n"
- "Intermediate steps of the pipeline must be 'transforms', that "
- "is, they\nmust implement fit and transform methods.\nThe final "
- "estimator only needs to implement fit.\nThe transformers in "
- "the pipeline can be cached using ``memory`` argument.\n\nThe "
- "purpose of the pipeline is to assemble several steps that can "
- "be\ncross-validated together while setting different parameters"
- ".\nFor this, it enables setting parameters of the various steps"
- " using their\nnames and the parameter name separated by a '__',"
- " as in the example below.\nA step's estimator may be replaced "
- "entirely by setting the parameter\nwith its name to another "
- "estimator, or a transformer removed by setting\nit to "
- "'passthrough' or ``None``."
- )
- else:
- expected_fixture = self.extension._get_sklearn_description(model)
- return expected_fixture
-
- def _serialization_test_helper(
- self,
- model,
- X,
- y,
- subcomponent_parameters,
- dependencies_mock_call_count=(1, 2),
- ):
- # Regex pattern for memory addresses of style 0x7f8e0f31ecf8
- pattern = re.compile("0x[0-9a-f]{12}")
-
- with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock:
- serialization = self.extension.model_to_flow(model)
-
- if X is not None:
- model.fit(X, y)
-
- new_model = self.extension.flow_to_model(serialization)
- # compares string representations of the dict, as it potentially
- # contains complex objects that can not be compared with == op
- assert re.sub(pattern, str(model.get_params()), "") == re.sub(
- pattern, str(new_model.get_params()), ""
- )
-
- assert type(new_model) == type(model)
- assert new_model is not model
-
- if X is not None:
- new_model.fit(self.X, self.y)
-
- assert check_dependencies_mock.call_count == dependencies_mock_call_count[0]
-
- xml = serialization._to_dict()
- new_model2 = self.extension.flow_to_model(OpenMLFlow._from_dict(xml))
- assert re.sub(pattern, str(model.get_params()), "") == re.sub(
- pattern, str(new_model2.get_params()), ""
- )
-
- assert type(new_model2) == type(model)
- assert new_model2 is not model
-
- if X is not None:
- new_model2.fit(self.X, self.y)
-
- assert check_dependencies_mock.call_count == dependencies_mock_call_count[1]
-
- if subcomponent_parameters:
- for nm in (new_model, new_model2):
- new_model_params = nm.get_params()
- model_params = model.get_params()
- for subcomponent_parameter in subcomponent_parameters:
- assert type(new_model_params[subcomponent_parameter]) == type(
- model_params[subcomponent_parameter]
- )
- assert (
- new_model_params[subcomponent_parameter]
- is not model_params[subcomponent_parameter]
- )
- del new_model_params[subcomponent_parameter]
- del model_params[subcomponent_parameter]
- assert new_model_params == model_params
-
- return serialization, new_model
-
- @pytest.mark.sklearn()
- def test_serialize_model(self):
- max_features = "auto" if Version(sklearn.__version__) < Version("1.3") else "sqrt"
- model = sklearn.tree.DecisionTreeClassifier(
- criterion="entropy",
- max_features=max_features,
- max_leaf_nodes=2000,
- )
-
- tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes"
- fixture_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
- fixture_short_name = "sklearn.DecisionTreeClassifier"
- # str obtained from self.extension._get_sklearn_description(model)
- fixture_description = "A decision tree classifier."
- version_fixture = self.extension._min_dependency_str(sklearn.__version__)
-
- presort_val = "false" if Version(sklearn.__version__) < Version("0.22") else '"deprecated"'
- # min_impurity_decrease has been introduced in 0.20
- # min_impurity_split has been deprecated in 0.20
- if Version(sklearn.__version__) < Version("0.19"):
- fixture_parameters = OrderedDict(
- (
- ("class_weight", "null"),
- ("criterion", '"entropy"'),
- ("max_depth", "null"),
- ("max_features", '"auto"'),
- ("max_leaf_nodes", "2000"),
- ("min_impurity_split", "1e-07"),
- ("min_samples_leaf", "1"),
- ("min_samples_split", "2"),
- ("min_weight_fraction_leaf", "0.0"),
- ("presort", "false"),
- ("random_state", "null"),
- ("splitter", '"best"'),
- ),
- )
- elif Version(sklearn.__version__) < Version("1.0"):
- fixture_parameters = OrderedDict(
- (
- ("class_weight", "null"),
- ("criterion", '"entropy"'),
- ("max_depth", "null"),
- ("max_features", '"auto"'),
- ("max_leaf_nodes", "2000"),
- ("min_impurity_decrease", "0.0"),
- ("min_impurity_split", "null"),
- ("min_samples_leaf", "1"),
- ("min_samples_split", "2"),
- ("min_weight_fraction_leaf", "0.0"),
- ("presort", presort_val),
- ("random_state", "null"),
- ("splitter", '"best"'),
- ),
- )
- elif Version(sklearn.__version__) < Version("1.4"):
- fixture_parameters = OrderedDict(
- (
- ("class_weight", "null"),
- ("criterion", '"entropy"'),
- ("max_depth", "null"),
- ("max_features", f'"{max_features}"'),
- ("max_leaf_nodes", "2000"),
- ("min_impurity_decrease", "0.0"),
- ("min_samples_leaf", "1"),
- ("min_samples_split", "2"),
- ("min_weight_fraction_leaf", "0.0"),
- ("presort", presort_val),
- ("random_state", "null"),
- ("splitter", '"best"'),
- ),
- )
- else:
- fixture_parameters = OrderedDict(
- (
- ("class_weight", "null"),
- ("criterion", '"entropy"'),
- ("max_depth", "null"),
- ("max_features", f'"{max_features}"'),
- ("max_leaf_nodes", "2000"),
- ("min_impurity_decrease", "0.0"),
- ("min_samples_leaf", "1"),
- ("min_samples_split", "2"),
- ("min_weight_fraction_leaf", "0.0"),
- ("presort", presort_val),
- ("monotonic_cst", "null"),
- ("random_state", "null"),
- ("splitter", '"best"'),
- ),
- )
-
- if Version(sklearn.__version__) >= Version("0.22"):
- fixture_parameters.update({"ccp_alpha": "0.0"})
- fixture_parameters.move_to_end("ccp_alpha", last=False)
- if Version(sklearn.__version__) >= Version("0.24"):
- del fixture_parameters["presort"]
-
- structure_fixture = {f"sklearn.tree.{tree_name}.DecisionTreeClassifier": []}
-
- serialization, _ = self._serialization_test_helper(
- model,
- X=self.X,
- y=self.y,
- subcomponent_parameters=None,
- )
- structure = serialization.get_structure("name")
-
- assert serialization.name == fixture_name
- assert serialization.class_name == fixture_name
- assert serialization.custom_name == fixture_short_name
- assert serialization.description == fixture_description
- assert serialization.parameters == fixture_parameters
- assert serialization.dependencies == version_fixture
- self.assertDictEqual(structure, structure_fixture)
-
- @pytest.mark.sklearn()
- @pytest.mark.production()
- def test_can_handle_flow(self):
- openml.config.server = self.production_server
-
- R_flow = openml.flows.get_flow(6794)
- assert not self.extension.can_handle_flow(R_flow)
- old_3rd_party_flow = openml.flows.get_flow(7660)
- assert self.extension.can_handle_flow(old_3rd_party_flow)
-
- openml.config.server = self.test_server
-
- @pytest.mark.sklearn()
- def test_serialize_model_clustering(self):
- model = sklearn.cluster.KMeans()
-
- sklearn_version = Version(sklearn.__version__)
- cluster_name = "k_means_" if sklearn_version < Version("0.22") else "_kmeans"
- fixture_name = f"sklearn.cluster.{cluster_name}.KMeans"
- fixture_short_name = "sklearn.KMeans"
- # str obtained from self.extension._get_sklearn_description(model)
- fixture_description = "K-Means clustering{}".format(
- "" if sklearn_version < Version("0.22") else ".",
- )
- version_fixture = self.extension._min_dependency_str(sklearn.__version__)
-
- n_jobs_val = "1"
- if sklearn_version >= Version("0.20"):
- n_jobs_val = "null"
- if sklearn_version >= Version("0.23"):
- n_jobs_val = '"deprecated"'
-
- precomp_val = '"auto"' if sklearn_version < Version("0.23") else '"deprecated"'
- n_init = "10"
- if sklearn_version >= Version("1.2"):
- n_init = '"warn"'
- if sklearn_version >= Version("1.4"):
- n_init = '"auto"'
-
- algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"'
- fixture_parameters = OrderedDict(
- [
- ("algorithm", algorithm),
- ("copy_x", "true"),
- ("init", '"k-means++"'),
- ("max_iter", "300"),
- ("n_clusters", "8"),
- ("n_init", n_init),
- ("n_jobs", n_jobs_val),
- ("precompute_distances", precomp_val),
- ("random_state", "null"),
- ("tol", "0.0001"),
- ("verbose", "0"),
- ]
- )
-
- if sklearn_version >= Version("1.0"):
- fixture_parameters.pop("n_jobs")
- fixture_parameters.pop("precompute_distances")
-
- fixture_structure = {f"sklearn.cluster.{cluster_name}.KMeans": []}
-
- serialization, _ = self._serialization_test_helper(
- model,
- X=None,
- y=None,
- subcomponent_parameters=None,
- )
- structure = serialization.get_structure("name")
-
- assert serialization.name == fixture_name
- assert serialization.class_name == fixture_name
- assert serialization.custom_name == fixture_short_name
- assert serialization.description == fixture_description
- assert serialization.parameters == fixture_parameters
- assert serialization.dependencies == version_fixture
- assert structure == fixture_structure
-
- @pytest.mark.sklearn()
- def test_serialize_model_with_subcomponent(self):
- estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
- )
- estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()}
- model = sklearn.ensemble.AdaBoostClassifier(
- n_estimators=100,
- **estimator_param,
- )
-
- weight_name = "{}weight_boosting".format(
- "" if Version(sklearn.__version__) < Version("0.22") else "_",
- )
- tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes"
- fixture_name = (
- f"sklearn.ensemble.{weight_name}.AdaBoostClassifier"
- f"({estimator_name}=sklearn.tree.{tree_name}.DecisionTreeClassifier)"
- )
- fixture_class_name = f"sklearn.ensemble.{weight_name}.AdaBoostClassifier"
- fixture_short_name = "sklearn.AdaBoostClassifier"
- # str obtained from self.extension._get_sklearn_description(model)
- fixture_description = (
- "An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a "
- "meta-estimator that begins by fitting a\nclassifier on the original"
- " dataset and then fits additional copies of the\nclassifier on the "
- "same dataset but where the weights of incorrectly\nclassified "
- "instances are adjusted such that subsequent classifiers focus\nmore"
- " on difficult cases.\n\nThis class implements the algorithm known "
- "as AdaBoost-SAMME [2]."
- )
- fixture_subcomponent_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
- fixture_subcomponent_class_name = f"sklearn.tree.{tree_name}.DecisionTreeClassifier"
- # str obtained from self.extension._get_sklearn_description(model.base_estimator)
- fixture_subcomponent_description = "A decision tree classifier."
- fixture_structure = {
- fixture_name: [],
- f"sklearn.tree.{tree_name}.DecisionTreeClassifier": [estimator_name],
- }
-
- serialization, _ = self._serialization_test_helper(
- model,
- X=self.X,
- y=self.y,
- subcomponent_parameters=[estimator_name],
- dependencies_mock_call_count=(2, 4),
- )
- structure = serialization.get_structure("name")
-
- assert serialization.name == fixture_name
- assert serialization.class_name == fixture_class_name
- assert serialization.custom_name == fixture_short_name
- if Version(sklearn.__version__) < Version("1.4"):
- assert serialization.description == fixture_description
- assert serialization.parameters["algorithm"] == '"SAMME.R"'
- assert isinstance(serialization.parameters[estimator_name], str)
- assert serialization.parameters["learning_rate"] == "1.0"
- assert serialization.parameters["n_estimators"] == "100"
- assert serialization.components[estimator_name].name == fixture_subcomponent_name
- assert (
- serialization.components[estimator_name].class_name == fixture_subcomponent_class_name
- )
- assert (
- serialization.components[estimator_name].description == fixture_subcomponent_description
- )
- self.assertDictEqual(structure, fixture_structure)
-
- @pytest.mark.sklearn()
- def test_serialize_pipeline(self):
- scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- dummy = sklearn.dummy.DummyClassifier(strategy="prior")
- model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("dummy", dummy)])
-
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- fixture_name = (
- "sklearn.pipeline.Pipeline("
- f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler,"
- "dummy=sklearn.dummy.DummyClassifier)"
- )
- fixture_short_name = "sklearn.Pipeline(StandardScaler,DummyClassifier)"
- fixture_description = self._get_expected_pipeline_description(model)
- fixture_structure = {
- fixture_name: [],
- f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
- "sklearn.dummy.DummyClassifier": ["dummy"],
- }
-
- serialization, new_model = self._serialization_test_helper(
- model,
- X=self.X,
- y=self.y,
- subcomponent_parameters=["scaler", "dummy", "steps"],
- dependencies_mock_call_count=(3, 6),
- )
- structure = serialization.get_structure("name")
-
- assert serialization.name == fixture_name
- assert serialization.custom_name == fixture_short_name
- if Version(sklearn.__version__) < Version("1.3"):
- # Newer versions of scikit-learn have update docstrings
- assert serialization.description == fixture_description
- self.assertDictEqual(structure, fixture_structure)
-
- # Comparing the pipeline
- # The parameters only have the name of base objects(not the whole flow)
- # as value
- # memory parameter has been added in 0.19, verbose in 0.21
- if Version(sklearn.__version__) < Version("0.19"):
- assert len(serialization.parameters) == 1
- elif Version(sklearn.__version__) < Version("0.21"):
- assert len(serialization.parameters) == 2
- else:
- assert len(serialization.parameters) == 3
-
- # Hard to compare two representations of a dict due to possibly
- # different sorting. Making a json makes it easier
- assert json.loads(serialization.parameters["steps"]) == [
- {
- "oml-python:serialized_object": "component_reference",
- "value": {"key": "scaler", "step_name": "scaler"},
- },
- {
- "oml-python:serialized_object": "component_reference",
- "value": {"key": "dummy", "step_name": "dummy"},
- },
- ]
-
- # Checking the sub-component
- assert len(serialization.components) == 2
- assert isinstance(serialization.components["scaler"], OpenMLFlow)
- assert isinstance(serialization.components["dummy"], OpenMLFlow)
-
- assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps]
- assert new_model.steps[0][1] is not model.steps[0][1]
- assert new_model.steps[1][1] is not model.steps[1][1]
-
- @pytest.mark.sklearn()
- def test_serialize_pipeline_clustering(self):
- scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- km = sklearn.cluster.KMeans()
- model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("clusterer", km)])
-
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- cluster_name = "k_means_" if Version(sklearn.__version__) < Version("0.22") else "_kmeans"
- fixture_name = (
- "sklearn.pipeline.Pipeline("
- f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler,"
- f"clusterer=sklearn.cluster.{cluster_name}.KMeans)"
- )
- fixture_short_name = "sklearn.Pipeline(StandardScaler,KMeans)"
- fixture_description = self._get_expected_pipeline_description(model)
- fixture_structure = {
- fixture_name: [],
- f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
- f"sklearn.cluster.{cluster_name}.KMeans": ["clusterer"],
- }
- serialization, new_model = self._serialization_test_helper(
- model,
- X=None,
- y=None,
- subcomponent_parameters=["scaler", "steps", "clusterer"],
- dependencies_mock_call_count=(3, 6),
- )
- structure = serialization.get_structure("name")
-
- assert serialization.name == fixture_name
- assert serialization.custom_name == fixture_short_name
- if Version(sklearn.__version__) < Version("1.3"):
- # Newer versions of scikit-learn have update docstrings
- assert serialization.description == fixture_description
- self.assertDictEqual(structure, fixture_structure)
-
- # Comparing the pipeline
- # The parameters only have the name of base objects(not the whole flow)
- # as value
- # memory parameter has been added in 0.19
- if Version(sklearn.__version__) < Version("0.19"):
- assert len(serialization.parameters) == 1
- elif Version(sklearn.__version__) < Version("0.21"):
- assert len(serialization.parameters) == 2
- else:
- assert len(serialization.parameters) == 3
- # Hard to compare two representations of a dict due to possibly
- # different sorting. Making a json makes it easier
- assert json.loads(serialization.parameters["steps"]) == [
- {
- "oml-python:serialized_object": "component_reference",
- "value": {"key": "scaler", "step_name": "scaler"},
- },
- {
- "oml-python:serialized_object": "component_reference",
- "value": {"key": "clusterer", "step_name": "clusterer"},
- },
- ]
-
- # Checking the sub-component
- assert len(serialization.components) == 2
- assert isinstance(serialization.components["scaler"], OpenMLFlow)
- assert isinstance(serialization.components["clusterer"], OpenMLFlow)
-
- assert [step[0] for step in new_model.steps] == [step[0] for step in model.steps]
- assert new_model.steps[0][1] is not model.steps[0][1]
- assert new_model.steps[1][1] is not model.steps[1][1]
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.20"),
- reason="columntransformer introduction in 0.20.0",
- )
- def test_serialize_column_transformer(self):
- # temporary local import, dependend on version 0.20
- import sklearn.compose
-
- model = sklearn.compose.ColumnTransformer(
- transformers=[
- ("numeric", sklearn.preprocessing.StandardScaler(), [0, 1, 2]),
- (
- "nominal",
- sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
- [3, 4, 5],
- ),
- ("drop", "drop", [6, 7, 8]),
- ],
- remainder="passthrough",
- )
-
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- fixture = (
- "sklearn.compose._column_transformer.ColumnTransformer("
- f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler,"
- "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)"
- )
- fixture_short_name = "sklearn.ColumnTransformer"
-
- if version.parse(sklearn.__version__) >= version.parse("0.21.0"):
- # str obtained from self.extension._get_sklearn_description(model)
- fixture_description = (
- "Applies transformers to columns of an array or pandas "
- "DataFrame.\n\nThis estimator allows different columns or "
- "column subsets of the input\nto be transformed separately and "
- "the features generated by each transformer\nwill be "
- "concatenated to form a single feature space.\nThis is useful "
- "for heterogeneous or columnar data, to combine several\nfeature"
- " extraction mechanisms or transformations into a single "
- "transformer."
- )
- else:
- fixture_description = self.extension._get_sklearn_description(model)
-
- fixture_structure = {
- fixture: [],
- f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["numeric"],
- "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"],
- "drop": ["drop"],
- }
-
- serialization = self.extension.model_to_flow(model)
- structure = serialization.get_structure("name")
- assert serialization.name == fixture
- assert serialization.custom_name == fixture_short_name
- assert serialization.description == fixture_description
- self.assertDictEqual(structure, fixture_structure)
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.20"),
- reason="columntransformer introduction in 0.20.0",
- )
- def test_serialize_column_transformer_pipeline(self):
- # temporary local import, dependend on version 0.20
- import sklearn.compose
-
- inner = sklearn.compose.ColumnTransformer(
- transformers=[
- ("numeric", sklearn.preprocessing.StandardScaler(), [0, 1, 2]),
- (
- "nominal",
- sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
- [3, 4, 5],
- ),
- ],
- remainder="passthrough",
- )
- model = sklearn.pipeline.Pipeline(
- steps=[("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier())],
- )
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- tree_name = "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes"
- fixture_name = (
- "sklearn.pipeline.Pipeline("
- "transformer=sklearn.compose._column_transformer."
- "ColumnTransformer("
- f"numeric=sklearn.preprocessing.{scaler_name}.StandardScaler,"
- "nominal=sklearn.preprocessing._encoders.OneHotEncoder),"
- f"classifier=sklearn.tree.{tree_name}.DecisionTreeClassifier)"
- )
- fixture_structure = {
- f"sklearn.preprocessing.{scaler_name}.StandardScaler": [
- "transformer",
- "numeric",
- ],
- "sklearn.preprocessing._encoders.OneHotEncoder": ["transformer", "nominal"],
- "sklearn.compose._column_transformer.ColumnTransformer(numeric="
- f"sklearn.preprocessing.{scaler_name}.StandardScaler,nominal=sklearn."
- "preprocessing._encoders.OneHotEncoder)": ["transformer"],
- f"sklearn.tree.{tree_name}.DecisionTreeClassifier": ["classifier"],
- fixture_name: [],
- }
-
- fixture_description = self._get_expected_pipeline_description(model)
- serialization, new_model = self._serialization_test_helper(
- model,
- X=None,
- y=None,
- subcomponent_parameters=(
- "transformer",
- "classifier",
- "transformer__transformers",
- "steps",
- "transformer__nominal",
- "transformer__numeric",
- ),
- dependencies_mock_call_count=(5, 10),
- )
- structure = serialization.get_structure("name")
- assert serialization.name == fixture_name
- if Version(sklearn.__version__) < Version("1.3"): # Not yet up-to-date for later versions
- assert serialization.description == fixture_description
- self.assertDictEqual(structure, fixture_structure)
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.20"),
- reason="Pipeline processing behaviour updated",
- )
- def test_serialize_feature_union(self):
- sparse_parameter = (
- "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
- )
- ohe_params = {sparse_parameter: False}
- if Version(sklearn.__version__) >= Version("0.20"):
- ohe_params["categories"] = "auto"
- ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
- scaler = sklearn.preprocessing.StandardScaler()
-
- fu = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)])
- serialization, new_model = self._serialization_test_helper(
- fu,
- X=self.X,
- y=self.y,
- subcomponent_parameters=("ohe", "scaler", "transformer_list"),
- dependencies_mock_call_count=(3, 6),
- )
- structure = serialization.get_structure("name")
- # OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = (
- "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
- )
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- fixture_name = (
- "sklearn.pipeline.FeatureUnion("
- f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
- f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)"
- )
- fixture_structure = {
- fixture_name: [],
- f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder": ["ohe"],
- f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
- }
- assert serialization.name == fixture_name
- self.assertDictEqual(structure, fixture_structure)
- assert new_model.transformer_list[0][0] == fu.transformer_list[0][0]
- assert (
- new_model.transformer_list[0][1].get_params() == fu.transformer_list[0][1].get_params()
- )
- assert new_model.transformer_list[1][0] == fu.transformer_list[1][0]
- assert (
- new_model.transformer_list[1][1].get_params() == fu.transformer_list[1][1].get_params()
- )
-
- assert [step[0] for step in new_model.transformer_list] == [
- step[0] for step in fu.transformer_list
- ]
- assert new_model.transformer_list[0][1] is not fu.transformer_list[0][1]
- assert new_model.transformer_list[1][1] is not fu.transformer_list[1][1]
-
- fu.set_params(scaler="drop")
- serialization, new_model = self._serialization_test_helper(
- fu,
- X=self.X,
- y=self.y,
- subcomponent_parameters=("ohe", "transformer_list"),
- dependencies_mock_call_count=(3, 6),
- )
- assert (
- serialization.name == "sklearn.pipeline.FeatureUnion("
- f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
- "scaler=drop)"
- )
- assert new_model.transformer_list[1][1] == "drop"
-
- @pytest.mark.sklearn()
- def test_serialize_feature_union_switched_names(self):
- ohe_params = (
- {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {}
- )
- ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
- scaler = sklearn.preprocessing.StandardScaler()
- fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)])
- fu2 = sklearn.pipeline.FeatureUnion(transformer_list=[("scaler", ohe), ("ohe", scaler)])
-
- fu1_serialization, _ = self._serialization_test_helper(
- fu1,
- X=None,
- y=None,
- subcomponent_parameters=(),
- dependencies_mock_call_count=(3, 6),
- )
- fu2_serialization, _ = self._serialization_test_helper(
- fu2,
- X=None,
- y=None,
- subcomponent_parameters=(),
- dependencies_mock_call_count=(3, 6),
- )
-
- # OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = (
- "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
- )
- scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
- assert (
- fu1_serialization.name == "sklearn.pipeline.FeatureUnion("
- f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
- f"scaler=sklearn.preprocessing.{scaler_name}.StandardScaler)"
- )
- assert (
- fu2_serialization.name == "sklearn.pipeline.FeatureUnion("
- f"scaler=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
- f"ohe=sklearn.preprocessing.{scaler_name}.StandardScaler)"
- )
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) >= Version("1.4"),
- "AdaBoost parameter name changed as did the way its forwarded to GridSearchCV",
- )
- def test_serialize_complex_flow(self):
- ohe = sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore")
- scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- boosting = sklearn.ensemble.AdaBoostClassifier(
- base_estimator=sklearn.tree.DecisionTreeClassifier(),
- )
- model = sklearn.pipeline.Pipeline(
- steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)],
- )
- parameter_grid = {
- "boosting__base_estimator__max_depth": scipy.stats.randint(1, 10),
- "boosting__learning_rate": scipy.stats.uniform(0.01, 0.99),
- "boosting__n_estimators": [1, 5, 10, 100],
- }
- # convert to ordered dict, sorted by keys) due to param grid check
- parameter_grid = OrderedDict(sorted(parameter_grid.items()))
- cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
- rs = sklearn.model_selection.RandomizedSearchCV(
- estimator=model,
- param_distributions=parameter_grid,
- cv=cv,
- )
- serialized, new_model = self._serialization_test_helper(
- rs,
- X=self.X,
- y=self.y,
- subcomponent_parameters=(),
- dependencies_mock_call_count=(6, 12),
- )
- structure = serialized.get_structure("name")
- # OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = (
- "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
- )
- ohe_name = f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder"
- scaler_name = "sklearn.preprocessing.{}.StandardScaler".format(
- "data" if Version(sklearn.__version__) < Version("0.22") else "_data",
- )
- tree_name = "sklearn.tree.{}.DecisionTreeClassifier".format(
- "tree" if Version(sklearn.__version__) < Version("0.22") else "_classes",
- )
- weight_name = "weight" if Version(sklearn.__version__) < Version("0.22") else "_weight"
- boosting_name = "sklearn.ensemble.{}_boosting.AdaBoostClassifier(base_estimator={})".format(
- weight_name,
- tree_name,
- )
- pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={},boosting={})".format(
- ohe_name,
- scaler_name,
- boosting_name,
- )
- fixture_name = (
- f"sklearn.model_selection._search.RandomizedSearchCV(estimator={pipeline_name})"
- )
- fixture_structure = {
- ohe_name: ["estimator", "ohe"],
- scaler_name: ["estimator", "scaler"],
- tree_name: ["estimator", "boosting", "base_estimator"],
- boosting_name: ["estimator", "boosting"],
- pipeline_name: ["estimator"],
- fixture_name: [],
- }
- assert serialized.name == fixture_name
- assert structure == fixture_structure
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.21"),
- reason="Pipeline till 0.20 doesn't support 'passthrough'",
- )
- def test_serialize_strings_as_pipeline_steps(self):
- import sklearn.compose
-
- # First check: test whether a passthrough in a pipeline is serialized correctly
- model = sklearn.pipeline.Pipeline(steps=[("transformer", "passthrough")])
- serialized = self.extension.model_to_flow(model)
- assert isinstance(serialized, OpenMLFlow)
- assert len(serialized.components) == 1
- assert serialized.components["transformer"].name == "passthrough"
- serialized = self.extension._serialize_sklearn(
- ("transformer", "passthrough"),
- parent_model=model,
- )
- assert serialized == ("transformer", "passthrough")
- extracted_info = self.extension._extract_information_from_model(model)
- assert len(extracted_info[2]) == 1
- assert isinstance(extracted_info[2]["transformer"], OpenMLFlow)
- assert extracted_info[2]["transformer"].name == "passthrough"
-
- # Second check: test whether a lone passthrough in a column transformer is serialized
- # correctly
- model = sklearn.compose.ColumnTransformer([("passthrough", "passthrough", (0,))])
- serialized = self.extension.model_to_flow(model)
- assert isinstance(serialized, OpenMLFlow)
- assert len(serialized.components) == 1
- assert serialized.components["passthrough"].name == "passthrough"
- serialized = self.extension._serialize_sklearn(
- ("passthrough", "passthrough"),
- parent_model=model,
- )
- assert serialized == ("passthrough", "passthrough")
- extracted_info = self.extension._extract_information_from_model(model)
- assert len(extracted_info[2]) == 1
- assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
- assert extracted_info[2]["passthrough"].name == "passthrough"
-
- # Third check: passthrough and drop in a column transformer
- model = sklearn.compose.ColumnTransformer(
- [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))],
- )
- serialized = self.extension.model_to_flow(model)
- assert isinstance(serialized, OpenMLFlow)
- assert len(serialized.components) == 2
- assert serialized.components["passthrough"].name == "passthrough"
- assert serialized.components["drop"].name == "drop"
- serialized = self.extension._serialize_sklearn(
- ("passthrough", "passthrough"),
- parent_model=model,
- )
- assert serialized == ("passthrough", "passthrough")
- extracted_info = self.extension._extract_information_from_model(model)
- assert len(extracted_info[2]) == 2
- assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
- assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
- assert extracted_info[2]["passthrough"].name == "passthrough"
- assert extracted_info[2]["drop"].name == "drop"
-
- # Fourth check: having an actual preprocessor in the column transformer, too
- model = sklearn.compose.ColumnTransformer(
- [
- ("passthrough", "passthrough", (0,)),
- ("drop", "drop", (1,)),
- ("test", sklearn.preprocessing.StandardScaler(), (2,)),
- ],
- )
- serialized = self.extension.model_to_flow(model)
- assert isinstance(serialized, OpenMLFlow)
- assert len(serialized.components) == 3
- assert serialized.components["passthrough"].name == "passthrough"
- assert serialized.components["drop"].name == "drop"
- serialized = self.extension._serialize_sklearn(
- ("passthrough", "passthrough"),
- parent_model=model,
- )
- assert serialized == ("passthrough", "passthrough")
- extracted_info = self.extension._extract_information_from_model(model)
- assert len(extracted_info[2]) == 3
- assert isinstance(extracted_info[2]["passthrough"], OpenMLFlow)
- assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
- assert extracted_info[2]["passthrough"].name == "passthrough"
- assert extracted_info[2]["drop"].name == "drop"
-
- # Fifth check: test whether a lone drop in a feature union is serialized correctly
- model = sklearn.pipeline.FeatureUnion([("drop", "drop")])
- serialized = self.extension.model_to_flow(model)
- assert isinstance(serialized, OpenMLFlow)
- assert len(serialized.components) == 1
- assert serialized.components["drop"].name == "drop"
- serialized = self.extension._serialize_sklearn(("drop", "drop"), parent_model=model)
- assert serialized == ("drop", "drop")
- extracted_info = self.extension._extract_information_from_model(model)
- assert len(extracted_info[2]) == 1
- assert isinstance(extracted_info[2]["drop"], OpenMLFlow)
- assert extracted_info[2]["drop"].name == "drop"
-
- @pytest.mark.sklearn()
- def test_serialize_type(self):
- supported_types = [float, np.float32, np.float64, int, np.int32, np.int64]
- if Version(np.__version__) < Version("1.24"):
- supported_types.append(float)
- supported_types.append(int)
-
- for supported_type in supported_types:
- serialized = self.extension.model_to_flow(supported_type)
- deserialized = self.extension.flow_to_model(serialized)
- assert deserialized == supported_type
-
- @pytest.mark.sklearn()
- def test_serialize_rvs(self):
- supported_rvs = [
- scipy.stats.norm(loc=1, scale=5),
- scipy.stats.expon(loc=1, scale=5),
- scipy.stats.randint(low=-3, high=15),
- ]
-
- for supported_rv in supported_rvs:
- serialized = self.extension.model_to_flow(supported_rv)
- deserialized = self.extension.flow_to_model(serialized)
- assert type(deserialized.dist) == type(supported_rv.dist)
- del deserialized.dist
- del supported_rv.dist
- assert deserialized.__dict__ == supported_rv.__dict__
-
- @pytest.mark.sklearn()
- def test_serialize_function(self):
- serialized = self.extension.model_to_flow(sklearn.feature_selection.chi2)
- deserialized = self.extension.flow_to_model(serialized)
- assert deserialized == sklearn.feature_selection.chi2
-
- @pytest.mark.sklearn()
- def test_serialize_cvobject(self):
- methods = [sklearn.model_selection.KFold(3), sklearn.model_selection.LeaveOneOut()]
- fixtures = [
- OrderedDict(
- [
- ("oml-python:serialized_object", "cv_object"),
- (
- "value",
- OrderedDict(
- [
- ("name", "sklearn.model_selection._split.KFold"),
- (
- "parameters",
- OrderedDict(
- [
- ("n_splits", "3"),
- ("random_state", "null"),
- ("shuffle", "false"),
- ],
- ),
- ),
- ],
- ),
- ),
- ],
- ),
- OrderedDict(
- [
- ("oml-python:serialized_object", "cv_object"),
- (
- "value",
- OrderedDict(
- [
- ("name", "sklearn.model_selection._split.LeaveOneOut"),
- ("parameters", OrderedDict()),
- ],
- ),
- ),
- ],
- ),
- ]
- for method, fixture in zip(methods, fixtures):
- m = self.extension.model_to_flow(method)
- assert m == fixture
-
- m_new = self.extension.flow_to_model(m)
- assert m_new is not m
- assert isinstance(m_new, type(method))
-
- @pytest.mark.sklearn()
- def test_serialize_simple_parameter_grid(self):
- # We cannot easily test for scipy random variables in here, but they
- # should be covered
-
- # Examples from the scikit-learn documentation
- models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
- grids = [
- [
- OrderedDict([("C", [1, 10, 100, 1000]), ("kernel", ["linear"])]),
- OrderedDict(
- [("C", [1, 10, 100, 1000]), ("gamma", [0.001, 0.0001]), ("kernel", ["rbf"])],
- ),
- ],
- OrderedDict(
- [
- ("bootstrap", [True, False]),
- ("criterion", ["gini", "entropy"]),
- ("max_depth", [3, None]),
- ("max_features", [1, 3, 10]),
- ("min_samples_leaf", [1, 3, 10]),
- ("min_samples_split", [1, 3, 10]),
- ],
- ),
- ]
-
- for grid, model in zip(grids, models):
- serialized = self.extension.model_to_flow(grid)
- deserialized = self.extension.flow_to_model(serialized)
-
- assert deserialized == grid
- assert deserialized is not grid
- # providing error_score because nan != nan
- hpo = sklearn.model_selection.GridSearchCV(
- param_grid=grid,
- estimator=model,
- error_score=-1000,
- )
-
- serialized = self.extension.model_to_flow(hpo)
- deserialized = self.extension.flow_to_model(serialized)
- assert hpo.param_grid == deserialized.param_grid
- assert hpo.estimator.get_params() == deserialized.estimator.get_params()
- hpo_params = hpo.get_params(deep=False)
- deserialized_params = deserialized.get_params(deep=False)
- del hpo_params["estimator"]
- del deserialized_params["estimator"]
- assert hpo_params == deserialized_params
-
- @pytest.mark.sklearn()
- @unittest.skip(
- "This feature needs further reworking. If we allow several "
- "components, we need to register them all in the downstream "
- "flows. This is so far not implemented.",
- )
- def test_serialize_advanced_grid(self):
- # TODO instead a GridSearchCV object should be serialized
-
- # This needs to be in its own function because we cannot simply check
- # for the equality of the grid, because scikit-learn objects don't
- # really support the equality operator
- # This will only work with sklearn==0.18
- N_FEATURES_OPTIONS = [2, 4, 8]
- C_OPTIONS = [1, 10, 100, 1000]
- grid = [
- {
- "reduce_dim": [
- sklearn.decomposition.PCA(iterated_power=7),
- sklearn.decomposition.NMF(),
- ],
- "reduce_dim__n_components": N_FEATURES_OPTIONS,
- "classify__C": C_OPTIONS,
- },
- {
- "reduce_dim": [
- sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2),
- ],
- "reduce_dim__k": N_FEATURES_OPTIONS,
- "classify__C": C_OPTIONS,
- },
- ]
-
- serialized = self.extension.model_to_flow(grid)
- deserialized = self.extension.flow_to_model(serialized)
-
- assert (
- grid[0]["reduce_dim"][0].get_params() == deserialized[0]["reduce_dim"][0].get_params()
- )
- assert grid[0]["reduce_dim"][0] is not deserialized[0]["reduce_dim"][0]
- assert (
- grid[0]["reduce_dim"][1].get_params() == deserialized[0]["reduce_dim"][1].get_params()
- )
- assert grid[0]["reduce_dim"][1] is not deserialized[0]["reduce_dim"][1]
- assert grid[0]["reduce_dim__n_components"] == deserialized[0]["reduce_dim__n_components"]
- assert grid[0]["classify__C"] == deserialized[0]["classify__C"]
- assert (
- grid[1]["reduce_dim"][0].get_params() == deserialized[1]["reduce_dim"][0].get_params()
- )
- assert grid[1]["reduce_dim"][0] is not deserialized[1]["reduce_dim"][0]
- assert grid[1]["reduce_dim__k"] == deserialized[1]["reduce_dim__k"]
- assert grid[1]["classify__C"] == deserialized[1]["classify__C"]
-
- @pytest.mark.sklearn()
- def test_serialize_advanced_grid_fails(self):
- # This unit test is checking that the test we skip above would actually fail
-
- param_grid = {
- "base_estimator": [
- sklearn.tree.DecisionTreeClassifier(),
- sklearn.tree.ExtraTreeClassifier(),
- ],
- }
-
- clf = sklearn.model_selection.GridSearchCV(
- sklearn.ensemble.BaggingClassifier(),
- param_grid=param_grid,
- )
- with pytest.raises(
- TypeError,
- match=re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL),
- ):
- self.extension.model_to_flow(clf)
-
- @pytest.mark.sklearn()
- def test_serialize_resampling(self):
- kfold = sklearn.model_selection.StratifiedKFold(n_splits=4, shuffle=True)
- serialized = self.extension.model_to_flow(kfold)
- deserialized = self.extension.flow_to_model(serialized)
- # Best approximation to get_params()
- assert str(deserialized) == str(kfold)
- assert deserialized is not kfold
-
- @pytest.mark.sklearn()
- def test_hypothetical_parameter_values(self):
- # The hypothetical parameter values of true, 1, 0.1 formatted as a
- # string (and their correct serialization and deserialization) an only
- # be checked inside a model
-
- model = Model("true", "1", "0.1")
-
- serialized = self.extension.model_to_flow(model)
- serialized.external_version = "sklearn==test123"
- deserialized = self.extension.flow_to_model(serialized)
- assert deserialized.get_params() == model.get_params()
- assert deserialized is not model
-
- @pytest.mark.sklearn()
- def test_gaussian_process(self):
- opt = scipy.optimize.fmin_l_bfgs_b
- kernel = sklearn.gaussian_process.kernels.Matern()
- gp = sklearn.gaussian_process.GaussianProcessClassifier(kernel=kernel, optimizer=opt)
- with pytest.raises(
- TypeError,
- match=r"Matern\(length_scale=1, nu=1.5\), ",
- ):
- self.extension.model_to_flow(gp)
-
- @pytest.mark.sklearn()
- def test_error_on_adding_component_multiple_times_to_flow(self):
- # this function implicitly checks
- # - openml.flows._check_multiple_occurence_of_component_in_flow()
- pca = sklearn.decomposition.PCA()
- pca2 = sklearn.decomposition.PCA()
- pipeline = sklearn.pipeline.Pipeline((("pca1", pca), ("pca2", pca2)))
- fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
- with pytest.raises(ValueError, match=fixture):
- self.extension.model_to_flow(pipeline)
-
- fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2)))
- fixture = (
- "Found a second occurence of component .*.PCA when trying to serialize FeatureUnion"
- )
- with pytest.raises(ValueError, match=fixture):
- self.extension.model_to_flow(fu)
-
- fs = sklearn.feature_selection.SelectKBest()
- fu2 = sklearn.pipeline.FeatureUnion((("pca1", pca), ("fs", fs)))
- pipeline2 = sklearn.pipeline.Pipeline((("fu", fu2), ("pca2", pca2)))
- fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
- with pytest.raises(ValueError, match=fixture):
- self.extension.model_to_flow(pipeline2)
-
- @pytest.mark.sklearn()
- def test_subflow_version_propagated(self):
- this_directory = os.path.dirname(os.path.abspath(__file__))
- tests_directory = os.path.abspath(os.path.join(this_directory, "..", ".."))
- sys.path.append(tests_directory)
- import tests.test_flows.dummy_learn.dummy_forest
-
- pca = sklearn.decomposition.PCA()
- dummy = tests.test_flows.dummy_learn.dummy_forest.DummyRegressor()
- pipeline = sklearn.pipeline.Pipeline((("pca", pca), ("dummy", dummy)))
- flow = self.extension.model_to_flow(pipeline)
- # In python2.7, the unit tests work differently on travis-ci; therefore,
- # I put the alternative travis-ci answer here as well. While it has a
- # different value, it is still correct as it is a propagation of the
- # subclasses' module name
- assert flow.external_version == "{},{},{}".format(
- self.extension._format_external_version("openml", openml.__version__),
- self.extension._format_external_version("sklearn", sklearn.__version__),
- self.extension._format_external_version("tests", "0.1"),
- )
-
- @pytest.mark.sklearn()
- @mock.patch("warnings.warn")
- def test_check_dependencies(self, warnings_mock):
- dependencies = ["sklearn==0.1", "sklearn>=99.99.99", "sklearn>99.99.99"]
- for dependency in dependencies:
- self.assertRaises(ValueError, self.extension._check_dependencies, dependency)
-
- @pytest.mark.sklearn()
- def test_illegal_parameter_names(self):
- # illegal name: estimators
- clf1 = sklearn.ensemble.VotingClassifier(
- estimators=[
- ("estimators", sklearn.ensemble.RandomForestClassifier()),
- ("whatevs", sklearn.ensemble.ExtraTreesClassifier()),
- ],
- )
- clf2 = sklearn.ensemble.VotingClassifier(
- estimators=[
- ("whatevs", sklearn.ensemble.RandomForestClassifier()),
- ("estimators", sklearn.ensemble.ExtraTreesClassifier()),
- ],
- )
- cases = [clf1, clf2]
-
- for case in cases:
- self.assertRaises(PyOpenMLError, self.extension.model_to_flow, case)
-
- @pytest.mark.sklearn()
- def test_paralizable_check(self):
- # using this model should pass the test (if param distribution is
- # legal)
- singlecore_bagging = sklearn.ensemble.BaggingClassifier()
- # using this model should return false (if param distribution is legal)
- multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5)
- # using this param distribution should raise an exception
- illegal_param_dist = {"base__n_jobs": [-1, 0, 1]}
- # using this param distribution should not raise an exception
- legal_param_dist = {"n_estimators": [2, 3, 4]}
-
- estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
- )
- legal_models = [
- sklearn.ensemble.RandomForestClassifier(),
- sklearn.ensemble.RandomForestClassifier(n_jobs=5),
- sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
- sklearn.pipeline.Pipeline(
- steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=1))],
- ),
- sklearn.pipeline.Pipeline(
- steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=5))],
- ),
- sklearn.pipeline.Pipeline(
- steps=[("bag", sklearn.ensemble.BaggingClassifier(n_jobs=-1))],
- ),
- sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist),
- sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist),
- sklearn.ensemble.BaggingClassifier(
- n_jobs=-1,
- **{estimator_name: sklearn.ensemble.RandomForestClassifier(n_jobs=5)},
- ),
- ]
- illegal_models = [
- sklearn.model_selection.GridSearchCV(singlecore_bagging, illegal_param_dist),
- sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist),
- ]
-
- if Version(sklearn.__version__) < Version("0.20"):
- has_refit_time = [False, False, False, False, False, False, False, False, False]
- else:
- has_refit_time = [False, False, False, False, False, False, True, True, False]
-
- X, y = sklearn.datasets.load_iris(return_X_y=True)
- for model, refit_time in zip(legal_models, has_refit_time):
- model.fit(X, y)
- assert refit_time == hasattr(model, "refit_time_")
-
- for model in illegal_models:
- with pytest.raises(PyOpenMLError):
- self.extension._prevent_optimize_n_jobs(model)
-
- @pytest.mark.sklearn()
- def test__get_fn_arguments_with_defaults(self):
- sklearn_version = Version(sklearn.__version__)
- if sklearn_version < Version("0.19"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 15),
- (sklearn.tree.DecisionTreeClassifier.__init__, 12),
- (sklearn.pipeline.Pipeline.__init__, 0),
- ]
- elif sklearn_version < Version("0.21"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 16),
- (sklearn.tree.DecisionTreeClassifier.__init__, 13),
- (sklearn.pipeline.Pipeline.__init__, 1),
- ]
- elif sklearn_version < Version("0.22"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 16),
- (sklearn.tree.DecisionTreeClassifier.__init__, 13),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
- elif sklearn_version < Version("0.23"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 18),
- (sklearn.tree.DecisionTreeClassifier.__init__, 14),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
- elif sklearn_version < Version("0.24"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 18),
- (sklearn.tree.DecisionTreeClassifier.__init__, 14),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
- elif sklearn_version < Version("1.0"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 18),
- (sklearn.tree.DecisionTreeClassifier.__init__, 13),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
- elif sklearn_version < Version("1.4"):
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 17),
- (sklearn.tree.DecisionTreeClassifier.__init__, 12),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
- else:
- fns = [
- (sklearn.ensemble.RandomForestRegressor.__init__, 18),
- (sklearn.tree.DecisionTreeClassifier.__init__, 13),
- (sklearn.pipeline.Pipeline.__init__, 2),
- ]
-
- for fn, num_params_with_defaults in fns:
- defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
- assert isinstance(defaults, dict)
- assert isinstance(defaultless, set)
- # check whether we have both defaults and defaultless params
- assert len(defaults) == num_params_with_defaults
- assert len(defaultless) > 0
- # check no overlap
- self.assertSetEqual(set(defaults.keys()), set(defaults.keys()) - defaultless)
- self.assertSetEqual(defaultless, defaultless - set(defaults.keys()))
-
- @pytest.mark.sklearn()
- def test_deserialize_with_defaults(self):
- # used the 'initialize_with_defaults' flag of the deserialization
- # method to return a flow that contains default hyperparameter
- # settings.
- steps = [
- ("Imputer", SimpleImputer()),
- ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()),
- ("Estimator", sklearn.tree.DecisionTreeClassifier()),
- ]
- pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
- pipe_adjusted = sklearn.clone(pipe_orig)
- if Version(sklearn.__version__) < Version("0.23"):
- params = {
- "Imputer__strategy": "median",
- "OneHotEncoder__sparse": False,
- "Estimator__min_samples_leaf": 42,
- }
- elif Version(sklearn.__version__) < Version("1.4"):
- params = {
- "Imputer__strategy": "mean",
- "OneHotEncoder__sparse": True,
- "Estimator__min_samples_leaf": 1,
- }
- else:
- params = {
- "Imputer__strategy": "mean",
- "OneHotEncoder__sparse_output": True,
- "Estimator__min_samples_leaf": 1,
- }
- pipe_adjusted.set_params(**params)
- flow = self.extension.model_to_flow(pipe_adjusted)
- pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
- # we want to compare pipe_deserialized and pipe_orig. We use the flow
- # equals function for this
- assert_flows_equal(
- self.extension.model_to_flow(pipe_orig),
- self.extension.model_to_flow(pipe_deserialized),
- )
-
- @pytest.mark.sklearn()
- def test_deserialize_adaboost_with_defaults(self):
- # used the 'initialize_with_defaults' flag of the deserialization
- # method to return a flow that contains default hyperparameter
- # settings.
- steps = [
- ("Imputer", SimpleImputer()),
- ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()),
- (
- "Estimator",
- sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier()),
- ),
- ]
- pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
- pipe_adjusted = sklearn.clone(pipe_orig)
- if Version(sklearn.__version__) < Version("0.22"):
- params = {
- "Imputer__strategy": "median",
- "OneHotEncoder__sparse": False,
- "Estimator__n_estimators": 10,
- }
- elif Version(sklearn.__version__) < Version("1.4"):
- params = {
- "Imputer__strategy": "mean",
- "OneHotEncoder__sparse": True,
- "Estimator__n_estimators": 50,
- }
- else:
- params = {
- "Imputer__strategy": "mean",
- "OneHotEncoder__sparse_output": True,
- "Estimator__n_estimators": 50,
- }
- pipe_adjusted.set_params(**params)
- flow = self.extension.model_to_flow(pipe_adjusted)
- pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
- # we want to compare pipe_deserialized and pipe_orig. We use the flow
- # equals function for this
- assert_flows_equal(
- self.extension.model_to_flow(pipe_orig),
- self.extension.model_to_flow(pipe_deserialized),
- )
-
- @pytest.mark.sklearn()
- def test_deserialize_complex_with_defaults(self):
- # used the 'initialize_with_defaults' flag of the deserialization
- # method to return a flow that contains default hyperparameter
- # settings.
- steps = [
- ("Imputer", SimpleImputer()),
- ("OneHotEncoder", sklearn.preprocessing.OneHotEncoder()),
- (
- "Estimator",
- sklearn.ensemble.AdaBoostClassifier(
- sklearn.ensemble.BaggingClassifier(
- sklearn.ensemble.GradientBoostingClassifier(),
- ),
- ),
- ),
- ]
- pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
- pipe_adjusted = sklearn.clone(pipe_orig)
- impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean"
- sparse = Version(sklearn.__version__) >= Version("0.23")
- sparse_parameter = (
- "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
- )
- estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator"
- )
- params = {
- "Imputer__strategy": impute_strategy,
- f"OneHotEncoder__{sparse_parameter}": sparse,
- "Estimator__n_estimators": 10,
- f"Estimator__{estimator_name}__n_estimators": 10,
- f"Estimator__{estimator_name}__{estimator_name}__learning_rate": 0.1,
- }
-
- pipe_adjusted.set_params(**params)
- flow = self.extension.model_to_flow(pipe_adjusted)
- pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
- # we want to compare pipe_deserialized and pipe_orig. We use the flow
- # equals function for this
- assert_flows_equal(
- self.extension.model_to_flow(pipe_orig),
- self.extension.model_to_flow(pipe_deserialized),
- )
-
- @pytest.mark.sklearn()
- def test_openml_param_name_to_sklearn(self):
- scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
- )
- boosting = sklearn.ensemble.AdaBoostClassifier(
- **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
- )
- model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)])
- flow = self.extension.model_to_flow(model)
- task = openml.tasks.get_task(115) # diabetes; crossvalidation
- run = openml.runs.run_flow_on_task(flow, task)
- run = run.publish()
- TestBase._mark_entity_for_removal("run", run.run_id)
- TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}")
- run = openml.runs.get_run(run.run_id)
- setup = openml.setups.get_setup(run.setup_id)
-
- # make sure to test enough parameters
- assert len(setup.parameters) > 15
-
- for parameter in setup.parameters.values():
- sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)
-
- # test the inverse. Currently, OpenML stores the hyperparameter
- # fullName as flow.name + flow.version + parameter.name on the
- # server (but this behaviour is not documented and might or might
- # not change in the future. Hence, we won't offer this
- # transformation functionality in the main package yet.)
- splitted = sklearn_name.split("__")
- if len(splitted) > 1: # if len is 1, it is part of root flow
- subflow = flow.get_subflow(splitted[0:-1])
- else:
- subflow = flow
- openml_name = f"{subflow.name}({subflow.version})_{splitted[-1]}"
- assert parameter.full_name == openml_name
-
- @pytest.mark.sklearn()
- def test_obtain_parameter_values_flow_not_from_server(self):
- model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
- flow = self.extension.model_to_flow(model)
- logistic_name = (
- "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic"
- )
- msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!"
-
- with pytest.raises(ValueError, match=msg):
- self.extension.obtain_parameter_values(flow)
-
- estimator_name = (
- "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
- )
- model = sklearn.ensemble.AdaBoostClassifier(
- **{
- estimator_name: sklearn.linear_model.LogisticRegression(
- solver="lbfgs",
- ),
- }
- )
- flow = self.extension.model_to_flow(model)
- flow.flow_id = 1
- with pytest.raises(ValueError, match=msg):
- self.extension.obtain_parameter_values(flow)
-
- @pytest.mark.sklearn()
- def test_obtain_parameter_values(self):
- model = sklearn.model_selection.RandomizedSearchCV(
- estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
- param_distributions={
- "max_depth": [3, None],
- "max_features": [1, 2, 3, 4],
- "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
- "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
- "bootstrap": [True, False],
- "criterion": ["gini", "entropy"],
- },
- cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
- n_iter=5,
- )
- flow = self.extension.model_to_flow(model)
- flow.flow_id = 1
- flow.components["estimator"].flow_id = 2
- parameters = self.extension.obtain_parameter_values(flow)
- for parameter in parameters:
- assert parameter["oml:component"] is not None, parameter
- if parameter["oml:name"] == "n_estimators":
- assert parameter["oml:value"] == "5"
- assert parameter["oml:component"] == 2
-
- @pytest.mark.sklearn()
- def test_numpy_type_allowed_in_flow(self):
- """Simple numpy types should be serializable."""
- dt = sklearn.tree.DecisionTreeClassifier(
- max_depth=np.float64(3.0),
- min_samples_leaf=np.int32(5),
- )
- self.extension.model_to_flow(dt)
-
- @pytest.mark.sklearn()
- def test_numpy_array_not_allowed_in_flow(self):
- """Simple numpy arrays should not be serializable."""
- bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3]))
- with pytest.raises(TypeError):
- self.extension.model_to_flow(bin)
-
-
-class TestSklearnExtensionRunFunctions(TestBase):
- _multiprocess_can_split_ = True
-
- def setUp(self):
- super().setUp(n_levels=2)
- self.extension = SklearnExtension()
-
- ################################################################################################
- # Test methods for performing runs with this extension module
-
- @pytest.mark.sklearn()
- def test_run_model_on_task(self):
- task = openml.tasks.get_task(1) # anneal; crossvalidation
- # using most_frequent imputer since dataset has mixed types and to keep things simple
- pipe = sklearn.pipeline.Pipeline(
- [
- ("imp", SimpleImputer(strategy="most_frequent")),
- ("dummy", sklearn.dummy.DummyClassifier()),
- ],
- )
- openml.runs.run_model_on_task(pipe, task)
-
- @pytest.mark.sklearn()
- def test_seed_model(self):
- # randomized models that are initialized without seeds, can be seeded
- randomized_clfs = [
- sklearn.ensemble.BaggingClassifier(),
- sklearn.model_selection.RandomizedSearchCV(
- sklearn.ensemble.RandomForestClassifier(),
- {
- "max_depth": [3, None],
- "max_features": [1, 2, 3, 4],
- "bootstrap": [True, False],
- "criterion": ["gini", "entropy"],
- "random_state": [-1, 0, 1, 2],
- },
- cv=sklearn.model_selection.StratifiedKFold(n_splits=2, shuffle=True),
- ),
- sklearn.dummy.DummyClassifier(),
- ]
-
- for idx, clf in enumerate(randomized_clfs):
- const_probe = 42
- all_params = clf.get_params()
- params = [key for key in all_params if key.endswith("random_state")]
- assert len(params) > 0
-
- # before param value is None
- for param in params:
- assert all_params[param] is None
-
- # now seed the params
- clf_seeded = self.extension.seed_model(clf, const_probe)
- new_params = clf_seeded.get_params()
-
- randstate_params = [key for key in new_params if key.endswith("random_state")]
-
- # afterwards, param value is set
- for param in randstate_params:
- assert isinstance(new_params[param], int)
- assert new_params[param] is not None
-
- if idx == 1:
- assert clf.cv.random_state == 56422
-
- @pytest.mark.sklearn()
- def test_seed_model_raises(self):
- # the _set_model_seed_where_none should raise exception if random_state is
- # anything else than an int
- randomized_clfs = [
- sklearn.ensemble.BaggingClassifier(random_state=np.random.RandomState(42)),
- sklearn.dummy.DummyClassifier(random_state="OpenMLIsGreat"),
- ]
-
- for clf in randomized_clfs:
- with pytest.raises(ValueError):
- self.extension.seed_model(model=clf, seed=42)
-
- @pytest.mark.sklearn()
- def test_run_model_on_fold_classification_1_array(self):
- task = openml.tasks.get_task(1) # anneal; crossvalidation
-
- X, y = task.get_X_and_y()
- train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X.iloc[train_indices]
- y_train = y.iloc[train_indices]
- X_test = X.iloc[test_indices]
- y_test = y.iloc[test_indices]
-
- pipeline = sklearn.pipeline.Pipeline(
- steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
- )
- # TODO add some mocking here to actually test the innards of this function, too!
- res = self.extension._run_model_on_fold(
- model=pipeline,
- task=task,
- fold_no=0,
- rep_no=0,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- )
-
- y_hat, y_hat_proba, user_defined_measures, trace = res
-
- # predictions
- assert isinstance(y_hat, np.ndarray)
- assert y_hat.shape == y_test.shape
- assert isinstance(y_hat_proba, pd.DataFrame)
- assert y_hat_proba.shape == (y_test.shape[0], 6)
- np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
- # The class '4' (at index 3) is not present in the training data. We check that the
- # predicted probabilities for that class are zero!
- np.testing.assert_array_almost_equal(
- y_hat_proba.iloc[:, 3].to_numpy(),
- np.zeros(y_test.shape),
- )
- for i in (0, 1, 2, 4, 5):
- assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
-
- # check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
- lambda: collections.defaultdict(dict)
- )
- for measure in user_defined_measures:
- fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
- # trace. SGD does not produce any
- assert trace is None
-
- self._check_fold_timing_evaluations(
- fold_evaluations,
- num_repeats=1,
- num_folds=1,
- task_type=task.task_type_id,
- check_scores=False,
- )
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.21"),
- reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
- "Pipeline till 0.20 doesn't support indexing and 'passthrough'",
- )
- def test_run_model_on_fold_classification_1_dataframe(self):
- from sklearn.compose import ColumnTransformer
-
- task = openml.tasks.get_task(1) # anneal; crossvalidation
-
- # diff test_run_model_on_fold_classification_1_array()
- X, y = task.get_X_and_y()
- train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X.iloc[train_indices]
- y_train = y.iloc[train_indices]
- X_test = X.iloc[test_indices]
- y_test = y.iloc[test_indices]
-
- # Helper functions to return required columns for ColumnTransformer
- sparse = {
- "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False
- }
- cat_imp = make_pipeline(
- SimpleImputer(strategy="most_frequent"),
- OneHotEncoder(handle_unknown="ignore", **sparse),
- )
- cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler())
- ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
- pipeline = sklearn.pipeline.Pipeline(
- steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
- )
- # TODO add some mocking here to actually test the innards of this function, too!
- res = self.extension._run_model_on_fold(
- model=pipeline,
- task=task,
- fold_no=0,
- rep_no=0,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- )
-
- y_hat, y_hat_proba, user_defined_measures, trace = res
-
- # predictions
- assert isinstance(y_hat, np.ndarray)
- assert y_hat.shape == y_test.shape
- assert isinstance(y_hat_proba, pd.DataFrame)
- assert y_hat_proba.shape == (y_test.shape[0], 6)
- np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
- # The class '4' (at index 3) is not present in the training data. We check that the
- # predicted probabilities for that class are zero!
- np.testing.assert_array_almost_equal(
- y_hat_proba.iloc[:, 3].to_numpy(),
- np.zeros(y_test.shape),
- )
- for i in (0, 1, 2, 4, 5):
- assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
-
- # check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
- lambda: collections.defaultdict(dict)
- )
- for measure in user_defined_measures:
- fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
- # trace. SGD does not produce any
- assert trace is None
-
- self._check_fold_timing_evaluations(
- fold_evaluations,
- num_repeats=1,
- num_folds=1,
- task_type=task.task_type_id,
- check_scores=False,
- )
-
- @pytest.mark.sklearn()
- def test_run_model_on_fold_classification_2(self):
- task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation
-
- X, y = task.get_X_and_y()
- train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X.iloc[train_indices]
- y_train = y.iloc[train_indices]
- X_test = X.iloc[test_indices]
- y_test = y.iloc[test_indices]
-
- pipeline = sklearn.model_selection.GridSearchCV(
- sklearn.pipeline.Pipeline(
- steps=[
- *_get_sklearn_preprocessing(),
- ("clf", sklearn.tree.DecisionTreeClassifier()),
- ],
- ),
- {"clf__max_depth": [1, 2]},
- )
- # TODO add some mocking here to actually test the innards of this function, too!
- res = self.extension._run_model_on_fold(
- model=pipeline,
- task=task,
- fold_no=0,
- rep_no=0,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- )
-
- y_hat, y_hat_proba, user_defined_measures, trace = res
-
- # predictions
- assert isinstance(y_hat, np.ndarray)
- assert y_hat.shape == y_test.shape
- assert isinstance(y_hat_proba, pd.DataFrame)
- assert y_hat_proba.shape == (y_test.shape[0], 2)
- np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
- for i in (0, 1):
- assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))
-
- # check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
- lambda: collections.defaultdict(dict)
- )
- for measure in user_defined_measures:
- fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
- # check that it produced and returned a trace object of the correct length
- assert isinstance(trace, OpenMLRunTrace)
- assert len(trace.trace_iterations) == 2
-
- self._check_fold_timing_evaluations(
- fold_evaluations,
- num_repeats=1,
- num_folds=1,
- task_type=task.task_type_id,
- check_scores=False,
- )
-
- @pytest.mark.sklearn()
- def test_run_model_on_fold_classification_3(self):
- class HardNaiveBayes(sklearn.naive_bayes.GaussianNB):
- # class for testing a naive bayes classifier that does not allow soft
- # predictions
- def predict_proba(*args, **kwargs):
- raise AttributeError("predict_proba is not available when probability=False")
-
- # task 1 (test server) is important: it is a task with an unused class
- tasks = [
- 1, # anneal; crossvalidation
- 3, # anneal; crossvalidation
- 115, # diabetes; crossvalidation
- ]
- flow = unittest.mock.Mock()
- flow.name = "dummy"
-
- for task_id in tasks:
- task = openml.tasks.get_task(task_id)
- X, y = task.get_X_and_y()
- train_indices, test_indices = task.get_train_test_split_indices(
- repeat=0,
- fold=0,
- sample=0,
- )
- X_train = X.iloc[train_indices]
- y_train = y.iloc[train_indices]
- X_test = X.iloc[test_indices]
- clf1 = sklearn.pipeline.Pipeline(
- steps=[
- *_get_sklearn_preprocessing(),
- ("estimator", sklearn.naive_bayes.GaussianNB()),
- ],
- )
- clf2 = sklearn.pipeline.Pipeline(
- steps=[*_get_sklearn_preprocessing(), ("estimator", HardNaiveBayes())],
- )
-
- pred_1, proba_1, _, _ = self.extension._run_model_on_fold(
- model=clf1,
- task=task,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- fold_no=0,
- rep_no=0,
- )
- pred_2, proba_2, _, _ = self.extension._run_model_on_fold(
- model=clf2,
- task=task,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- fold_no=0,
- rep_no=0,
- )
-
- # verifies that the predictions are identical
- np.testing.assert_array_equal(pred_1, pred_2)
- np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0]))
- # Test that there are predictions other than ones and zeros
- assert np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1) < X_test.shape[
- 0
- ] * len(task.class_labels)
-
- np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0]))
- # Test that there are only ones and zeros predicted
- assert np.sum(proba_2.to_numpy() == 0) + np.sum(
- proba_2.to_numpy() == 1
- ) == X_test.shape[0] * len(task.class_labels)
-
- @pytest.mark.sklearn()
- @pytest.mark.production()
- def test_run_model_on_fold_regression(self):
- # There aren't any regression tasks on the test server
- openml.config.server = self.production_server
- task = openml.tasks.get_task(2999)
-
- X, y = task.get_X_and_y()
- train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X.iloc[train_indices]
- y_train = y.iloc[train_indices]
- X_test = X.iloc[test_indices]
- y_test = y.iloc[test_indices]
-
- pipeline = sklearn.pipeline.Pipeline(
- steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())],
- )
- # TODO add some mocking here to actually test the innards of this function, too!
- res = self.extension._run_model_on_fold(
- model=pipeline,
- task=task,
- fold_no=0,
- rep_no=0,
- X_train=X_train,
- y_train=y_train,
- X_test=X_test,
- )
-
- y_hat, y_hat_proba, user_defined_measures, trace = res
-
- # predictions
- assert isinstance(y_hat, np.ndarray)
- assert y_hat.shape == y_test.shape
- assert y_hat_proba is None
-
- # check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
- lambda: collections.defaultdict(dict)
- )
- for measure in user_defined_measures:
- fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
- # trace. SGD does not produce any
- assert trace is None
-
- self._check_fold_timing_evaluations(
- fold_evaluations,
- num_repeats=1,
- num_folds=1,
- task_type=task.task_type_id,
- check_scores=False,
- )
-
- @pytest.mark.sklearn()
- @pytest.mark.production()
- def test_run_model_on_fold_clustering(self):
- # There aren't any regression tasks on the test server
- openml.config.server = self.production_server
- task = openml.tasks.get_task(126033)
-
- X = task.get_X()
-
- pipeline = sklearn.pipeline.Pipeline(
- steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.cluster.KMeans())],
- )
- # TODO add some mocking here to actually test the innards of this function, too!
- res = self.extension._run_model_on_fold(
- model=pipeline,
- task=task,
- fold_no=0,
- rep_no=0,
- X_train=X,
- )
-
- y_hat, y_hat_proba, user_defined_measures, trace = res
-
- # predictions
- assert isinstance(y_hat, np.ndarray)
- assert y_hat.shape == (X.shape[0],)
- assert y_hat_proba is None
-
- # check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
- lambda: collections.defaultdict(dict)
- )
- for measure in user_defined_measures:
- fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
- # trace. SGD does not produce any
- assert trace is None
-
- self._check_fold_timing_evaluations(
- fold_evaluations,
- num_repeats=1,
- num_folds=1,
- task_type=task.task_type_id,
- check_scores=False,
- )
-
- @pytest.mark.sklearn()
- def test__extract_trace_data(self):
- param_grid = {
- "hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]],
- "activation": ["identity", "logistic", "tanh", "relu"],
- "learning_rate_init": [0.1, 0.01, 0.001, 0.0001],
- "max_iter": [10, 20, 40, 80],
- }
- num_iters = 10
- task = openml.tasks.get_task(20) # balance-scale; crossvalidation
- clf = sklearn.model_selection.RandomizedSearchCV(
- sklearn.neural_network.MLPClassifier(),
- param_grid,
- n_iter=num_iters,
- )
- # just run the task on the model (without invoking any fancy extension & openml code)
- train, _ = task.get_train_test_split_indices(0, 0)
- X, y = task.get_X_and_y()
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- clf.fit(X.iloc[train], y.iloc[train])
-
- # check num layers of MLP
- assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"]
-
- trace_list = self.extension._extract_trace_data(clf, rep_no=0, fold_no=0)
- trace = self.extension._obtain_arff_trace(clf, trace_list)
-
- assert isinstance(trace, OpenMLRunTrace)
- assert isinstance(trace_list, list)
- assert len(trace_list) == num_iters
-
- for trace_iteration in iter(trace):
- assert trace_iteration.repeat == 0
- assert trace_iteration.fold == 0
- assert trace_iteration.iteration >= 0
- assert trace_iteration.iteration <= num_iters
- assert trace_iteration.setup_string is None
- assert isinstance(trace_iteration.evaluation, float)
- assert np.isfinite(trace_iteration.evaluation)
- assert isinstance(trace_iteration.selected, bool)
-
- assert len(trace_iteration.parameters) == len(param_grid)
- for param in param_grid:
- # Prepend with the "parameter_" prefix
- param_in_trace = f"parameter_{param}"
- assert param_in_trace in trace_iteration.parameters
- param_value = json.loads(trace_iteration.parameters[param_in_trace])
- assert param_value in param_grid[param]
-
- @pytest.mark.sklearn()
- def test_trim_flow_name(self):
- import re
-
- long = """sklearn.pipeline.Pipeline(
- columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
- numeric=sklearn.pipeline.Pipeline(
- SimpleImputer=sklearn.preprocessing.imputation.Imputer,
- standardscaler=sklearn.preprocessing.data.StandardScaler),
- nominal=sklearn.pipeline.Pipeline(
- simpleimputer=sklearn.impute.SimpleImputer,
- onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
- variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
- svc=sklearn.svm.classes.SVC)"""
- short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)"
- shorter = "sklearn.Pipeline(...,SVC)"
- long_stripped, _ = re.subn(r"\s", "", long)
- assert short == SklearnExtension.trim_flow_name(long_stripped)
- assert shorter == SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50)
-
- long = """sklearn.pipeline.Pipeline(
- imputation=openmlstudy14.preprocessing.ConditionalImputer,
- hotencoding=sklearn.preprocessing.data.OneHotEncoder,
- variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
- classifier=sklearn.ensemble.forest.RandomForestClassifier)"""
- short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)" # noqa: E501
- long_stripped, _ = re.subn(r"\s", "", long)
- assert short == SklearnExtension.trim_flow_name(long_stripped)
-
- long = """sklearn.pipeline.Pipeline(
- SimpleImputer=sklearn.preprocessing.imputation.Imputer,
- VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
- Estimator=sklearn.model_selection._search.RandomizedSearchCV(
- estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
- short = (
- "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))"
- )
- long_stripped, _ = re.subn(r"\s", "", long)
- assert short == SklearnExtension.trim_flow_name(long_stripped)
-
- long = """sklearn.model_selection._search.RandomizedSearchCV(
- estimator=sklearn.pipeline.Pipeline(
- SimpleImputer=sklearn.preprocessing.imputation.Imputer,
- classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
- short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
- long_stripped, _ = re.subn(r"\s", "", long)
- assert short == SklearnExtension.trim_flow_name(long_stripped)
-
- long = """sklearn.pipeline.FeatureUnion(
- pca=sklearn.decomposition.pca.PCA,
- svd=sklearn.decomposition.truncated_svd.TruncatedSVD)"""
- short = "sklearn.FeatureUnion(PCA,TruncatedSVD)"
- long_stripped, _ = re.subn(r"\s", "", long)
- assert short == SklearnExtension.trim_flow_name(long_stripped)
-
- long = "sklearn.ensemble.forest.RandomForestClassifier"
- short = "sklearn.RandomForestClassifier"
- assert short == SklearnExtension.trim_flow_name(long)
-
- assert SklearnExtension.trim_flow_name("weka.IsolationForest") == "weka.IsolationForest"
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.21"),
- reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
- "Pipeline till 0.20 doesn't support indexing and 'passthrough'",
- )
- def test_run_on_model_with_empty_steps(self):
- from sklearn.compose import ColumnTransformer
-
- # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
- dataset = openml.datasets.get_dataset(128) # iris
- task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation
-
- X, y, categorical_ind, feature_names = dataset.get_data(
- target=dataset.default_target_attribute,
- )
- categorical_ind = np.array(categorical_ind)
- (cat_idx,) = np.where(categorical_ind)
- (cont_idx,) = np.where(~categorical_ind)
-
- clf = make_pipeline(
- ColumnTransformer(
- [
- (
- "cat",
- make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder()),
- cat_idx.tolist(),
- ),
- (
- "cont",
- make_pipeline(SimpleImputer(strategy="median"), StandardScaler()),
- cont_idx.tolist(),
- ),
- ],
- ),
- )
-
- clf = sklearn.pipeline.Pipeline(
- [
- ("dummystep", "passthrough"), # adding 'passthrough' as an estimator
- ("prep", clf),
- ("classifier", sklearn.svm.SVC(gamma="auto")),
- ],
- )
-
- # adding 'drop' to a ColumnTransformer
- if not categorical_ind.any():
- clf[1][0].set_params(cat="drop")
- if not (~categorical_ind).any():
- clf[1][0].set_params(cont="drop")
-
- # serializing model with non-actionable step
- run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
-
- assert len(flow.components) == 3
- assert isinstance(flow.components["dummystep"], OpenMLFlow)
- assert flow.components["dummystep"].name == "passthrough"
- assert isinstance(flow.components["classifier"], OpenMLFlow)
- if Version(sklearn.__version__) < Version("0.22"):
- assert flow.components["classifier"].name == "sklearn.svm.classes.SVC"
- else:
- assert flow.components["classifier"].name == "sklearn.svm._classes.SVC"
- assert isinstance(flow.components["prep"], OpenMLFlow)
- assert flow.components["prep"].class_name == "sklearn.pipeline.Pipeline"
- assert isinstance(flow.components["prep"].components["columntransformer"], OpenMLFlow)
- assert isinstance(
- flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow
- )
- assert (
- flow.components["prep"].components["columntransformer"].components["cat"].name == "drop"
- )
-
- # de-serializing flow to a model with non-actionable step
- model = self.extension.flow_to_model(flow)
- model.fit(X, y)
- assert type(model) == type(clf)
- assert model != clf
- assert len(model.named_steps) == 3
- assert model.named_steps["dummystep"] == "passthrough"
-
- xml = flow._to_dict()
- new_model = self.extension.flow_to_model(OpenMLFlow._from_dict(xml))
-
- new_model.fit(X, y)
- assert type(new_model) == type(clf)
- assert new_model != clf
- assert len(new_model.named_steps) == 3
- assert new_model.named_steps["dummystep"] == "passthrough"
-
- @pytest.mark.sklearn()
- def test_sklearn_serialization_with_none_step(self):
- msg = (
- "Cannot serialize objects of None type. Please use a valid "
- "placeholder for None. Note that empty sklearn estimators can be "
- "replaced with 'drop' or 'passthrough'."
- )
- clf = sklearn.pipeline.Pipeline(
- [("dummystep", None), ("classifier", sklearn.svm.SVC(gamma="auto"))],
- )
- with pytest.raises(ValueError, match=msg):
- self.extension.model_to_flow(clf)
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.20"),
- reason="columntransformer introduction in 0.20.0",
- )
- def test_failed_serialization_of_custom_class(self):
- """Check if any custom class inherited from sklearn expectedly fails serialization"""
- try:
- from sklearn.impute import SimpleImputer
- except ImportError:
- # for lower versions
- from sklearn.preprocessing import Imputer as SimpleImputer
-
- import sklearn.tree
- from sklearn.compose import ColumnTransformer
- from sklearn.pipeline import Pipeline, make_pipeline
- from sklearn.preprocessing import OneHotEncoder, StandardScaler
-
- cat_imp = make_pipeline(
- SimpleImputer(strategy="most_frequent"),
- OneHotEncoder(handle_unknown="ignore"),
- )
- cont_imp = make_pipeline(CustomImputer(), StandardScaler())
- ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
- clf = Pipeline(
- steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
- ) # build a sklearn classifier
-
- task = openml.tasks.get_task(253) # profb; crossvalidation
- try:
- _ = openml.runs.run_model_on_task(clf, task)
- except AttributeError as e:
- if e.args[0] == "module '__main__' has no attribute '__version__'":
- raise AttributeError(e)
- else:
- raise Exception(e)
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.20"),
- reason="columntransformer introduction in 0.20.0",
- )
- def test_setupid_with_column_transformer(self):
- """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
- flow each time.
- """
- import sklearn.compose
- from sklearn.svm import SVC
-
- def column_transformer_pipe(task_id):
- task = openml.tasks.get_task(task_id)
- # make columntransformer
- preprocessor = sklearn.compose.ColumnTransformer(
- transformers=[
- ("num", StandardScaler(), cont),
- ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
- ],
- )
- # make pipeline
- clf = SVC(gamma="scale", random_state=1)
- pipe = make_pipeline(preprocessor, clf)
- # run task
- run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
- run.publish()
- return openml.runs.get_run(run.run_id)
-
- run1 = column_transformer_pipe(11) # only categorical
- TestBase._mark_entity_for_removal("run", run1.run_id)
- run2 = column_transformer_pipe(23) # only numeric
- TestBase._mark_entity_for_removal("run", run2.run_id)
- assert run1.setup_id == run2.setup_id
From 0425929c55cbb89c2932c0d0b943f477ba908e2f Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 12:30:39 +0200
Subject: [PATCH 3/8] Do not load sklearn extension by default
---
openml/__init__.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/openml/__init__.py b/openml/__init__.py
index 48d301eec..c49505eb9 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -121,7 +121,3 @@ def populate_cache(
"_api_calls",
"__version__",
]
-
-# Load the scikit-learn extension by default
-# TODO(eddiebergman): Not sure why this is at the bottom of the file
-import openml.extensions.sklearn # noqa: E402, F401
From 23ada0ea44bec186200b50933674d191f1aba902 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 12:42:00 +0200
Subject: [PATCH 4/8] Disable scikit-learn tests
---
tests/test_flows/test_flow.py | 4 +-
tests/test_flows/test_flow_functions.py | 4 +-
tests/test_runs/test_run.py | 6 +-
tests/test_runs/test_run_functions.py | 6 +-
tests/test_setups/test_setup_functions.py | 4 +-
tests/test_study/test_study_examples.py | 77 -----------------------
6 files changed, 12 insertions(+), 89 deletions(-)
delete mode 100644 tests/test_study/test_study_examples.py
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 4a5241b62..f725074b5 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -26,7 +26,7 @@
import openml
import openml.exceptions
-import openml.extensions.sklearn
+#import openml.extensions.sklearn
import openml.utils
from openml._api_calls import _perform_api_call
from openml.testing import SimpleImputer, TestBase
@@ -37,7 +37,7 @@ class TestFlow(TestBase):
def setUp(self):
super().setUp()
- self.extension = openml.extensions.sklearn.SklearnExtension()
+ self.extension = None #openml.extensions.sklearn.SklearnExtension()
def tearDown(self):
super().tearDown()
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 40c78c822..b78781a1c 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -18,7 +18,7 @@
from sklearn import ensemble
import openml
-import openml.extensions.sklearn
+#import openml.extensions.sklearn
from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException
from openml.testing import TestBase, create_request_response
@@ -283,7 +283,7 @@ def test_sklearn_to_flow_list_of_lists(self):
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]])
- extension = openml.extensions.sklearn.SklearnExtension()
+ extension = None # openml.extensions.sklearn.SklearnExtension()
# Test serialization works
flow = extension.model_to_flow(ordinal_encoder)
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index e58c72e2d..7e2117d79 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -16,7 +16,7 @@
from sklearn.tree import DecisionTreeClassifier
import openml
-import openml.extensions.sklearn
+#import openml.extensions.sklearn
from openml import OpenMLRun
from openml.testing import SimpleImputer, TestBase
@@ -299,7 +299,7 @@ def test_publish_with_local_loaded_flow(self):
Publish a run tied to a local flow after it has first been saved to
and loaded from disk.
"""
- extension = openml.extensions.sklearn.SklearnExtension()
+ extension = None # openml.extensions.sklearn.SklearnExtension()
for model, task in self._get_models_tasks_for_tests():
# Make sure the flow does not exist on the server yet.
@@ -339,7 +339,7 @@ def test_publish_with_local_loaded_flow(self):
@pytest.mark.sklearn()
def test_offline_and_online_run_identical(self):
- extension = openml.extensions.sklearn.SklearnExtension()
+ extension = None #openml.extensions.sklearn.SklearnExtension()
for model, task in self._get_models_tasks_for_tests():
# Make sure the flow does not exist on the server yet.
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 58670b354..ed4847219 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -34,12 +34,12 @@
import openml
import openml._api_calls
import openml.exceptions
-import openml.extensions.sklearn
+#import openml.extensions.sklearn
from openml.exceptions import (
OpenMLNotAuthorizedError,
OpenMLServerException,
)
-from openml.extensions.sklearn import cat, cont
+#from openml.extensions.sklearn import cat, cont
from openml.runs.functions import (
_run_task_get_arffcontent,
delete_run,
@@ -108,7 +108,7 @@ class TestRun(TestBase):
def setUp(self):
super().setUp()
- self.extension = openml.extensions.sklearn.SklearnExtension()
+ self.extension = None#openml.extensions.sklearn.SklearnExtension()
def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
# it can take a while for a run to be processed on the OpenML (test)
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 88ac84805..5cc531e63 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -13,7 +13,7 @@
import openml
import openml.exceptions
-import openml.extensions.sklearn
+#import openml.extensions.sklearn
from openml.testing import TestBase
@@ -31,7 +31,7 @@ class TestSetupFunctions(TestBase):
_multiprocess_can_split_ = True
def setUp(self):
- self.extension = openml.extensions.sklearn.SklearnExtension()
+ self.extension = None# openml.extensions.sklearn.SklearnExtension()
super().setUp()
@pytest.mark.sklearn()
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
deleted file mode 100644
index e3b21fc8c..000000000
--- a/tests/test_study/test_study_examples.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# License: BSD 3-Clause
-from __future__ import annotations
-
-import unittest
-from packaging.version import Version
-
-import pytest
-import sklearn
-
-from openml.extensions.sklearn import cat, cont
-from openml.testing import TestBase
-
-
-class TestStudyFunctions(TestBase):
- _multiprocess_can_split_ = True
- """Test the example code of Bischl et al. (2018)"""
-
- @pytest.mark.sklearn()
- @unittest.skipIf(
- Version(sklearn.__version__) < Version("0.24"),
- reason="columntransformer introduction in 0.24.0",
- )
- def test_Figure1a(self):
- """Test listing in Figure 1a on a single task and the old OpenML100 study.
-
- The original listing is pasted into the comment below because it the actual unit test
- differs a bit, as for example it does not run for all tasks, but only a single one.
-
- import openml
- import sklearn.tree, sklearn.preprocessing
- benchmark_suite = openml.study.get_study('OpenML-CC18','tasks') # obtain the benchmark suite
- clf = sklearn.pipeline.Pipeline(steps=[('imputer',sklearn.preprocessing.Imputer()), ('estimator',sklearn.tree.DecisionTreeClassifier())]) # build a sklearn classifier
- for task_id in benchmark_suite.tasks: # iterate over all tasks
- task = openml.tasks.get_task(task_id) # download the OpenML task
- X, y = task.get_X_and_y() # get the data (not used in this example)
- openml.config.apikey = 'FILL_IN_OPENML_API_KEY' # set the OpenML Api Key
- run = openml.runs.run_model_on_task(task,clf) # run classifier on splits (requires API key)
- score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
- print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
- run.publish() # publish the experiment on OpenML (optional)
- print('URL for run: %s/run/%d' %(openml.config.server,run.run_id))
- """ # noqa: E501
- import sklearn.metrics
- import sklearn.tree
- from sklearn.compose import ColumnTransformer
- from sklearn.impute import SimpleImputer
- from sklearn.pipeline import Pipeline, make_pipeline
- from sklearn.preprocessing import OneHotEncoder, StandardScaler
-
- import openml
-
- benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite
- cat_imp = OneHotEncoder(handle_unknown="ignore")
- cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
- ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
- clf = Pipeline(
- steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
- ) # build a sklearn classifier
- for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks
- task = openml.tasks.get_task(task_id) # download the OpenML task
- X, y = task.get_X_and_y() # get the data (not used in this example)
- openml.config.apikey = openml.config.apikey # set the OpenML Api Key
- run = openml.runs.run_model_on_task(
- clf,
- task,
- avoid_duplicate_runs=False,
- ) # run classifier on splits (requires API key)
- score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
- TestBase.logger.info(
- f"Data set: {task.get_dataset().name}; Accuracy: {score.mean():0.2f}",
- )
- run.publish() # publish the experiment on OpenML (optional)
- TestBase._mark_entity_for_removal("run", run.run_id)
- TestBase.logger.info(
- f"collected from {__file__.split('/')[-1]}: {run.run_id}",
- )
- TestBase.logger.info("URL for run: %s/run/%d" % (openml.config.server, run.run_id))
From 0f2ac1ad720b5354ee41aace29a3e38ede8c725e Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 15:20:43 +0200
Subject: [PATCH 5/8] Tests fail successfully
---
openml/extensions/functions.py | 22 ++++++++++++++++++++--
openml/flows/flow.py | 17 +++++++----------
tests/test_extensions/test_functions.py | 2 ++
tests/test_flows/test_flow_functions.py | 4 ++--
tests/test_runs/test_run_functions.py | 2 ++
5 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py
index 302ab246c..7a944c997 100644
--- a/openml/extensions/functions.py
+++ b/openml/extensions/functions.py
@@ -13,6 +13,13 @@
from . import Extension
+SKLEARN_HINT = (
+ "But it looks related to scikit-learn. "
+ "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
+ "For more information, see "
+ "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation"
+)
+
def register_extension(extension: type[Extension]) -> None:
"""Register an extension.
@@ -57,7 +64,13 @@ def get_extension_by_flow(
candidates.append(extension_class())
if len(candidates) == 0:
if raise_if_no_extension:
- raise ValueError(f"No extension registered which can handle flow: {flow}")
+ install_instruction = ""
+ if flow.name.startswith("sklearn"):
+ install_instruction = SKLEARN_HINT
+ raise ValueError(
+ f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). "
+ f"{install_instruction}"
+ )
return None
@@ -96,7 +109,12 @@ def get_extension_by_model(
candidates.append(extension_class())
if len(candidates) == 0:
if raise_if_no_extension:
- raise ValueError(f"No extension registered which can handle model: {model}")
+ install_instruction = ""
+ if type(model).__module__.startswith("sklearn"):
+ install_instruction = SKLEARN_HINT
+ raise ValueError(
+ f"No extension registered which can handle model: {model}. {install_instruction}"
+ )
return None
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index a3ff50ca1..02d24e78b 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -4,7 +4,7 @@
import logging
from collections import OrderedDict
from pathlib import Path
-from typing import Any, Hashable, Sequence
+from typing import Any, Hashable, Sequence, cast
import xmltodict
@@ -157,10 +157,7 @@ def __init__( # noqa: PLR0913
self.language = language
self.dependencies = dependencies
self.flow_id = flow_id
- if extension is None:
- self._extension = get_extension_by_flow(self)
- else:
- self._extension = extension
+ self._extension = extension
@property
def id(self) -> int | None:
@@ -170,12 +167,12 @@ def id(self) -> int | None:
@property
def extension(self) -> Extension:
"""The extension of the flow (e.g., sklearn)."""
- if self._extension is not None:
- return self._extension
+ if self._extension is None:
+ self._extension = cast(
+ Extension, get_extension_by_flow(self, raise_if_no_extension=True)
+ )
- raise RuntimeError(
- f"No extension could be found for flow {self.flow_id}: {self.name}",
- )
+ return self._extension
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
"""Collect all information to display in the __repr__ body."""
diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py
index bc7937c88..ac4610a15 100644
--- a/tests/test_extensions/test_functions.py
+++ b/tests/test_extensions/test_functions.py
@@ -11,6 +11,8 @@
class DummyFlow:
external_version = "DummyFlow==0.1"
+ name = "Dummy Flow"
+ flow_id = 1
dependencies = None
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index b78781a1c..b44de08bf 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -321,8 +321,8 @@ def test_get_flow_reinstantiate_model(self):
def test_get_flow_reinstantiate_model_no_extension(self):
# Flow 10 is a WEKA flow
self.assertRaisesRegex(
- RuntimeError,
- "No extension could be found for flow 10: weka.SMO",
+ ValueError,
+ ".* flow: 10 \(weka.SMO\). ",
openml.flows.get_flow,
flow_id=10,
reinstantiate=True,
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index ed4847219..ed90bcad5 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1900,6 +1900,7 @@ def test_joblib_backends(self, parallel_mock):
Version(sklearn.__version__) < Version("0.20"),
reason="SimpleImputer doesn't handle mixed type DataFrame as input",
)
+ @pytest.mark.sklearn()
def test_delete_run(self):
rs = np.random.randint(1, 2**31 - 1)
clf = sklearn.pipeline.Pipeline(
@@ -1928,6 +1929,7 @@ def test_delete_run(self):
Version(sklearn.__version__) < Version("0.20"),
reason="SimpleImputer doesn't handle mixed type DataFrame as input",
)
+ @pytest.mark.sklearn()
def test_initialize_model_from_run_nonstrict(self):
# We cannot guarantee that a run with an older version exists on the server.
# Thus, we test it simply with a run that we know exists that might not be loose.
From 943afcc714a8682f0846c35449e22e90cd7c674b Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 15:35:49 +0200
Subject: [PATCH 6/8] Add openml-sklearn as dependency of sklearn tests
---
pyproject.toml | 1 +
tests/conftest.py | 1 +
2 files changed, 2 insertions(+)
diff --git a/pyproject.toml b/pyproject.toml
index fa9a70dc1..e32f113e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,7 @@ test=[
"mypy",
"ruff",
"requests-mock",
+ "openml-sklearn",
]
examples=[
"matplotlib",
diff --git a/tests/conftest.py b/tests/conftest.py
index 778b0498b..28ff9df46 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,6 +33,7 @@
import shutil
from pathlib import Path
import pytest
+import openml_sklearn
import openml
from openml.testing import TestBase
From 89f8fceb433b841032fb6a74567b5b6740fae275 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 16:00:40 +0200
Subject: [PATCH 7/8] Make use of openml_sklearn extension
---
tests/test_flows/test_flow.py | 6 ++++--
tests/test_flows/test_flow_functions.py | 4 ++--
tests/test_runs/test_run.py | 6 +++---
tests/test_runs/test_run_functions.py | 9 +++++----
tests/test_setups/test_setup_functions.py | 4 ++--
5 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index f725074b5..e6407a51c 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -24,20 +24,22 @@
import sklearn.tree
import xmltodict
+from openml_sklearn import SklearnExtension
+
import openml
import openml.exceptions
-#import openml.extensions.sklearn
import openml.utils
from openml._api_calls import _perform_api_call
from openml.testing import SimpleImputer, TestBase
+
class TestFlow(TestBase):
_multiprocess_can_split_ = True
def setUp(self):
super().setUp()
- self.extension = None #openml.extensions.sklearn.SklearnExtension()
+ self.extension = SklearnExtension()
def tearDown(self):
super().tearDown()
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index b44de08bf..4a9b03fd7 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -7,6 +7,7 @@
from collections import OrderedDict
from multiprocessing.managers import Value
+from openml_sklearn import SklearnExtension
from packaging.version import Version
from unittest import mock
from unittest.mock import patch
@@ -18,7 +19,6 @@
from sklearn import ensemble
import openml
-#import openml.extensions.sklearn
from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException
from openml.testing import TestBase, create_request_response
@@ -283,7 +283,7 @@ def test_sklearn_to_flow_list_of_lists(self):
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]])
- extension = None # openml.extensions.sklearn.SklearnExtension()
+ extension = SklearnExtension()
# Test serialization works
flow = extension.model_to_flow(ordinal_encoder)
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 7e2117d79..88fa1672b 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -8,6 +8,7 @@
import numpy as np
import pytest
import xmltodict
+from openml_sklearn import SklearnExtension
from sklearn.base import clone
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
@@ -16,7 +17,6 @@
from sklearn.tree import DecisionTreeClassifier
import openml
-#import openml.extensions.sklearn
from openml import OpenMLRun
from openml.testing import SimpleImputer, TestBase
@@ -299,7 +299,7 @@ def test_publish_with_local_loaded_flow(self):
Publish a run tied to a local flow after it has first been saved to
and loaded from disk.
"""
- extension = None # openml.extensions.sklearn.SklearnExtension()
+ extension = SklearnExtension()
for model, task in self._get_models_tasks_for_tests():
# Make sure the flow does not exist on the server yet.
@@ -339,7 +339,7 @@ def test_publish_with_local_loaded_flow(self):
@pytest.mark.sklearn()
def test_offline_and_online_run_identical(self):
- extension = None #openml.extensions.sklearn.SklearnExtension()
+ extension = SklearnExtension()
for model, task in self._get_models_tasks_for_tests():
# Make sure the flow does not exist on the server yet.
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index ed90bcad5..725421d4f 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -7,6 +7,8 @@
import time
import unittest
import warnings
+
+from openml_sklearn import SklearnExtension, cat, cont
from packaging.version import Version
from unittest import mock
@@ -34,7 +36,6 @@
import openml
import openml._api_calls
import openml.exceptions
-#import openml.extensions.sklearn
from openml.exceptions import (
OpenMLNotAuthorizedError,
OpenMLServerException,
@@ -108,7 +109,7 @@ class TestRun(TestBase):
def setUp(self):
super().setUp()
- self.extension = None#openml.extensions.sklearn.SklearnExtension()
+ self.extension = SklearnExtension()
def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
# it can take a while for a run to be processed on the OpenML (test)
@@ -1750,7 +1751,7 @@ def test_format_prediction_task_regression(self):
Version(sklearn.__version__) < Version("0.21"),
reason="couldn't perform local tests successfully w/o bloating RAM",
)
- @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs")
+ @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
def test__run_task_get_arffcontent_2(self, parallel_mock):
"""Tests if a run executed in parallel is collated correctly."""
task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
@@ -1824,7 +1825,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
Version(sklearn.__version__) < Version("0.21"),
reason="couldn't perform local tests successfully w/o bloating RAM",
)
- @mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs")
+ @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
def test_joblib_backends(self, parallel_mock):
"""Tests evaluation of a run using various joblib backends and n_jobs."""
task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 5cc531e63..b805ca9d3 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -10,10 +10,10 @@
import sklearn.base
import sklearn.naive_bayes
import sklearn.tree
+from openml_sklearn import SklearnExtension
import openml
import openml.exceptions
-#import openml.extensions.sklearn
from openml.testing import TestBase
@@ -31,7 +31,7 @@ class TestSetupFunctions(TestBase):
_multiprocess_can_split_ = True
def setUp(self):
- self.extension = None# openml.extensions.sklearn.SklearnExtension()
+ self.extension = SklearnExtension()
super().setUp()
@pytest.mark.sklearn()
From 8ab1992d5e887354a7092134c198767ad381a77a Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 19 Jun 2025 16:06:52 +0200
Subject: [PATCH 8/8] packaging is only used in test submodules
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index e32f113e5..91ec2327c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,6 @@ dependencies = [
"minio",
"pyarrow",
"tqdm", # For MinIO download progress bars
- "packaging",
]
requires-python = ">=3.8"
maintainers = [
@@ -81,6 +80,7 @@ test=[
"ruff",
"requests-mock",
"openml-sklearn",
+ "packaging",
]
examples=[
"matplotlib",