add scikit-learn compliance to the APLR classes

paulbkoch · paulbkoch · commit 798494b73ee8 · 2026-03-22T19:27:34.000-07:00
diff --git a/python/interpret-core/interpret/glassbox/_aplr.py b/python/interpret-core/interpret/glassbox/_aplr.py
@@ -17,7 +17,12 @@ class _SeriesType:
         pass
 
 
-from ..utils._scikit import SKClassifierMixin, SKRegressorMixin
+from ..utils._scikit import (
+    SKBaseEstimator,
+    SKClassifierMixin,
+    SKNotFittedError,
+    SKRegressorMixin,
+)
 from ..api.base import LocalExplainer, GlobalExplainer
 from ..api.templates import FeatureValueExplanation
 from ..utils._clean_simple import clean_dimensions
@@ -46,7 +51,11 @@ def __init__(self, *args, **kwargs):
 
 
 class APLRRegressor(
-    SKRegressorMixin, LocalExplainer, GlobalExplainer, APLRRegressorNative
+    SKRegressorMixin,
+    LocalExplainer,
+    GlobalExplainer,
+    SKBaseEstimator,
+    APLRRegressorNative,
 ):
     """APLR Regressor."""
 
@@ -60,13 +69,35 @@ def __init__(self, **kwargs):
         # TODO: add feature_names and feature_types to conform to glassbox API
         super().__init__(**kwargs)
 
+    def get_params(self, deep=True):
+        return APLRRegressorNative.get_params(self)
+
+    def set_params(self, **params):
+        APLRRegressorNative.set_params(self, **params)
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.non_deterministic = True
+        tags.target_tags.required = True
+        return tags
+
+    def predict(self, X):
+        """Predicts target values."""
+        if not hasattr(self, "n_features_in_"):
+            raise SKNotFittedError(
+                "This model has not been fitted yet. Call 'fit' first."
+            )
+        return super().predict(X)
+
     def fit(self, X, y, **kwargs):
         """Fits model."""
         X_names = kwargs.get("X_names")
 
-        self.bin_counts, self.bin_edges = calculate_densities(X)
+        self.bin_counts_, self.bin_edges_ = calculate_densities(X)
         self.unique_values_in_ = calculate_unique_values(X)
         self.feature_names_in_ = define_feature_names(X, X_names=X_names)
+        self.n_features_in_ = len(self.feature_names_in_)
 
         super().fit(
             X,
@@ -107,8 +138,8 @@ def explain_global(self, name: Optional[str] = None):
             is_two_way_interaction: bool = len(predictor_indexes_used) == 2
             if is_main_effect:
                 density_dict = {
-                    "names": self.bin_edges[predictor_indexes_used[0]],
-                    "scores": self.bin_counts[predictor_indexes_used[0]],
+                    "names": self.bin_edges_[predictor_indexes_used[0]],
+                    "scores": self.bin_counts_[predictor_indexes_used[0]],
                 }
                 feature_dict = {
                     "type": "univariate",
@@ -282,7 +313,23 @@ def calculate_densities(X: FloatMatrix) -> Tuple[List[List[int]], List[List[floa
 
 
 def convert_to_numpy_matrix(X: FloatMatrix) -> np.ndarray:
+    try:
+        from scipy import sparse as _sparse
+
+        if _sparse.issparse(X):
+            raise TypeError(
+                "Sparse input is not supported. Please convert X to a dense array."
+            )
+    except ImportError:
+        pass
+
     if isinstance(X, np.ndarray):
+        if X.dtype == object:
+            try:
+                return X.astype(np.float64)
+            except (ValueError, TypeError):
+                msg = "argument must be a float64 convertible type"
+                raise TypeError(msg)
         if not np.issubdtype(X.dtype, np.number):
             msg = f"If X is a numpy array, it must contain only numeric values, but got dtype '{X.dtype}'."
             raise TypeError(msg)
@@ -341,7 +388,11 @@ def __init__(self, *args, **kwargs):
 
 
 class APLRClassifier(
-    SKClassifierMixin, LocalExplainer, GlobalExplainer, APLRClassifierNative
+    SKClassifierMixin,
+    LocalExplainer,
+    GlobalExplainer,
+    SKBaseEstimator,
+    APLRClassifierNative,
 ):
     """APLR Classifier."""
 
@@ -355,25 +406,63 @@ def __init__(self, **kwargs):
         # TODO: add feature_names and feature_types to conform to glassbox API
         super().__init__(**kwargs)
 
+    def get_params(self, deep=True):
+        return APLRClassifierNative.get_params(self)
+
+    def set_params(self, **params):
+        APLRClassifierNative.set_params(self, **params)
+        return self
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.non_deterministic = True
+        tags.target_tags.required = True
+        return tags
+
+    def predict(self, X):
+        """Predicts class labels."""
+        if not hasattr(self, "n_features_in_"):
+            raise SKNotFittedError(
+                "This model has not been fitted yet. Call 'fit' first."
+            )
+        str_preds = super().predict(X)
+        return np.array(
+            [self._str_to_label_[s] for s in str_preds], dtype=self.classes_.dtype
+        )
+
+    def predict_proba(self, X):
+        """Predicts class probabilities."""
+        if not hasattr(self, "n_features_in_"):
+            raise SKNotFittedError(
+                "This model has not been fitted yet. Call 'fit' first."
+            )
+        return self.predict_class_probabilities(X)
+
     def fit(self, X, y, **kwargs):
         """Fits model."""
         X_names = kwargs.get("X_names")
 
-        self.bin_counts, self.bin_edges = calculate_densities(X)
+        self.bin_counts_, self.bin_edges_ = calculate_densities(X)
         self.unique_values_in_ = calculate_unique_values(X)
         self.feature_names_in_ = define_feature_names(X, X_names=X_names)
+        self.n_features_in_ = len(self.feature_names_in_)
 
-        if not all(isinstance(val, str) for val in y):
-            y = [str(val) for val in y]
-        if isinstance(y, _SeriesType):
-            y = y.to_numpy()
+        y_arr = np.asarray(y)
+        y_str = [str(val) for val in y_arr]
 
         super().fit(
             X,
-            y,
+            y_str,
             **kwargs,
         )
-        self.classes_ = self.classes_
+
+        categories = self.get_categories()
+        unique_orig = {}
+        for val, s in zip(y_arr, y_str):
+            if s not in unique_orig:
+                unique_orig[s] = val
+        self.classes_ = np.array([unique_orig[c] for c in categories])
+        self._str_to_label_ = {c: unique_orig[c] for c in categories}
         return self
 
     def explain_global(self, name: Optional[str] = None):
@@ -413,8 +502,8 @@ def explain_global(self, name: Optional[str] = None):
                 is_two_way_interaction: bool = len(predictor_indexes_used) == 2
                 if is_main_effect:
                     density_dict = {
-                        "names": self.bin_edges[predictor_indexes_used[0]],
-                        "scores": self.bin_counts[predictor_indexes_used[0]],
+                        "names": self.bin_edges_[predictor_indexes_used[0]],
+                        "scores": self.bin_counts_[predictor_indexes_used[0]],
                     }
                     feature_dict = {
                         "type": "univariate",
@@ -518,7 +607,7 @@ def explain_local(
             for each instance as horizontal bar charts.
         """
 
-        pred = self.predict(X)
+        pred = APLRClassifierNative.predict(self, X)
         pred_proba = self.predict_class_probabilities(X)
         pred_max_prob = np.max(pred_proba, axis=1)
         term_names = self.get_unique_term_affiliations()
diff --git a/python/interpret-core/tests/glassbox/test_aplr.py b/python/interpret-core/tests/glassbox/test_aplr.py
@@ -2,11 +2,13 @@
 # Distributed under the MIT software license
 
 import numpy as np
+import pytest
+import warnings
 from aplr import APLRClassifier as APLRClassifierNative
 from aplr import APLRRegressor as APLRRegressorNative
 from interpret.glassbox import APLRClassifier, APLRRegressor
 from sklearn.datasets import load_breast_cancer, load_diabetes
-import warnings
+from sklearn.utils import estimator_checks
 
 
 def test_regression():
@@ -85,7 +87,7 @@ def test_classification():
 
     native_pred = native.predict(X)
     our_pred = our_aplr.predict(X)
-    assert native_pred == our_pred
+    assert [str(v) for v in our_pred] == list(native_pred)
 
     # With response
     local_expl = our_aplr.explain_local(X[:5], y[:5])
@@ -106,3 +108,55 @@ def test_classification():
         global_expl = our_aplr.explain_global()
         global_viz = global_expl.visualize()
         assert global_viz is not None
+
+
+@pytest.fixture
+def skip_sklearn() -> set:
+    """Tests which we do not adhere to."""
+    # TODO: whittle these down to the minimum
+    return {
+        "check_do_not_raise_errors_in_init_or_set_params",  # native APLR validates params eagerly in __init__/set_params
+        "check_no_attributes_set_in_init",  # native APLR sets attributes in __init__
+        "check_fit1d",  # interpret accepts 1d X for single feature
+        "check_fit2d_predict1d",  # interpret accepts 1d for predict
+        "check_supervised_y_2d",  # interpret deliberately supports y.shape = (nsamples, 1)
+        "check_classifiers_regression_target",  # interpret is more permissive with y values
+        "check_n_features_in_after_fitting",  # interpret uses a different error message format
+        "check_complex_data",  # interpret uses a different error message for complex data
+        "check_estimators_nan_inf",  # interpret treats NaN as missing data, not as NaN/inf validation error
+        "check_requires_y_none",  # interpret uses a different error message for y=None
+        # native APLR raises RuntimeError instead of ValueError for invalid inputs
+        "check_regressors_train",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_regressor_data_not_an_array",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_classifier_data_not_an_array",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_classifiers_train",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_classifiers_classes",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_regressors_no_decision_function",  # native APLR raises RuntimeError for mismatched X/y lengths
+        "check_supervised_y_no_nan",  # native APLR raises RuntimeError instead of ValueError for NaN y
+        "check_estimators_empty_data_messages",  # native APLR raises RuntimeError for empty data
+        "check_fit2d_1sample",  # native APLR requires more than 1 sample for CV folds
+        # native APLR classifier-specific limitations
+        "check_classifiers_one_label",  # native APLR requires at least 2 categories
+        "check_classifiers_one_label_sample_weights",  # native APLR requires at least 2 categories
+        "check_fit_idempotent",  # native APLR classifier fitting twice produces different results
+        "check_sample_weight_equivalence_on_dense_data",  # algorithmic difference
+        "check_sample_weight_equivalence_on_sparse_data",  # algorithmic difference
+    }
+
+
+@estimator_checks.parametrize_with_checks(
+    [
+        APLRRegressor(cv_folds=2),
+        APLRClassifier(cv_folds=2),
+    ]
+)
+def test_sklearn_estimator(estimator, check, skip_sklearn):
+    if check.func.__name__ in skip_sklearn:
+        pytest.skip("Deliberate deviation from scikit-learn.")
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            "Casting complex values to real discards the imaginary part",
+            category=np.exceptions.ComplexWarning,
+        )
+        check(estimator)