h2o crash fixes

SamoraHunter · SamoraHunter · commit 55099b3912ff · 2025-12-19T13:06:10.000Z
diff --git a/ml_grid/model_classes/H2OGAMClassifier.py b/ml_grid/model_classes/H2OGAMClassifier.py
@@ -107,6 +107,18 @@ def _prepare_fit(
                         )
                     continue
 
+                # --- FIX: Check for unique quantiles to prevent H2O knot generation failure ---
+                try:
+                    # H2O uses quantiles for knots. If quantiles are not unique, it fails.
+                    # We check if we can generate 'required_knots' unique bins.
+                    pd.qcut(X[col], q=required_knots, duplicates="raise")
+                except ValueError:
+                    if not self._suppress_low_cardinality_error:
+                        raise ValueError(
+                            f"Skipping GAM col '{col}': Cannot generate {required_knots} unique quantiles (distribution too skewed)."
+                        )
+                    continue
+
                 suitable_gam_cols.append(col)
                 suitable_knots.append(required_knots)
                 if i < len(bs_list):
diff --git a/ml_grid/model_classes/H2OGLMClassifier.py b/ml_grid/model_classes/H2OGLMClassifier.py
@@ -34,32 +34,26 @@ def __init__(self, **kwargs):
         # Pass the specific estimator class
         super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
 
+    def _prepare_fit(self, X, y):
+        """
+        Intercepts the parameter preparation to ENFORCE stability settings.
+        This runs immediately BEFORE the H2O model is initialized/trained.
+        """
+        # Get the standard parameters from the base class
+        train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)
+
+        # --- STRICT OVERRIDE ---
+        # Force L_BFGS: The only solver robust against the index mismatch bug on this data
+        model_params["solver"] = "L_BFGS"
+        model_params["remove_collinear_columns"] = False
+        model_params["lambda_search"] = False
+
+        return train_h2o, x_vars, outcome_var, model_params
+
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
         """Fits the H2O GLM model."""
-
-        # --- DOUBLE-LOCK: Enforce stable parameters at fit time ---
-        # GridSearch calls set_params() which might overwrite our safe defaults.
-        # We explicitly revert them here before training.
-
-        kwargs["solver"] = "L_BFGS"
-        kwargs["remove_collinear_columns"] = False
-        kwargs["lambda_search"] = False
-
-        # Update internal H2O parameter dictionary if it exists
-        if hasattr(self, "_parms"):
-            self._parms["solver"] = "L_BFGS"
-            self._parms["remove_collinear_columns"] = False
-            self._parms["lambda_search"] = False
-
-        # Proceed with standard fit
+        # The override logic is now handled in _prepare_fit, called by super().fit()
         super().fit(X, y, **kwargs)
-
-        # 3. TRIPLE-LOCK: Ensure the internal model object respects this
-        if hasattr(self, "model_") and self.model_ is not None:
-            self.model_._parms["solver"] = "L_BFGS"
-            self.model_._parms["remove_collinear_columns"] = False
-            self.model_._parms["lambda_search"] = False
-
         return self
 
 
diff --git a/ml_grid/model_classes/h2o_gam_classifier_class.py b/ml_grid/model_classes/h2o_gam_classifier_class.py
@@ -50,7 +50,12 @@ def __init__(
                 if pd.api.types.is_numeric_dtype(X[col]):
                     # Check cardinality (>10 unique values)
                     if X[col].nunique() > 10:
-                        gam_cols.append(col)
+                        # Check distribution for at least 5 knots (default minimum)
+                        try:
+                            pd.qcut(X[col], q=5, duplicates="raise")
+                            gam_cols.append(col)
+                        except ValueError:
+                            pass
 
         if not gam_cols and X is not None:
             logger.warning(
diff --git a/ml_grid/model_classes/h2o_glm_classifier_class.py b/ml_grid/model_classes/h2o_glm_classifier_class.py
@@ -2,8 +2,8 @@
 import pandas as pd
 from h2o.estimators import H2OGeneralizedLinearEstimator
 
-# Removing skopt imports to prevent the ParameterGrid TypeError
-# from skopt.space import Real, Categorical, Integer
+# --- FIX: Re-import skopt for Bayesian search compatibility ---
+from skopt.space import Real, Categorical
 
 from .H2OBaseClassifier import H2OBaseClassifier
 
@@ -43,23 +43,12 @@ def _prepare_fit(self, X, y):
         # Get the standard parameters from the base class
         train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)
 
-        # --- STRICT OVERRIDE (The "Triple-Lock") ---
-        # Regardless of what GridSearch/HyperOpt requested, we force these values
-        # to prevent the Java Backend Crash (NullPointerException).
-
-        # 1. Force L_BFGS: The only solver robust against the index mismatch bug on this data
+        # --- STRICT OVERRIDE ---
+        # Force L_BFGS: The only solver robust against the index mismatch bug on this data
         model_params["solver"] = "L_BFGS"
-
-        # 2. Disable Collinear Removal: This prevents the coefficient vector size change
         model_params["remove_collinear_columns"] = False
-
-        # 3. Disable Lambda Search: If True, H2O ignores 'solver' and uses Coordinate Descent
         model_params["lambda_search"] = False
 
-        self.logger.info(
-            f"H2OGLMClassifier: Enforced stability params: solver={model_params['solver']}, lambda_search={model_params['lambda_search']}"
-        )
-
         return train_h2o, x_vars, outcome_var, model_params
 
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
@@ -80,33 +69,60 @@ def __init__(self, X=None, y=None, parameter_space_size="small"):
         # Instantiate the actual estimator wrapper
         self.algorithm_implementation = H2OGLMClassifier()
 
-        # Define the Hyperparameter Space
-        # FIX: Converted skopt distributions (Real, Categorical) to Lists
-        # to ensure compatibility with sklearn.model_selection.ParameterGrid
-
-        if parameter_space_size == "xsmall":
-            self.parameter_space = {
-                "alpha": [0.0, 0.5, 1.0],
-                "lambda_": [1e-3, 1e-2, 1e-1],
-                "family": ["binomial"],
-                "solver": ["L_BFGS"],
-                "standardize": [True],
-            }
-        elif parameter_space_size == "small":
-            self.parameter_space = {
-                "alpha": [0.0, 0.25, 0.5, 0.75, 1.0],
-                "lambda_": np.logspace(-4, -1, 5).tolist(),
-                "family": ["binomial"],
-                "solver": ["L_BFGS"],
-                "standardize": [True],
-            }
+        # --- FIX: Conditionally define parameter space for Bayes vs. Grid search ---
+        from ml_grid.util.global_params import global_parameters
+
+        if global_parameters.bayessearch:
+            # Use skopt spaces for Bayesian search
+            if parameter_space_size == "xsmall":
+                self.parameter_space = {
+                    "alpha": Real(0.0, 1.0),
+                    "lambda_": Real(1e-3, 1e-1, prior="log-uniform"),
+                    "family": Categorical(["binomial"]),
+                    "solver": Categorical(["L_BFGS"]),
+                    "standardize": Categorical([True]),
+                }
+            elif parameter_space_size == "small":
+                self.parameter_space = {
+                    "alpha": Real(0.0, 1.0),
+                    "lambda_": Real(1e-4, 1e-1, prior="log-uniform"),
+                    "family": Categorical(["binomial"]),
+                    "solver": Categorical(["L_BFGS"]),
+                    "standardize": Categorical([True]),
+                }
+            else:  # Medium/Large space
+                self.parameter_space = {
+                    "alpha": Real(0.0, 1.0),
+                    "lambda_": Real(1e-6, 10.0, prior="log-uniform"),
+                    "family": Categorical(["binomial"]),
+                    "solver": Categorical(["L_BFGS"]),
+                    "standardize": Categorical([True, False]),
+                    "balance_classes": Categorical([True, False]),
+                }
         else:
-            # Medium/Large space
-            self.parameter_space = {
-                "alpha": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
-                "lambda_": np.logspace(-6, 1, 8).tolist(),
-                "family": ["binomial"],
-                "solver": ["L_BFGS"],
-                "standardize": [True, False],
-                "balance_classes": [True, False],
-            }
+            # Use lists for Grid/Random search
+            if parameter_space_size == "xsmall":
+                self.parameter_space = {
+                    "alpha": [0.0, 0.5, 1.0],
+                    "lambda_": [1e-3, 1e-2, 1e-1],
+                    "family": ["binomial"],
+                    "solver": ["L_BFGS"],
+                    "standardize": [True],
+                }
+            elif parameter_space_size == "small":
+                self.parameter_space = {
+                    "alpha": [0.0, 0.25, 0.5, 0.75, 1.0],
+                    "lambda_": np.logspace(-4, -1, 5).tolist(),
+                    "family": ["binomial"],
+                    "solver": ["L_BFGS"],
+                    "standardize": [True],
+                }
+            else:  # Medium/Large space
+                self.parameter_space = {
+                    "alpha": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
+                    "lambda_": np.logspace(-6, 1, 8).tolist(),
+                    "family": ["binomial"],
+                    "solver": ["L_BFGS"],
+                    "standardize": [True, False],
+                    "balance_classes": [True, False],
+                }
diff --git a/tests/test_h2o_base_classifier.py b/tests/test_h2o_base_classifier.py
@@ -215,7 +215,7 @@ def test_predict_successful(
 
     # 2. Check that the new frame creation logic was called
     mock_h2o_frame.assert_called_once_with(
-        X, column_names=list(X.columns), column_types=classifier_instance.feature_types_
+        X, column_names=list(X.columns)
     )
     mock_h2o_assign.assert_called_once_with(mock_tmp_frame, ANY)
     mock_h2o_get_frame.assert_called_once()
diff --git a/tests/test_h2o_classifiers.py b/tests/test_h2o_classifiers.py
@@ -227,6 +227,39 @@ def test_h2o_gam_knot_cardinality_error(h2o_session_fixture):
         cross_val_score(estimator, X, y, cv=cv, error_score="raise", n_jobs=1)
 
 
+def test_h2o_gam_knot_distribution_error(h2o_session_fixture):
+    """
+    Tests that H2OGAMClassifier raises ValueError when quantiles cannot be generated
+    due to skewed distribution, even if cardinality is technically sufficient.
+    """
+    h2o.remove_all()
+
+    # Ensure enough unique values survive the CV split to pass the cardinality check (>= 10)
+    # We need > 10 unique values in the training fold.
+    # With 50/50 split, we need roughly > 20 unique values in total.
+    # We keep it skewed (mostly 0s) to trigger the quantile error.
+    skewed_vals = np.array([0] * 70 + list(range(1, 31)))
+    np.random.shuffle(skewed_vals)
+
+    X = pd.DataFrame({"feature1": np.random.rand(100), "feature_skewed": skewed_vals})
+    y = pd.Series(np.random.randint(0, 2, 100), name="outcome")
+
+    estimator = H2O_GAM_class(
+        X=X, y=y, parameter_space_size="small"
+    ).algorithm_implementation
+
+    estimator.set_params(
+        gam_columns=["feature_skewed"],
+        num_knots=5,
+        _suppress_low_cardinality_error=False,
+    )
+
+    cv = KFold(n_splits=2, shuffle=True, random_state=42)
+
+    with pytest.raises(ValueError, match=r"Cannot generate .* unique quantiles"):
+        cross_val_score(estimator, X, y, cv=cv, error_score="raise", n_jobs=1)
+
+
 class MockMlGridObject:
     def __init__(self, X, y):
         self.X_train = X
diff --git a/tests/test_h2o_gam_stability.py b/tests/test_h2o_gam_stability.py
@@ -129,3 +129,54 @@ def test_gam_empty_columns_fallback(h2o_session):
     # It should have forced a GLM because the only candidate column was rejected
     assert model.algo == "glm"
     print("[TEST] SUCCESS: Correctly fell back to GLM when gam_columns were invalid.")
+
+
+def test_gam_skewed_distribution_fallback(h2o_session):
+    """
+    Tests that H2OGAMClassifier detects columns with sufficient cardinality
+    but skewed distribution (causing knot generation failure) by falling back
+    or raising error as configured.
+    """
+    h2o.remove_all()
+
+    # Create data: 100 samples.
+    # 'feature_skewed': 0 is present 90 times. 1..10 are present 1 time each.
+    # Total unique values = 11.
+    # If num_knots=5, required unique >= 10. This passes the simple cardinality check.
+    # However, quantiles will likely overlap on 0, causing knot generation issues.
+
+    skewed_col = np.array([0] * 90 + list(range(1, 11)))
+    np.random.shuffle(skewed_col)
+
+    X = pd.DataFrame({"feature_ok": np.random.rand(100), "feature_skewed": skewed_col})
+    y = pd.Series(np.random.randint(0, 2, 100), name="outcome")
+
+    # 1. Test with suppression (default) -> Should drop column and succeed (fallback to GLM if needed)
+    clf = H2OGAMClassifier(
+        gam_columns=["feature_skewed"],
+        num_knots=[5],
+        _suppress_low_cardinality_error=True,
+    )
+
+    print("\n[TEST] Attempting to fit GAM on skewed column (suppress=True)...")
+    try:
+        clf.fit(X, y)
+        model = h2o.get_model(clf.model_id)
+        print(f"[TEST] Model Algo: {model.algo}")
+        # Since feature_skewed is the only gam column and it fails knot check,
+        # it should be dropped. If no gam cols remain, fallback to GLM.
+        assert model.algo == "glm"
+    except Exception as e:
+        pytest.fail(f"GAM fit failed with suppression enabled: {e}")
+
+    # 2. Test without suppression -> Should raise ValueError from our new check
+    clf_raise = H2OGAMClassifier(
+        gam_columns=["feature_skewed"],
+        num_knots=[5],
+        _suppress_low_cardinality_error=False,
+    )
+
+    print("\n[TEST] Attempting to fit GAM on skewed column (suppress=False)...")
+    with pytest.raises(ValueError, match="Cannot generate .* unique quantiles"):
+        clf_raise.fit(X, y)
+    print("[TEST] SUCCESS: Caught skewed distribution error.")

Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ def test_predict_successful(`
`215`	`215`
`216`	`216`	`# 2. Check that the new frame creation logic was called`
`217`	`217`	`mock_h2o_frame.assert_called_once_with(`
`218`		`- X, column_names=list(X.columns), column_types=classifier_instance.feature_types_`
	`218`	`+ X, column_names=list(X.columns)`
`219`	`219`	`)`
`220`	`220`	`mock_h2o_assign.assert_called_once_with(mock_tmp_frame, ANY)`
`221`	`221`	`mock_h2o_get_frame.assert_called_once()`