Skip to content

Commit 55099b3

Browse files
author
SamoraHunter
committed
h2o crash fixes
1 parent 18831a1 commit 55099b3

7 files changed

Lines changed: 180 additions & 69 deletions

ml_grid/model_classes/H2OGAMClassifier.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,18 @@ def _prepare_fit(
107107
)
108108
continue
109109

110+
# --- FIX: Check for unique quantiles to prevent H2O knot generation failure ---
111+
try:
112+
# H2O uses quantiles for knots. If quantiles are not unique, it fails.
113+
# We check if we can generate 'required_knots' unique bins.
114+
pd.qcut(X[col], q=required_knots, duplicates="raise")
115+
except ValueError:
116+
if not self._suppress_low_cardinality_error:
117+
raise ValueError(
118+
f"Skipping GAM col '{col}': Cannot generate {required_knots} unique quantiles (distribution too skewed)."
119+
)
120+
continue
121+
110122
suitable_gam_cols.append(col)
111123
suitable_knots.append(required_knots)
112124
if i < len(bs_list):

ml_grid/model_classes/H2OGLMClassifier.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -34,32 +34,26 @@ def __init__(self, **kwargs):
3434
# Pass the specific estimator class
3535
super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
3636

37+
def _prepare_fit(self, X, y):
38+
"""
39+
Intercepts the parameter preparation to ENFORCE stability settings.
40+
This runs immediately BEFORE the H2O model is initialized/trained.
41+
"""
42+
# Get the standard parameters from the base class
43+
train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)
44+
45+
# --- STRICT OVERRIDE ---
46+
# Force L_BFGS: The only solver robust against the index mismatch bug on this data
47+
model_params["solver"] = "L_BFGS"
48+
model_params["remove_collinear_columns"] = False
49+
model_params["lambda_search"] = False
50+
51+
return train_h2o, x_vars, outcome_var, model_params
52+
3753
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
3854
"""Fits the H2O GLM model."""
39-
40-
# --- DOUBLE-LOCK: Enforce stable parameters at fit time ---
41-
# GridSearch calls set_params() which might overwrite our safe defaults.
42-
# We explicitly revert them here before training.
43-
44-
kwargs["solver"] = "L_BFGS"
45-
kwargs["remove_collinear_columns"] = False
46-
kwargs["lambda_search"] = False
47-
48-
# Update internal H2O parameter dictionary if it exists
49-
if hasattr(self, "_parms"):
50-
self._parms["solver"] = "L_BFGS"
51-
self._parms["remove_collinear_columns"] = False
52-
self._parms["lambda_search"] = False
53-
54-
# Proceed with standard fit
55+
# The override logic is now handled in _prepare_fit, called by super().fit()
5556
super().fit(X, y, **kwargs)
56-
57-
# 3. TRIPLE-LOCK: Ensure the internal model object respects this
58-
if hasattr(self, "model_") and self.model_ is not None:
59-
self.model_._parms["solver"] = "L_BFGS"
60-
self.model_._parms["remove_collinear_columns"] = False
61-
self.model_._parms["lambda_search"] = False
62-
6357
return self
6458

6559

ml_grid/model_classes/h2o_gam_classifier_class.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,12 @@ def __init__(
5050
if pd.api.types.is_numeric_dtype(X[col]):
5151
# Check cardinality (>10 unique values)
5252
if X[col].nunique() > 10:
53-
gam_cols.append(col)
53+
# Check distribution for at least 5 knots (default minimum)
54+
try:
55+
pd.qcut(X[col], q=5, duplicates="raise")
56+
gam_cols.append(col)
57+
except ValueError:
58+
pass
5459

5560
if not gam_cols and X is not None:
5661
logger.warning(

ml_grid/model_classes/h2o_glm_classifier_class.py

Lines changed: 60 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
import pandas as pd
33
from h2o.estimators import H2OGeneralizedLinearEstimator
44

5-
# Removing skopt imports to prevent the ParameterGrid TypeError
6-
# from skopt.space import Real, Categorical, Integer
5+
# --- FIX: Re-import skopt for Bayesian search compatibility ---
6+
from skopt.space import Real, Categorical
77

88
from .H2OBaseClassifier import H2OBaseClassifier
99

@@ -43,23 +43,12 @@ def _prepare_fit(self, X, y):
4343
# Get the standard parameters from the base class
4444
train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)
4545

46-
# --- STRICT OVERRIDE (The "Triple-Lock") ---
47-
# Regardless of what GridSearch/HyperOpt requested, we force these values
48-
# to prevent the Java Backend Crash (NullPointerException).
49-
50-
# 1. Force L_BFGS: The only solver robust against the index mismatch bug on this data
46+
# --- STRICT OVERRIDE ---
47+
# Force L_BFGS: The only solver robust against the index mismatch bug on this data
5148
model_params["solver"] = "L_BFGS"
52-
53-
# 2. Disable Collinear Removal: This prevents the coefficient vector size change
5449
model_params["remove_collinear_columns"] = False
55-
56-
# 3. Disable Lambda Search: If True, H2O ignores 'solver' and uses Coordinate Descent
5750
model_params["lambda_search"] = False
5851

59-
self.logger.info(
60-
f"H2OGLMClassifier: Enforced stability params: solver={model_params['solver']}, lambda_search={model_params['lambda_search']}"
61-
)
62-
6352
return train_h2o, x_vars, outcome_var, model_params
6453

6554
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
@@ -80,33 +69,60 @@ def __init__(self, X=None, y=None, parameter_space_size="small"):
8069
# Instantiate the actual estimator wrapper
8170
self.algorithm_implementation = H2OGLMClassifier()
8271

83-
# Define the Hyperparameter Space
84-
# FIX: Converted skopt distributions (Real, Categorical) to Lists
85-
# to ensure compatibility with sklearn.model_selection.ParameterGrid
86-
87-
if parameter_space_size == "xsmall":
88-
self.parameter_space = {
89-
"alpha": [0.0, 0.5, 1.0],
90-
"lambda_": [1e-3, 1e-2, 1e-1],
91-
"family": ["binomial"],
92-
"solver": ["L_BFGS"],
93-
"standardize": [True],
94-
}
95-
elif parameter_space_size == "small":
96-
self.parameter_space = {
97-
"alpha": [0.0, 0.25, 0.5, 0.75, 1.0],
98-
"lambda_": np.logspace(-4, -1, 5).tolist(),
99-
"family": ["binomial"],
100-
"solver": ["L_BFGS"],
101-
"standardize": [True],
102-
}
72+
# --- FIX: Conditionally define parameter space for Bayes vs. Grid search ---
73+
from ml_grid.util.global_params import global_parameters
74+
75+
if global_parameters.bayessearch:
76+
# Use skopt spaces for Bayesian search
77+
if parameter_space_size == "xsmall":
78+
self.parameter_space = {
79+
"alpha": Real(0.0, 1.0),
80+
"lambda_": Real(1e-3, 1e-1, prior="log-uniform"),
81+
"family": Categorical(["binomial"]),
82+
"solver": Categorical(["L_BFGS"]),
83+
"standardize": Categorical([True]),
84+
}
85+
elif parameter_space_size == "small":
86+
self.parameter_space = {
87+
"alpha": Real(0.0, 1.0),
88+
"lambda_": Real(1e-4, 1e-1, prior="log-uniform"),
89+
"family": Categorical(["binomial"]),
90+
"solver": Categorical(["L_BFGS"]),
91+
"standardize": Categorical([True]),
92+
}
93+
else: # Medium/Large space
94+
self.parameter_space = {
95+
"alpha": Real(0.0, 1.0),
96+
"lambda_": Real(1e-6, 10.0, prior="log-uniform"),
97+
"family": Categorical(["binomial"]),
98+
"solver": Categorical(["L_BFGS"]),
99+
"standardize": Categorical([True, False]),
100+
"balance_classes": Categorical([True, False]),
101+
}
103102
else:
104-
# Medium/Large space
105-
self.parameter_space = {
106-
"alpha": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
107-
"lambda_": np.logspace(-6, 1, 8).tolist(),
108-
"family": ["binomial"],
109-
"solver": ["L_BFGS"],
110-
"standardize": [True, False],
111-
"balance_classes": [True, False],
112-
}
103+
# Use lists for Grid/Random search
104+
if parameter_space_size == "xsmall":
105+
self.parameter_space = {
106+
"alpha": [0.0, 0.5, 1.0],
107+
"lambda_": [1e-3, 1e-2, 1e-1],
108+
"family": ["binomial"],
109+
"solver": ["L_BFGS"],
110+
"standardize": [True],
111+
}
112+
elif parameter_space_size == "small":
113+
self.parameter_space = {
114+
"alpha": [0.0, 0.25, 0.5, 0.75, 1.0],
115+
"lambda_": np.logspace(-4, -1, 5).tolist(),
116+
"family": ["binomial"],
117+
"solver": ["L_BFGS"],
118+
"standardize": [True],
119+
}
120+
else: # Medium/Large space
121+
self.parameter_space = {
122+
"alpha": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
123+
"lambda_": np.logspace(-6, 1, 8).tolist(),
124+
"family": ["binomial"],
125+
"solver": ["L_BFGS"],
126+
"standardize": [True, False],
127+
"balance_classes": [True, False],
128+
}

tests/test_h2o_base_classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def test_predict_successful(
215215

216216
# 2. Check that the new frame creation logic was called
217217
mock_h2o_frame.assert_called_once_with(
218-
X, column_names=list(X.columns), column_types=classifier_instance.feature_types_
218+
X, column_names=list(X.columns)
219219
)
220220
mock_h2o_assign.assert_called_once_with(mock_tmp_frame, ANY)
221221
mock_h2o_get_frame.assert_called_once()

tests/test_h2o_classifiers.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,39 @@ def test_h2o_gam_knot_cardinality_error(h2o_session_fixture):
227227
cross_val_score(estimator, X, y, cv=cv, error_score="raise", n_jobs=1)
228228

229229

230+
def test_h2o_gam_knot_distribution_error(h2o_session_fixture):
231+
"""
232+
Tests that H2OGAMClassifier raises ValueError when quantiles cannot be generated
233+
due to skewed distribution, even if cardinality is technically sufficient.
234+
"""
235+
h2o.remove_all()
236+
237+
# Ensure enough unique values survive the CV split to pass the cardinality check (>= 10)
238+
# We need > 10 unique values in the training fold.
239+
# With 50/50 split, we need roughly > 20 unique values in total.
240+
# We keep it skewed (mostly 0s) to trigger the quantile error.
241+
skewed_vals = np.array([0] * 70 + list(range(1, 31)))
242+
np.random.shuffle(skewed_vals)
243+
244+
X = pd.DataFrame({"feature1": np.random.rand(100), "feature_skewed": skewed_vals})
245+
y = pd.Series(np.random.randint(0, 2, 100), name="outcome")
246+
247+
estimator = H2O_GAM_class(
248+
X=X, y=y, parameter_space_size="small"
249+
).algorithm_implementation
250+
251+
estimator.set_params(
252+
gam_columns=["feature_skewed"],
253+
num_knots=5,
254+
_suppress_low_cardinality_error=False,
255+
)
256+
257+
cv = KFold(n_splits=2, shuffle=True, random_state=42)
258+
259+
with pytest.raises(ValueError, match=r"Cannot generate .* unique quantiles"):
260+
cross_val_score(estimator, X, y, cv=cv, error_score="raise", n_jobs=1)
261+
262+
230263
class MockMlGridObject:
231264
def __init__(self, X, y):
232265
self.X_train = X

tests/test_h2o_gam_stability.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,54 @@ def test_gam_empty_columns_fallback(h2o_session):
129129
# It should have forced a GLM because the only candidate column was rejected
130130
assert model.algo == "glm"
131131
print("[TEST] SUCCESS: Correctly fell back to GLM when gam_columns were invalid.")
132+
133+
134+
def test_gam_skewed_distribution_fallback(h2o_session):
135+
"""
136+
Tests that H2OGAMClassifier detects columns with sufficient cardinality
137+
but skewed distribution (causing knot generation failure) by falling back
138+
or raising error as configured.
139+
"""
140+
h2o.remove_all()
141+
142+
# Create data: 100 samples.
143+
# 'feature_skewed': 0 is present 90 times. 1..10 are present 1 time each.
144+
# Total unique values = 11.
145+
# If num_knots=5, required unique >= 10. This passes the simple cardinality check.
146+
# However, quantiles will likely overlap on 0, causing knot generation issues.
147+
148+
skewed_col = np.array([0] * 90 + list(range(1, 11)))
149+
np.random.shuffle(skewed_col)
150+
151+
X = pd.DataFrame({"feature_ok": np.random.rand(100), "feature_skewed": skewed_col})
152+
y = pd.Series(np.random.randint(0, 2, 100), name="outcome")
153+
154+
# 1. Test with suppression (default) -> Should drop column and succeed (fallback to GLM if needed)
155+
clf = H2OGAMClassifier(
156+
gam_columns=["feature_skewed"],
157+
num_knots=[5],
158+
_suppress_low_cardinality_error=True,
159+
)
160+
161+
print("\n[TEST] Attempting to fit GAM on skewed column (suppress=True)...")
162+
try:
163+
clf.fit(X, y)
164+
model = h2o.get_model(clf.model_id)
165+
print(f"[TEST] Model Algo: {model.algo}")
166+
# Since feature_skewed is the only gam column and it fails knot check,
167+
# it should be dropped. If no gam cols remain, fallback to GLM.
168+
assert model.algo == "glm"
169+
except Exception as e:
170+
pytest.fail(f"GAM fit failed with suppression enabled: {e}")
171+
172+
# 2. Test without suppression -> Should raise ValueError from our new check
173+
clf_raise = H2OGAMClassifier(
174+
gam_columns=["feature_skewed"],
175+
num_knots=[5],
176+
_suppress_low_cardinality_error=False,
177+
)
178+
179+
print("\n[TEST] Attempting to fit GAM on skewed column (suppress=False)...")
180+
with pytest.raises(ValueError, match="Cannot generate .* unique quantiles"):
181+
clf_raise.fit(X, y)
182+
print("[TEST] SUCCESS: Caught skewed distribution error.")

0 commit comments

Comments
 (0)