Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ lint.ignore = [

lint.pydocstyle.convention = "numpy"

[tool.ruff.lint.per-file-ignores]
"tests/**.py" = ["D103"]

[tool.codespell]
ignore-words-list = "chec,arrang,livetime"

Expand Down
48 changes: 38 additions & 10 deletions src/eventdisplay_ml/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def flatten_feature_data(group_df, ntel, analysis_type, training):
df_flat = flatten_telescope_data_vectorized(
group_df,
ntel,
features.telescope_features(analysis_type, training=training),
features.telescope_features(analysis_type),
analysis_type=analysis_type,
training=training,
)
Expand Down Expand Up @@ -171,7 +171,7 @@ def load_training_data(
_logger.info(f"Processing file: {f}")
tree = root_file["data"]
df = tree.arrays(branch_list, cut=event_cut, library="pd")
_logger.info(f"Number of events after filter {event_cut}: {len(df)}")
_logger.info(f"Number of events after event cut {event_cut}: {len(df)}")
if max_events_per_file and len(df) > max_events_per_file:
df = df.sample(n=max_events_per_file, random_state=42)
if not df.empty:
Expand All @@ -188,7 +188,7 @@ def load_training_data(
df_flat = flatten_telescope_data_vectorized(
data_tree,
n_tel,
features.telescope_features(analysis_type, training=True),
features.telescope_features(analysis_type),
analysis_type,
training=True,
)
Expand All @@ -205,6 +205,8 @@ def load_training_data(
df_flat.dropna(axis=1, how="all", inplace=True)
_logger.info(f"Final events for n_tel={n_tel} after cleanup: {len(df_flat)}")

print_variable_statistics(df_flat)

return df_flat


Expand Down Expand Up @@ -250,7 +252,7 @@ def calculate_intersection(tel_list):
df["DispNImages"] = df["DispNImages_new"]
df = df.drop(columns=["DispTelList_T_new", "DispNImages_new"])

pad_vars = features.telescope_features(analysis_type, training=training)
pad_vars = features.telescope_features(analysis_type)

for var_name in pad_vars:
if var_name in df.columns:
Expand All @@ -274,7 +276,7 @@ def event_cuts(analysis_type, n_tel, model_parameters=None):
"""Event cut string for the given analysis type and telescope multiplicity."""
event_cut = f"(DispNImages == {n_tel})"

if analysis_type in ("signal_classification", "background_classification"):
if analysis_type == "classification":
cuts = [
"Erec > 0",
"MSCW > -2",
Expand Down Expand Up @@ -306,15 +308,18 @@ def flatten_telescope_variables(n_tel, flat_features, index):
new_cols[f"loss_dist_{i}"] = df_flat[f"loss_{i}"] * df_flat[f"dist_{i}"]
new_cols[f"width_length_{i}"] = df_flat[f"width_{i}"] / (df_flat[f"length_{i}"] + 1e-6)

df_flat[f"size_{i}"] = np.log10(np.clip(df_flat[f"size_{i}"], 1e-6, None))
if "E_{i}" in df_flat:
if f"size_{i}" in df_flat:
df_flat[f"size_{i}"] = np.log10(np.clip(df_flat[f"size_{i}"], 1e-6, None))
if f"E_{i}" in df_flat:
df_flat[f"E_{i}"] = np.log10(np.clip(df_flat[f"E_{i}"], 1e-6, None))
if "ES_{i}" in df_flat:
if f"ES_{i}" in df_flat:
df_flat[f"ES_{i}"] = np.log10(np.clip(df_flat[f"ES_{i}"], 1e-6, None))

# pointing corrections
df_flat[f"cen_x_{i}"] = df_flat[f"cen_x_{i}"] + df_flat[f"fpointing_dx_{i}"]
df_flat[f"cen_y_{i}"] = df_flat[f"cen_y_{i}"] + df_flat[f"fpointing_dy_{i}"]
if f"cen_x_{i}" in df_flat and f"fpointing_dx_{i}" in df_flat:
df_flat[f"cen_x_{i}"] = df_flat[f"cen_x_{i}"] + df_flat[f"fpointing_dx_{i}"]
if f"cen_y_{i}" in df_flat and f"fpointing_dy_{i}" in df_flat:
df_flat[f"cen_y_{i}"] = df_flat[f"cen_y_{i}"] + df_flat[f"fpointing_dy_{i}"]
df_flat = df_flat.drop(columns=[f"fpointing_dx_{i}", f"fpointing_dy_{i}"])

return pd.concat([df_flat, pd.DataFrame(new_cols, index=index)], axis=1)
Expand Down Expand Up @@ -376,3 +381,26 @@ def energy_in_bins(df_chunk, bins):
e_bin[valid] = np.argmin(distances, axis=1)
df_chunk["e_bin"] = e_bin
return df_chunk["e_bin"]


def print_variable_statistics(df):
"""
Print min, max, mean, and RMS for each variable in the DataFrame.

Parameters
----------
df : pandas.DataFrame
DataFrame containing variables loaded using branch_list.
"""
for col in df.columns:
data = df[col].dropna().to_numpy()
if data.size == 0:
print(f"{col}: No data")
continue
min_val = np.min(data)
max_val = np.max(data)
mean_val = np.mean(data)
rms_val = np.sqrt(np.mean(np.square(data)))
_logger.info(
f"{col:25s} min: {min_val:10.4g} max: {max_val:10.4g} mean: {mean_val:10.4g} rms: {rms_val:10.4g}"
)
114 changes: 74 additions & 40 deletions src/eventdisplay_ml/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

from eventdisplay_ml.features import target_features

_logger = logging.getLogger(__name__)


Expand All @@ -25,6 +27,11 @@ def evaluation_efficiency(name, model, x_test, y_test):
pred = y_pred_proba >= t
eff_signal.append(((pred) & (y_test == 1)).sum() / n_signal if n_signal else 0)
eff_background.append(((pred) & (y_test == 0)).sum() / n_background if n_background else 0)
_logger.info(
f"{name} Threshold: {t:.2f} | "
f"Signal Efficiency: {eff_signal[-1]:.4f} | "
f"Background Efficiency: {eff_background[-1]:.4f}"
)

return pd.DataFrame(
{
Expand Down Expand Up @@ -63,20 +70,19 @@ def evaluate_regression_model(model, x_test, y_test, df, x_cols, y_data, name):
score = model.score(x_test, y_test)
_logger.info(f"XGBoost Multi-Target R^2 Score (Testing Set): {score:.4f}")
y_pred = model.predict(x_test)
mse_x = mean_squared_error(y_test["MCxoff"], y_pred[:, 0])
mse_y = mean_squared_error(y_test["MCyoff"], y_pred[:, 1])
_logger.info(f"{name} MSE (X_off): {mse_x:.4f}, MSE (Y_off): {mse_y:.4f}")
mae_x = mean_absolute_error(y_test["MCxoff"], y_pred[:, 0])
mae_y = mean_absolute_error(y_test["MCyoff"], y_pred[:, 1])
_logger.info(f"{name} MAE (X_off): {mae_x:.4f}")
_logger.info(f"{name} MAE (Y_off): {mae_y:.4f}")
mse = mean_squared_error(y_test, y_pred)
_logger.info(f"{name} Mean Squared Error (All targets): {mse:.4f}")
mae = mean_absolute_error(y_test, y_pred)
_logger.info(f"{name} Mean Absolute Error (All targets): {mae:.4f}")

target_variance(y_test, y_pred, y_data.columns)
feature_importance(model, x_cols, y_data.columns, name)
if name == "xgboost":
shap_feature_importance(model, x_test, y_data.columns)

df_pred = pd.DataFrame(y_pred, columns=target_features("stereo_analysis"))
calculate_resolution(
y_pred,
df_pred,
y_test,
df,
percentiles=[68, 90, 95],
Expand All @@ -87,15 +93,33 @@ def evaluate_regression_model(model, x_test, y_test, df, x_cols, y_data, name):
)


def target_variance(y_test, y_pred, targets):
"""Calculate and log variance explained per target."""
y_test_np = y_test.to_numpy() if hasattr(y_test, "to_numpy") else y_test

mse_values = np.mean((y_test_np - y_pred) ** 2, axis=0)
variance_values = np.var(y_test_np, axis=0)

_logger.info("--- Performance Per Target ---")
for i, name in enumerate(targets):
# Fraction of variance unexplained (lower is better, 0.0 is perfect)
unexplained = mse_values[i] / variance_values[i]

_logger.info(
f"Target: {name:12s} | MSE: {mse_values[i]:.6f} | "
f"Unexplained Variance: {unexplained:.2%}"
)


def calculate_resolution(y_pred, y_test, df, percentiles, log_e_min, log_e_max, n_bins, name):
"""Compute angular and energy resolution based on predictions."""
results_df = pd.DataFrame(
{
"MCxoff_true": y_test["MCxoff"].values,
"MCyoff_true": y_test["MCyoff"].values,
"MCxoff_pred": y_pred[:, 0],
"MCyoff_pred": y_pred[:, 1],
"MCe0_pred": y_pred[:, 2],
"MCxoff_pred": y_pred["MCxoff"].values,
"MCyoff_pred": y_pred["MCyoff"].values,
"MCe0_pred": y_pred["MCe0"].values,
"MCe0": df.loc[y_test.index, "MCe0"].values,
}
)
Expand Down Expand Up @@ -134,50 +158,60 @@ def percentile_series(col, p):
_logger.info(f"\n{output_df.to_markdown(floatfmt='.4f')}")


def _iter_targets(model, target_names):
"""Iterate over targets in multi-/single-output models."""
if hasattr(model, "estimators_"): # MultiOutputRegressor
def feature_importance(model, x_cols, target_names, name=None):
"""Feature importance handling both MultiOutputRegressor and native Multi-target."""
_logger.info("--- XGBoost Feature Importance ---")

# Case 1: Scikit-Learn MultiOutputRegressor
if hasattr(model, "estimators_"):
for i, est in enumerate(model.estimators_):
target = target_names[i] if i < len(target_names) else f"target_{i}"
yield target, est
target = target_names[i] if (target_names and i < len(target_names)) else f"target_{i}"
_log_importance_table(target, est.feature_importances_, x_cols, name)

# Case 2: Native Multi-target OR Single-target Classifier
else:
target = target_names[0] if target_names else "target"
yield target, model
importances = getattr(model, "feature_importances_", None)

if importances is not None:
if target_names is not None and len(target_names) > 0:
# Convert to list to ensure .join works regardless of input type
target_str = ", ".join(map(str, target_names))
else:
target_str = "Target"

def feature_importance(model, x_cols, target_names, name=None):
"""Feature importance using built-in XGBoost method."""
_logger.info("--- XGBoost Feature Importance ---")
# Check if it's actually multi-target to set the log message
if target_names is not None and len(target_names) > 1:
_logger.info("Note: Native XGBoost multi-target provides JOINT importance.")

for target, est in _iter_targets(model, target_names):
importances = getattr(est, "feature_importances_", None)
if importances is None:
_logger.info("No feature_importances_ found.")
continue
_log_importance_table(target_str, importances, x_cols, name)

df = pd.DataFrame({"Feature": x_cols, "Importance": importances}).sort_values(
"Importance", ascending=False
)
_logger.info(f"### {name} Importance for Target: **{target}**")
_logger.info(f"\n{df.head(25).to_markdown(index=False)}")

def _log_importance_table(target_label, values, x_cols, name):
"""Format and log the importance dataframe for printing."""
df = pd.DataFrame({"Feature": x_cols, "Importance": values}).sort_values(
"Importance", ascending=False
)
_logger.info(f"### {name} Importance for: **{target_label}**")
_logger.info(f"\n{df.head(25).to_markdown(index=False)}")


def shap_feature_importance(model, x_data, target_names, max_points=20000, n_top=25):
"""Feature importance using SHAP values from XGBoost."""
x_sample = x_data.sample(n=min(len(x_data), max_points), random_state=0)
"""Feature importance using SHAP values for native multi-target XGBoost."""
x_sample = x_data.sample(n=min(len(x_data), max_points), random_state=42)
n_features = len(x_data.columns)
n_targets = len(target_names)

for target, est in _iter_targets(model, target_names):
if not hasattr(est, "get_booster"):
_logger.info("Model does not support SHAP feature importance.")
continue
dmatrix = xgb.DMatrix(x_sample)
shap_vals = model.get_booster().predict(dmatrix, pred_contribs=True)
shap_vals = shap_vals.reshape(len(x_sample), n_targets, n_features + 1)

shap_vals = est.get_booster().predict(xgb.DMatrix(x_sample), pred_contribs=True)[:, :-1]
for i, target in enumerate(target_names):
target_shap = shap_vals[:, i, :-1]

imp = np.abs(shap_vals).mean(axis=0)
imp = np.abs(target_shap).mean(axis=0)
idx = np.argsort(imp)[::-1]

_logger.info(f"=== Builtin XGBoost SHAP Importance for {target} ===")
_logger.info(f"=== SHAP Importance for {target} ===")
for j in idx[:n_top]:
if j < n_features:
_logger.info(f"{x_data.columns[j]:25s} {imp[j]:.6e}")
25 changes: 18 additions & 7 deletions src/eventdisplay_ml/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def excluded_features(analysis_type, ntel):
if "classification" in analysis_type:
return {
"Erec",
*[f"cen_x_{i}" for i in range(ntel)],
*[f"cen_y_{i}" for i in range(ntel)],
*[f"size_{i}" for i in range(ntel)],
*[f"E_{i}" for i in range(ntel)],
*[f"ES_{i}" for i in range(ntel)],
*[f"fpointing_dx_{i}" for i in range(ntel)],
Expand All @@ -54,19 +57,16 @@ def excluded_features(analysis_type, ntel):
raise ValueError(f"Unknown analysis type: {analysis_type}")


def telescope_features(analysis_type, training):
def telescope_features(analysis_type):
"""
Telescope-type features.

Disp variables with different indexing logic in data preparation.
"""
var = [
"cen_x",
"cen_y",
"cosphi",
"sinphi",
"loss",
"size",
"dist",
"width",
"length",
Expand All @@ -79,13 +79,24 @@ def telescope_features(analysis_type, training):
if analysis_type == "classification":
return var

return [*var, "E", "ES", "Disp_T", "DispXoff_T", "DispYoff_T", "DispWoff_T"]
return [
*var,
"size",
"cen_x",
"cen_y",
"E",
"ES",
"Disp_T",
"DispXoff_T",
"DispYoff_T",
"DispWoff_T",
]


def _regression_features(training):
"""Regression features."""
var = [
*telescope_features("stereo_analysis", training),
*telescope_features("stereo_analysis"),
"DispNImages",
"DispTelList_T",
"Xoff",
Expand All @@ -103,7 +114,7 @@ def _regression_features(training):

def _classification_features(training):
"""Classification features."""
var_tel = telescope_features("classification", training)
var_tel = telescope_features("classification")
var_array = [
"DispNImages",
"DispTelList_T",
Expand Down
5 changes: 3 additions & 2 deletions src/eventdisplay_ml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def process_file_chunked(
Number of events to read and process per chunk.
"""
branch_list = features.features(analysis_type, training=False)
_logger.info(f"Using branches: {branch_list}")
selected_indices = parse_image_selection(image_selection)

_logger.info(f"Chunk size: {chunk_size}")
Expand Down Expand Up @@ -325,7 +326,7 @@ def _output_tree(analysis_type, root_file):

def _apply_model(analysis_type, df_chunk, models, tree):
"""
Apply regression models to the data chunk.
Apply models to the data chunk.

Parameters
----------
Expand All @@ -334,7 +335,7 @@ def _apply_model(analysis_type, df_chunk, models, tree):
df_chunk : pandas.DataFrame
Data chunk to process.
models : dict
Dictionary of loaded XGBoost models for regression.
Dictionary of loaded XGBoost models.
tree : uproot.writing.WritingTTree
Output tree to write results to.
"""
Expand Down
Loading
Loading