From 14ba385f128f9ceaff4886ba6d769b47289f01d4 Mon Sep 17 00:00:00 2001 From: Maxime De Bois Date: Fri, 25 Apr 2025 10:08:15 +0200 Subject: [PATCH 1/2] fix not taking games played on last date --- pandaskill/app/leaderboard_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaskill/app/leaderboard_page.py b/pandaskill/app/leaderboard_page.py index 19fd2b1..9c49654 100644 --- a/pandaskill/app/leaderboard_page.py +++ b/pandaskill/app/leaderboard_page.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt import seaborn as sns from pandaskill.app.misc import compute_rating_lower_bound +from datetime import timedelta def display_leaderboard_page(data): """ @@ -80,7 +81,7 @@ def _get_leaderboard_parameters(data): since = date - dt.timedelta(days=30*6) parameters = { "since": since.strftime("%Y-%m-%d"), - "date": date.strftime("%Y-%m-%d"), + "date": (date + timedelta(days=1)).strftime("%Y-%m-%d"), "min_nb_games": min_nb_games } data = data.loc[data["date"] <= dt.datetime.combine(date, dt.datetime.min.time())] From 016057a0dc553980d2ecd84f7ca9f8600b006328 Mon Sep 17 00:00:00 2001 From: Maxime De Bois Date: Tue, 27 May 2025 17:07:42 +0200 Subject: [PATCH 2/2] improve shap values viz --- .../experiments/general/visualization.py | 26 +++--- .../performance_score/training_testing_cv.py | 89 +++++++++++-------- .../performance_score/visualization.py | 18 ++-- .../run_performance_score_experiment.py | 2 +- 4 files changed, 72 insertions(+), 63 deletions(-) diff --git a/pandaskill/experiments/general/visualization.py b/pandaskill/experiments/general/visualization.py index 9c04f9e..5a846c0 100644 --- a/pandaskill/experiments/general/visualization.py +++ b/pandaskill/experiments/general/visualization.py @@ -1,5 +1,5 @@ from pandaskill.experiments.general.metrics import compute_ece_from_binned_df, bin_predictions_equal_size -from pandaskill.experiments.general.utils import ALL_REGIONS +from pandaskill.experiments.general.utils import ALL_REGIONS, ROLES import matplotlib.pyplot as plt import numpy as np import os @@ -62,22 +62,23 @@ def plot_violin_distributions( df = df.copy() - if x_column == "region": + if x_column in ["role", "region"]: + reference_list = ALL_REGIONS if x_column == "region" else ROLES region_order_dict = { region: i - for i, region in enumerate(ALL_REGIONS) + for i, region in enumerate(reference_list) } - df["region_order"] = df["region"].map(region_order_dict) + df["order"] = df[x_column].map(region_order_dict) + df = df.sort_values("order") - format_region = lambda region: region.replace(" ", "\n").replace("-", "\n") - df["region"] = df["region"].apply(format_region) + if x_column == "region": # format region names + format_region = lambda region: region.replace(" ", "\n").replace("-", "\n") + df[x_column] = df[x_column].apply(format_region) + reference_list = [ + format_region(region) for region in reference_list + ] - df = df.sort_values("region_order") - - all_regions_formatted = [ - format_region(region) for region in ALL_REGIONS - ] - color_palette = dict(zip(all_regions_formatted, color_palette)) + color_palette = dict(zip(reference_list, color_palette)) else: nb_unique_hue = len(df[x_column].unique()) color_palette = color_palette[:nb_unique_hue] @@ -104,5 +105,4 @@ def plot_violin_distributions( plt.tight_layout() plt.savefig(join(saving_dir, f"{file_name[:-4]}.pdf")) - # plt.savefig(join(saving_dir, file_name)) plt.close() \ No newline at end of file diff --git a/pandaskill/experiments/performance_score/training_testing_cv.py b/pandaskill/experiments/performance_score/training_testing_cv.py index f725fa1..b0718ff 100644 --- a/pandaskill/experiments/performance_score/training_testing_cv.py +++ b/pandaskill/experiments/performance_score/training_testing_cv.py @@ -49,9 +49,12 @@ def compute_performance_scores_cv_loop( features_importance_list.append(feature_importances_fold) calibration_data.append(calibration_data_fold) - if evaluation_config["visualize_shap_values"]: - _visualize_shap_values( - data, features, test_game_ids, evaluation_config["specific_games_analysis"], models_list, roles, experiment_dir + _visualize_shap_values_single_game( + data, features, test_game_ids, evaluation_config["specific_games_analysis"], models_list, roles, experiment_dir + ) + if evaluation_config["visualize_shap_values_distributions"]: + _visualize_shap_values_distributions( + data, features, test_game_ids, models_list, roles, experiment_dir ) plot_all_models_calibration(calibration_data, experiment_dir) @@ -63,7 +66,7 @@ def compute_performance_scores_cv_loop( return performance_scores_df, evaluation_metrics -def _visualize_shap_values( +def _visualize_shap_values_single_game( data: pd.DataFrame, features: list, game_ids_cv: np.ndarray, @@ -71,12 +74,53 @@ def _visualize_shap_values( models_cv: dict, roles: list, experiment_dir: str +) -> None: + if not specific_game_ids: + return + + saving_folder = join(experiment_dir, "shap_values") + explainers_dict = { + role: [shap.Explainer(model_role_dict[role].model) for model_role_dict in models_cv] + for role in roles + } + for game_id in specific_game_ids: + fold_idx = next(i for i, g in enumerate(game_ids_cv) if game_id in g) + game_saving_folder = os.path.join(saving_folder, f"game_{game_id}") + os.makedirs(game_saving_folder, exist_ok=True) + + X_game = data.xs(game_id, level="game_id") + for player_id, row in X_game.iterrows(): + role = row["role"] + player_name = row["player_name"] + + X = row[features].values.reshape(1, -1) + model_obj = models_cv[fold_idx][role] + X_norm = model_obj.scaler.transform(X) + + explainer = explainers_dict[role][fold_idx] + shap_vals = explainer(X_norm) + + plot_shap_game_features_impact( + explainer=explainer, + shap_values=shap_vals.values[0], + feature_values_df= pd.Series(X[0], index=features, name=player_id), + file_name= f"{role}_{game_id}_{player_name}.png", + saving_folder=game_saving_folder, + nb_features_to_display=10, + show_xlabel=True + ) + +def _visualize_shap_values_distributions( + data: pd.DataFrame, + features: list, + game_ids_cv: np.ndarray, + models_cv: dict, + roles: list, + experiment_dir: str ) -> None: saving_folder = join(experiment_dir, "shap_values") shap_values_dict = {} feature_values_dict = {} - explainers_dict = {} - for role in roles: shap_values, feature_values, all_game_ids, explainers = [], [], [], [] for game_ids, model_role_dict in zip(game_ids_cv, models_cv): @@ -93,7 +137,6 @@ def _visualize_shap_values( shap_values_array = np.concatenate(shap_values, axis=0) shap_values_dict[role] = shap_values_array feature_values_dict[role] = feature_values_df - explainers_dict[role] = explainers file_name = "combined_shap_features_impact.png" plot_multiple_shap_features_impact( @@ -104,38 +147,6 @@ def _visualize_shap_values( saving_folder=saving_folder, max_display=len(features) ) - - for role in roles: - shap_values_role = shap_values_dict[role] - shap_values = pd.DataFrame(shap_values_role, index=[game_id for game_id in all_game_ids for _ in range(2)], columns=features) - for game_id in specific_game_ids: - game_id_fold_index = next( - (fold_index for fold_index, game_ids in enumerate(game_ids_cv) if game_id in game_ids), - None - ) - - explainer_game = explainers_dict[role][game_id_fold_index] - - shap_values_game = shap_values.loc[game_id] - feature_values_game = feature_values_dict[role].loc[game_id] - - game_saving_folder = join(saving_folder, f"game_{game_id}") - os.makedirs(game_saving_folder, exist_ok=True) - - for player_index in range(len(shap_values_game)): - player_id = feature_values_game.iloc[player_index].name - player_name = data.loc[(game_id, player_id), "player_name"] - - file_name = f"{role}_shap_features_impact_{game_id}_{player_name}.png" - - plot_shap_game_features_impact( - explainer=explainer_game, - shap_values=shap_values_game.iloc[player_index].values, - feature_values_df=feature_values_game.iloc[player_index], - title=f"SHAP values for player {player_name} in game {game_id} with role {role}", - file_name=file_name, - saving_folder=game_saving_folder - ) def _compute_game_id_cross_validation( data: pd.DataFrame, diff --git a/pandaskill/experiments/performance_score/visualization.py b/pandaskill/experiments/performance_score/visualization.py index 9230390..f43b88d 100644 --- a/pandaskill/experiments/performance_score/visualization.py +++ b/pandaskill/experiments/performance_score/visualization.py @@ -31,8 +31,8 @@ def visualize_performance_scores( "role", "performance_score", "Performance score distribution per role", - "Region", - "Player Performance in Game", + "Role", + "PScore", experiment_dir, "performance_score_per_role.png" ) @@ -89,7 +89,6 @@ def plot_all_models_calibration( plt.tight_layout() - # plt.savefig(join(saving_dir, "full_calibration_plots.png")) plt.savefig(join(saving_dir, "full_calibration_plots.pdf")) plt.close() @@ -117,9 +116,10 @@ def plot_shap_game_features_impact( explainer: shap.Explainer, shap_values: np.ndarray, feature_values_df: pd.DataFrame, - title: str, file_name: str, - saving_folder: str + saving_folder: str, + nb_features_to_display: int = 5, + show_xlabel: bool = True ) -> None: os.makedirs(saving_folder, exist_ok=True) feature_values_df = feature_values_df.rename(index=feature_str_dict) @@ -131,11 +131,12 @@ def plot_shap_game_features_impact( ) shap.waterfall_plot( explanation, - max_display=10, + max_display=nb_features_to_display, show=False, ) fig = plt.gcf() - fig.axes[0].set_xlabel("SHAP Values", labelpad=20, fontsize=14) # Set xlabel for the last axis (waterfall plot) + if show_xlabel: + fig.axes[0].set_xlabel("SHAP Values", labelpad=20, fontsize=14) # Set xlabel for the last axis (waterfall plot) fig.tight_layout() plt.savefig( join(saving_folder, file_name[:-4] + ".pdf"), @@ -151,9 +152,6 @@ def plot_multiple_shap_features_impact( saving_folder: str, max_display: int = 10 ) -> None: - - - os.makedirs(saving_folder, exist_ok=True) fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(12,6)) diff --git a/pandaskill/experiments/run_performance_score_experiment.py b/pandaskill/experiments/run_performance_score_experiment.py index 32e48ff..dcd0ad0 100644 --- a/pandaskill/experiments/run_performance_score_experiment.py +++ b/pandaskill/experiments/run_performance_score_experiment.py @@ -71,7 +71,7 @@ def _get_model_class_from_name(model_name: str) -> callable: "one_model_per_role": True, }, "visualization": { - "visualize_shap_values": False, # activating this will significantly slow down the computation + "visualize_shap_values_distributions": False, # activating this will significantly slow down the computation "specific_games_analysis": [ 36348, # close game - LCK 2024 ]