From 14ba385f128f9ceaff4886ba6d769b47289f01d4 Mon Sep 17 00:00:00 2001
From: Maxime De Bois <maxime.debois@pandascore.co>
Date: Fri, 25 Apr 2025 10:08:15 +0200
Subject: [PATCH 1/2] fix not taking games played on last date

---
 pandaskill/app/leaderboard_page.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandaskill/app/leaderboard_page.py b/pandaskill/app/leaderboard_page.py
index 19fd2b1..9c49654 100644
--- a/pandaskill/app/leaderboard_page.py
+++ b/pandaskill/app/leaderboard_page.py
@@ -6,6 +6,7 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 from pandaskill.app.misc import compute_rating_lower_bound
+from datetime import timedelta
 
 def display_leaderboard_page(data):
     """
@@ -80,7 +81,7 @@ def _get_leaderboard_parameters(data):
     since = date - dt.timedelta(days=30*6)
     parameters = {
         "since": since.strftime("%Y-%m-%d"),
-        "date": date.strftime("%Y-%m-%d"),
+        "date": (date + timedelta(days=1)).strftime("%Y-%m-%d"),
         "min_nb_games": min_nb_games
     }
     data = data.loc[data["date"] <= dt.datetime.combine(date, dt.datetime.min.time())]

From 016057a0dc553980d2ecd84f7ca9f8600b006328 Mon Sep 17 00:00:00 2001
From: Maxime De Bois <maxime.debois@pandascore.co>
Date: Tue, 27 May 2025 17:07:42 +0200
Subject: [PATCH 2/2] improve shap values viz

---
 .../experiments/general/visualization.py      | 26 +++---
 .../performance_score/training_testing_cv.py  | 89 +++++++++++--------
 .../performance_score/visualization.py        | 18 ++--
 .../run_performance_score_experiment.py       |  2 +-
 4 files changed, 72 insertions(+), 63 deletions(-)

diff --git a/pandaskill/experiments/general/visualization.py b/pandaskill/experiments/general/visualization.py
index 9c04f9e..5a846c0 100644
--- a/pandaskill/experiments/general/visualization.py
+++ b/pandaskill/experiments/general/visualization.py
@@ -1,5 +1,5 @@
 from pandaskill.experiments.general.metrics import compute_ece_from_binned_df, bin_predictions_equal_size
-from pandaskill.experiments.general.utils import ALL_REGIONS
+from pandaskill.experiments.general.utils import ALL_REGIONS, ROLES
 import matplotlib.pyplot as plt
 import numpy as np
 import os
@@ -62,22 +62,23 @@ def plot_violin_distributions(
 
     df = df.copy()
     
-    if x_column == "region":
+    if x_column in ["role", "region"]:
+        reference_list = ALL_REGIONS if x_column == "region" else ROLES
         region_order_dict = {
             region: i
-            for i, region in enumerate(ALL_REGIONS)
+            for i, region in enumerate(reference_list)
         }
-        df["region_order"] = df["region"].map(region_order_dict)
+        df["order"] = df[x_column].map(region_order_dict)
+        df = df.sort_values("order")
 
-        format_region = lambda region: region.replace(" ", "\n").replace("-", "\n")
-        df["region"] = df["region"].apply(format_region)
+        if x_column == "region": # format region names
+            format_region = lambda region: region.replace(" ", "\n").replace("-", "\n")
+            df[x_column] = df[x_column].apply(format_region)
+            reference_list = [
+                format_region(region) for region in reference_list
+            ]
         
-        df = df.sort_values("region_order")
-        
-        all_regions_formatted = [
-            format_region(region) for region in ALL_REGIONS
-        ]
-        color_palette = dict(zip(all_regions_formatted, color_palette))
+        color_palette = dict(zip(reference_list, color_palette))        
     else:
         nb_unique_hue = len(df[x_column].unique())
         color_palette = color_palette[:nb_unique_hue]
@@ -104,5 +105,4 @@ def plot_violin_distributions(
     plt.tight_layout()
 
     plt.savefig(join(saving_dir, f"{file_name[:-4]}.pdf"))
-    # plt.savefig(join(saving_dir, file_name))
     plt.close()
\ No newline at end of file
diff --git a/pandaskill/experiments/performance_score/training_testing_cv.py b/pandaskill/experiments/performance_score/training_testing_cv.py
index f725fa1..b0718ff 100644
--- a/pandaskill/experiments/performance_score/training_testing_cv.py
+++ b/pandaskill/experiments/performance_score/training_testing_cv.py
@@ -49,9 +49,12 @@ def compute_performance_scores_cv_loop(
         features_importance_list.append(feature_importances_fold)
         calibration_data.append(calibration_data_fold)
 
-    if evaluation_config["visualize_shap_values"]:
-        _visualize_shap_values(
-            data, features, test_game_ids, evaluation_config["specific_games_analysis"], models_list, roles, experiment_dir
+    _visualize_shap_values_single_game(
+        data, features, test_game_ids, evaluation_config["specific_games_analysis"], models_list, roles, experiment_dir
+    )
+    if evaluation_config["visualize_shap_values_distributions"]:
+        _visualize_shap_values_distributions(
+            data, features, test_game_ids, models_list, roles, experiment_dir
         )
 
     plot_all_models_calibration(calibration_data, experiment_dir)
@@ -63,7 +66,7 @@ def compute_performance_scores_cv_loop(
 
     return performance_scores_df, evaluation_metrics
 
-def _visualize_shap_values(
+def _visualize_shap_values_single_game(
     data: pd.DataFrame, 
     features: list, 
     game_ids_cv: np.ndarray, 
@@ -71,12 +74,53 @@ def _visualize_shap_values(
     models_cv: dict, 
     roles: list,
     experiment_dir: str
+) -> None:    
+    if not specific_game_ids:
+        return
+    
+    saving_folder = join(experiment_dir, "shap_values")
+    explainers_dict = {
+        role: [shap.Explainer(model_role_dict[role].model) for model_role_dict in models_cv]
+        for role in roles
+    }
+    for game_id in specific_game_ids:
+        fold_idx = next(i for i, g in enumerate(game_ids_cv) if game_id in g)
+        game_saving_folder = os.path.join(saving_folder, f"game_{game_id}")
+        os.makedirs(game_saving_folder, exist_ok=True)
+
+        X_game = data.xs(game_id, level="game_id")
+        for player_id, row in X_game.iterrows():
+            role = row["role"]
+            player_name = row["player_name"]
+
+            X  = row[features].values.reshape(1, -1)
+            model_obj = models_cv[fold_idx][role]
+            X_norm = model_obj.scaler.transform(X)
+
+            explainer = explainers_dict[role][fold_idx]
+            shap_vals = explainer(X_norm)
+
+            plot_shap_game_features_impact(
+                explainer=explainer,
+                shap_values=shap_vals.values[0],
+                feature_values_df= pd.Series(X[0], index=features, name=player_id),
+                file_name= f"{role}_{game_id}_{player_name}.png",
+                saving_folder=game_saving_folder,
+                nb_features_to_display=10,
+                show_xlabel=True
+            )
+
+def _visualize_shap_values_distributions(
+    data: pd.DataFrame, 
+    features: list, 
+    game_ids_cv: np.ndarray,
+    models_cv: dict, 
+    roles: list,
+    experiment_dir: str
 ) -> None:    
     saving_folder = join(experiment_dir, "shap_values")
     shap_values_dict = {}
     feature_values_dict = {}
-    explainers_dict = {}
-    
     for role in roles:
         shap_values, feature_values, all_game_ids, explainers = [], [], [], []
         for game_ids, model_role_dict in zip(game_ids_cv, models_cv):
@@ -93,7 +137,6 @@ def _visualize_shap_values(
         shap_values_array = np.concatenate(shap_values, axis=0)
         shap_values_dict[role] = shap_values_array
         feature_values_dict[role] = feature_values_df
-        explainers_dict[role] = explainers
     
     file_name = "combined_shap_features_impact.png"
     plot_multiple_shap_features_impact(
@@ -104,38 +147,6 @@ def _visualize_shap_values(
         saving_folder=saving_folder,
         max_display=len(features)
     )
-
-    for role in roles:
-        shap_values_role = shap_values_dict[role]
-        shap_values = pd.DataFrame(shap_values_role, index=[game_id for game_id in all_game_ids for _ in range(2)], columns=features)
-        for game_id in specific_game_ids:
-            game_id_fold_index = next(
-                (fold_index for fold_index, game_ids in enumerate(game_ids_cv) if game_id in game_ids),
-                None
-            )
-                        
-            explainer_game = explainers_dict[role][game_id_fold_index]
-            
-            shap_values_game = shap_values.loc[game_id]
-            feature_values_game = feature_values_dict[role].loc[game_id]
-            
-            game_saving_folder = join(saving_folder, f"game_{game_id}")
-            os.makedirs(game_saving_folder, exist_ok=True)
-            
-            for player_index in range(len(shap_values_game)):
-                player_id = feature_values_game.iloc[player_index].name
-                player_name = data.loc[(game_id, player_id), "player_name"]
-                
-                file_name = f"{role}_shap_features_impact_{game_id}_{player_name}.png"
-                
-                plot_shap_game_features_impact(
-                    explainer=explainer_game,
-                    shap_values=shap_values_game.iloc[player_index].values, 
-                    feature_values_df=feature_values_game.iloc[player_index], 
-                    title=f"SHAP values for player {player_name} in game {game_id} with role {role}",
-                    file_name=file_name, 
-                    saving_folder=game_saving_folder
-                )
             
 def _compute_game_id_cross_validation(
     data: pd.DataFrame,
diff --git a/pandaskill/experiments/performance_score/visualization.py b/pandaskill/experiments/performance_score/visualization.py
index 9230390..f43b88d 100644
--- a/pandaskill/experiments/performance_score/visualization.py
+++ b/pandaskill/experiments/performance_score/visualization.py
@@ -31,8 +31,8 @@ def visualize_performance_scores(
         "role",
         "performance_score",
         "Performance score distribution per role",
-        "Region",
-        "Player Performance in Game",
+        "Role",
+        "PScore",
         experiment_dir,
         "performance_score_per_role.png"
     )
@@ -89,7 +89,6 @@ def plot_all_models_calibration(
 
     plt.tight_layout()
 
-    # plt.savefig(join(saving_dir, "full_calibration_plots.png"))
     plt.savefig(join(saving_dir, "full_calibration_plots.pdf"))
     plt.close()
 
@@ -117,9 +116,10 @@ def plot_shap_game_features_impact(
     explainer: shap.Explainer,
     shap_values: np.ndarray,
     feature_values_df: pd.DataFrame,
-    title: str,
     file_name: str,
-    saving_folder: str
+    saving_folder: str,
+    nb_features_to_display: int = 5,
+    show_xlabel: bool = True
 ) -> None:
     os.makedirs(saving_folder, exist_ok=True)
     feature_values_df = feature_values_df.rename(index=feature_str_dict)
@@ -131,11 +131,12 @@ def plot_shap_game_features_impact(
     )
     shap.waterfall_plot(
         explanation,
-        max_display=10,
+        max_display=nb_features_to_display,
         show=False,
     )
     fig = plt.gcf()
-    fig.axes[0].set_xlabel("SHAP Values", labelpad=20, fontsize=14)  # Set xlabel for the last axis (waterfall plot)
+    if show_xlabel:
+        fig.axes[0].set_xlabel("SHAP Values", labelpad=20, fontsize=14)  # Set xlabel for the last axis (waterfall plot)
     fig.tight_layout()  
     plt.savefig(
         join(saving_folder, file_name[:-4] + ".pdf"),
@@ -151,9 +152,6 @@ def plot_multiple_shap_features_impact(
     saving_folder: str,
     max_display: int = 10
 ) -> None:
-    
-
-
     os.makedirs(saving_folder, exist_ok=True)
     
     fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(12,6))
diff --git a/pandaskill/experiments/run_performance_score_experiment.py b/pandaskill/experiments/run_performance_score_experiment.py
index 32e48ff..dcd0ad0 100644
--- a/pandaskill/experiments/run_performance_score_experiment.py
+++ b/pandaskill/experiments/run_performance_score_experiment.py
@@ -71,7 +71,7 @@ def _get_model_class_from_name(model_name: str) -> callable:
             "one_model_per_role": True,
         },
         "visualization": {
-            "visualize_shap_values": False, # activating this will significantly slow down the computation
+            "visualize_shap_values_distributions": False, # activating this will significantly slow down the computation
             "specific_games_analysis": [
                 36348, # close game - LCK 2024
             ]