RankingExplainers/rankingSHAP.py at main · jacons/RankingExplainers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
================================================================================
-- This is the improved and optimized version of the RankingSHAP algorithm --
Source : https://github.com/MariaHeuss/RankingShap
================================================================================
Author: Andrea Iommi
Code Ownership:
    - All Python source code in this file is written solely by the author.
Documentation Notice:
    - All docstrings and inline documentation are written by ChatGPT,
      but thoroughly checked and approved by the author for accuracy.
================================================================================
"""

from typing import Callable

import numpy as np
import shap
from numpy import ndarray, asarray, full, empty_like, arange, where

RankerFunc = Callable[[ndarray], ndarray]
CharFunc = Callable[[ndarray,ndarray], float]

class SurrogateFunction:
    """
    This class acts as a surrogate model for a black-box ranker, as described in the RankingSHAP paper.
    It adapts the ranking model's output to a single value, making it compatible with the SHAP framework.
    The model performs perturbations on the input document set and then uses a characterization function to
    quantify the impact of these perturbations on the ranking.
    """


    def __init__(self, ranker: RankerFunc, char_function: CharFunc):
        """
        Initializes the SurrogateFunction with a ranking model and a characterization function.

        Parameters
        ----------
            ranker (RankerFunc): The black-box ranking model that takes a set of document feature vectors
                                  and returns their scores.
            char_function (CharFunc): A function that reduces the ranking model's output to a single value
                                        that reflects a change for a perturbed input sample.
                                        An example is Kendall's tau correlation.
        """
        self.ranker = ranker
        self.char_function = char_function
        self.original_x = None
        self.og_rank = None

    @property
    def original_x(self):
        """
        Returns the original document feature vectors for a specific query.
        """
        return self.original_x

    @original_x.setter
    def original_x(self, original_x:ndarray):
        """
        Sets the original document feature vectors and their corresponding ranks.
        """
        self.original_x = original_x
        self.og_rank = self.ranking(original_x)

    @staticmethod
    def rank_list(vector:ndarray)-> ndarray:
        """
        Generates a ranked list from a vector of scores. The highest score gets rank 1.

        Parameters:
        ----------
            vector (ndarray): A numpy array of scores.

        Returns:
        ----------
            ndarray: A numpy array representing the ranks.
        """
        temp = vector.argsort()[::-1]
        ranks = empty_like(temp)
        ranks[temp] = arange(1, len(vector) + 1)
        return ranks

    def ranking(self, vector:ndarray)-> ndarray:
        """
        Computes the ranking of the document feature vectors using the ranker and returns their ranks.

        Parameters:
        ----------
            vector (ndarray): A set of document feature vectors.

        Returns:
        ----------
            ndarray: A numpy array representing the ranks of the documents.
        """
        return self.rank_list(self.ranker(vector))

    def __call__(self, z: ndarray) -> ndarray:
        """
        Calculates the impact of perturbed feature vectors on the ranking. This method is called by the SHAP
        KernelExplainer.

        Parameters:
        ----------
            z (ndarray): A set of masked feature vectors. The masked features are replaced with a baseline
                            value from the background data.

        Returns:
        ----------
            ndarray: An array of scores, where each score represents the change in ranking for a perturbed
                     document set as measured by the characterization function.
        """

        # The provided 'z' array from KernelExplainer already contains the background examples.
        # It's a matrix where each row is a masked document set. The `pd.isna(z_doc)` checks for the
        # masked features (represented as 'None' values) and replaces them with the original feature values.
        adjusted_features = asarray(
                [[where(np.isnan(z_doc), doc, z_doc) for doc in self.original_x] for z_doc in z]
        )

        scores = []

        for features_background_sample in adjusted_features:
            # Determine ranking for the adjusted document feature vectors.
            new_rank = self.ranking(features_background_sample)
            # Apply the characterization function (e.g., Kendall's tau) to get a single score.
            scores.append(self.char_function(self.og_rank, new_rank))
        return asarray(scores)


class RankinSHAP:
    """
    RankingSHAP is a feature attribution method that extends SHAP to explain listwise ranking decisions.
     It works by wrapping SHAP's KernelExplainer and using a SurrogateFunction to adapt the
     ranking output to a single value that can be explained.
    """
    def __init__(self, background_data: ndarray, ranker: RankerFunc, char_function:  CharFunc):
        """
        Initializes the RankingSHAP explainer.

        Parameters:
        ----------
            background_data (ndarray): The background data, which should represent the real data
                                        distribution and is used for masking features.
            ranker (RankerFunc): The black-box ranking model.
            char_function (CharFunc): A function to characterize the impact of perturbations on the ranking,
                                      such as Kendall's tau.
        """
        self.surrogate = SurrogateFunction(ranker, char_function)
        self.background_data = background_data

        self.feature_shape = np.shape(background_data[0])
        self.num_features = len(background_data[0])

    def explain(self, x: ndarray) -> ndarray:
        """
        Computes the SHAP values for the given document set.

        Parameters:
        ----------
            x (ndarray): The document feature vectors to be explained.

        Returns:
        ----------
            ndarray: A numpy array containing the SHAP attribution values for each feature.
        """
        self.surrogate.original_x = x

        # The SHAP KernelExplainer requires a function that takes a masked input and returns a prediction.
        # It's initialized with the surrogate function and the background data.
        shap_explainer = shap.KernelExplainer(
            self.surrogate, self.background_data, nsamples="auto"
        )

        # The explainer needs to explain a single instance, which is represented here as a vector
        # of 'None's. This signals to the surrogate function to generate masked versions of the
        # original data based on the background data.
        vector_of_nones = asarray([full(self.feature_shape, None)])
        exp = shap_explainer.shap_values(vector_of_nones, nsamples="auto")[0]

        return exp