deberta-tensorrt/deberta_onnx_modify.py at master · symphonylyh/deberta-tensorrt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# python deberta_onnx_modify.py deberta.onnx # for modified model with plugin nodes
# python deberta_onnx_modify.py deberta.onnx --correctness_check # for correctness check

import onnx
import onnx_graphsurgeon as gs
import argparse, os
import numpy as np

PLUGIN_VERSION = 2

parser = argparse.ArgumentParser(description="Modify DeBERTa ONNX model to prepare for Disentangled Attention Plugin. This will save the modified model under the same directory with '_plugin.onnx' appended to the filename.")
parser.add_argument('input', type=str, help='Path to the input ONNX model')
parser.add_argument('--output', type=str, help="Path to the output ONNX model. If not set, default to the input file name with a suffix of '_plugin' ")
parser.add_argument('--correctness_check', action='store_true')

args = parser.parse_args()

model_input = args.input
if args.output is None:
    model_output = os.path.splitext(model_input)[0] + "_plugin" + os.path.splitext(model_input)[-1]
else:
    model_output = args.output
correctness_check = args.correctness_check

def isolate_node(graph, node_name):
    '''
    Simple isolation of nodes by its operation name.
    This will add 'NullPlugin' nodes to each input/output edge.
    '''
    def find_idx(tensor, node, tensor_type):
        '''
        Find the index of the node w.r.t. input/output tensor.
        type str 'input' or 'output'
        '''
        idx = -1
        if tensor_type == 'input':
            for i, n in enumerate(tensor.outputs):
                if n.name == node.name:
                    idx = i
                    break

        elif tensor_type == 'output':
            for i, n in enumerate(tensor.inputs): # although usually tensor has only one input node
                if n.name == node.name:
                    idx = i
                    break
        assert idx >= 0, 'Tensor and Node are not connected!'
        return idx

    nodes = [node for node in graph.nodes if node.op == node_name]
    for node in nodes:
        ## modify inputs
        new_inputs = []
        while node.inputs: # del input tensor's output will remove in node.inputs too, so for loop can't work; instead, loop until node.inputs = [] i.e., all input edges disconnected
            input = node.inputs[0]
            # disconnect input tensors from the node (remove the node from the input tensor's output node list). Note: this will also remove the tensor from node.inputs
            # can simply do input.outputs.clear(), but just in case some tensors have > 1 output nodes
            del input.outputs[find_idx(input, node, 'input')]

            # add null plugin node
            null = gs.Node(op='NullPlugin', name='null')
            graph.nodes.append(null)

            # create intermediate tensor (new edge, I')
            input_prime = gs.Variable(name=input.name+"'")

            # reconnect
            input.outputs.append(null) # equivalent to null.inputs.append(input). Again the mutual connection concept. If we do this again, actually the input is added twice! This will results in non-unique input/output tensor problem
            null.outputs.append(input_prime)
            new_inputs.append(input_prime)

        # reconnect new input tensors to node (can't do in the loop above, since the loop keep accessing node.inputs)
        for new_input in new_inputs:
            node.inputs.append(new_input) # this will at the same time add node to input_prime's output node

        ## modify outputs
        new_outputs = []
        while node.outputs:
            output = node.outputs[0]
            # disconnect output tensors from the node (remove the node from the output tensor's input node list) Note: this will also remove the tensor from node.outputs
            del output.inputs[find_idx(output, node, 'output')]

            # add null plugin node
            null = gs.Node(op='NullPlugin', name='null')
            graph.nodes.append(null)

            # create intermediate tensor (new edge, O')
            output_prime = gs.Variable(name=output.name+"'")

            # reconnect
            output.inputs.append(null) # equivalent to null.outputs.append(output). Dont' do it twice
            null.inputs.append(output_prime)
            new_outputs.append(output_prime)

        # reconnect node to new output tensors (can't do in the loop above, since the loop keep accessing node.outputs)
        for new_output in new_outputs:
            node.outputs.append(new_output)

    return graph

# example: https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31695/
@gs.Graph.register()
def insert_disentangled_attention_v1(self, inputs, outputs):
    '''
    Fuse disentangled attention module (Gather + Gather + Transpose + Add)
    '''
    # disconnect previous output from flow (the previous subgraph still exists but is effectively dead since it has no link to an output tensor, and thus will be cleaned up)
    [out.inputs.clear() for out in outputs]
    # add plugin layer
    self.layer(op='DisentangledAttention_TRT', inputs=inputs, outputs=outputs)

def insert_disentangled_attention_all_v1(graph):
    '''
    Insert disentangled attention plugins for all layers
    '''
    nodes = [node for node in graph.nodes if node.op == 'GatherElements'] # find by gatherelements op
    assert len(nodes) % 2 == 0, "No. of GatherElements nodes is not an even number!"

    layers = [(nodes[2*i+0], nodes[2*i+1]) for i in range(len(nodes)//2)] # 2 gatherelements in 1 layer
    for l, (left,right) in enumerate(layers):
        print(f"Fusing layer {l}")
        # CAVEAT! MUST cast to list when setting the inputs & outputs. graphsurgeon's default for X.inputs and X.outputs is `onnx_graphsurgeon.util.misc.SynchronizedList`, i.e. 2-way node-tensor updating mechanism. If not cast, when we remove the input nodes of a tensor, the tensor itself will be removed as well...

        ## for raw MSFT model
        # inputs: (data1, indices1, data2, indices2), input tensors for 2 gathers
        inputs = list(left.inputs + right.inputs)
        # outputs: (result), output tensors after adding 2 gather results
        outputs = list(left.o().o().outputs)

        ## for precompute model
        # # inputs: (data1, indices1, data2, indices2), input tensors for 2 gathers
        # inputs = list(left.inputs + right.inputs)
        # # outputs: (result), output tensors after adding 2 gather results
        # outputs = list(left.o().o().o().outputs)
        # insert plugin layer
        graph.insert_disentangled_attention_v1(inputs, outputs)

    return graph

@gs.Graph.register()
def insert_disentangled_attention_v2(self, inputs, outputs, factor, span):
    '''
    Fuse disentangled attention module (Add + Gather + Gather + Transpose + Add + Div)

    inputs: list of plugin inputs
    outputs: list of plugin outputs
    factor: scaling factor of disentangled attention, sqrt(3d), converted from a division factor to a multiplying factor
    span: relative distance span, k
    '''
    # disconnect previous output from flow (the previous subgraph still exists but is effectively dead since it has no link to an output tensor, and thus will be cleaned up)
    [out.inputs.clear() for out in outputs]
    # add plugin layer
    attrs = {
        "factor": 1/factor,
        "span": span
    }
    self.layer(op='DisentangledAttention_TRT', inputs=inputs, outputs=outputs, attrs=attrs)

def insert_disentangled_attention_all_v2(graph):
    '''
    Insert disentangled attention plugins for all layers
    '''
    nodes = [node for node in graph.nodes if node.op == 'GatherElements'] # find by gatherelements op
    assert len(nodes) % 2 == 0, "No. of GatherElements nodes is not an even number!"

    layers = [(nodes[2*i+0], nodes[2*i+1]) for i in range(len(nodes)//2)] # 2 gatherelements in 1 layer
    for l, (left,right) in enumerate(layers):
        print(f"Fusing layer {l}")
        # CAVEAT! MUST cast to list when setting the inputs & outputs. graphsurgeon's default for X.inputs and X.outputs is `onnx_graphsurgeon.util.misc.SynchronizedList`, i.e. 2-way node-tensor updating mechanism. If not cast, when we remove the input nodes of a tensor, the tensor itself will be removed as well...

        model_type = 2
        if model_type == 1:
            ## for raw MSFT model
            # inputs: (data0, data1, data2), input tensors for c2c add and 2 gathers
            inputs = list(left.o().o().o().i().inputs)[0:1] + list(left.inputs)[0:1] + list(right.inputs)[0:1]
            # outputs: (result), output tensors after adding 3 gather results
            outputs = list(left.o().o().o().o(2,0).outputs) # include reshape as well
            # constants: scaling factor, relative distance span
            factor = left.o().o().o().i().inputs[1].inputs[0].attrs["value"].values.item()
            span = right.i(1,0).i().i().i().inputs[1].inputs[0].attrs["value"].values.item()

        elif model_type == 2:
            ## for latest HF model
            # inputs: (data0, data1, data2), input tensors for c2c add and 2 gathers
            inputs = list(left.o().o().o().o().i().inputs)[0:1] + list(left.inputs)[0:1] + list(right.inputs)[0:1]
            # outputs: (result), output tensors after adding 3 gather results
            outputs = list(left.o().o().o().o().outputs)
            # constants: scaling factor, relative distance span
            factor = left.o().inputs[1].inputs[0].attrs["value"].values.item()
            span = right.i(1,0).i().i().i().inputs[1].inputs[0].attrs["value"].values.item()

        # insert plugin layer
        graph.insert_disentangled_attention_v2(inputs, outputs, factor, span)

    return graph

def correctness_check_models(graph):
    '''
    Add output nodes at the plugin location for both the original model and the model with plugin
    '''

    ## for original graph
    # make a copy of the graph first
    graph_raw = graph.copy()
    nodes = [node for node in graph_raw.nodes if node.op == 'GatherElements'] # find by gatherelements op
    assert len(nodes) % 2 == 0, "No. of GatherElements nodes is not an even number!"

    layers = [(nodes[2*i+0], nodes[2*i+1]) for i in range(len(nodes)//2)] # 2 gatherelements in 1 layer
    original_output_all = []
    for l, (left,right) in enumerate(layers):
        # outputs: (result), output tensors after adding 3 gather results
        # add the output tensor to the graph outputs list. Don't create any new tensor!
        end_node = left.o().o().o().o()
        end_node.outputs[0].dtype = graph_raw.outputs[0].dtype # need to explicitly specify dtype and shape of graph output tensor
        end_node.outputs[0].shape = ['batch_size*6', 2048, 2048]
        original_output_all.append(end_node.outputs[0])

    graph_raw.outputs = graph_raw.outputs + original_output_all # add plugin outputs to graph output

    ## for modified graph with plugin
    nodes = [node for node in graph.nodes if node.op == 'GatherElements'] # find by gatherelements op
    assert len(nodes) % 2 == 0, "No. of GatherElements nodes is not an even number!"

    layers = [(nodes[2*i+0], nodes[2*i+1]) for i in range(len(nodes)//2)] # 2 gatherelements in 1 layer
    plugin_output_all = []
    for l, (left,right) in enumerate(layers):
        ## for latest HF model
        # inputs: (data0, data1, data2), input tensors for c2c add and 2 gathers
        inputs = list(left.o().o().o().o().i().inputs)[0:1] + list(left.inputs)[0:1] + list(right.inputs)[0:1]
        # outputs: (result), output tensors after adding 3 gather results
        outputs = list(left.o().o().o().o().outputs)
        end_node = left.o().o().o().o()
        end_node.outputs[0].dtype = graph.outputs[0].dtype # need to explicitly specify dtype and shape of graph output tensor
        end_node.outputs[0].shape = ['batch_size*6', 2048, 2048]
        plugin_output_all.append(end_node.outputs[0]) # add to graph output (outside this loop)

        # constants: scaling factor, relative distance span
        factor = left.o().inputs[1].inputs[0].attrs["value"].values.item()
        span = right.i(1,0).i().i().i().inputs[1].inputs[0].attrs["value"].values.item()

        # insert plugin layer
        graph.insert_disentangled_attention_v2(inputs, outputs, factor, span)

    graph.outputs = graph.outputs + plugin_output_all # add plugin outputs to graph output

    return graph_raw, graph

def check_model(model_name):
    # Load the ONNX model
    model = onnx.load(model_name)

    # Check that the model is well formed
    onnx.checker.check_model(model)

# load onnx
graph = gs.import_onnx(onnx.load(model_input))

## for testing purpose, simply isolate certain nodes with nullplugin
# graph = isolate_node(graph, 'GatherElements')

if not correctness_check: # not correctness check, just save the modified model with plugin
    if PLUGIN_VERSION == 1:
        ## version 1: replace Gather + Gather + Transpose + Add + Div (c2p and p2c) with DisentangledAttentionPlugin node
        graph = insert_disentangled_attention_all_v1(graph)
    elif PLUGIN_VERSION == 2:
        ## version 2: replace Add + Gather + Gather + Transpose + Add + Div (c2c and c2p and p2c) with DisentangledAttentionPlugin node
        graph = insert_disentangled_attention_all_v2(graph)

    # remove unused nodes, and topologically sort the graph.
    graph.cleanup().toposort()

    # export the onnx graph from graphsurgeon
    onnx.save_model(gs.export_onnx(graph), model_output)

    print(f"Saving modified model to {model_output}")

    # don't check ONNX model because 'DisentangledAttentionPlugin' is not a registered op
    # check_model(model_output)

else: # correctness check, save two models (original and with plugin) with intermediate output nodes inserted
    graph_raw, graph = correctness_check_models(graph)

    # remove unused nodes, and topologically sort the graph.
    graph_raw.cleanup().toposort()
    graph.cleanup().toposort()

    # export the onnx graph from graphsurgeon
    model_output1 = os.path.splitext(model_input)[0] + "_correctness_check_original" + os.path.splitext(model_input)[-1]
    model_output2 = os.path.splitext(model_input)[0] + "_correctness_check_plugin" + os.path.splitext(model_input)[-1]
    onnx.save_model(gs.export_onnx(graph_raw), model_output1)
    onnx.save_model(gs.export_onnx(graph), model_output2)

    print(f"Saving models for correctness check to {model_output1} (original) and {model_output2} (with plugin)")

    check_model(model_output1)
    # don't check ONNX model because 'DisentangledAttentionPlugin' is not a registered op
    # check_model(model_output2)