Not sure if this is intended behaviour, but the error between the kernelized SiLUActivation and default pytorch implementation is huge. This creates erroneous output for GraniteMoeHybrid.
from transformers.activations import SiLUActivation
from transformers.integrations import hub_kernels
import torch
from kernels import (
LayerRepository,
LocalLayerRepository,
use_kernel_mapping,
Mode,
use_kernel_forward_from_hub,
kernelize
)
if __name__ == "__main__":
device = torch.device("cuda")
dtype = torch.bfloat16
act = SiLUActivation()
x = torch.load('error_input.dmp')
print(x.size())
y = act(x)
kernel_layer_mapping = {
"SiLU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0"
)
}
},
}
with use_kernel_mapping(kernel_layer_mapping):
act_k = kernelize(act, mode=Mode.INFERENCE, device=device)
y_ = act_k(x)
print("Max error:", torch.abs(y - y_).max())
torch.Size([1, 51, 1024])
Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 12677.08it/s]
Download complete: : 0.00B [00:00, ?B/s] Max error: tensor(17.2500, device='cuda:0', dtype=torch.bfloat16) ?it/s]
Download complete: : 0.00B [00:00, ?B/s]
error_input.dmp
Not sure if this is intended behaviour, but the error between the kernelized SiLUActivation and default pytorch implementation is huge. This creates erroneous output for
GraniteMoeHybrid.error_input.dmp