|
| 1 | +/************************************************************************* |
| 2 | + * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. |
| 3 | + * |
| 4 | + * License for AMD contributions = MIT. See LICENSE for more information |
| 5 | + ************************************************************************/ |
| 6 | + |
| 7 | +#include <benchmark/benchmark.h> |
| 8 | +#include <hip/hip_runtime.h> |
| 9 | +#include <hip/hip_fp16.h> |
| 10 | +#include <hip/hip_bfloat16.h> |
| 11 | +#include "amd_detail/hip_float8.h" |
| 12 | + |
| 13 | +#include "benchmark_utils.h" |
| 14 | + |
| 15 | +#include "amd_detail/hip_float8.h" |
| 16 | + |
| 17 | +#include <transformer_engine/cast_hip.h> |
| 18 | +#include <transformer_engine/transformer_engine_hip.h> |
| 19 | + |
| 20 | +using namespace te_bench; |
| 21 | +using namespace transformer_engine; |
| 22 | +using fp8_e4m3 = test::fp8e4m3; |
| 23 | + |
| 24 | +// Tensor shapes from LLaMA (8B, 70B, 405B) and Qwen (7B, 72B) |
| 25 | +#define COMMON_SHAPES \ |
| 26 | + ->Args({1024, 3584}) \ |
| 27 | + ->Args({1024, 4096}) \ |
| 28 | + ->Args({1024, 8192}) \ |
| 29 | + ->Args({1024, 14336}) \ |
| 30 | + ->Args({2048, 4096}) \ |
| 31 | + ->Args({2048, 8192}) \ |
| 32 | + ->Args({2048, 14336}) \ |
| 33 | + ->Args({2048, 28672}) \ |
| 34 | + ->Args({4096, 4096}) \ |
| 35 | + ->Args({4096, 8192}) \ |
| 36 | + ->Args({4096, 16384}) \ |
| 37 | + ->Args({4096, 28672}) \ |
| 38 | + ->Args({8192, 8192}) \ |
| 39 | + ->Args({8192, 16384}) \ |
| 40 | + ->Args({8192, 28672}) \ |
| 41 | + ->Args({8192, 53248}) \ |
| 42 | + ->Args({16384, 8192}) \ |
| 43 | + ->Args({16384, 16384})\ |
| 44 | + ->Args({32768, 8192}) |
| 45 | + |
| 46 | +template <typename IType, typename OType, int SCALE_DIM_Y, int SCALE_DIM_X> |
| 47 | +static void BM_DequantizeMXFP8(benchmark::State &state) { |
| 48 | + const size_t rows = state.range(0); |
| 49 | + const size_t cols = state.range(1); |
| 50 | + |
| 51 | + constexpr bool USE_ROWWISE = SCALE_DIM_X > 1; |
| 52 | + constexpr bool USE_COLWISE = SCALE_DIM_Y > 1; |
| 53 | + |
| 54 | + const size_t scale_cols_row = USE_ROWWISE ? (cols + 31) / 32 : 0; |
| 55 | + const size_t scale_rows_col = USE_COLWISE ? (rows + 31) / 32 : 0; |
| 56 | + const size_t scale_cols_col = USE_COLWISE ? cols : 0; |
| 57 | + |
| 58 | + std::vector<size_t> shape = {rows, cols}; |
| 59 | + DType itype = std::is_same_v<IType, fp8_e4m3> ? DType::kFloat8E4M3 : DType::kFloat8E5M2; |
| 60 | + DType otype = std::is_same_v<OType, __half> ? DType::kFloat16 : |
| 61 | + (std::is_same_v<OType, hip_bfloat16> ? DType::kBFloat16 : DType::kFloat32); |
| 62 | + |
| 63 | + test::Tensor &input_tensor = TensorCache::get_or_create("input", shape, itype, USE_ROWWISE, USE_COLWISE, |
| 64 | + NVTE_MXFP8_1D_SCALING, false); |
| 65 | + test::Tensor &output_tensor = TensorCache::get_or_create("output", shape, otype, true, false, |
| 66 | + NVTE_DELAYED_TENSOR_SCALING, false); |
| 67 | + |
| 68 | + hipStream_t stream; |
| 69 | + HIP_CHECK(hipStreamCreate(&stream)); |
| 70 | + |
| 71 | + DeviceBuffer<float> temp_fp32(rows * cols); |
| 72 | + fill_random_uniform_gpu(temp_fp32.get(), rows * cols, -2.0f, 1.0f, stream); |
| 73 | + |
| 74 | + void *input_data_ptr = USE_ROWWISE ? input_tensor.rowwise_dptr() : input_tensor.columnwise_dptr(); |
| 75 | + size_t threads = 256; |
| 76 | + size_t blocks = (rows * cols + threads - 1) / threads; |
| 77 | + cast_fp32_kernel<<<blocks, threads, 0, stream>>>(temp_fp32.get(), static_cast<IType*>(input_data_ptr), rows * cols); |
| 78 | + |
| 79 | + HIP_CHECK(hipStreamSynchronize(stream)); |
| 80 | + |
| 81 | + hipEvent_t start, stop; |
| 82 | + HIP_CHECK(hipEventCreate(&start)); |
| 83 | + HIP_CHECK(hipEventCreate(&stop)); |
| 84 | + |
| 85 | + warmup_gpu(); |
| 86 | + |
| 87 | + for (auto _ : state) { |
| 88 | + HIP_CHECK(hipEventRecord(start, stream)); |
| 89 | + |
| 90 | + nvte_dequantize(input_tensor.data(), output_tensor.data(), stream); |
| 91 | + |
| 92 | + HIP_CHECK(hipEventRecord(stop, stream)); |
| 93 | + HIP_CHECK(hipEventSynchronize(stop)); |
| 94 | + |
| 95 | + float ms = 0; |
| 96 | + HIP_CHECK(hipEventElapsedTime(&ms, start, stop)); |
| 97 | + state.SetIterationTime(ms / 1000.0); |
| 98 | + } |
| 99 | + |
| 100 | + HIP_CHECK(hipEventDestroy(start)); |
| 101 | + HIP_CHECK(hipEventDestroy(stop)); |
| 102 | + |
| 103 | + const size_t bytes_read_data = rows * cols * sizeof(IType) * |
| 104 | + ((USE_ROWWISE ?: 0) + (USE_COLWISE ?: 0)); |
| 105 | + const size_t bytes_read_scales = (USE_ROWWISE ? rows * scale_cols_row : 0) + |
| 106 | + (USE_COLWISE ? scale_rows_col * scale_cols_col : 0); |
| 107 | + const size_t bytes_write = rows * cols * sizeof(OType); |
| 108 | + const size_t total_bytes = bytes_read_data + bytes_read_scales + bytes_write; |
| 109 | + |
| 110 | + set_bytes_processed(state, total_bytes); |
| 111 | + |
| 112 | + HIP_CHECK(hipStreamDestroy(stream)); |
| 113 | +} |
| 114 | + |
| 115 | +#define REGISTER_DEQUANTIZE_ALL_CONFIGS(ITYPE, OTYPE, INAME, ONAME) \ |
| 116 | + BENCHMARK_TEMPLATE(BM_DequantizeMXFP8, ITYPE, OTYPE, 1, 32) \ |
| 117 | + ->Name("BM_DequantizeMXFP8/" INAME "_" ONAME "/rowwise") \ |
| 118 | + COMMON_SHAPES \ |
| 119 | + ->Unit(benchmark::kMicrosecond) \ |
| 120 | + ->UseManualTime(); \ |
| 121 | + BENCHMARK_TEMPLATE(BM_DequantizeMXFP8, ITYPE, OTYPE, 32, 1) \ |
| 122 | + ->Name("BM_DequantizeMXFP8/" INAME "_" ONAME "/colwise") \ |
| 123 | + COMMON_SHAPES \ |
| 124 | + ->Unit(benchmark::kMicrosecond) \ |
| 125 | + ->UseManualTime(); |
| 126 | + |
| 127 | +REGISTER_DEQUANTIZE_ALL_CONFIGS(fp8_e4m3, __half, "E4M3", "FP16") |
| 128 | +REGISTER_DEQUANTIZE_ALL_CONFIGS(fp8_e4m3, hip_bfloat16, "E4M3", "BF16") |
| 129 | +REGISTER_DEQUANTIZE_ALL_CONFIGS(fp8_e4m3, float, "E4M3", "FP32") |
| 130 | + |
| 131 | +BENCHMARK_MAIN(); |
0 commit comments