Skip to content
6 changes: 6 additions & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,10 @@ endfunction()
if(EXECUTORCH_BUILD_WEBGPU_TEST)
add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
add_webgpu_native_test(
webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
)
add_webgpu_native_test(
webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
)
endif()
15 changes: 12 additions & 3 deletions backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ fi
cd "${EXECUTORCH_ROOT}"

# ── Exports for the model-driven executables (best-effort) ───────────────────
# native_test + rms_norm read .pte/golden inputs via WEBGPU_TEST_* env and
# self-skip if absent; dispatch_order + scratch are standalone (no exports).
# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
# self-skip if absent; scratch is standalone (generates its own inputs).
PTE_MODEL="/tmp/webgpu_add_test.pte"
PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
RMS_NORM_DIR="/tmp/rmsn"
RMS_NORM_OK=1
DISPATCH_ORDER_DIR="/tmp/dispatch_order"
DISPATCH_ORDER_OK=1

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
Expand All @@ -55,6 +57,11 @@ from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rm
export_rms_norm_cases('${RMS_NORM_DIR}')
" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }

# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ─────────
echo "=== Configure WebGPU native tests on Dawn ==="
rm -rf "${BUILD_DIR}"
Expand Down Expand Up @@ -115,7 +122,9 @@ fi
if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
"${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_dispatch_order_test" ]] && "${BIN_DIR}/webgpu_dispatch_order_test"
if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
"${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"

echo "=== WebGPU native tests on Dawn: all run targets passed ==="
167 changes: 167 additions & 0 deletions backends/webgpu/test/native/test_dispatch_order.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

using namespace executorch::backends::webgpu;
using namespace executorch::extension;
using namespace executorch::runtime;

namespace {

struct Case {
const char* name;
std::vector<int32_t> sizes;
};

// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
const std::vector<Case> kCases = {
{"single", {16, 16}},
{"chain3", {64, 64}},
{"chain5_tiny", {1, 1}},
{"chain5_wide", {7, 896}},
{"chain8", {256, 256}},
{"deep32", {128, 128}},
{"large_chain", {1024, 1024}},
{"het_small", {1, 1, 7, 896}},
{"het_deep", {1, 1, 5, 256}},
};

std::vector<float> read_f32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const auto file_size = static_cast<size_t>(f.tellg());
if (file_size % sizeof(float) != 0) {
return {}; // truncated/corrupt golden; caller treats empty as failure
}
f.seekg(0);
std::vector<float> data(file_size / sizeof(float));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(file_size));
return data;
}

bool run_case(const std::string& dir, const Case& tc) {
printf("\n--- dispatch_order[%s] ---\n", tc.name);
const std::string base = dir + "/" + tc.name;
std::vector<float> input = read_f32_bin(base + ".input.bin");
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
if (input.empty() || golden.empty()) {
printf("FAIL: could not read input/golden for %s\n", tc.name);
return false;
}

Module module(base + ".pte");
if (module.load_forward() != Error::Ok) {
printf("FAIL: could not load %s.pte\n", tc.name);
return false;
}

size_t expected = 1;
for (int32_t d : tc.sizes) {
expected *= static_cast<size_t>(d);
}
if (input.size() != expected) {
printf(
"FAIL: input numel %zu != expected %zu for %s\n",
input.size(),
expected,
tc.name);
return false;
}
auto x = make_tensor_ptr(tc.sizes, std::vector<float>(input));
auto result = module.forward({EValue(x)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}
const auto& outputs = result.get();
if (outputs.empty() || !outputs[0].isTensor()) {
printf("FAIL: no tensor output\n");
return false;
}
const auto& out_tensor = outputs[0].toTensor();
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
printf(
"FAIL: output numel %zu != golden %zu\n",
(size_t)out_tensor.numel(),
golden.size());
return false;
}
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f;
float max_rel_err = 0.0f;
for (size_t i = 0; i < golden.size(); i++) {
const float abs_err = std::abs(out_data[i] - golden[i]);
max_abs_err = std::max(max_abs_err, abs_err);
const float denom = std::max(std::abs(golden[i]), 1e-6f);
max_rel_err = std::max(max_rel_err, abs_err / denom);
}
printf(
"Max abs error: %e Max rel error: %e (%zu elements)\n",
max_abs_err,
max_rel_err,
golden.size());
// Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
return false;
}
printf("PASS: dispatch_order[%s]\n", tc.name);
return true;
}

} // namespace

int main(int argc, char** argv) {
std::string dir = "/tmp/dispatch_order";
if (argc > 1) {
dir = argv[1];
}
if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
dir = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
} catch (const std::exception& e) {
printf("SKIP: %s\n", e.what());
return 0;
}
set_default_webgpu_context(&ctx);
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());

bool ok = true;
for (const auto& tc : kCases) {
ok = run_case(dir, tc) && ok;
}

set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

if (!ok) {
return 1;
}
printf("\nAll dispatch_order tests passed\n");
return 0;
}
Loading
Loading