diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl new file mode 100644 index 00000000000..4f005b8fdb0 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl @@ -0,0 +1,98 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define T ${texel_load_component_type(DTYPE, STORAGE)} + +$if STORAGE == "buffer": + #define BUFFER +$if HAS_BIAS: + #define HAS_BIAS + +${define_required_extensions(STORAGE, DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE, is_scalar_array=False)} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=False)} + +// in_sizes: {L_in, C, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "in_sizes")} +// out_sizes: {L_out, C, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +layout(push_constant) uniform restrict Block { + int kernel_size; + int stride; + int padding; + int dilation; + float output_min; + float output_max; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Thread mapping: X = C/4, Y = L_out, Z = N +// Each thread computes 4 output channels at one spatial position. +// Depthwise: each channel has its own filter, so 4 channels can be computed +// independently with element-wise vec4 FMA. + +void main() { + const int c4 = int(gl_GlobalInvocationID.x); + const int l_out = int(gl_GlobalInvocationID.y); + const int n = int(gl_GlobalInvocationID.z); + + const int L_in = in_sizes.x; + const int C = in_sizes.y; + const int C4 = div_up_4(C); + const int L_out = out_sizes.x; + + if (c4 >= C4 || l_out >= L_out) { + return; + } + + VEC4_T sum = VEC4_T(0); + + for (int k = 0; k < kernel_size; k++) { + const int l_in = l_out * stride - padding + k * dilation; + if (l_in >= 0 && l_in < L_in) { +#ifdef BUFFER + const VEC4_T in_val = t_in[(n * L_in + l_in) * C4 + c4]; + const VEC4_T w_val = t_weight[k * C4 + c4]; +#else + const VEC4_T in_val = texelFetch(t_in, ivec3(l_in, c4, n), 0); + const VEC4_T w_val = texelFetch(t_weight, ivec3(k, 0, c4), 0); +#endif + sum = fma(w_val, in_val, sum); + } + } + +#ifdef HAS_BIAS +#ifdef BUFFER + sum += t_bias[c4]; +#else + sum += texelFetch(t_bias, ivec3(c4, 0, 0), 0); +#endif +#endif + + sum = clamp(sum, VEC4_T(output_min), VEC4_T(output_max)); + +#ifdef BUFFER + t_out[(n * L_out + l_out) * C4 + c4] = sum; +#else + imageStore(t_out, ivec3(l_out, c4, n), sum); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml new file mode 100644 index 00000000000..883ad8899ea --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv1d_dw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + HAS_BIAS: false + generate_variant_forall: + STORAGE: + - VALUE: texture3d + - VALUE: buffer + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: conv1d_dw + - NAME: conv1d_dw_bias + HAS_BIAS: true diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp new file mode 100644 index 00000000000..88d421e6994 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace vkcompute { + +void resize_conv1d_dw_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef self = args.at(1).refs.at(0); + + TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0)); + + const int64_t stride = graph->get_int_list(extra_args.at(1))->at(0); + const int64_t padding = graph->get_int_list(extra_args.at(2))->at(0); + const int64_t dilation = graph->get_int_list(extra_args.at(3))->at(0); + + const std::vector in_sizes = graph->sizes_of(self); + const int64_t kernel_size = weight_ref->sizes.at(2); + const int64_t L_in = in_sizes.at(2); + + const int64_t L_out = + calc_out_size(L_in, kernel_size, stride, padding, dilation, false); + + graph->virtual_resize(out, {in_sizes.at(0), in_sizes.at(1), L_out}); +} + +struct Conv1dDWParams final { + int32_t kernel_size; + int32_t stride; + int32_t padding; + int32_t dilation; +}; + +struct Conv1dDWClampParams final { + float output_min; + float output_max; +}; + +utils::uvec3 pick_conv1d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + // out is [N, C, L_out]; in WHCN: {L_out, C, N, 1} + const uint32_t C = graph->size_at(-2, out); + const uint32_t L_out = graph->size_at(-1, out); + const uint32_t N = + graph->dim_of(out) >= 3 ? graph->size_at(-3, out) : 1; + + return {utils::div_up_4(C), L_out, N}; +} + +void add_conv1d_dw_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride_ref, + const ValueRef padding_ref, + const ValueRef dilation_ref, + const ValueRef out, + const float output_min = std::numeric_limits::lowest(), + const float output_max = std::numeric_limits::max()) { + VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim); + + const utils::StorageType storage_type = graph.storage_type_of(out); + + // Weight [C, 1, K] prepacked as channels-packed so each vec4 load gives + // 4 channels at one kernel position. + ValueRef packed_weight = prepack_standard( + graph, weight_data, storage_type, utils::kChannelsPacked); + + bool has_bias = graph.val_is_not_none(bias); + ValueRef packed_bias = kDummyValueRef; + if (has_bias) { + packed_bias = + prepack_standard(graph, bias, storage_type, utils::kWidthPacked); + } + + const auto stride_val = graph.get_int_list(stride_ref)->at(0); + const auto padding_val = graph.get_int_list(padding_ref)->at(0); + const auto dilation_val = graph.get_int_list(dilation_ref)->at(0); + + Conv1dDWParams params{ + utils::safe_downcast(graph.get_tref(weight_data)->sizes.at(2)), + utils::safe_downcast(stride_val), + utils::safe_downcast(padding_val), + utils::safe_downcast(dilation_val), + }; + + Conv1dDWClampParams clamp_params{ + output_min, + output_max, + }; + + std::string kernel_name = has_bias ? "conv1d_dw_bias" : "conv1d_dw"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, storage_type); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + std::vector read_inputs = {in, packed_weight}; + if (has_bias) { + read_inputs.push_back(packed_bias); + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_conv1d_dw_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}}, + // Shader params buffers + {graph.sizes_ubo(in), graph.sizes_ubo(out)}, + // Push Constants + {PushConstantDataInfo(¶ms, sizeof(Conv1dDWParams)), + PushConstantDataInfo(&clamp_params, sizeof(Conv1dDWClampParams))}, + // Specialization Constants + {}, + // Resize Args + {weight_data, stride_ref, padding_ref, dilation_ref}, + // Resizing Logic + resize_conv1d_dw_node)); +} + +// Args: in, weight, bias, stride, padding, dilation, groups, +// output_min, output_max, out +// output_min and output_max may be kDummyValueRef (no clamp). +void conv1d_dw(ComputeGraph& graph, const std::vector& args) { + ValueRef in = args[0]; + ValueRef weight = args[1]; + ValueRef bias = args[2]; + ValueRef stride = args[3]; + ValueRef padding = args[4]; + ValueRef dilation = args[5]; + ValueRef out = args[9]; + + float output_min = std::numeric_limits::lowest(); + float output_max = std::numeric_limits::max(); + if (is_valid(args[7])) { + output_min = graph.extract_scalar(args[7]); + } + if (is_valid(args[8])) { + output_max = graph.extract_scalar(args[8]); + } + + add_conv1d_dw_node( + graph, + in, + weight, + bias, + stride, + padding, + dilation, + out, + output_min, + output_max); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.conv1d_dw.default, conv1d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp new file mode 100644 index 00000000000..be35e8a3109 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace vkcompute { + +void test_conv1d_dw(ComputeGraph& graph, const std::vector& args) { + // args: in, weight, bias, stride, padding, dilation, groups, out + VK_GET_OP_FN("et_vk.conv1d_dw.default")(graph, args); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv1d_dw.default, test_conv1d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 1f0cecc1c0c..5f4b49e2773 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -103,3 +103,4 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_conv2d_pw") define_custom_op_test_binary("test_conv2d_dw") define_custom_op_test_binary("test_conv1d_pw") + define_custom_op_test_binary("test_conv1d_dw") diff --git a/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp new file mode 100644 index 00000000000..2438847036e --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp @@ -0,0 +1,267 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 256; + +struct Conv1dDWConfig { + int64_t N; + int64_t C; + int64_t L; + int64_t K; + int64_t stride; + int64_t padding; + int64_t dilation; + bool has_bias; +}; + +static TestCase create_conv1d_dw_test_case( + const Conv1dDWConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type) { + TestCase test_case; + + bool is_perf = config.C > kRefDimSizeLimit || config.L > kRefDimSizeLimit; + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = storage_type_abbrev(storage_type); + std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32"; + std::string bias_str = config.has_bias ? "+bias" : ""; + + int64_t L_out = + (config.L + 2 * config.padding - config.dilation * (config.K - 1) - 1) / + config.stride + + 1; + + std::string name = prefix + " conv1d_dw" + bias_str + " [" + + std::to_string(config.N) + "," + std::to_string(config.C) + "," + + std::to_string(config.L) + "] K=" + std::to_string(config.K) + + " s=" + std::to_string(config.stride) + + " p=" + std::to_string(config.padding) + + " d=" + std::to_string(config.dilation) + " " + storage_str + "(HP) " + + dtype_str; + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv1d_dw.default"); + + // Input: [N, C, L] height-packed + ValueSpec input( + {config.N, config.C, config.L}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + test_case.add_input_spec(input); + + // Weight: [C, 1, K] height-packed, constant + ValueSpec weight( + {config.C, 1, config.K}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + weight.set_constant(true); + test_case.add_input_spec(weight); + + // Bias: [C] or None + if (config.has_bias) { + ValueSpec bias( + {config.C}, + dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // stride + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.stride)})); + // padding + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.padding)})); + // dilation + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.dilation)})); + // groups = C (depthwise) + test_case.add_input_spec(ValueSpec(static_cast(config.C))); + + // Output: [N, C, L_out] height-packed + ValueSpec output( + {config.N, config.C, L_out}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +static void conv1d_dw_reference_impl(TestCase& test_case) { + const auto& input_spec = test_case.inputs()[0]; + const auto& weight_spec = test_case.inputs()[1]; + const auto& bias_spec = test_case.inputs()[2]; + const auto& stride_spec = test_case.inputs()[3]; + const auto& padding_spec = test_case.inputs()[4]; + const auto& dilation_spec = test_case.inputs()[5]; + ValueSpec& output = test_case.outputs()[0]; + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Reference only supports float"); + } + + auto in_sizes = input_spec.get_tensor_sizes(); + auto w_sizes = weight_spec.get_tensor_sizes(); + auto out_sizes = output.get_tensor_sizes(); + + const int64_t N = in_sizes[0]; + const int64_t C = in_sizes[1]; + const int64_t L_in = in_sizes[2]; + const int64_t K = w_sizes[2]; + const int64_t L_out = out_sizes[2]; + + const int64_t stride = stride_spec.get_int_list()[0]; + const int64_t padding = padding_spec.get_int_list()[0]; + const int64_t dilation = dilation_spec.get_int_list()[0]; + + const auto& in_data = input_spec.get_float_data(); + const auto& w_data = weight_spec.get_float_data(); + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C * L_out, 0.0f); + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C; ++c) { + for (int64_t l = 0; l < L_out; ++l) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + const int64_t l_in = l * stride - padding + k * dilation; + if (l_in >= 0 && l_in < L_in) { + sum += in_data[n * C * L_in + c * L_in + l_in] * w_data[c * K + k]; + } + } + ref_data[n * C * L_out + c * L_out + l] = sum; + } + } + } + + if (!bias_spec.is_none()) { + const auto& bias_data = bias_spec.get_float_data(); + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C; ++c) { + for (int64_t l = 0; l < L_out; ++l) { + ref_data[n * C * L_out + c * L_out + l] += bias_data[c]; + } + } + } + } +} + +static std::vector generate_conv1d_dw_test_cases() { + std::vector test_cases; + + std::vector storage_types = { + utils::kTexture3D, utils::kBuffer}; + + // Accuracy shapes + std::vector accu_configs = { + // {N, C, L, K, stride, padding, dilation, has_bias} + {1, 16, 64, 3, 1, 1, 1, false}, + {1, 32, 128, 5, 1, 2, 1, true}, + {1, 64, 32, 3, 2, 1, 1, false}, + {2, 16, 64, 3, 1, 1, 1, true}, + {1, 16, 64, 7, 1, 3, 2, false}, + // Non-aligned channel counts (not a multiple of 4) + {1, 5, 64, 3, 1, 1, 1, false}, + {1, 5, 64, 3, 1, 1, 1, true}, + {1, 7, 32, 5, 1, 2, 1, false}, + {1, 13, 48, 3, 2, 1, 1, true}, + {2, 7, 64, 3, 1, 1, 1, false}, + }; + + for (const auto& cfg : accu_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st)); + } + } + + // Performance shapes (half + float) + std::vector perf_configs = { + {1, 256, 1024, 3, 1, 1, 1, false}, + {1, 512, 2048, 5, 1, 2, 1, true}, + {1, 128, 4096, 31, 1, 15, 1, false}, + }; + + for (const auto& cfg : perf_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st)); + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kHalf, st)); + } + } + + return test_cases; +} + +static int64_t conv1d_dw_flop_calculator(const TestCase& test_case) { + auto in_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto w_sizes = test_case.inputs()[1].get_tensor_sizes(); + auto out_sizes = test_case.outputs()[0].get_tensor_sizes(); + + const int64_t N = in_sizes[0]; + const int64_t C = in_sizes[1]; + const int64_t K = w_sizes[2]; + const int64_t L_out = out_sizes[2]; + + return 2 * N * C * L_out * K; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Conv1d Depthwise (Height-Packed) Benchmark" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = conv1d_dw_reference_impl; + + auto results = execute_test_cases( + generate_conv1d_dw_test_cases, + conv1d_dw_flop_calculator, + "Conv1dDW", + 3, + 10, + ref_fn); + + return 0; +}