Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

/*
* conv2d_gemm: GEMM step of im2col-backed conv2d.
*
* Reads the im2col'd input produced by conv2d_im2col.glsl as a 2D matrix
* of shape [M, K_total] (M = H_out * W_out, K_total = Kh*Kw*Cin_padded)
* and writes the conv2d output as texture3D channels-packed
* logical shape [1, C_out, H_out, W_out].
*
* The im2col input can be any of:
* - texture2d, width-packed: texel at (k4, m) holds 4 K values for row m.
* IN_STORAGE=texture2d codegen.
* - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values
* for output spatial position (oh, ow). Used when M would exceed
* max_texture2d_dim. IN_STORAGE=texture3d codegen.
* - buffer: vec4 at offset m*K4 + k4, same K packing.
* IN_STORAGE=buffer codegen.
*
* The matmul interpretation is:
* out[m, n] = sum_k im2col[m, k] * weight[n, k] + bias[n]
* with M = H_out * W_out, K = K_total, N = C_out.
*/

#version 450 core

#define PRECISION ${PRECISION}

$if IN_STORAGE == "buffer" and DTYPE == "half":
${define_explicit_type_extensions(DTYPE)}

// VEC4_T is the input storage's natural texel type, which is also the tile type
// (the linear_fp_*_tile headers default the tile vec4 type to VEC4_T). For the
// buffer/half path this resolves to f16vec4, so the GEMM inner loop accumulates
// in true FP16 — the fma emits mad.f16 and the accumulators live in half-width
// registers. Texture-sampled half always returns vec4, so FP16 accumulation is
// naturally confined to the buffer (Mali) path; the texture variants (Adreno),
// where FP16 accumulation regresses, stay vec4 / FP32 with no extra gating.
#define VEC4_T ${texel_load_type(DTYPE, IN_STORAGE)}

// OUT_VEC4_T is the output surface type. t_out is always texture3d, whose
// imageStore ABI takes vec4 (fp32) regardless of DTYPE, so the accumulator tile
// is cast from VEC4_T to OUT_VEC4_T at store time.
#define OUT_VEC4_T ${texel_load_type(DTYPE, "texture3d")}

#define TILE_M4 ${TILE_M4}
#define TILE_K4 ${TILE_K4}
#define TILE_N4 ${TILE_N4}

#define TILE_M ${TILE_M}
#define TILE_K ${TILE_K4 * 4}
#define TILE_N ${TILE_N4 * 4}

$if IN_STORAGE == "buffer":
#define INPUT_BUFFER
$elif IN_STORAGE == "texture3d":
#define INPUT_TEXTURE3D

${define_required_extensions("texture3d", DTYPE)}
$if IN_STORAGE == "buffer":
${define_required_extensions("buffer", DTYPE)}

layout(std430) buffer;

#include "common.glslh"

${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
$if IN_STORAGE == "buffer":
${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer", is_scalar_array=False)}
$else:
${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE)}
${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")}

${layout_declare_ubo(B, "ivec4", "out_sizes")}

// Push constants are uploaded in 16-byte chunks (one ivec4 each).
// K4_total is shape-independent (it depends only on C_in and the conv kernel
// dims), so it is safe to bake at build time even under dynamic shapes.
// M = H_out * W_out IS shape-dependent, so it is derived at runtime from the
// refreshed out_sizes UBO in main() rather than read from here.
layout(push_constant) uniform restrict Block {
ivec4 gemm_dims; // (K4_total, _unused, _unused, _unused)
vec4 clamp_vals; // (out_min, out_max, _unused, _unused)
};

#define K4_TOTAL gemm_dims.x
#define OUT_MIN clamp_vals.x
#define OUT_MAX clamp_vals.y

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "activation_type", "0")}

#include "linear_fp_input_tile.glslh"
#include "linear_fp_packed_weight_tile_load.glslh"
#include "linear_fp_output_tile_fp_compute.glslh"

/*
* Load TILE_M rows × TILE_K4 K-tiles of the im2col'd input.
* The im2col output is a contiguous (M, K_total/4) matrix of vec4s, so the
* load is a plain 2D fetch — no spatial decomposition.
*/
void load_input_tile_with_checks(
out FPInputTile tile,
const int k4_start,
const int m_start,
const int K4,
const int M,
const int W_out) {
// W_out is only consumed by the texture3d variant below.
[[unroll]] for (int m = 0; m < TILE_M; ++m) {
[[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
if (k4_start + k4 < K4 && m_start + m < M) {
const int row = m_start + m;
const int col = k4_start + k4;
#if defined(INPUT_BUFFER)
// Cast SSBO texel into the input tile type (f16vec4 for half, vec4 for
// float).
tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(t_in[row * K4 + col]);
#elif defined(INPUT_TEXTURE3D)
// texture3d layout: row (the flat M index) decomposes into (ow, oh)
// and K4 is along the Z axis. texelFetch returns vec4 (fp32); cast to
// the input tile type.
tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(
texelFetch(t_in, ivec3(row % W_out, row / W_out, col), 0));
#else
tile.data[m][k4] =
LINEAR_FP_INPUT_TILE_VEC4_T(texelFetch(t_in, ivec2(col, row), 0));
#endif
} else {
tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(0.0);
}
}
}
}

void store_output_tile_with_checks(
const FPOutTile out_tile,
const int n4_start,
const int m_start,
const int N4,
const int M,
const int W_out) {
[[unroll]] for (int m = 0; m < TILE_M; ++m) {
[[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
if (m_start + m < M && n4_start + n4 < N4) {
const int spatial = m_start + m;
// Cast the accumulator (f16vec4 for the buffer/half path) to the
// texture3d output surface type for the activation clamp and store.
OUT_VEC4_T texel = OUT_VEC4_T(out_tile.data[m][n4]);
if (activation_type == 1) {
texel = max(texel, OUT_VEC4_T(0.0));
} else if (activation_type == 2) {
texel = clamp(texel, OUT_VEC4_T(OUT_MIN), OUT_VEC4_T(OUT_MAX));
}
imageStore(
t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel);
}
}
}
}

void main() {
const int tile_idx_n = int(gl_GlobalInvocationID.x);
const int tile_idx_m = int(gl_GlobalInvocationID.y);

const int n4_start = tile_idx_n * TILE_N4;
const int m_start = tile_idx_m * TILE_M;

const int W_out = out_sizes.x;
const int H_out = out_sizes.y;
// M = H_out * W_out is derived from the refreshed out_sizes UBO so it tracks
// dynamic output shapes (out_sizes is virtual_resize'd on trigger_resize).
const int M = W_out * H_out;
const int K4 = K4_TOTAL;
const int N = out_sizes.z;
const int N4 = div_up_4(N);

if (n4_start >= N4 || m_start >= M) {
return;
}

FPOutTile out_tile;
initialize(out_tile);

FPInputTile in_tile;
FPWeightTile w_tile;

for (int k4 = 0; k4 < K4; k4 += TILE_K4) {
load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out);
load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4);
fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile);
}

// Apply bias. The bias texel depends only on n4, so fetch it once per n4 and
// add it to every m row rather than re-fetching inside the M loop.
[[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
if (n4_start + n4 < N4) {
// t_bias is an fp32 texture2d; cast its texel to the accumulator type.
const LINEAR_FP_OUTPUT_TILE_VEC4_T bias_texel =
LINEAR_FP_OUTPUT_TILE_VEC4_T(
texelFetch(t_bias, ivec2(n4_start + n4, 0), 0));
[[unroll]] for (int m = 0; m < TILE_M; ++m) {
out_tile.data[m][n4] += bias_texel;
}
}
}

store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out);
}
26 changes: 26 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

conv2d_gemm:
parameter_names_with_default_values:
DTYPE: float
IN_STORAGE: texture2d
TILE_M4: 1
TILE_K4: 1
TILE_N4: 1
TILE_M: 4
generate_variant_forall:
combination:
parameter_names: [IN_STORAGE, DTYPE]
combos:
- parameter_values: [texture2d, float]
- parameter_values: [texture2d, half]
- parameter_values: [texture3d, float]
- parameter_values: [texture3d, half]
- parameter_values: [buffer, float]
- parameter_values: [buffer, half]
shader_variants:
- NAME: conv2d_gemm
132 changes: 132 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

/*
* Im2col transformation for FP32 / FP16 conv2d.
*
* The output is a 2D matrix of shape [M, K_total] where
* M = H_out * W_out (number of output spatial positions)
* K_total = Kh * Kw * align_up_4(C_in) (flattened receptive field)
*
* K layout (so a 4-tile in K — one vec4 — holds the same kernel position):
* K = (ki * Kw + kj) * Cin_padded + ci
*
* Three codegen'd storage variants of the output tensor:
* - texture2d, width-packed: texel at (k4, m) holds 4 K values for spatial
* position m. Extents = (K_total/4, M).
* - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values
* for output spatial position (oh, ow). Extents = (W_out, H_out, K4).
* Used as a fallback when M would exceed max_texture2d_dim.
* - buffer: vec4 at offset (m * K4 + k4), same K packing.
*
* The caller picks storage per device (Mali → buffer; others → texture2d
* when its 2D extents fit, texture3d when its 3D extents fit, else buffer).
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}

$if OUT_STORAGE == "buffer":
#define OUTPUT_BUFFER
#define VEC4_BUF_T ${texel_load_type(DTYPE, "buffer")}
$elif OUT_STORAGE == "texture3d":
#define OUTPUT_TEXTURE3D

${define_required_extensions("texture3d", DTYPE)}
$if OUT_STORAGE == "buffer":
${define_required_extensions("buffer", DTYPE)}

layout(std430) buffer;

$if OUT_STORAGE == "buffer":
${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer", is_scalar_array=False)}
$else:
${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE)}
${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}

${layout_declare_ubo(B, "ivec4", "in_sizes")}

// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply
// with the per-entry size limit. All of these fields are shape-independent
// (they depend only on the conv kernel params and C_in), so they are safe to
// bake at build time even under dynamic shapes — W_out / H_out / M are derived
// at runtime from the refreshed in_sizes UBO below.
layout(push_constant) uniform restrict Block {
ivec4 kernel_stride; // (Kh, Kw, Sh, Sw)
ivec4 padding_dil; // (Ph, Pw, Dh, Dw)
ivec4 dims; // (Cin_padded, _unused, _unused, K4_total)
};

#define KERNEL_H kernel_stride.x
#define KERNEL_W kernel_stride.y
#define STRIDE_H kernel_stride.z
#define STRIDE_W kernel_stride.w
#define PADDING_H padding_dil.x
#define PADDING_W padding_dil.y
#define DILATION_H padding_dil.z
#define DILATION_W padding_dil.w
#define CIN_PADDED dims.x
#define K4_TOTAL dims.w

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
const int k4 = int(gl_GlobalInvocationID.x);
const int m = int(gl_GlobalInvocationID.y);

// Derive the spatial output extents from the (refreshed-on-resize) input
// sizes UBO so the im2col mapping tracks dynamic input shapes. in_sizes is
// (W_in, H_in, C_in, N). dilation == 1 is guaranteed by the C++ routing
// heuristic, but the general formula is used for correctness.
const int W_OUT =
(in_sizes.x + 2 * PADDING_W - DILATION_W * (KERNEL_W - 1) - 1) / STRIDE_W +
1;
const int H_OUT =
(in_sizes.y + 2 * PADDING_H - DILATION_H * (KERNEL_H - 1) - 1) / STRIDE_H +
1;
const int M = H_OUT * W_OUT;

if (k4 >= K4_TOTAL || m >= M) {
return;
}

const int k_start = k4 * 4;

// K = (ki * Kw + kj) * Cin_padded + ci ; since Cin_padded % 4 == 0, all 4
// K values in this texel share the same (ki, kj) and span 4 consecutive
// ci values starting at ci_start.
const int krow_idx = k_start / CIN_PADDED; // ki * Kw + kj
const int ci_start = k_start % CIN_PADDED;
const int kj = krow_idx % KERNEL_W;
const int ki = krow_idx / KERNEL_W;
const int ci_blk = ci_start >> 2; // ci_start / 4

// Decompose flat output position m back into (oh, ow).
const int ow = m % W_OUT;
const int oh = m / W_OUT;

// Compute the input spatial position for this (oh, ow, ki, kj).
const int ih = oh * STRIDE_H - PADDING_H + ki * DILATION_H;
const int iw = ow * STRIDE_W - PADDING_W + kj * DILATION_W;

VEC4_T out_texel = VEC4_T(0);
if (ih >= 0 && ih < in_sizes.y && iw >= 0 && iw < in_sizes.x) {
out_texel = texelFetch(t_in, ivec3(iw, ih, ci_blk), 0);
}

#if defined(OUTPUT_BUFFER)
t_out[m * K4_TOTAL + k4] = VEC4_BUF_T(out_texel);
#elif defined(OUTPUT_TEXTURE3D)
imageStore(t_out, ivec3(ow, oh, k4), out_texel);
#else
imageStore(t_out, ivec2(k4, m), out_texel);
#endif
}
Loading
Loading