Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ test_data/*
# The experimental/builder directory should be tracked despite matching build*
!experimental/builder
!experimental/builder/**
experimental/grouped_convolution_tile_instances/instances/*
!experimental/grouped_convolution_tile_instances/instances/*.in
!experimental/grouped_convolution_tile_instances/instances/*.inc
experimental/grouped_convolution_tile_instances/*.inc
9 changes: 5 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,11 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)

add_subdirectory(library)

if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
add_subdirectory(experimental/grouped_convolution_tile_instances)
endif()

if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
Expand Down Expand Up @@ -735,10 +740,6 @@ if (NOT MIOPEN_REQ_LIBS_ONLY)
add_subdirectory(profiler)
endif()

if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
endif()

if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
add_subdirectory(codegen)
endif()
Expand Down
69 changes: 45 additions & 24 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ def cmake_build(Map conf=[:]){
if (params.NINJA_BUILD_TRACE) {
echo "running ninja build trace"
}
if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
if ((params.RUN_BUILDER_TESTS || params.RUN_FULL_CONV_TILE_TESTS) && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
setup_args = " -D CK_EXPERIMENTAL_BUILDER=ON " + setup_args
}
setup_cmd = conf.get(
Expand Down Expand Up @@ -1091,7 +1091,7 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
0 13 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
0 11 * * * % RUN_PYTORCH_TESTS=true;RUN_CODEGEN_TESTS=false;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : ""

pipeline {
Expand Down Expand Up @@ -1255,6 +1255,10 @@ pipeline {
name: "RUN_AITER_TESTS",
defaultValue: false,
description: "Run AITER tests with latest CK develop branch (default: OFF)")
booleanParam(
name: "RUN_FULL_CONV_TILE_TESTS",
defaultValue: false,
description: "Run CK Tile grouped convolution tests with latest CK develop branch (default: OFF)")
string(
name: 'aiter_branch',
defaultValue: 'main',
Expand Down Expand Up @@ -1318,21 +1322,15 @@ pipeline {
agent{ label rocmnode("nogpu") }
environment{
setup_args = "NO_CK_BUILD"
execute_cmd = "(cd .. && git ls-files \'*.h\' \
\'*.hpp\' \
\'*.cpp\' \
\'*.h.in\' \
\'*.hpp.in\' \
\'*.cpp.in\' \
\'*.cl\' \
| grep -v 'build/' \
| grep -v 'include/rapidjson' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\') && \
execute_cmd = """cd .. && \
find . -type f \\( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.cl' \\) \
-not -path '*/build/*' -not -path '*/include/rapidjson/*' | \
xargs -P 8 -I{} sh -c 'clang-format-18 -style=file {} | diff -u - {} || (echo "ERROR: {} needs formatting" && exit 1)' && \
/cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
-D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
-D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
-U __gfx803__ -U __gfx900__ -U __gfx906__ -U CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 \
--file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
--file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
Expand All @@ -1348,17 +1346,10 @@ pipeline {
agent{ label rocmnode("nogpu") }
environment{
setup_args = "NO_CK_BUILD"
execute_cmd = "(cd .. && git ls-files \
\'*.h\' \
\'*.hpp\' \
\'*.cpp\' \
\'*.h.in\' \
\'*.hpp.in\' \
\'*.cpp.in\' \
\'*.cl\' \
| grep -v 'build/' \
| grep -v 'include/rapidjson' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')"
execute_cmd = """cd .. && \
find . -type f \\( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.cl' \\) \
-not -path '*/build/*' -not -path '*/include/rapidjson/*' | \
xargs -P 8 -I{} sh -c 'clang-format-18 -style=file {} | diff -u - {} || (echo "ERROR: {} needs formatting" && exit 1)'"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
Expand Down Expand Up @@ -1423,6 +1414,36 @@ pipeline {
}
}
}
stage("Run Full Grouped Conv Tile Tests")
{
when {
beforeAgent true
expression { env.SHOULD_RUN_CI.toBoolean() }
}
parallel
{
stage("Run Full Grouped Conv Tile Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ python3 ../experimental/builder/src/generate_instances.py --mode=profiler && \
../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_tile && \
./bin/test_grouped_convnd_fwd_tile"""
}
steps{
// TODO: Reenable after the instance fixes
// buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run Grouped Conv Large Case Tests")
{
when {
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx/requirements.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
rocm-docs-core[api_reference]==1.31.2
rocm-docs-core[api_reference]==1.31.3
sphinxcontrib-bibtex==2.6.5
2 changes: 1 addition & 1 deletion docs/sphinx/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core[api-reference]==1.31.2
rocm-docs-core[api-reference]==1.31.3
# via -r requirements.in
rpds-py==0.24.0
# via
Expand Down
8 changes: 7 additions & 1 deletion example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def scheck(self) -> str:
assert False

def seqtune(self, max_bm0: int) -> str:
if self.bm0 == max_bm0:
if self.bm0 == max_bm0 or self.bm0 == 64:
return "true/*fall back to largest tile*/"
else:
return f"a.seqlen_q <= {self.bm0}"
Expand Down Expand Up @@ -847,6 +847,11 @@ def check_hdim_tile(
(problem_ctx.hdim, problem_ctx.hdim_v) != (128, 128)
and kernel_ctx.tile.F_bm0 != 128
)
or (
(problem_ctx.hdim, problem_ctx.hdim_v) == (128, 128)
and kernel_ctx.pipeline.tag != "qr_async"
and kernel_ctx.tile.F_bk0 == 64
)
):
# non qr_async_trload only support km0=128 tile size when hdim is not 128
# non qr_async only support kn0=128 tile size when hdim is 128
Expand Down Expand Up @@ -942,6 +947,7 @@ def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
( 96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)],
(128, 128) : [FmhaFwdTileSize( 16, 32, 64, 128, 32, 128, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1),
FmhaFwdTileSize( 32, 32, 128, 128, 32, 128, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1),
FmhaFwdTileSize( 64, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 16, 16, 32, 16, 16, 16, -1, CppConstraint('get_num_blocks(64) <= num_cus')),
FmhaFwdTileSize(128, 64, 32, 128, 16, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1),
FmhaFwdTileSize(128, 128, 32, 128, 32, 128, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)],
# (160, 160) : [FmhaFwdTileSize(128, 128 , 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
#include "run_gemm_quant_example.inc"

template <typename T>
using GemmConfig = GemmConfigQuantPrefill<T>;
using GemmConfig = GemmConfigABQuantPrefill<T>;

template <typename T>
using GemmConfigPreshuffleB = GemmConfigPreshuffleB_ABQuant_Prefill<T>;

// template <typename T>
// using GemmConfigPreshuffleB = GemmConfigPreshuffleB_ABQuant_Decode<T>;

void abquant_quantgrouped_instance_factory(
std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
Expand Down Expand Up @@ -78,7 +84,7 @@ void abquant_quantgrouped_instance_factory(
using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
using TypeConfig =
decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
return run_gemm_example_prec_type<GemmConfigPreshuffleB<ck_tile::fp8_t>,
TypeConfig,
AQuantGroupSize,
BQuantGroupSize,
Expand All @@ -93,7 +99,7 @@ void abquant_quantgrouped_instance_factory(
using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
using TypeConfig =
decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
return run_gemm_example_prec_type<GemmConfigPreshuffleB<ck_tile::fp8_t>,
TypeConfig,
AQuantGroupSize,
BQuantGroupSize,
Expand All @@ -108,7 +114,7 @@ void abquant_quantgrouped_instance_factory(
using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
using TypeConfig =
decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
return run_gemm_example_prec_type<GemmConfigPreshuffleB<ck_tile::bf8_t>,
TypeConfig,
AQuantGroupSize,
BQuantGroupSize,
Expand All @@ -123,7 +129,7 @@ void abquant_quantgrouped_instance_factory(
using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
using TypeConfig =
decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
return run_gemm_example_prec_type<GemmConfigPreshuffleB<ck_tile::bf8_t>,
TypeConfig,
AQuantGroupSize,
BQuantGroupSize,
Expand Down
29 changes: 29 additions & 0 deletions example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,28 @@ struct GemmConfigPreshuffleB_PreshuffleBQuant_Prefill
static constexpr bool PreshuffleQuant = true;
};

template <typename PrecType>
struct GemmConfigPreshuffleB_ABQuant_Prefill : public GemmConfigPreshuffleB_BQuant_Prefill<PrecType>
{
static constexpr ck_tile::index_t M_Warp = 2;
static constexpr ck_tile::index_t N_Warp = 2;
static constexpr ck_tile::index_t K_Warp = 1;

static constexpr bool kPadK = false;
static constexpr bool TransposeC = true;
};

template <typename PrecType>
struct GemmConfigPreshuffleB_ABQuant_Decode : public GemmConfigPreshuffleB_BQuant_Prefill<PrecType>
{
static constexpr ck_tile::index_t M_Tile = 16;
static constexpr ck_tile::index_t N_Tile = 128;
static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);

static constexpr bool kPadK = false;
static constexpr bool TransposeC = true;
};

template <typename PrecType>
struct GemmConfigQuantPrefill : public GemmConfigBase
{
Expand All @@ -209,6 +231,13 @@ struct GemmConfigQuantPrefill : public GemmConfigBase
ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
};

template <typename PrecType>
struct GemmConfigABQuantPrefill : public GemmConfigQuantPrefill<PrecType>
{
static constexpr bool kPadK = false;
static constexpr bool TransposeC = true;
};

template <typename PrecType>
struct GemmConfigPreshuffleBQuantPrefill : public GemmConfigQuantPrefill<PrecType>
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ template <typename GemmConfig,
float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::stream_config& s)
{
static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
constexpr bool transpose_c = QuantMode == ck_tile::QuantType::ABQuantGrouped;
using ComputeDataType = std::conditional_t<QuantMode == ck_tile::QuantType::AQuantGrouped ||
QuantMode == ck_tile::QuantType::RowColQuant,
typename TypeConfig::BDataType,
Expand All @@ -57,7 +58,7 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
QuantMode,
AQLayout, // for AQLayout
BQLayout, // for BQLayout
false,
transpose_c,
GemmConfig::DoubleSmemBuffer>;

using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<typename TypeConfig::ADataType,
Expand Down Expand Up @@ -88,7 +89,7 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr bool transpose_c = false;
// constexpr bool transpose_c = false;

// row-col and tensor quants use the regular pipeline, A/B/AB quants use their own
using PipelineProblem = std::conditional_t<
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ concept EpilogueDescriptor = requires(T t) {

// Concept for the thread cluster access order
template <typename T>
concept AccessOrderDescriptor = requires(T t) {
concept ThreadClusterOrderDescriptor = requires(T t) {
{ t.order } -> std::convertible_to<std::array<size_t, 3>>;
} || requires(T t) {
{ t.order } -> std::convertible_to<std::array<size_t, 4>>;
Expand Down Expand Up @@ -231,16 +231,16 @@ concept SpecifiesLdsTransfer = requires(T t) {

// Concept to check if a struct specifies thread cluster access order info.
template <typename T>
concept SpecifiesThreadClusterAccessOrder = requires(T t) {
{ T::transfer.a.block_transfer_access_order } -> AccessOrderDescriptor;
{ T::transfer.b.block_transfer_access_order } -> AccessOrderDescriptor;
concept SpecifiesThreadClusterArrangeOrder = requires(T t) {
{ T::transfer.a.thread_cluster_arrange_order } -> ThreadClusterOrderDescriptor;
{ T::transfer.b.thread_cluster_arrange_order } -> ThreadClusterOrderDescriptor;
};

// Concept to check if a struct specifies source access order info.
template <typename T>
concept SpecifiesSourceAccessOrder = requires(T t) {
{ T::transfer.a.src_access_order } -> AccessOrderDescriptor;
{ T::transfer.b.src_access_order } -> AccessOrderDescriptor;
{ T::transfer.a.src_access_order } -> ThreadClusterOrderDescriptor;
{ T::transfer.b.src_access_order } -> ThreadClusterOrderDescriptor;
};

// Concept to check if struct specifies block GEMM.
Expand Down
Loading