Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
6c56976
Add tma unittest
William-An Jul 13, 2025
0e62c4a
add regular load to TMA benchmark
William-An Oct 7, 2025
881eac2
make the regular load to have same access pattern as TMA load
William-An Oct 7, 2025
b5d152a
avoid compiler optimization
William-An Oct 7, 2025
eb50a00
move cuda mempcy to be before kernel launch
William-An Oct 8, 2025
1b064e4
add iteration count for tma ubench
William-An Oct 15, 2025
be002f7
minor formatting
William-An Oct 17, 2025
f97ea24
move tma to ubench folder
William-An Oct 17, 2025
c175bd1
make setup script works with zsh
William-An Oct 17, 2025
626df80
fix the issue that ubench all return 1 even without issue
William-An Oct 17, 2025
0fc7c54
add a sample test kernel for mbarrier PTX mapping to SASS
William-An Oct 27, 2025
012fc4f
update gitignore
William-An Nov 1, 2025
8b6ff4b
add gmma kernels for latency measurement
William-An Nov 1, 2025
12e7d58
increase iter to 1024
William-An Nov 1, 2025
0029227
add missed kernels
William-An Nov 1, 2025
3a443a0
add maxflops for gmma
William-An Nov 1, 2025
02be081
update block size
William-An Nov 1, 2025
8384fb3
update prints for MaxFlops_gmma
William-An Nov 2, 2025
e44ebf0
fix a bug
William-An Nov 2, 2025
685a52f
fix include after updating it
William-An Nov 3, 2025
c6d05da
fix for cpp and c source
William-An Nov 3, 2025
5d49dab
fix compile
William-An Nov 3, 2025
0b96343
fix for pattern matching
William-An Nov 3, 2025
a94cf54
fix compilation for mbarrier
William-An Nov 3, 2025
291df76
Fix makefile for tma app
William-An Nov 3, 2025
8cdff44
generate SASS and PTX for TMA and GMMA workloads
William-An Nov 3, 2025
0abcec8
update makefile to force PTX to be embedded in final fat bin
William-An Nov 3, 2025
911596c
change naming
William-An Nov 3, 2025
045d65d
comment out parboil as it is using python2
William-An Jan 8, 2026
60a76af
Add GPU ubench to clean target
William-An Jan 8, 2026
16c29ba
Use dynamic linking by default for GPU apps
William-An Jan 8, 2026
cb832b9
Add test binaries for GMMA instruction
William-An Jan 8, 2026
10fa195
Checkout CUTLASS during ci
William-An Jan 8, 2026
04f6478
Use type to specify gmma ubench iteration count and update test code
William-An Jan 8, 2026
f1221ef
Fix typos
William-An Jan 8, 2026
1968d1f
Update Makefiles and setup_environment to use C++17 standard
JRPan Jan 23, 2026
52711f9
missed one rename
JRPan Jan 23, 2026
122346b
Remove unused clean target and tma build steps from Makefile
JRPan Jan 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .github/workflows/test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
run: |
git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
git submodule update --init -- src/cuda/cuda-samples
git submodule update --init -- src/cuda/cutlass-bench
/bin/bash test-build.sh ci

- name: Print Successful Apps
Expand Down
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,13 @@ src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
4.2
.venv/
__pycache__/
__pycache__/
compile_commands.json
.cache/
tmp/

# Ignoring files without extension (but keep Makefile and files with extensions)
src/cuda/GPU_Microbenchmark/ubench/**/*
!src/cuda/GPU_Microbenchmark/ubench/**/*/
!src/cuda/GPU_Microbenchmark/ubench/**/*.*
!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
36 changes: 19 additions & 17 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,6 @@ GPU_Microbenchmark:
mkdir -p $(BINDIR)/$(BINSUBDIR)/
$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/GPU_Microbenchmark
mv cuda/GPU_Microbenchmark/bin/* $(BINDIR)/$(BINSUBDIR)/
clean_GPU_Microbenchmark:
find cuda/GPU_Microbenchmark/ubench -type f -executable -delete


Deepbench_nvidia:
Expand Down Expand Up @@ -540,22 +538,23 @@ clean_shoc:
cd cuda/shoc-master/; $(MAKE) clean; $(MAKE) distclean

clean_parboil:
$(SETENV) cd cuda/parboil; ./parboil clean cutcp cuda
$(SETENV) cd cuda/parboil; ./parboil clean bfs cuda
$(SETENV) cd cuda/parboil; ./parboil clean histo cuda
$(SETENV) cd cuda/parboil; ./parboil clean lbm cuda
$(SETENV) cd cuda/parboil; ./parboil clean mri-gridding cuda
$(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda
$(SETENV) cd cuda/parboil; ./parboil clean sad cuda
$(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda
$(SETENV) cd cuda/parboil; ./parboil clean spmv cuda
$(SETENV) cd cuda/parboil; ./parboil clean stencil cuda
$(SETENV) cd cuda/parboil; ./parboil clean tpacf cuda
# Commented out as parboil uses Python2
# $(SETENV) cd cuda/parboil; ./parboil clean cutcp cuda
# $(SETENV) cd cuda/parboil; ./parboil clean bfs cuda
# $(SETENV) cd cuda/parboil; ./parboil clean histo cuda
# $(SETENV) cd cuda/parboil; ./parboil clean lbm cuda
# $(SETENV) cd cuda/parboil; ./parboil clean mri-gridding cuda
# $(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda
# $(SETENV) cd cuda/parboil; ./parboil clean sad cuda
# $(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda
# $(SETENV) cd cuda/parboil; ./parboil clean spmv cuda
# $(SETENV) cd cuda/parboil; ./parboil clean stencil cuda
# $(SETENV) cd cuda/parboil; ./parboil clean tpacf cuda

clean_parboil_hw_power:
$(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda_k1
$(SETENV) cd cuda/parboil; ./parboil clean sad cuda_k1
$(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda_k1
# $(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda_k1
# $(SETENV) cd cuda/parboil; ./parboil clean sad cuda_k1
# $(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda_k1

clean_lonestargpu-2.0:
$(setenv) $(MAKE) $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 clean
Expand Down Expand Up @@ -698,4 +697,7 @@ clean_cuda_samples:
$(MAKE) clean -C ./cuda/cuda-samples/build

clean_huggingface:
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface

clean_GPU_Microbenchmark:
$(MAKE) clean -C ./cuda/GPU_Microbenchmark
2 changes: 2 additions & 0 deletions src/cuda/GPU_Microbenchmark/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
bin/
*.o
*.out
*.a
*.ptx
38 changes: 32 additions & 6 deletions src/cuda/GPU_Microbenchmark/common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,42 @@ CC := nvcc

LIB :=

release:
$(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
# Generate object file list from SRC (for parallel compilation)
CUDA_SRC_FILES := $(filter %.cu, $(SRC))
CPP_SRC_FILES := $(filter %.cpp, $(SRC))
C_SRC_FILES := $(filter %.c, $(SRC))

# To preserve PTX in multi-step compilation, we have to compile the CUDA source files to .a files
CUDA_LIB_FILES := $(CUDA_SRC_FILES:.cu=.a)

# Host side source files
CPP_OBJECT_FILES := $(CPP_SRC_FILES:.cpp=.o)
C_OBJECT_FILES := $(C_SRC_FILES:.c=.o)
OBJECT_FILES := $(CPP_OBJECT_FILES) $(C_OBJECT_FILES)

# If multiple source files are provided, compile them separately and link
# To preserve PTX in final binary: First create static library, then link to executable
# This avoids nvlink stripping PTX during device linking
release: $(CUDA_LIB_FILES) $(OBJECT_FILES)
$(CC) $(NVCC_FLAGS) $^ -o $(EXE) -L$(LIB) -lcudart --cudart shared
mv $(EXE) $(BIN_DIR)

# Pattern rule for compiling individual .cu files to .o files
%.a: %.cu
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) --lib $< -o $@

%.o: %.cpp
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@

%.o: %.c
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@

tuner:
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) $(INCLUDE) -L$(LIB) -lcudart --cudart shared
mv $(EXE) $(BIN_DIR)

clean:
rm -f *.o; rm -f $(EXE)
rm -f *.o *.a *.ptx *.sass $(OBJECTS) $(CUDA_LIB_FILES); rm -f $(EXE) $(LIB_FILE)

run:
./$(EXE)
Expand All @@ -36,7 +62,7 @@ nvsight:
nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv

ptx:
cuobjdump -ptx ./$(EXE) tee ptx.txt
cuobjdump -ptx ./$(EXE) | tee $(EXE).ptx

sass:
cuobjdump -sass ./$(EXE) tee sass.txt
cuobjdump -sass ./$(EXE) | tee $(EXE).sass
10 changes: 5 additions & 5 deletions src/cuda/GPU_Microbenchmark/hw_def/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };

// source:
// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
unsigned round_up_2n(unsigned v) {
inline unsigned round_up_2n(unsigned v) {
v--;
v |= v >> 1;
v |= v >> 2;
Expand All @@ -34,9 +34,9 @@ unsigned round_up_2n(unsigned v) {
return v;
}

unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
inline unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }

bool isPowerOfTwo(int n) {
inline bool isPowerOfTwo(int n) {
if (n == 0)
return false;

Expand All @@ -51,12 +51,12 @@ static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
// atom size =
// dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
unsigned get_atom_size_inByte(enum dram_model model) {
inline unsigned get_atom_size_inByte(enum dram_model model) {
return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
dram_model_burst_length[model];
}
// CCD = dram_model_burst_length/dram_model_freq_ratio
unsigned get_adjusted_CCD(enum dram_model model) {
inline unsigned get_adjusted_CCD(enum dram_model model) {
assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
return dram_model_burst_length[model] / dram_model_freq_ratio[model];
}
Expand Down
6 changes: 3 additions & 3 deletions src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct GpuConfig
unsigned FBP_COUNT = 0; // Frame Buffer Partitions
unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs)
};
GpuConfig config;
inline GpuConfig config;
// Parses short flags like --sm 80 into a GpuConfig object
inline void parseGpuConfigArgs(int argc, char *argv[])
{
Expand Down Expand Up @@ -158,7 +158,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
}
}

cudaDeviceProp deviceProp;
inline cudaDeviceProp deviceProp;

// NVIDIA RM API defines
#define NV_IOCTL_MAGIC 'F'
Expand Down Expand Up @@ -237,7 +237,7 @@ inline unsigned queryGrInfo(uint32_t info_index)
return result;
}

unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
{
// Check if running in GPGPU-Sim by looking for gpgpusim.config
std::ifstream configFile("gpgpusim.config");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ __global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1,
int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
initializeDeviceProp(0, argc, argv);

// Parse command line arguments for --fast flag
uint32_t repeat_times = 2048; // default
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ __global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1,
int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
initializeDeviceProp(0, argc, argv);
config.BLOCKS_NUM = config.SM_NUMBER * 2;
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ __global__ void atmoic_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
initializeDeviceProp(0, argc, argv);

config.THREADS_PER_BLOCK = 1;
config.THREADS_PER_SM = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
initializeDeviceProp(0, argc, argv);

dpu_max_flops();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
initializeDeviceProp(0, argc, argv);

fpu_max_flops();

Expand Down
30 changes: 30 additions & 0 deletions src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Source files split for parallel compilation
# Use wildcard to automatically include all size-specific breakdown files
SRC = MaxFlops_gmma.cu $(wildcard kernels/MaxFlops_gmma_*.cu)

EXE = MaxFlops_gmma

# Add include path for CUTLASS
INCLUDE += -I$(GPUAPPS_ROOT)/src/cuda/cutlass-bench/include -I./

# GMMA is only supported in sm_90a
ARCH?=sm_90a
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
CUDA_CPPFLAGS=
# Generate code for both sm_XXX and compute_XXX (SASS and PTX)
HOPPER_CUDA_CPPFLAGS=$(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch)))

# CUTLASS cute library requires C++17
NVCC_FLAGS := $(HOPPER_CUDA_CPPFLAGS) -std=c++17

include ../../../common/common.mk

# A test executable for checking the library and simulator debugging
TEST_SRC = MaxFlops_gmma_test.cu
TEST_EXE = MaxFlops_gmma_test
test: $(TEST_SRC)
$(CC) $(NVCC_FLAGS) $^ -o $(TEST_EXE) $(INCLUDE) -lcudart --cudart shared
cp $(TEST_EXE) $(BIN_DIR)

# Append the test to the release target
release: .EXTRA_PREREQS = test
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include <cuda.h>
#include "MaxFlops_gmma.h"
#include "../../../hw_def/hw_def.h"

int main(int argc, char *argv[])
{
initializeDeviceProp(0, argc, argv);

// Run comprehensive sweep over all valid MMA operations
run_all_wgmma_maxflops_tests();

return 0;
}
Loading
Loading