Skip to content

Commit f20dbba

Browse files
Profile with ncu instead of nsys
1 parent b656547 commit f20dbba

File tree

2 files changed

+45
-43
lines changed

2 files changed

+45
-43
lines changed

.github/scripts/profiler_nvidia.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,29 +8,25 @@
88
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
99
args = parser.parse_args()
1010

11-
ntsi_list = []
11+
kernel_dict = {}
1212
with open(args.input) as csv_file:
1313
csv_reader = csv.reader(csv_file)
1414
next(csv_reader)
15-
next(csv_reader)
16-
next(csv_reader)
1715
for row in csv_reader:
18-
if row:
19-
full_name = row[8]
20-
instances = int(row[2])
21-
time = float(row[1])
22-
sigma = float(row[7])
23-
if len(full_name) > 5 and full_name[:5] == "krnl_":
24-
name = full_name[5:]
25-
ntsi_list.append([name, time, sigma, instances])
26-
27-
ntsi_list.sort(key = lambda row: row[0])
16+
full_name = row[4]
17+
time = int(row[14]) / 1000.0
18+
if len(full_name) > 5 and full_name[:5] == "krnl_":
19+
name = full_name[5:]
20+
if name in kernel_dict.keys():
21+
kernel_dict[name].append(time)
22+
else:
23+
kernel_dict[name] = [time]
2824

2925
data = [["name", "time", "stdev"]]
30-
for name, time, sigma, instances in ntsi_list:
31-
count = instances / args.runs
32-
mean = int(time * count)
33-
stdev = sigma * count
26+
for name, time_list in kernel_dict.items():
27+
count = len(time_list) // args.runs
28+
mean = statistics.mean(time_list) * count
29+
stdev = 0 if args.runs == 1 else statistics.stdev(time_list) * count
3430
data.append([name, mean, stdev])
3531

3632
with open(args.output, 'w') as csv_file:

.github/workflows/standalone-benchmark.yml

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,33 +20,27 @@ jobs:
2020
vendor: nvidia
2121
runner: cern-nextgen-h100
2222
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=90
23-
profiler: nsys profile -o nvidia-h100
24-
profiler_post: nsys stats --report cuda_gpu_kern_sum --timeunit us --force-export=true --format csv nvidia-h100.nsys-rep >
2523
- name: nvidia-l40s
2624
vendor: nvidia
2725
runner: cern-nextgen-l40s
2826
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=89
29-
profiler: nsys profile -o nvidia-l40s
30-
profiler_post: nsys stats --report cuda_gpu_kern_sum --force-export=true --format csv nvidia-l40s.nsys-rep >
3127
- name: amd-mi300x
3228
vendor: amd
3329
runner: cern-nextgen-mi300x
3430
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx942
35-
profiler: rocprofv2 --basenames --output-directory /root --output-file-name amd-mi300x
36-
profiler_post: touch
3731
- name: amd-w7900
3832
vendor: amd
3933
runner: cern-nextgen-w7900
4034
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100
41-
profiler: rocprofv2 --basenames --output-directory /root --output-file-name amd-w7900
42-
profiler_post: touch
35+
4336
env:
4437
WORK_DIR: /cvmfs/alice.cern.ch
4538
ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
4639
MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles
4740
STANDALONE_DIR: /root/standalone
4841
BENCHMARK_CSV: ${{ matrix.name }}.csv
4942
PROFILER_CSV: results_${{ matrix.name }}.csv
43+
TIMING_CA: ./ca -e 50kHz -g --seed 0 --memSize 15000000000 --sync --debug 1 # Add --PROCdebugMarkdown 1 --runs 42 --runsInit 2 --PROCresetTimers 1 for benchmark runs
5044
LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat
5145

5246
name: ${{ matrix.name }}
@@ -57,19 +51,9 @@ jobs:
5751
- name: Download Files
5852
run: |
5953
mkdir -p ${STANDALONE_DIR}
60-
61-
if [[ "${{ matrix.vendor }}" == "nvidia" ]]; then
62-
curl -fL --retry 3 -o ${STANDALONE_DIR}/nsys.rpm https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2026_2/NsightSystems-linux-cli-public-2026.2.1.210-3763964.rpm
63-
dnf install -y ${STANDALONE_DIR}/nsys.rpm
64-
rm -f ${STANDALONE_DIR}/nsys.rpm
65-
fi
6654
6755
curl -fL --retry 3 -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
6856
69-
mkdir -p ${STANDALONE_DIR}/baseline
70-
curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${PROFILER_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${PROFILER_CSV}
71-
curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${BENCHMARK_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${BENCHMARK_CSV}
72-
7357
mkdir -p ${STANDALONE_DIR}/events
7458
curl -fL --retry 3 -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz
7559
tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events
@@ -107,23 +91,45 @@ jobs:
10791
source /etc/profile.d/modules.sh
10892
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
10993
cd ${STANDALONE_DIR}
110-
${{ matrix.profiler }} ${STANDALONE_DIR}/ca -e 50kHz -g --memSize 15000000000 --sync --debug 1 --runs 42 --runsInit 2 --PROCdebugMarkdown 1 --PROCresetTimers 1 --PROCdebugCSV /root/${BENCHMARK_CSV}
111-
${{ matrix.profiler_post }} /root/${PROFILER_CSV}
94+
${TIMING_CA} --debug 1 --runs 42 --runsInit 2 --PROCdebugMarkdown 1 --PROCresetTimers 1 --PROCdebugCSV /root/${BENCHMARK_CSV}
95+
python3 ${GITHUB_WORKSPACE}/.github/scripts/merge_runs.py --discard 2 --input /root/${BENCHMARK_CSV} --output /root/${BENCHMARK_CSV}
96+
97+
- name: Profiler - Nsight Compute
98+
if: ${{ matrix.vendor == 'nvidia' }}
99+
run: |
100+
dnf install -y cuda-nsight-compute-13-1
101+
source /etc/profile.d/modules.sh
102+
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
103+
cd ${STANDALONE_DIR}
104+
ncu --set none --metrics gpu__time_duration.avg --export ${{ matrix.name }} --clock-control none --force-overwrite ${TIMING_CA} --runs 42 --debug 1 --PROCdebugMarkdown 1 # Generates ${{ matrix.name }}.ncu-rep
105+
ncu --import ${STANDALONE_DIR}/${{ matrix.name }}.ncu-rep --print-units base --csv > /root/${PROFILER_CSV}
112106
rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build
107+
python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_${{ matrix.vendor }}.py --runs 42 --input /root/${PROFILER_CSV} --output /root/${PROFILER_CSV}
113108
114-
- name: Display table on GitHub web
109+
- name: Profiler - rocprofv2
110+
if: ${{ matrix.vendor == 'amd' }}
115111
run: |
116112
source /etc/profile.d/modules.sh
117113
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
114+
cd ${STANDALONE_DIR}
115+
rocprofv2 --output-directory /root --output-file-name ${{ matrix.name }} ${TIMING_CA} --runs 42 --debug 1 --PROCdebugMarkdown 1 # Generates results_${{ matrix.name }}.csv == ${PROFILER_CSV}
116+
rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build
118117
python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_${{ matrix.vendor }}.py --runs 42 --input /root/${PROFILER_CSV} --output /root/${PROFILER_CSV}
119-
python3 ${GITHUB_WORKSPACE}/.github/scripts/merge_runs.py --discard 2 --input /root/${BENCHMARK_CSV} --output /root/${BENCHMARK_CSV}
120-
python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${PROFILER_CSV} --current /root/${PROFILER_CSV} >> ${GITHUB_STEP_SUMMARY}
121-
echo -e "\n\n" >> ${GITHUB_STEP_SUMMARY}
122-
python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${BENCHMARK_CSV} --current /root/${BENCHMARK_CSV} >> ${GITHUB_STEP_SUMMARY}
123-
rm -rf ${STANDALONE_DIR}/baseline
124118
125119
- name: Upload Artifact
126120
uses: actions/upload-artifact@v6
127121
with:
128122
name: ${{ matrix.name }}-artifact
129123
path: "/root/*.csv"
124+
125+
- name: Display table on GitHub web
126+
run: |
127+
mkdir -p ${STANDALONE_DIR}/baseline
128+
curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${PROFILER_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${PROFILER_CSV}
129+
curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${BENCHMARK_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${BENCHMARK_CSV}
130+
#source /etc/profile.d/modules.sh
131+
#module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
132+
python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${PROFILER_CSV} --current /root/${PROFILER_CSV} >> ${GITHUB_STEP_SUMMARY}
133+
echo -e "\n\n" >> ${GITHUB_STEP_SUMMARY}
134+
python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${BENCHMARK_CSV} --current /root/${BENCHMARK_CSV} >> ${GITHUB_STEP_SUMMARY}
135+
rm -rf ${STANDALONE_DIR}/baseline

0 commit comments

Comments
 (0)