Skip to content

Commit 9574083

Browse files
Merge branch ngt-ci-pipeline into template
2 parents 123dc33 + cb85ca6 commit 9574083

File tree

6 files changed

+134
-50
lines changed

6 files changed

+134
-50
lines changed

.github/scripts/csv_to_md.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import argparse
2+
import csv
3+
import tabulate as tab
4+
5+
parser = argparse.ArgumentParser()
6+
parser.add_argument('-b', '--baseline', required=True, help='Baseline CSV file')
7+
parser.add_argument('-c', '--current', required=True, help='Current CSV file')
8+
args = parser.parse_args()
9+
10+
def get_2d_list(csv_filename):
11+
with open(csv_filename) as csv_file:
12+
csv_reader = csv.reader(csv_file)
13+
next(csv_reader)
14+
return [[str(name), float(mean), float(stdev)] for name, mean, stdev in csv_reader]
15+
16+
table_baseline = get_2d_list(args.baseline)
17+
table_current = get_2d_list(args.current)
18+
19+
def get_emoji(d, stdev):
20+
z = 1.96 # 95% confidence interval
21+
if d < -z * stdev:
22+
return ':green_circle:'
23+
elif d > z * stdev:
24+
return ':red_circle:'
25+
else:
26+
return ':white_circle:'
27+
28+
table = []
29+
for baseline, current in zip(table_baseline, table_current):
30+
baseline_name, baseline_mean, _ = baseline
31+
name, mean, stdev = current
32+
assert(baseline_name == name)
33+
diff = baseline_mean - mean
34+
impact = 0.0 if stdev == 0.0 else diff / stdev
35+
emoji = get_emoji(diff, stdev)
36+
table.append([name, int(mean), f'{stdev:.2f}', int(diff), f'{impact:.2f}', emoji])
37+
38+
header = ['name', 'mean (ms)', 'stdev \u03C3', 'diff \u0394', '\u0394 / \u03C3', '']
39+
print(tab.tabulate(table, header, tablefmt="github"))

.github/scripts/merge_runs.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import argparse
2+
import csv
3+
import statistics
4+
5+
parser = argparse.ArgumentParser()
6+
parser.add_argument('-d', '--discard', type=int, default=0, help='Number of initial measurements to discard')
7+
parser.add_argument('-i', '--input', required=True, help='Input CSV file')
8+
parser.add_argument('-o', '--output', required=True, help='Output CSV file')
9+
args = parser.parse_args()
10+
11+
time_dict = dict({})
12+
with open(args.input) as csv_file:
13+
csv_reader = csv.reader(csv_file)
14+
next(csv_reader)
15+
for name, time, _, _ in csv_reader:
16+
if name in time_dict.keys():
17+
time_dict[name].append(float(time))
18+
else:
19+
time_dict[name] = [float(time)]
20+
21+
data = [["name", "time", "stdev"]]
22+
for name, time_list in time_dict.items():
23+
mean = statistics.mean(time_list[args.discard:])
24+
stdev = statistics.stdev(time_list[args.discard:])
25+
data.append([name, mean, stdev])
26+
27+
with open(args.output, 'w') as csv_file:
28+
csv_writer = csv.writer(csv_file)
29+
csv_writer.writerows(data)

.github/workflows/standalone-benchmark.yml

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,6 @@ on:
99

1010
jobs:
1111
benchmark:
12-
env:
13-
ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
14-
ARTIFACT_FILE: /root/artifact.txt
15-
LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat
16-
MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles
17-
STANDALONE_DIR: /root/standalone
18-
WORK_DIR: /cvmfs/alice.cern.ch
19-
2012
runs-on: ${{ matrix.runner }}
2113
container: registry.cern.ch/alisw/slc9-gpu-builder@sha256:ea3443f9dfbc770e4b4bce0d1a9ecc0b7a7c16e9f76e416b796d170877220820
2214
strategy:
@@ -36,11 +28,34 @@ jobs:
3628
- name: amd-w7900
3729
runner: cern-nextgen-w7900
3830
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100
31+
env:
32+
WORK_DIR: /cvmfs/alice.cern.ch
33+
ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
34+
MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles
35+
STANDALONE_DIR: /root/standalone
36+
BENCHMARK_CSV: /root/${{ matrix.name }}.csv
37+
LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat
3938

4039
name: ${{ matrix.name }}
4140
steps:
4241
- name: Checkout Repository
43-
uses: actions/checkout@v4
42+
uses: actions/checkout@v6
43+
44+
- name: Download Files
45+
run: |
46+
mkdir -p ${STANDALONE_DIR}
47+
48+
curl -fL --retry 3 -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
49+
50+
mkdir -p ${STANDALONE_DIR}/baseline
51+
curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${{ matrix.name }}.csv https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${{ matrix.name }}.csv
52+
53+
mkdir -p ${STANDALONE_DIR}/events
54+
curl -fL --retry 3 -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz
55+
tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events
56+
57+
curl -fL --retry 3 -o ${STANDALONE_DIR}/events/50kHz.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/50kHz.tar.xz
58+
tar -xf ${STANDALONE_DIR}/events/50kHz.tar.xz -C ${STANDALONE_DIR}/events
4459
4560
- name: Build Deterministic
4661
run: &build |
@@ -53,50 +68,38 @@ jobs:
5368
env:
5469
DETERMINISTIC_MODE: GPU
5570

56-
- name: Download Small Event File
57-
run: &download |
58-
mkdir -p ${STANDALONE_DIR}/events
59-
curl -o ${STANDALONE_DIR}/events/${EVENT_FILE}.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/${EVENT_FILE}.tar.xz
60-
tar -xf ${STANDALONE_DIR}/events/${EVENT_FILE}.tar.xz -C ${STANDALONE_DIR}/events
61-
ls -la ${STANDALONE_DIR}/events/o2-simple
62-
env:
63-
EVENT_FILE: o2-simple
64-
65-
- name: Test
71+
- name: Test GPU Track Reconstruction
6672
run: |
6773
source /etc/profile.d/modules.sh
6874
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
69-
70-
ls -la ${STANDALONE_DIR}/events/o2-simple
71-
72-
${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --RTCoptConstexpr 1 --debug 6
73-
74-
curl -v -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
75+
cd ${STANDALONE_DIR}
76+
${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptConstexpr 1 --RTCoptSpecialCode 1 --debug 6
7577
cmp ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
76-
rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out ${STANDALONE_DIR}/build
77-
env:
78-
EVENT_FILE: o2-simple
78+
rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out ${STANDALONE_DIR}/events/o2-simple ${STANDALONE_DIR}/build
7979
8080
- name: Build Non-Deterministic
8181
run: *build
8282
env:
8383
DETERMINISTIC_MODE: OFF
8484

85-
- name: Download Large Event File
86-
run: *download
87-
env:
88-
EVENT_FILE: 50kHz
89-
90-
- name: Benchmark
85+
- name: Benchmark GPU Track Reconstruction
9186
run: |
9287
source /etc/profile.d/modules.sh
9388
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
89+
cd ${STANDALONE_DIR}
90+
${STANDALONE_DIR}/ca -e 50kHz -g --memSize 15000000000 --sync --runs 12 --debug 1 --PROCtimingCSV ${BENCHMARK_CSV}
91+
rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build
9492
95-
${STANDALONE_DIR}/ca -e 50kHz -g --memSize 15000000000 --sync --runs 1 --RTCenable --RTCoptSpecialCode 1 --RTCoptConstexpr 1 --debug 1 > ${ARTIFACT_FILE}
96-
rm -rf ${STANDALONE_DIR}/events ${STANDALONE_DIR}/build
93+
- name: Display table on GitHub web
94+
run: |
95+
source /etc/profile.d/modules.sh
96+
module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
97+
python3 ${GITHUB_WORKSPACE}/.github/scripts/merge_runs.py --discard 2 --input ${BENCHMARK_CSV} --output ${BENCHMARK_CSV}
98+
python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${{ matrix.name }}.csv --current ${BENCHMARK_CSV} >> ${GITHUB_STEP_SUMMARY}
99+
rm -rf ${STANDALONE_DIR}/baseline
97100
98101
- name: Upload Artifact
99-
uses: actions/upload-artifact@v4
102+
uses: actions/upload-artifact@v6
100103
with:
101104
name: ${{ matrix.name }}-artifact
102-
path: /root/artifact.txt
105+
path: /root/${{ matrix.name }}.csv

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
#include <atomic>
3737
#include <ctime>
38+
#include <fstream>
3839

3940
#ifndef _WIN32
4041
#include <unistd.h>
@@ -263,19 +264,21 @@ int32_t GPUReconstructionCPU::RunChains()
263264
}
264265
double kernelTotal = 0;
265266
std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
267+
std::ofstream timingCSVFile;
268+
if (!GetProcessingSettings().timingCSV.empty()) {
269+
timingCSVFile.open(GetProcessingSettings().timingCSV, std::ios::binary | std::ofstream::app);
270+
if (mNEventsProcessed == 1) timingCSVFile << "name,time,count,type\n";
271+
if (!timingCSVFile.is_open()) GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str());
272+
}
266273

267274
if (GetProcessingSettings().debugLevel >= 1) {
268275
for (uint32_t i = 0; i < mTimers.size(); i++) {
269276
double time = 0;
270-
if (mTimers[i] == nullptr) {
271-
continue;
272-
}
277+
if (mTimers[i] == nullptr) continue;
273278
for (int32_t j = 0; j < mTimers[i]->num; j++) {
274279
HighResTimer& timer = mTimers[i]->timer[j];
275280
time += timer.GetElapsedTime();
276-
if (GetProcessingSettings().resetTimers) {
277-
timer.Reset();
278-
}
281+
if (GetProcessingSettings().resetTimers) timer.Reset();
279282
}
280283

281284
uint32_t type = mTimers[i]->type;
@@ -288,7 +291,9 @@ int32_t GPUReconstructionCPU::RunChains()
288291
if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
289292
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
290293
}
291-
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
294+
double elapsedTime_ms = time * 1000000 / mStatNEvents;
295+
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), elapsedTime_ms, bandwidth);
296+
if (timingCSVFile.is_open()) timingCSVFile << mTimers[i]->name << "," << elapsedTime_ms << "," << mTimers[i]->count << ",Task\n";
292297
if (GetProcessingSettings().resetTimers) {
293298
mTimers[i]->count = 0;
294299
mTimers[i]->memSize = 0;
@@ -298,8 +303,10 @@ int32_t GPUReconstructionCPU::RunChains()
298303
if (GetProcessingSettings().recoTaskTiming) {
299304
for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
300305
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
306+
double elapsedTime_ms = kernelStepTimes[i] * 1000000 / mStatNEvents;
301307
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
302-
gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
308+
gpudatatypes::RECO_STEP_NAMES[i], elapsedTime_ms, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
309+
if (timingCSVFile.is_open()) timingCSVFile << gpudatatypes::RECO_STEP_NAMES[i] << "," << elapsedTime_ms << ",1,Step\n";
303310
}
304311
if (mTimersRecoSteps[i].bytesToGPU) {
305312
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
@@ -320,17 +327,22 @@ int32_t GPUReconstructionCPU::RunChains()
320327
}
321328
}
322329
for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
323-
if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
324-
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
330+
double elapsedTime_ms = mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents;
331+
if (elapsedTime_ms != 0.) {
332+
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], elapsedTime_ms);
333+
if (timingCSVFile.is_open()) timingCSVFile << gpudatatypes::GENERAL_STEP_NAMES[i] << "," << elapsedTime_ms << ",1,Step\n";
325334
}
326335
}
327336
if (GetProcessingSettings().debugLevel >= 1) {
328337
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
329338
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
339+
if (timingCSVFile.is_open()) timingCSVFile << "Total Kernel" << "," << mStatKernelTime << ",1,Total\n";
330340
}
331341
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
342+
if (timingCSVFile.is_open()) timingCSVFile << "Total Wall" << "," << mStatWallTime << ",1,Total\n";
332343
} else if (GetProcessingSettings().debugLevel >= 0) {
333344
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
345+
if (timingCSVFile.is_open()) timingCSVFile << "Total Wall" << "," << mStatWallTime << ",1,Total\n";
334346
}
335347
if (GetProcessingSettings().resetTimers) {
336348
mStatNEvents = 0;

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
307307
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
308308
AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
309309
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
310+
AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
310311
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
311312
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
312313
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))

GPU/GPUTracking/Standalone/Benchmark/standalone.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
627627
if (configStandalone.runs > 1) {
628628
printf("Run %d (thread %d)\n", iteration + 1, threadId);
629629
}
630-
recUse->SetResetTimers(iRun < configStandalone.runsInit);
630+
recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
631631
if (configStandalone.outputcontrolmem) {
632632
recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);
633633
}
@@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
685685
chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
686686
}
687687
chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
688-
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
688+
recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
689689
tmpRetVal = recAsync->RunChains();
690690
if (tmpRetVal == 0 || tmpRetVal == 2) {
691691
OutputStat(chainTrackingAsync, nullptr, nullptr);

0 commit comments

Comments
 (0)