Merge branch ngt-ci-pipeline into template

OliverRietmann · OliverRietmann · commit 95740834941d · 2026-03-13T11:25:59.000+01:00
diff --git a/.github/scripts/csv_to_md.py b/.github/scripts/csv_to_md.py
@@ -0,0 +1,39 @@
+import argparse
+import csv
+import tabulate as tab
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-b', '--baseline', required=True, help='Baseline CSV file')
+parser.add_argument('-c', '--current', required=True, help='Current CSV file')
+args = parser.parse_args()
+
+def get_2d_list(csv_filename):
+  with open(csv_filename) as csv_file:
+    csv_reader = csv.reader(csv_file)
+    next(csv_reader)
+    return [[str(name), float(mean), float(stdev)] for name, mean, stdev in csv_reader]
+
+table_baseline = get_2d_list(args.baseline)
+table_current = get_2d_list(args.current)
+
+def get_emoji(d, stdev):
+  z = 1.96 # 95% confidence interval
+  if d < -z * stdev:
+    return ':green_circle:'
+  elif d > z * stdev:
+    return ':red_circle:'
+  else:
+    return ':white_circle:'
+
+table = []
+for baseline, current in zip(table_baseline, table_current):
+  baseline_name, baseline_mean, _ = baseline
+  name, mean, stdev = current
+  assert(baseline_name == name)
+  diff = baseline_mean - mean
+  impact = 0.0 if stdev == 0.0 else diff / stdev
+  emoji = get_emoji(diff, stdev)
+  table.append([name, int(mean), f'{stdev:.2f}', int(diff), f'{impact:.2f}', emoji])
+
+header = ['name', 'mean (ms)', 'stdev \u03C3', 'diff \u0394', '\u0394 / \u03C3', '']
+print(tab.tabulate(table, header, tablefmt="github"))
diff --git a/.github/scripts/merge_runs.py b/.github/scripts/merge_runs.py
@@ -0,0 +1,29 @@
+import argparse
+import csv
+import statistics
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-d', '--discard', type=int, default=0, help='Number of initial measurements to discard')
+parser.add_argument('-i', '--input', required=True, help='Input CSV file')
+parser.add_argument('-o', '--output', required=True, help='Output CSV file')
+args = parser.parse_args()
+
+time_dict = dict({})
+with open(args.input) as csv_file:
+  csv_reader = csv.reader(csv_file)
+  next(csv_reader)
+  for name, time, _, _ in csv_reader:
+    if name in time_dict.keys():
+      time_dict[name].append(float(time))
+    else:
+      time_dict[name] = [float(time)]
+
+data = [["name", "time", "stdev"]]
+for name, time_list in time_dict.items():
+  mean = statistics.mean(time_list[args.discard:])
+  stdev = statistics.stdev(time_list[args.discard:])
+  data.append([name, mean, stdev])
+
+with open(args.output, 'w') as csv_file:
+  csv_writer = csv.writer(csv_file)
+  csv_writer.writerows(data)
diff --git a/.github/workflows/standalone-benchmark.yml b/.github/workflows/standalone-benchmark.yml
@@ -9,14 +9,6 @@ on:
 
 jobs:
   benchmark:
-    env:
-      ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
-      ARTIFACT_FILE: /root/artifact.txt
-      LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat
-      MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles
-      STANDALONE_DIR: /root/standalone
-      WORK_DIR: /cvmfs/alice.cern.ch
-
     runs-on: ${{ matrix.runner }}
     container: registry.cern.ch/alisw/slc9-gpu-builder@sha256:ea3443f9dfbc770e4b4bce0d1a9ecc0b7a7c16e9f76e416b796d170877220820
     strategy:
@@ -36,11 +28,34 @@ jobs:
           - name: amd-w7900
             runner: cern-nextgen-w7900
             cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100
+    env:
+      WORK_DIR: /cvmfs/alice.cern.ch
+      ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
+      MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles
+      STANDALONE_DIR: /root/standalone
+      BENCHMARK_CSV: /root/${{ matrix.name }}.csv
+      LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat
 
     name: ${{ matrix.name }}
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
+
+      - name: Download Files
+        run: |
+          mkdir -p ${STANDALONE_DIR}
+
+          curl -fL --retry 3 -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
+
+          mkdir -p ${STANDALONE_DIR}/baseline
+          curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/${{ matrix.name }}.csv https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/${{ matrix.name }}.csv
+
+          mkdir -p ${STANDALONE_DIR}/events
+          curl -fL --retry 3 -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz
+          tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events
+
+          curl -fL --retry 3 -o ${STANDALONE_DIR}/events/50kHz.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/50kHz.tar.xz
+          tar -xf ${STANDALONE_DIR}/events/50kHz.tar.xz -C ${STANDALONE_DIR}/events
 
       - name: Build Deterministic
         run: &build |
@@ -53,50 +68,38 @@ jobs:
         env:
           DETERMINISTIC_MODE: GPU
 
-      - name: Download Small Event File
-        run: &download |
-          mkdir -p ${STANDALONE_DIR}/events
-          curl -o ${STANDALONE_DIR}/events/${EVENT_FILE}.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/${EVENT_FILE}.tar.xz
-          tar -xf ${STANDALONE_DIR}/events/${EVENT_FILE}.tar.xz -C ${STANDALONE_DIR}/events
-          ls -la ${STANDALONE_DIR}/events/o2-simple
-        env:
-          EVENT_FILE: o2-simple
-
-      - name: Test
+      - name: Test GPU Track Reconstruction
         run: |
           source /etc/profile.d/modules.sh
           module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
-
-          ls -la ${STANDALONE_DIR}/events/o2-simple
-
-          ${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --RTCoptConstexpr 1 --debug 6
-
-          curl -v -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
+          cd ${STANDALONE_DIR}
+          ${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptConstexpr 1 --RTCoptSpecialCode 1 --debug 6
           cmp ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
-          rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out ${STANDALONE_DIR}/build
-        env:
-          EVENT_FILE: o2-simple
+          rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out ${STANDALONE_DIR}/events/o2-simple ${STANDALONE_DIR}/build
 
       - name: Build Non-Deterministic
         run: *build
         env:
           DETERMINISTIC_MODE: OFF
 
-      - name: Download Large Event File
-        run: *download
-        env:
-          EVENT_FILE: 50kHz
-
-      - name: Benchmark
+      - name: Benchmark GPU Track Reconstruction
         run: |
           source /etc/profile.d/modules.sh
           module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
+          cd ${STANDALONE_DIR}
+          ${STANDALONE_DIR}/ca -e 50kHz -g --memSize 15000000000 --sync --runs 12 --debug 1 --PROCtimingCSV ${BENCHMARK_CSV}
+          rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build
 
-          ${STANDALONE_DIR}/ca -e 50kHz -g --memSize 15000000000 --sync --runs 1 --RTCenable --RTCoptSpecialCode 1 --RTCoptConstexpr 1 --debug 1 > ${ARTIFACT_FILE}
-          rm -rf ${STANDALONE_DIR}/events ${STANDALONE_DIR}/build
+      - name: Display table on GitHub web
+        run: |
+          source /etc/profile.d/modules.sh
+          module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25
+          python3 ${GITHUB_WORKSPACE}/.github/scripts/merge_runs.py --discard 2 --input ${BENCHMARK_CSV} --output ${BENCHMARK_CSV}
+          python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --baseline ${STANDALONE_DIR}/baseline/${{ matrix.name }}.csv --current ${BENCHMARK_CSV} >> ${GITHUB_STEP_SUMMARY}
+          rm -rf ${STANDALONE_DIR}/baseline
 
       - name: Upload Artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: ${{ matrix.name }}-artifact
-          path: /root/artifact.txt
+          path: /root/${{ matrix.name }}.csv
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -35,6 +35,7 @@
 
 #include <atomic>
 #include <ctime>
+#include <fstream>
 
 #ifndef _WIN32
 #include <unistd.h>
@@ -263,19 +264,21 @@ int32_t GPUReconstructionCPU::RunChains()
   }
   double kernelTotal = 0;
   std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
+  std::ofstream timingCSVFile;
+  if (!GetProcessingSettings().timingCSV.empty()) {
+    timingCSVFile.open(GetProcessingSettings().timingCSV, std::ios::binary | std::ofstream::app);
+    if (mNEventsProcessed == 1) timingCSVFile << "name,time,count,type\n";
+    if (!timingCSVFile.is_open()) GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str());
+  }
 
   if (GetProcessingSettings().debugLevel >= 1) {
     for (uint32_t i = 0; i < mTimers.size(); i++) {
       double time = 0;
-      if (mTimers[i] == nullptr) {
-        continue;
-      }
+      if (mTimers[i] == nullptr) continue;
       for (int32_t j = 0; j < mTimers[i]->num; j++) {
         HighResTimer& timer = mTimers[i]->timer[j];
         time += timer.GetElapsedTime();
-        if (GetProcessingSettings().resetTimers) {
-          timer.Reset();
-        }
+        if (GetProcessingSettings().resetTimers) timer.Reset();
       }
 
       uint32_t type = mTimers[i]->type;
@@ -288,7 +291,9 @@ int32_t GPUReconstructionCPU::RunChains()
       if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
         snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
       }
-      printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
+      double elapsedTime_ms = time * 1000000 / mStatNEvents;
+      printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), elapsedTime_ms, bandwidth);
+      if (timingCSVFile.is_open()) timingCSVFile << mTimers[i]->name << "," << elapsedTime_ms << "," << mTimers[i]->count << ",Task\n";
       if (GetProcessingSettings().resetTimers) {
         mTimers[i]->count = 0;
         mTimers[i]->memSize = 0;
@@ -298,8 +303,10 @@ int32_t GPUReconstructionCPU::RunChains()
   if (GetProcessingSettings().recoTaskTiming) {
     for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
       if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
+        double elapsedTime_ms = kernelStepTimes[i] * 1000000 / mStatNEvents;
         printf("Execution Time: Step              : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
-               gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
+               gpudatatypes::RECO_STEP_NAMES[i], elapsedTime_ms, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
+        if (timingCSVFile.is_open()) timingCSVFile << gpudatatypes::RECO_STEP_NAMES[i] << "," << elapsedTime_ms << ",1,Step\n";
       }
       if (mTimersRecoSteps[i].bytesToGPU) {
         printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
@@ -320,17 +327,22 @@ int32_t GPUReconstructionCPU::RunChains()
       }
     }
     for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
-      if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
-        printf("Execution Time: General Step      : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
+      double elapsedTime_ms = mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents;
+      if (elapsedTime_ms != 0.) {
+        printf("Execution Time: General Step      : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], elapsedTime_ms);
+        if (timingCSVFile.is_open()) timingCSVFile << gpudatatypes::GENERAL_STEP_NAMES[i] << "," << elapsedTime_ms << ",1,Step\n";
       }
     }
     if (GetProcessingSettings().debugLevel >= 1) {
       mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
       printf("Execution Time: Total   : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
+      if (timingCSVFile.is_open()) timingCSVFile << "Total Kernel" << "," << mStatKernelTime << ",1,Total\n";
     }
     printf("Execution Time: Total   : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
+    if (timingCSVFile.is_open()) timingCSVFile << "Total Wall" << "," << mStatWallTime << ",1,Total\n";
   } else if (GetProcessingSettings().debugLevel >= 0) {
     GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
+    if (timingCSVFile.is_open()) timingCSVFile << "Total Wall" << "," << mStatWallTime << ",1,Total\n";
   }
   if (GetProcessingSettings().resetTimers) {
     mStatNEvents = 0;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -307,6 +307,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
 AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
 AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
 AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
+AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
 AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
 AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
 AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
     if (configStandalone.runs > 1) {
       printf("Run %d (thread %d)\n", iteration + 1, threadId);
     }
-    recUse->SetResetTimers(iRun < configStandalone.runsInit);
+    recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
     if (configStandalone.outputcontrolmem) {
       recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);
     }
@@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
         chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
       }
       chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
-      recAsync->SetResetTimers(iRun < configStandalone.runsInit);
+      recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
       tmpRetVal = recAsync->RunChains();
       if (tmpRetVal == 0 || tmpRetVal == 2) {
         OutputStat(chainTrackingAsync, nullptr, nullptr);

Original file line number	Diff line number	Diff line change
`@@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU`
`627`	`627`	`if (configStandalone.runs > 1) {`
`628`	`628`	`printf("Run %d (thread %d)\n", iteration + 1, threadId);`
`629`	`629`	`}`
`630`		`- recUse->SetResetTimers(iRun < configStandalone.runsInit);`
	`630`	`+ recUse->SetResetTimers(iRun < configStandalone.runsInit \|\| configStandalone.proc.resetTimers);`
`631`	`631`	`if (configStandalone.outputcontrolmem) {`
`632`	`632`	`recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);`
`633`	`633`	`}`
`@@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU`
`685`	`685`	`chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;`
`686`	`686`	`}`
`687`	`687`	`chainTrackingAsync->mIOPtrs.clustersNative = nullptr;`
`688`		`- recAsync->SetResetTimers(iRun < configStandalone.runsInit);`
	`688`	`+ recAsync->SetResetTimers(iRun < configStandalone.runsInit \|\| configStandalone.proc.resetTimers);`
`689`	`689`	`tmpRetVal = recAsync->RunChains();`
`690`	`690`	`if (tmpRetVal == 0 \|\| tmpRetVal == 2) {`
`691`	`691`	`OutputStat(chainTrackingAsync, nullptr, nullptr);`