Add comment about how to fix IterSwap and InsertionSort for SoA tracks #244
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Standalone Benchmark | |
| on: | |
| workflow_dispatch: | |
| pull_request: | |
| push: | |
| branches: | |
| - '**' | |
| jobs: | |
| benchmark: | |
| runs-on: ${{ matrix.runner }} | |
| container: registry.cern.ch/alisw/slc9-gpu-builder@sha256:ea3443f9dfbc770e4b4bce0d1a9ecc0b7a7c16e9f76e416b796d170877220820 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| name: [cpu, nvidia-h100, nvidia-l40s, amd-mi300x, amd-w7900] | |
| include: | |
| - name: cpu | |
| runner: cern-nextgen-mi300x | |
| cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=0 | |
| profiler_runs: 42 | |
| standalone_runs: 42 | |
| cpu_gpu: "-c" | |
| - name: nvidia-h100 | |
| runner: cern-nextgen-h100 | |
| cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=90 | |
| profiler_runs: 21 | |
| standalone_runs: 42 | |
| cpu_gpu: "-g --memSize 20000000000" | |
| - name: nvidia-l40s | |
| runner: cern-nextgen-l40s | |
| cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=89 | |
| profiler_runs: 42 | |
| standalone_runs: 42 | |
| cpu_gpu: "-g --memSize 20000000000" | |
| - name: amd-mi300x | |
| runner: cern-nextgen-mi300x | |
| cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx942 | |
| profiler_runs: 42 | |
| standalone_runs: 42 | |
| cpu_gpu: "-g --memSize 20000000000" | |
| - name: amd-w7900 | |
| runner: cern-nextgen-w7900 | |
| cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100 | |
| profiler_runs: 42 | |
| standalone_runs: 42 | |
| cpu_gpu: "-g --memSize 20000000000" | |
| env: | |
| WORK_DIR: /cvmfs/alice.cern.ch | |
| ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages | |
| MODULEPATH: /cvmfs/alice.cern.ch/etc/toolchain/modulefiles/el9-x86_64:/cvmfs/alice.cern.ch/el9-x86_64/Modules/modulefiles | |
| STANDALONE_DIR: /root/standalone | |
| BENCHMARK_CSV: standalone_${{ matrix.name }}.csv | |
| PROFILER_CSV: profiler_${{ matrix.name }}.csv | |
| TIMING_CA: ./ca -e 50kHz ${{ matrix.cpu_gpu }} --seed 0 --sync --runsInit 0 --PROCresetTimers 1 --PROCdebugMarkdown 1 --debug 1 # Add --runs 42 for benchmark runs | |
| LD_LIBRARY_PATH: /usr/local/cuda-13.0/compat | |
| name: ${{ matrix.name }} | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v6 | |
| - name: Download Files | |
| run: | | |
| mkdir -p ${STANDALONE_DIR} | |
| curl -fL --retry 3 -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out | |
| mkdir -p ${STANDALONE_DIR}/events | |
| curl -fL --retry 3 -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz | |
| tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events | |
| curl -fL --retry 3 -o ${STANDALONE_DIR}/events/50kHz.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/50kHz.tar.xz | |
| tar -xf ${STANDALONE_DIR}/events/50kHz.tar.xz -C ${STANDALONE_DIR}/events | |
| - name: Build Deterministic | |
| run: &build | | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| mkdir -p ${STANDALONE_DIR} | |
| cmake -B ${STANDALONE_DIR}/build ${{ matrix.cmake_args }} -DENABLE_OPENCL=0 -DGPUCA_BUILD_EVENT_DISPLAY=0 -DGPUCA_DETERMINISTIC_MODE=${DETERMINISTIC_MODE} -DCMAKE_INSTALL_PREFIX=${STANDALONE_DIR} ${GITHUB_WORKSPACE}/GPU/GPUTracking/Standalone/ | |
| cmake --build ${STANDALONE_DIR}/build --target install -j 8 | |
| env: | |
| DETERMINISTIC_MODE: GPU | |
| - name: Test Track Reconstruction | |
| run: | | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| cd ${STANDALONE_DIR} | |
| ${STANDALONE_DIR}/ca -e o2-simple ${{ matrix.cpu_gpu }} --seed 0 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptConstexpr 1 --RTCoptSpecialCode 1 --debug 6 | |
| cmp ${STANDALONE_DIR}/*.out | |
| rm -rf ${STANDALONE_DIR}/*.out ${STANDALONE_DIR}/events/o2-simple ${STANDALONE_DIR}/build | |
| - name: Build Non-Deterministic | |
| run: *build | |
| env: | |
| DETERMINISTIC_MODE: OFF | |
| - name: Benchmark Track Reconstruction | |
| run: | | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| cd ${STANDALONE_DIR} | |
| ${TIMING_CA} --runs ${{ matrix.standalone_runs }} --PROCdebugCSV /root/${BENCHMARK_CSV} | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_standalone.py --discard 0 --input /root/${BENCHMARK_CSV} --output /root/summary_${BENCHMARK_CSV} | |
| - name: Profiler - Nsight Compute | |
| if: ${{ matrix.name == 'nvidia-h100' }} | |
| run: | | |
| dnf install -y cuda-nsight-compute-13-1 | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| cd ${STANDALONE_DIR} | |
| ncu --set none --metrics gpu__time_duration.avg --export ${{ matrix.name }} --clock-control none --force-overwrite ${TIMING_CA} --runs ${{ matrix.profiler_runs }} # Generates ${{ matrix.name }}.ncu-rep | |
| ncu --import ${STANDALONE_DIR}/${{ matrix.name }}.ncu-rep --print-units base --csv > /root/${PROFILER_CSV} | |
| rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_ncu.py --input /root/${PROFILER_CSV} --output /root/summary_${PROFILER_CSV} | |
| - name: Profiler - Nsight Systems | |
| if: ${{ matrix.name == 'nvidia-l40s' }} | |
| run: | | |
| dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" | |
| dnf install --nogpgcheck -y nsight-systems-cli-2026.2.1 | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| cd ${STANDALONE_DIR} | |
| nsys profile -o ${{ matrix.name }} ${TIMING_CA} --runs ${{ matrix.profiler_runs }} # Generates ${{ matrix.name }}.nsys-rep | |
| nsys stats --report cuda_gpu_kern_sum --timeunit usec --force-export=true --format csv ${{ matrix.name }}.nsys-rep > /root/${PROFILER_CSV} | |
| rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_nsys.py --input /root/${PROFILER_CSV} --output /root/summary_${PROFILER_CSV} | |
| - name: Profiler - rocprofv2 | |
| if: ${{ matrix.name == 'amd-mi300x' || matrix.name == 'amd-w7900' }} | |
| run: | | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| cd ${STANDALONE_DIR} | |
| rocprofv2 --output-directory /root --output-file-name ${{ matrix.name }} ${TIMING_CA} --runs ${{ matrix.standalone_runs }} # Generates results_${{ matrix.name }}.csv | |
| rm -rf ${STANDALONE_DIR}/events/50kHz ${STANDALONE_DIR}/build | |
| mv /root/results_${{ matrix.name }}.csv /root/${PROFILER_CSV} | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/profiler_rocprofv2.py --input /root/${PROFILER_CSV} --output /root/summary_${PROFILER_CSV} | |
| - name: Upload Artifact | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: ${{ matrix.name }}-artifact | |
| path: "/root/*.csv" | |
| - name: Display table on GitHub web | |
| run: | | |
| source /etc/profile.d/modules.sh | |
| module load ninja/fortran-v1.11.1.g9-15 Vc/1.4.5-10 boost/v1.83.0-alice2-57 fmt/11.1.2-14 CMake/v3.31.6-10 ms_gsl/4.2.1-3 Clang/v20.1.7-9 TBB/v2022.3.0-3 ROOT/v6-36-04-alice9-15 ONNXRuntime/v1.22.0-71 GLFW/3.3.2-25 | |
| mkdir -p ${STANDALONE_DIR}/baseline | |
| curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/summary_${PROFILER_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/summary_${PROFILER_CSV} | |
| curl -fL --retry 3 -o ${STANDALONE_DIR}/baseline/summary_${BENCHMARK_CSV} https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/baseline/summary_${BENCHMARK_CSV} | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --runs ${{ matrix.profiler_runs }} --baseline ${STANDALONE_DIR}/baseline/summary_${PROFILER_CSV} --current /root/summary_${PROFILER_CSV} >> ${GITHUB_STEP_SUMMARY} | |
| echo -e "\n\n" >> ${GITHUB_STEP_SUMMARY} | |
| python3 ${GITHUB_WORKSPACE}/.github/scripts/csv_to_md.py --runs ${{ matrix.standalone_runs }} --baseline ${STANDALONE_DIR}/baseline/summary_${BENCHMARK_CSV} --current /root/summary_${BENCHMARK_CSV} >> ${GITHUB_STEP_SUMMARY} | |
| rm -rf ${STANDALONE_DIR}/baseline | |
| if: ${{ matrix.name != 'cpu' }} |