From 5d07c4393aea45f023f0124f2010a15e714a434f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 15 Mar 2026 13:08:01 +0100 Subject: [PATCH 01/35] cleanup --- examples/benchmarks/sph_weak_scale_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index be488c3bd..90bfacbc2 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -159,13 +159,24 @@ result_text += f"res_cnts = {res_cnts}\n" result_text += f"step time = {step_time}\n" + dic_out = { + "world_size": shamrock.sys.world_size(), + "rate": res_rate, + "cnt": res_cnt, + "step_time": step_time, + } + # print the system metrics result_text += "system metrics:\n" for key, value in max_rate_system_metrics.items(): result_text += f"{key}: {value} J\n" + dic_out[key] = value for key, value in max_rate_system_metrics.items(): result_text += f"avg power {key} / step time : {value / step_time} W\n" + dic_out[f"power_{key}"] = value / step_time + + result_text += f"dic_out = {dic_out}\n" print("current results:") print(result_text) From de797a610e3beab44d777f2e17e11c9335caa9a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 15 Mar 2026 13:11:14 +0100 Subject: [PATCH 02/35] cleanup --- env/machine/argonne/aurora/env_oneapi.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/env/machine/argonne/aurora/env_oneapi.sh b/env/machine/argonne/aurora/env_oneapi.sh index 1fdebde40..c59bc8eef 100644 --- a/env/machine/argonne/aurora/env_oneapi.sh +++ b/env/machine/argonne/aurora/env_oneapi.sh @@ -6,6 +6,8 @@ module load python module load ninja function shamconfigure { + # note that the -g flag is set. In principle there is no impact on the perf, but if you run on + # aurora it is better to enable debug symbols for crash reporting. cmake \ -S $SHAMROCK_DIR \ -B $BUILD_DIR \ @@ -14,10 +16,11 @@ function shamconfigure { -DINTEL_LLVM_PATH=$(dirname $(which icpx))/.. \ -DCMAKE_CXX_COMPILER=$(which icpx) \ -DCMAKE_C_COMPILER=$(which icx) \ - -DCMAKE_CXX_FLAGS="-fsycl -fp-model=precise" \ + -DCMAKE_CXX_FLAGS="-g -fsycl -fp-model=precise" \ -DCMAKE_EXE_LINKER_FLAGS="-Wl,--copy-dt-needed-entries" \ -DCMAKE_BUILD_TYPE="${SHAMROCK_BUILD_TYPE}" \ -DBUILD_TEST=Yes \ + -DSHAMROCK_USE_CPPTRACE=Yes \ "${CMAKE_OPT[@]}" } From b161128b7ac5a31e6fc4b7df759db6d11dcae743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 15 Mar 2026 22:35:37 +0100 Subject: [PATCH 03/35] better ? --- src/shamsys/src/system_metrics.cpp | 36 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index aefcfd60d..ee4437a9a 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -16,6 +16,7 @@ #include "shambase/aliases_int.hpp" #include "shambase/popen.hpp" #include "shamcomm/local_rank.hpp" +#include "shamcomm/wrapper.hpp" #include "shamsys/system_metrics.hpp" #include @@ -24,35 +25,47 @@ namespace shamsys { class AuroraSystemMetricReporter : public ISystemMetricReporter { public: std::optional get_rank_energy_consummed() override { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread BOARD_ENERGY board 0"); - return std::stoull(output.c_str()); + ret = std::stoull(output.c_str()); } - return std::nullopt; + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + return ret; } std::optional get_gpu_energy_consummed() override { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread GPU_ENERGY board 0"); - return std::stoull(output.c_str()); + ret = std::stoull(output.c_str()); } - return std::nullopt; + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + return ret; } std::optional get_cpu_energy_consummed() override { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread CPU_ENERGY board 0"); - return std::stoull(output.c_str()); + ret = std::stoull(output.c_str()); } - return std::nullopt; + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + return ret; } std::optional get_dram_energy_consummed() override { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread DRAM_ENERGY board 0"); - return std::stoull(output.c_str()); + ret = std::stoull(output.c_str()); } - return std::nullopt; + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + return ret; } bool support_rank_energy_consummed() override { return true; } @@ -64,12 +77,15 @@ namespace shamsys { class IntelRAPLSystemMetricReport : public ISystemMetricReporter { public: std::optional get_rank_energy_consummed() override { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output( "cat /sys/class/powercap/intel-rapl:0/energy_uj"); - return f64(std::stoull(output.c_str())) * 1e-6; + ret = f64(std::stoull(output.c_str())) * 1e-6; } - return std::nullopt; + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + return ret; } std::optional get_gpu_energy_consummed() override { return std::nullopt; } From 58b4695751b780529cdeaad20c00f5e49dba87b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 15 Mar 2026 23:08:07 +0100 Subject: [PATCH 04/35] better reporting ? --- src/shammodels/sph/src/Solver.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/shammodels/sph/src/Solver.cpp b/src/shammodels/sph/src/Solver.cpp index d2e042373..b6467de41 100644 --- a/src/shammodels/sph/src/Solver.cpp +++ b/src/shammodels/sph/src/Solver.cpp @@ -1579,10 +1579,12 @@ void shammodels::sph::Solver::update_sync_load_values() { template class Kern> shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() { - sham::MemPerfInfos mem_perf_infos_start = sham::details::get_mem_perf_info(); - f64 mpi_timer_start = shamcomm::mpi::get_timer("total"); + // has to be first since there is a barrier that may mess the other timers shamsys::SystemMetrics system_metrics_start = shamsys::get_system_metrics(); + sham::MemPerfInfos mem_perf_infos_start = sham::details::get_mem_perf_info(); + f64 mpi_timer_start = shamcomm::mpi::get_timer("total"); + Tscal t_current = solver_config.get_time(); Tscal dt = solver_config.get_dt_sph(); From ff5a902e22f4f6cafbaf0a56bbedc08b43f5f2fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 15 Mar 2026 23:37:48 +0100 Subject: [PATCH 05/35] better reporting ? --- examples/benchmarks/sph_weak_scale_test.py | 11 +++--- src/shammodels/common/src/timestep_report.cpp | 27 +++++++++----- src/shammodels/sph/src/Solver.cpp | 6 ++-- src/shammodels/sph/src/SolverLog.cpp | 5 +++ src/shammodels/sph/src/pySPHModel.cpp | 1 + .../include/shamsys/system_metrics.hpp | 16 +++++++-- src/shamsys/src/system_metrics.cpp | 36 ++++++------------- 7 files changed, 60 insertions(+), 42 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 90bfacbc2..9278ae739 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -167,14 +167,17 @@ } # print the system metrics + metrics_duration = max_rate_system_metrics["duration"] result_text += "system metrics:\n" for key, value in max_rate_system_metrics.items(): - result_text += f"{key}: {value} J\n" - dic_out[key] = value + if not key == "duration": + result_text += f"{key}: {value} J\n" + dic_out[key] = value for key, value in max_rate_system_metrics.items(): - result_text += f"avg power {key} / step time : {value / step_time} W\n" - dic_out[f"power_{key}"] = value / step_time + if not key == "duration": + result_text += f"avg power {key} / step time : {value / metrics_duration} W\n" + dic_out[f"power_{key}"] = value / metrics_duration result_text += f"dic_out = {dic_out}\n" diff --git a/src/shammodels/common/src/timestep_report.cpp b/src/shammodels/common/src/timestep_report.cpp index 2d9ede830..a617e777f 100644 --- a/src/shammodels/common/src/timestep_report.cpp +++ b/src/shammodels/common/src/timestep_report.cpp @@ -69,6 +69,9 @@ std::string shammodels::report_perf_timestep( = optional_gather_power(system_metrics.cpu_energy_consummed); std::vector dram_energy_consummed_all_ranks = optional_gather_power(system_metrics.dram_energy_consummed); + std::vector metric_time_all_ranks + = (report_power_usage) ? shamalgs::collective::gather(system_metrics.wall_time) + : std::vector{}; if (shamcomm::world_rank() != 0) { return ""; @@ -88,6 +91,10 @@ std::string shammodels::report_perf_timestep( = std::accumulate(max_mem_device_all_ranks.begin(), max_mem_device_all_ranks.end(), 0_u64); size_t sum_mem_host_total = std::accumulate(max_mem_host_all_ranks.begin(), max_mem_host_all_ranks.end(), 0_u64); + f64 metric_tmax + = (report_power_usage) + ? *std::max_element(metric_time_all_ranks.begin(), metric_time_all_ranks.end()) + : 0._f64; std::vector rank_power_step_all_ranks; std::vector rank_gpu_power_step_all_ranks; @@ -101,28 +108,32 @@ std::string shammodels::report_perf_timestep( for (u32 i = 0; i < shamcomm::world_size(); i++) { if (rank_energy_consummed_all_ranks[i] > 0._f64) { rank_power_step_all_ranks.push_back( - shambase::format("{:.1f} W", f64(rank_energy_consummed_all_ranks[i]) / max_t)); + shambase::format( + "{:.1f} W", f64(rank_energy_consummed_all_ranks[i]) / metric_tmax)); } else { rank_power_step_all_ranks.push_back("N/A"); } if (gpu_energy_consummed_all_ranks[i] > 0._f64) { rank_gpu_power_step_all_ranks.push_back( - shambase::format("{:.1f} W", f64(gpu_energy_consummed_all_ranks[i]) / max_t)); + shambase::format( + "{:.1f} W", f64(gpu_energy_consummed_all_ranks[i]) / metric_tmax)); } else { rank_gpu_power_step_all_ranks.push_back("N/A"); } if (cpu_energy_consummed_all_ranks[i] > 0._f64) { rank_cpu_power_step_all_ranks.push_back( - shambase::format("{:.1f} W", f64(cpu_energy_consummed_all_ranks[i]) / max_t)); + shambase::format( + "{:.1f} W", f64(cpu_energy_consummed_all_ranks[i]) / metric_tmax)); } else { rank_cpu_power_step_all_ranks.push_back("N/A"); } if (dram_energy_consummed_all_ranks[i] > 0._f64) { rank_dram_power_step_all_ranks.push_back( - shambase::format("{:.1f} W", f64(dram_energy_consummed_all_ranks[i]) / max_t)); + shambase::format( + "{:.1f} W", f64(dram_energy_consummed_all_ranks[i]) / metric_tmax)); } else { rank_dram_power_step_all_ranks.push_back("N/A"); } @@ -135,10 +146,10 @@ std::string shammodels::report_perf_timestep( cpu_energy_consummed_all_ranks.begin(), cpu_energy_consummed_all_ranks.end(), 0._f64); f64 sum_dram_energy_consummed = std::accumulate( dram_energy_consummed_all_ranks.begin(), dram_energy_consummed_all_ranks.end(), 0._f64); - sum_power_step = shambase::format("{:.1e} W", sum_rank_energy_consummed / max_t); - sum_gpu_power_step = shambase::format("{:.1e} W", sum_gpu_energy_consummed / max_t); - sum_cpu_power_step = shambase::format("{:.1e} W", sum_cpu_energy_consummed / max_t); - sum_dram_power_step = shambase::format("{:.1e} W", sum_dram_energy_consummed / max_t); + sum_power_step = shambase::format("{:.1e} W", sum_rank_energy_consummed / metric_tmax); + sum_gpu_power_step = shambase::format("{:.1e} W", sum_gpu_energy_consummed / metric_tmax); + sum_cpu_power_step = shambase::format("{:.1e} W", sum_cpu_energy_consummed / metric_tmax); + sum_dram_power_step = shambase::format("{:.1e} W", sum_dram_energy_consummed / metric_tmax); } u32 cols_count = 9_u32; diff --git a/src/shammodels/sph/src/Solver.cpp b/src/shammodels/sph/src/Solver.cpp index b6467de41..d783cef92 100644 --- a/src/shammodels/sph/src/Solver.cpp +++ b/src/shammodels/sph/src/Solver.cpp @@ -2650,12 +2650,14 @@ shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() tstep.end(); - sham::MemPerfInfos mem_perf_infos_end = sham::details::get_mem_perf_info(); + f64 delta_mpi_timer = shamcomm::mpi::get_timer("total") - mpi_timer_start; + sham::MemPerfInfos mem_perf_infos_end = sham::details::get_mem_perf_info(); + + /// must be after the mpi timer to not count the barrier of the system metrics std::optional rank_energy_consummed_end = shamsys::get_rank_energy_consummed(); shamsys::SystemMetrics system_metrics_end = shamsys::get_system_metrics(); shamsys::SystemMetrics system_metrics_delta = system_metrics_end - system_metrics_start; - f64 delta_mpi_timer = shamcomm::mpi::get_timer("total") - mpi_timer_start; f64 t_dev_alloc = (mem_perf_infos_end.time_alloc_device - mem_perf_infos_start.time_alloc_device) + (mem_perf_infos_end.time_free_device - mem_perf_infos_start.time_free_device); diff --git a/src/shammodels/sph/src/SolverLog.cpp b/src/shammodels/sph/src/SolverLog.cpp index 9390574c4..c377163c4 100644 --- a/src/shammodels/sph/src/SolverLog.cpp +++ b/src/shammodels/sph/src/SolverLog.cpp @@ -74,6 +74,8 @@ shamsys::SystemMetrics shammodels::sph::SolverLog::get_last_system_metrics() { = optional_gather_power(last_log.system_metrics.cpu_energy_consummed); std::vector dram_energy_consummed_all_ranks = optional_gather_power(last_log.system_metrics.dram_energy_consummed); + std::vector metric_time_all_ranks + = shamalgs::collective::gather(last_log.system_metrics.wall_time); f64 sum_rank_energy_consummed = std::accumulate( rank_energy_consummed_all_ranks.begin(), rank_energy_consummed_all_ranks.end(), 0._f64); @@ -83,8 +85,11 @@ shamsys::SystemMetrics shammodels::sph::SolverLog::get_last_system_metrics() { cpu_energy_consummed_all_ranks.begin(), cpu_energy_consummed_all_ranks.end(), 0._f64); f64 sum_dram_energy_consummed = std::accumulate( dram_energy_consummed_all_ranks.begin(), dram_energy_consummed_all_ranks.end(), 0._f64); + f64 metric_time_all + = *std::max_element(metric_time_all_ranks.begin(), metric_time_all_ranks.end()); shamsys::SystemMetrics system_metrics; + system_metrics.wall_time = metric_time_all; system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed()) ? sum_rank_energy_consummed : std::optional{}; diff --git a/src/shammodels/sph/src/pySPHModel.cpp b/src/shammodels/sph/src/pySPHModel.cpp index 5b3c07ddf..74da727eb 100644 --- a/src/shammodels/sph/src/pySPHModel.cpp +++ b/src/shammodels/sph/src/pySPHModel.cpp @@ -1132,6 +1132,7 @@ void add_instance(py::module &m, std::string name_config, std::string name_model [&](T &self) { auto system_metrics = self.solver.solve_logs.get_last_system_metrics(); py::dict ret; + ret["duration"] = system_metrics.wall_time; if (system_metrics.rank_energy_consummed.has_value()) { ret["rank_energy_consummed"] = system_metrics.rank_energy_consummed.value(); } diff --git a/src/shamsys/include/shamsys/system_metrics.hpp b/src/shamsys/include/shamsys/system_metrics.hpp index a900fc293..4abd1f836 100644 --- a/src/shamsys/include/shamsys/system_metrics.hpp +++ b/src/shamsys/include/shamsys/system_metrics.hpp @@ -17,7 +17,9 @@ #include "shambase/aliases_float.hpp" #include "shambase/memory.hpp" +#include "shambase/stacktrace.hpp" #include "shamcmdopt/env.hpp" +#include "shamcomm/wrapper.hpp" #include #include @@ -76,18 +78,27 @@ namespace shamsys { } struct SystemMetrics { + f64 wall_time; std::optional rank_energy_consummed; std::optional gpu_energy_consummed; std::optional cpu_energy_consummed; std::optional dram_energy_consummed; }; - inline SystemMetrics get_system_metrics() { - return SystemMetrics{ + inline SystemMetrics get_system_metrics(bool barrier = true) { + if (barrier) { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + } + auto ret = SystemMetrics{ + shambase::details::get_wtime(), get_rank_energy_consummed(), get_gpu_energy_consummed(), get_cpu_energy_consummed(), get_dram_energy_consummed()}; + if (barrier) { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + } + return ret; } inline SystemMetrics operator-(const SystemMetrics &lhs, const SystemMetrics &rhs) { @@ -98,6 +109,7 @@ namespace shamsys { : std::nullopt; }; return SystemMetrics{ + lhs.wall_time - rhs.wall_time, optional_sub(lhs.rank_energy_consummed, rhs.rank_energy_consummed), optional_sub(lhs.gpu_energy_consummed, rhs.gpu_energy_consummed), optional_sub(lhs.cpu_energy_consummed, rhs.cpu_energy_consummed), diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index ee4437a9a..aefcfd60d 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -16,7 +16,6 @@ #include "shambase/aliases_int.hpp" #include "shambase/popen.hpp" #include "shamcomm/local_rank.hpp" -#include "shamcomm/wrapper.hpp" #include "shamsys/system_metrics.hpp" #include @@ -25,47 +24,35 @@ namespace shamsys { class AuroraSystemMetricReporter : public ISystemMetricReporter { public: std::optional get_rank_energy_consummed() override { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread BOARD_ENERGY board 0"); - ret = std::stoull(output.c_str()); + return std::stoull(output.c_str()); } - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - return ret; + return std::nullopt; } std::optional get_gpu_energy_consummed() override { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread GPU_ENERGY board 0"); - ret = std::stoull(output.c_str()); + return std::stoull(output.c_str()); } - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - return ret; + return std::nullopt; } std::optional get_cpu_energy_consummed() override { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread CPU_ENERGY board 0"); - ret = std::stoull(output.c_str()); + return std::stoull(output.c_str()); } - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - return ret; + return std::nullopt; } std::optional get_dram_energy_consummed() override { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output("geopmread DRAM_ENERGY board 0"); - ret = std::stoull(output.c_str()); + return std::stoull(output.c_str()); } - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - return ret; + return std::nullopt; } bool support_rank_energy_consummed() override { return true; } @@ -77,15 +64,12 @@ namespace shamsys { class IntelRAPLSystemMetricReport : public ISystemMetricReporter { public: std::optional get_rank_energy_consummed() override { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - std::optional ret = std::nullopt; if (shamcomm::is_main_node_rank()) { std::string output = shambase::popen_fetch_output( "cat /sys/class/powercap/intel-rapl:0/energy_uj"); - ret = f64(std::stoull(output.c_str())) * 1e-6; + return f64(std::stoull(output.c_str())) * 1e-6; } - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - return ret; + return std::nullopt; } std::optional get_gpu_energy_consummed() override { return std::nullopt; } From e0cd91d1dfd56ccace69e22623658307ca067cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 00:20:41 +0100 Subject: [PATCH 06/35] streamline that shit --- src/shammodels/common/src/timestep_report.cpp | 97 ++--------- src/shammodels/sph/src/SolverLog.cpp | 47 +----- .../include/shamsys/system_metrics.hpp | 37 +++-- src/shamsys/src/system_metrics.cpp | 155 ++++++++++++++++++ 4 files changed, 194 insertions(+), 142 deletions(-) diff --git a/src/shammodels/common/src/timestep_report.cpp b/src/shammodels/common/src/timestep_report.cpp index a617e777f..23db6d3ec 100644 --- a/src/shammodels/common/src/timestep_report.cpp +++ b/src/shammodels/common/src/timestep_report.cpp @@ -56,22 +56,8 @@ std::string shammodels::report_perf_timestep( std::vector max_mem_device_all_ranks = shamalgs::collective::gather(max_mem_device); std::vector max_mem_host_all_ranks = shamalgs::collective::gather(max_mem_host); - auto optional_gather_power = [&](const std::optional &value) -> std::vector { - return (report_power_usage) ? shamalgs::collective::gather(value ? value.value() : 0._f64) - : std::vector{}; - }; - - std::vector rank_energy_consummed_all_ranks - = optional_gather_power(system_metrics.rank_energy_consummed); - std::vector gpu_energy_consummed_all_ranks - = optional_gather_power(system_metrics.gpu_energy_consummed); - std::vector cpu_energy_consummed_all_ranks - = optional_gather_power(system_metrics.cpu_energy_consummed); - std::vector dram_energy_consummed_all_ranks - = optional_gather_power(system_metrics.dram_energy_consummed); - std::vector metric_time_all_ranks - = (report_power_usage) ? shamalgs::collective::gather(system_metrics.wall_time) - : std::vector{}; + auto rank_metrics = (report_power_usage) ? shamsys::gather_rank_metrics(system_metrics) + : std::vector{}; if (shamcomm::world_rank() != 0) { return ""; @@ -91,65 +77,16 @@ std::string shammodels::report_perf_timestep( = std::accumulate(max_mem_device_all_ranks.begin(), max_mem_device_all_ranks.end(), 0_u64); size_t sum_mem_host_total = std::accumulate(max_mem_host_all_ranks.begin(), max_mem_host_all_ranks.end(), 0_u64); - f64 metric_tmax - = (report_power_usage) - ? *std::max_element(metric_time_all_ranks.begin(), metric_time_all_ranks.end()) - : 0._f64; - std::vector rank_power_step_all_ranks; - std::vector rank_gpu_power_step_all_ranks; - std::vector rank_cpu_power_step_all_ranks; - std::vector rank_dram_power_step_all_ranks; - std::string sum_power_step; - std::string sum_gpu_power_step; - std::string sum_cpu_power_step; - std::string sum_dram_power_step; - if (report_power_usage) { - for (u32 i = 0; i < shamcomm::world_size(); i++) { - if (rank_energy_consummed_all_ranks[i] > 0._f64) { - rank_power_step_all_ranks.push_back( - shambase::format( - "{:.1f} W", f64(rank_energy_consummed_all_ranks[i]) / metric_tmax)); - } else { - rank_power_step_all_ranks.push_back("N/A"); - } - - if (gpu_energy_consummed_all_ranks[i] > 0._f64) { - rank_gpu_power_step_all_ranks.push_back( - shambase::format( - "{:.1f} W", f64(gpu_energy_consummed_all_ranks[i]) / metric_tmax)); - } else { - rank_gpu_power_step_all_ranks.push_back("N/A"); - } + shamsys::SystemMetrics aggregated_metrics = shamsys::aggregate_rank_metrics(rank_metrics); - if (cpu_energy_consummed_all_ranks[i] > 0._f64) { - rank_cpu_power_step_all_ranks.push_back( - shambase::format( - "{:.1f} W", f64(cpu_energy_consummed_all_ranks[i]) / metric_tmax)); - } else { - rank_cpu_power_step_all_ranks.push_back("N/A"); - } - - if (dram_energy_consummed_all_ranks[i] > 0._f64) { - rank_dram_power_step_all_ranks.push_back( - shambase::format( - "{:.1f} W", f64(dram_energy_consummed_all_ranks[i]) / metric_tmax)); - } else { - rank_dram_power_step_all_ranks.push_back("N/A"); - } + std::vector formatted_rank_metrics{}; + shamsys::FormattedSystemMetrics formatted_aggregated_metrics = {}; + if (report_power_usage) { + for (const auto &metric : rank_metrics) { + formatted_rank_metrics.push_back(shamsys::format_system_metrics(metric)); } - f64 sum_rank_energy_consummed = std::accumulate( - rank_energy_consummed_all_ranks.begin(), rank_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_gpu_energy_consummed = std::accumulate( - gpu_energy_consummed_all_ranks.begin(), gpu_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_cpu_energy_consummed = std::accumulate( - cpu_energy_consummed_all_ranks.begin(), cpu_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_dram_energy_consummed = std::accumulate( - dram_energy_consummed_all_ranks.begin(), dram_energy_consummed_all_ranks.end(), 0._f64); - sum_power_step = shambase::format("{:.1e} W", sum_rank_energy_consummed / metric_tmax); - sum_gpu_power_step = shambase::format("{:.1e} W", sum_gpu_energy_consummed / metric_tmax); - sum_cpu_power_step = shambase::format("{:.1e} W", sum_cpu_energy_consummed / metric_tmax); - sum_dram_power_step = shambase::format("{:.1e} W", sum_dram_energy_consummed / metric_tmax); + formatted_aggregated_metrics = shamsys::format_system_metrics(aggregated_metrics); } u32 cols_count = 9_u32; @@ -217,16 +154,16 @@ std::string shammodels::report_perf_timestep( }; if (report_power_usage) { if (shamsys::support_rank_energy_consummed()) { - row.push_back(rank_power_step_all_ranks[i]); + row.push_back(formatted_rank_metrics[i].rank_power.value_or("N/A")); } if (shamsys::support_gpu_energy_consummed()) { - row.push_back(rank_gpu_power_step_all_ranks[i]); + row.push_back(formatted_rank_metrics[i].gpu_power.value_or("N/A")); } if (shamsys::support_cpu_energy_consummed()) { - row.push_back(rank_cpu_power_step_all_ranks[i]); + row.push_back(formatted_rank_metrics[i].cpu_power.value_or("N/A")); } if (shamsys::support_dram_energy_consummed()) { - row.push_back(rank_dram_power_step_all_ranks[i]); + row.push_back(formatted_rank_metrics[i].dram_power.value_or("N/A")); } } table.add_data(row, Table::right); @@ -265,16 +202,16 @@ std::string shammodels::report_perf_timestep( }; if (report_power_usage) { if (shamsys::support_rank_energy_consummed()) { - all_row.push_back(sum_power_step); + all_row.push_back(formatted_aggregated_metrics.rank_power.value_or("N/A")); } if (shamsys::support_gpu_energy_consummed()) { - all_row.push_back(sum_gpu_power_step); + all_row.push_back(formatted_aggregated_metrics.gpu_power.value_or("N/A")); } if (shamsys::support_cpu_energy_consummed()) { - all_row.push_back(sum_cpu_power_step); + all_row.push_back(formatted_aggregated_metrics.cpu_power.value_or("N/A")); } if (shamsys::support_dram_energy_consummed()) { - all_row.push_back(sum_dram_power_step); + all_row.push_back(formatted_aggregated_metrics.dram_power.value_or("N/A")); } } table.add_data(all_row, Table::right); diff --git a/src/shammodels/sph/src/SolverLog.cpp b/src/shammodels/sph/src/SolverLog.cpp index c377163c4..bc0f6bdc2 100644 --- a/src/shammodels/sph/src/SolverLog.cpp +++ b/src/shammodels/sph/src/SolverLog.cpp @@ -59,49 +59,6 @@ shamsys::SystemMetrics shammodels::sph::SolverLog::get_last_system_metrics() { auto &last_log = step_logs.back(); - bool report_power_usage = shamsys::has_reporter(); - - auto optional_gather_power = [&](const std::optional &value) -> std::vector { - return (report_power_usage) ? shamalgs::collective::gather(value ? value.value() : 0._f64) - : std::vector{}; - }; - - std::vector rank_energy_consummed_all_ranks - = optional_gather_power(last_log.system_metrics.rank_energy_consummed); - std::vector gpu_energy_consummed_all_ranks - = optional_gather_power(last_log.system_metrics.gpu_energy_consummed); - std::vector cpu_energy_consummed_all_ranks - = optional_gather_power(last_log.system_metrics.cpu_energy_consummed); - std::vector dram_energy_consummed_all_ranks - = optional_gather_power(last_log.system_metrics.dram_energy_consummed); - std::vector metric_time_all_ranks - = shamalgs::collective::gather(last_log.system_metrics.wall_time); - - f64 sum_rank_energy_consummed = std::accumulate( - rank_energy_consummed_all_ranks.begin(), rank_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_gpu_energy_consummed = std::accumulate( - gpu_energy_consummed_all_ranks.begin(), gpu_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_cpu_energy_consummed = std::accumulate( - cpu_energy_consummed_all_ranks.begin(), cpu_energy_consummed_all_ranks.end(), 0._f64); - f64 sum_dram_energy_consummed = std::accumulate( - dram_energy_consummed_all_ranks.begin(), dram_energy_consummed_all_ranks.end(), 0._f64); - f64 metric_time_all - = *std::max_element(metric_time_all_ranks.begin(), metric_time_all_ranks.end()); - - shamsys::SystemMetrics system_metrics; - system_metrics.wall_time = metric_time_all; - system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed()) - ? sum_rank_energy_consummed - : std::optional{}; - system_metrics.gpu_energy_consummed = (shamsys::support_gpu_energy_consummed()) - ? sum_gpu_energy_consummed - : std::optional{}; - system_metrics.cpu_energy_consummed = (shamsys::support_cpu_energy_consummed()) - ? sum_cpu_energy_consummed - : std::optional{}; - system_metrics.dram_energy_consummed = (shamsys::support_dram_energy_consummed()) - ? sum_dram_energy_consummed - : std::optional{}; - - return system_metrics; + auto rank_metrics = shamsys::gather_rank_metrics(last_log.system_metrics); + return shamsys::aggregate_rank_metrics(rank_metrics); } diff --git a/src/shamsys/include/shamsys/system_metrics.hpp b/src/shamsys/include/shamsys/system_metrics.hpp index 4abd1f836..a4481e7ed 100644 --- a/src/shamsys/include/shamsys/system_metrics.hpp +++ b/src/shamsys/include/shamsys/system_metrics.hpp @@ -17,9 +17,7 @@ #include "shambase/aliases_float.hpp" #include "shambase/memory.hpp" -#include "shambase/stacktrace.hpp" #include "shamcmdopt/env.hpp" -#include "shamcomm/wrapper.hpp" #include #include @@ -85,21 +83,26 @@ namespace shamsys { std::optional dram_energy_consummed; }; - inline SystemMetrics get_system_metrics(bool barrier = true) { - if (barrier) { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - } - auto ret = SystemMetrics{ - shambase::details::get_wtime(), - get_rank_energy_consummed(), - get_gpu_energy_consummed(), - get_cpu_energy_consummed(), - get_dram_energy_consummed()}; - if (barrier) { - shamcomm::mpi::Barrier(MPI_COMM_WORLD); - } - return ret; - } + SystemMetrics get_system_metrics(bool barrier = true); + + std::vector gather_rank_metrics(const SystemMetrics &input); + + SystemMetrics aggregate_rank_metrics(const std::vector &input); + + struct FormattedSystemMetrics { + std::string wall_time; + std::optional rank_energy_consummed; + std::optional gpu_energy_consummed; + std::optional cpu_energy_consummed; + std::optional dram_energy_consummed; + std::optional rank_power; + std::optional gpu_power; + std::optional cpu_power; + std::optional dram_power; + }; + + /// Only to be used on deltas, not the raw one + FormattedSystemMetrics format_system_metrics(const SystemMetrics &input); inline SystemMetrics operator-(const SystemMetrics &lhs, const SystemMetrics &rhs) { auto optional_sub = [](const std::optional &lhs, diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index aefcfd60d..4c78b0078 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -15,7 +15,10 @@ #include "shambase/aliases_int.hpp" #include "shambase/popen.hpp" +#include "shambase/stacktrace.hpp" +#include "shamalgs/collective/reduction.hpp" #include "shamcomm/local_rank.hpp" +#include "shamcomm/wrapper.hpp" #include "shamsys/system_metrics.hpp" #include @@ -145,4 +148,156 @@ namespace shamsys { } return reporter; } + + SystemMetrics get_system_metrics(bool barrier) { + if (barrier) { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + } + auto ret = SystemMetrics{ + shambase::details::get_wtime(), + get_rank_energy_consummed(), + get_gpu_energy_consummed(), + get_cpu_energy_consummed(), + get_dram_energy_consummed()}; + if (barrier) { + shamcomm::mpi::Barrier(MPI_COMM_WORLD); + } + return ret; + } + + std::vector gather_rank_metrics(const SystemMetrics &input) { + std::vector ret(shamcomm::world_size()); + + auto optional_gather_power = [&](const std::optional &value) -> std::vector { + return shamalgs::collective::gather(value ? value.value() : 0._f64); + }; + + std::vector rank_energy_consummed_all_ranks + = optional_gather_power(input.rank_energy_consummed); + std::vector gpu_energy_consummed_all_ranks + = optional_gather_power(input.gpu_energy_consummed); + std::vector cpu_energy_consummed_all_ranks + = optional_gather_power(input.cpu_energy_consummed); + std::vector dram_energy_consummed_all_ranks + = optional_gather_power(input.dram_energy_consummed); + std::vector metric_time_all_ranks = shamalgs::collective::gather(input.wall_time); + + for (u32 i = 0; i < shamcomm::world_size(); i++) { + ret[i] = SystemMetrics{ + metric_time_all_ranks[i], + (shamsys::support_rank_energy_consummed()) + ? std::optional{rank_energy_consummed_all_ranks[i]} + : std::nullopt, + (shamsys::support_gpu_energy_consummed()) + ? std::optional{gpu_energy_consummed_all_ranks[i]} + : std::nullopt, + (shamsys::support_cpu_energy_consummed()) + ? std::optional{cpu_energy_consummed_all_ranks[i]} + : std::nullopt, + (shamsys::support_dram_energy_consummed()) + ? std::optional{dram_energy_consummed_all_ranks[i]} + : std::nullopt, + }; + } + + return ret; + } + + SystemMetrics aggregate_rank_metrics(const std::vector &input) { + f64 sum_rank_energy_consummed = 0._f64; + f64 sum_gpu_energy_consummed = 0._f64; + f64 sum_cpu_energy_consummed = 0._f64; + f64 sum_dram_energy_consummed = 0._f64; + f64 metric_time_all = 0._f64; + + for (const auto &m : input) { + sum_rank_energy_consummed + += (m.rank_energy_consummed ? m.rank_energy_consummed.value() : 0._f64); + sum_gpu_energy_consummed + += (m.gpu_energy_consummed ? m.gpu_energy_consummed.value() : 0._f64); + sum_cpu_energy_consummed + += (m.cpu_energy_consummed ? m.cpu_energy_consummed.value() : 0._f64); + sum_dram_energy_consummed + += (m.dram_energy_consummed ? m.dram_energy_consummed.value() : 0._f64); + metric_time_all = std::max(metric_time_all, m.wall_time); + } + + SystemMetrics system_metrics; + system_metrics.wall_time = metric_time_all; + system_metrics.rank_energy_consummed = (shamsys::support_rank_energy_consummed()) + ? sum_rank_energy_consummed + : std::optional{}; + system_metrics.gpu_energy_consummed = (shamsys::support_gpu_energy_consummed()) + ? sum_gpu_energy_consummed + : std::optional{}; + system_metrics.cpu_energy_consummed = (shamsys::support_cpu_energy_consummed()) + ? sum_cpu_energy_consummed + : std::optional{}; + system_metrics.dram_energy_consummed = (shamsys::support_dram_energy_consummed()) + ? sum_dram_energy_consummed + : std::optional{}; + + return system_metrics; + } + + FormattedSystemMetrics format_system_metrics(const SystemMetrics &input) { + FormattedSystemMetrics ret{ + shambase::format("{:.1f} s", input.wall_time), + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + }; + + if (input.rank_energy_consummed.has_value()) { + if (input.wall_time > 0._f64 && input.rank_energy_consummed.value() > 0._f64) { + f64 consumed_energy = input.rank_energy_consummed.value(); + f64 power = consumed_energy / input.wall_time; + ret.rank_power = shambase::format("{:.1f} W", power); + ret.rank_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + } else { + ret.rank_power = "N/A"; + ret.rank_energy_consummed = "N/A"; + } + } + if (input.gpu_energy_consummed.has_value()) { + if (input.wall_time > 0._f64 && input.gpu_energy_consummed.value() > 0._f64) { + f64 consumed_energy = input.gpu_energy_consummed.value(); + f64 power = consumed_energy / input.wall_time; + ret.gpu_power = shambase::format("{:.1f} W", power); + ret.gpu_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + } else { + ret.gpu_power = "N/A"; + ret.gpu_energy_consummed = "N/A"; + } + } + if (input.cpu_energy_consummed.has_value()) { + if (input.wall_time > 0._f64 && input.cpu_energy_consummed.value() > 0._f64) { + f64 consumed_energy = input.cpu_energy_consummed.value(); + f64 power = consumed_energy / input.wall_time; + ret.cpu_power = shambase::format("{:.1f} W", power); + ret.cpu_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + } else { + ret.cpu_power = "N/A"; + ret.cpu_energy_consummed = "N/A"; + } + } + if (input.dram_energy_consummed.has_value()) { + if (input.wall_time > 0._f64 && input.dram_energy_consummed.value() > 0._f64) { + f64 consumed_energy = input.dram_energy_consummed.value(); + f64 power = consumed_energy / input.wall_time; + ret.dram_power = shambase::format("{:.1f} W", power); + ret.dram_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + } else { + ret.dram_power = "N/A"; + ret.dram_energy_consummed = "N/A"; + } + } + + return ret; + } } // namespace shamsys From 6bc2845a8039138aa50aa12f74bad7d3cb84f6a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 00:49:54 +0100 Subject: [PATCH 07/35] whoopsi --- src/shamsys/src/system_metrics.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index 4c78b0078..8db366a97 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -258,7 +258,7 @@ namespace shamsys { f64 consumed_energy = input.rank_energy_consummed.value(); f64 power = consumed_energy / input.wall_time; ret.rank_power = shambase::format("{:.1f} W", power); - ret.rank_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + ret.rank_energy_consummed = shambase::format("{:.1f} J", consumed_energy); } else { ret.rank_power = "N/A"; ret.rank_energy_consummed = "N/A"; @@ -269,7 +269,7 @@ namespace shamsys { f64 consumed_energy = input.gpu_energy_consummed.value(); f64 power = consumed_energy / input.wall_time; ret.gpu_power = shambase::format("{:.1f} W", power); - ret.gpu_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + ret.gpu_energy_consummed = shambase::format("{:.1f} J", consumed_energy); } else { ret.gpu_power = "N/A"; ret.gpu_energy_consummed = "N/A"; @@ -280,7 +280,7 @@ namespace shamsys { f64 consumed_energy = input.cpu_energy_consummed.value(); f64 power = consumed_energy / input.wall_time; ret.cpu_power = shambase::format("{:.1f} W", power); - ret.cpu_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + ret.cpu_energy_consummed = shambase::format("{:.1f} J", consumed_energy); } else { ret.cpu_power = "N/A"; ret.cpu_energy_consummed = "N/A"; @@ -291,7 +291,7 @@ namespace shamsys { f64 consumed_energy = input.dram_energy_consummed.value(); f64 power = consumed_energy / input.wall_time; ret.dram_power = shambase::format("{:.1f} W", power); - ret.dram_energy_consummed = shambase::format("{:.1f} W", consumed_energy); + ret.dram_energy_consummed = shambase::format("{:.1f} J", consumed_energy); } else { ret.dram_power = "N/A"; ret.dram_energy_consummed = "N/A"; From ee3632a5df41f410804bfda080b6ce9e861281a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 01:03:07 +0100 Subject: [PATCH 08/35] correct max mem usage --- examples/benchmarks/sph_weak_scale_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 9278ae739..7cc979606 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -14,6 +14,8 @@ result_text = "" for N_target_base in [32e6]: + shamrock.backends.reset_mem_info_max() + gamma = 5.0 / 3.0 rho_g = 1 target_tot_u = 1 @@ -117,6 +119,8 @@ model.set_cfl_multipler(1e-4) model.set_cfl_mult_stiffness(1e6) + shamrock.backends.reset_mem_info_max() + # converge smoothing length and compute initial dt model.timestep() From f815d5f529acbdbf3a08f85df4b503c3c34a69c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 11:30:46 +0100 Subject: [PATCH 09/35] try to permute time mesure and power mesure --- src/shamsys/src/system_metrics.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index 8db366a97..9bc539647 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -154,11 +154,12 @@ namespace shamsys { shamcomm::mpi::Barrier(MPI_COMM_WORLD); } auto ret = SystemMetrics{ - shambase::details::get_wtime(), + 0, get_rank_energy_consummed(), get_gpu_energy_consummed(), get_cpu_energy_consummed(), get_dram_energy_consummed()}; + ret.wall_time = shambase::details::get_wtime(); if (barrier) { shamcomm::mpi::Barrier(MPI_COMM_WORLD); } From cdca66abe4e3fa414071628a39fc959534e94eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 12:11:15 +0100 Subject: [PATCH 10/35] attempt --- src/shammodels/sph/src/Solver.cpp | 9 ++++++--- src/shamsys/src/system_metrics.cpp | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/shammodels/sph/src/Solver.cpp b/src/shammodels/sph/src/Solver.cpp index d783cef92..daf214c95 100644 --- a/src/shammodels/sph/src/Solver.cpp +++ b/src/shammodels/sph/src/Solver.cpp @@ -1580,7 +1580,10 @@ template class Kern> shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() { // has to be first since there is a barrier that may mess the other timers + shambase::Timer timer_system_metrics; + timer_system_metrics.start(); shamsys::SystemMetrics system_metrics_start = shamsys::get_system_metrics(); + timer_system_metrics.end(); sham::MemPerfInfos mem_perf_infos_start = sham::details::get_mem_perf_info(); f64 mpi_timer_start = shamcomm::mpi::get_timer("total"); @@ -2654,9 +2657,9 @@ shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() sham::MemPerfInfos mem_perf_infos_end = sham::details::get_mem_perf_info(); /// must be after the mpi timer to not count the barrier of the system metrics - std::optional rank_energy_consummed_end = shamsys::get_rank_energy_consummed(); - shamsys::SystemMetrics system_metrics_end = shamsys::get_system_metrics(); - shamsys::SystemMetrics system_metrics_delta = system_metrics_end - system_metrics_start; + shamsys::SystemMetrics system_metrics_end = shamsys::get_system_metrics(); + shamsys::SystemMetrics system_metrics_delta = system_metrics_end - system_metrics_start; + system_metrics_delta.wall_time -= timer_system_metrics.elasped_sec(); f64 t_dev_alloc = (mem_perf_infos_end.time_alloc_device - mem_perf_infos_start.time_alloc_device) diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index 9bc539647..ddb5275d5 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -153,13 +153,13 @@ namespace shamsys { if (barrier) { shamcomm::mpi::Barrier(MPI_COMM_WORLD); } - auto ret = SystemMetrics{ - 0, + f64 wall_time = shambase::details::get_wtime(); + auto ret = SystemMetrics{ + wall_time, get_rank_energy_consummed(), get_gpu_energy_consummed(), get_cpu_energy_consummed(), get_dram_energy_consummed()}; - ret.wall_time = shambase::details::get_wtime(); if (barrier) { shamcomm::mpi::Barrier(MPI_COMM_WORLD); } From d811e5f0bd5a42742f5558106702132713b5d118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 13:22:44 +0100 Subject: [PATCH 11/35] attempt --- src/shammodels/sph/src/Solver.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/shammodels/sph/src/Solver.cpp b/src/shammodels/sph/src/Solver.cpp index daf214c95..6e5450863 100644 --- a/src/shammodels/sph/src/Solver.cpp +++ b/src/shammodels/sph/src/Solver.cpp @@ -1580,10 +1580,7 @@ template class Kern> shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() { // has to be first since there is a barrier that may mess the other timers - shambase::Timer timer_system_metrics; - timer_system_metrics.start(); shamsys::SystemMetrics system_metrics_start = shamsys::get_system_metrics(); - timer_system_metrics.end(); sham::MemPerfInfos mem_perf_infos_start = sham::details::get_mem_perf_info(); f64 mpi_timer_start = shamcomm::mpi::get_timer("total"); @@ -2659,7 +2656,6 @@ shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() /// must be after the mpi timer to not count the barrier of the system metrics shamsys::SystemMetrics system_metrics_end = shamsys::get_system_metrics(); shamsys::SystemMetrics system_metrics_delta = system_metrics_end - system_metrics_start; - system_metrics_delta.wall_time -= timer_system_metrics.elasped_sec(); f64 t_dev_alloc = (mem_perf_infos_end.time_alloc_device - mem_perf_infos_start.time_alloc_device) From 775d26ac258ebbf8b571c840f94d1d50634df29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 16 Mar 2026 14:41:50 +0100 Subject: [PATCH 12/35] add duration --- examples/benchmarks/sph_weak_scale_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 7cc979606..5a0bda2a9 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -183,6 +183,8 @@ result_text += f"avg power {key} / step time : {value / metrics_duration} W\n" dic_out[f"power_{key}"] = value / metrics_duration + dic_out["system_metric_duration"] = metrics_duration + result_text += f"dic_out = {dic_out}\n" print("current results:") From 1d533025d5c4061a51fb1a404db8b1c99c99cb2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 17 Mar 2026 09:58:48 +0100 Subject: [PATCH 13/35] add support for linked geopm --- src/shamsys/CMakeLists.txt | 16 +++++++++ src/shamsys/src/system_metrics.cpp | 55 ++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/shamsys/CMakeLists.txt b/src/shamsys/CMakeLists.txt index 76aa8d691..f7a631b8a 100644 --- a/src/shamsys/CMakeLists.txt +++ b/src/shamsys/CMakeLists.txt @@ -44,6 +44,22 @@ if(SHAMROCK_USE_CPPTRACE) target_compile_definitions(shamsys PUBLIC -DSHAMROCK_USE_CPPTRACE) endif() +include(CheckIncludeFileCXX) + +check_include_file_cxx(geopm/PlatformIO.hpp HAVE_GEOPM_HEADER) +find_library(GEOPM_LIB geopmd) + +if(HAVE_GEOPM_HEADER AND GEOPM_LIB) + option(SHAMROCK_USE_GEOPM "use geopm tooling" Off) + + message("-- SHAMROCK_USE_GEOPM is set to ${SHAMROCK_USE_GEOPM}") + + if(SHAMROCK_USE_GEOPM) + target_link_libraries(shamsys PUBLIC ${GEOPM_LIB}) + target_compile_definitions(shamsys PUBLIC SHAMROCK_USE_GEOPM) + endif() +endif() + target_include_directories(shamsys PUBLIC "$" "$") diff --git a/src/shamsys/src/system_metrics.cpp b/src/shamsys/src/system_metrics.cpp index d4096af5a..b92581960 100644 --- a/src/shamsys/src/system_metrics.cpp +++ b/src/shamsys/src/system_metrics.cpp @@ -22,8 +22,52 @@ #include "shamsys/system_metrics.hpp" #include +#ifdef SHAMROCK_USE_GEOPM + #include + #include +#endif + namespace shamsys { +#ifdef SHAMROCK_USE_GEOPM + + class AuroraSystemMetricReporterLinked : public ISystemMetricReporter { + public: + std::optional get_rank_energy_consummed() override { + if (shamcomm::is_main_node_rank()) { + return geopm::platform_io().read_signal("BOARD_ENERGY", GEOPM_DOMAIN_BOARD, 0); + } + return std::nullopt; + } + + std::optional get_gpu_energy_consummed() override { + if (shamcomm::is_main_node_rank()) { + return geopm::platform_io().read_signal("GPU_ENERGY", GEOPM_DOMAIN_BOARD, 0); + } + return std::nullopt; + } + + std::optional get_cpu_energy_consummed() override { + if (shamcomm::is_main_node_rank()) { + return geopm::platform_io().read_signal("CPU_ENERGY", GEOPM_DOMAIN_BOARD, 0); + } + return std::nullopt; + } + + std::optional get_dram_energy_consummed() override { + if (shamcomm::is_main_node_rank()) { + return geopm::platform_io().read_signal("DRAM_ENERGY", GEOPM_DOMAIN_BOARD, 0); + } + return std::nullopt; + } + + bool support_rank_energy_consummed() override { return true; } + bool support_gpu_energy_consummed() override { return true; } + bool support_cpu_energy_consummed() override { return true; } + bool support_dram_energy_consummed() override { return true; } + }; +#endif + class AuroraSystemMetricReporter : public ISystemMetricReporter { public: std::optional get_rank_energy_consummed() override { @@ -112,14 +156,18 @@ namespace shamsys { std::unique_ptr make_reporter(std::string_view reporter_name) { if (reporter_name == "aurora") { return std::make_unique(); +#ifdef SHAMROCK_USE_GEOPM + } else if (reporter_name == "aurora-linked") { + return std::make_unique(); +#endif } else if (reporter_name == "intel-rapl") { return std::make_unique(); } else if (reporter_name == "noop" || reporter_name == "none" || reporter_name == "") { return std::make_unique(); } else { throw shambase::make_except_with_loc(shambase::format( - "Unknown system metrics reporter: {}, valid reporters are: aurora, intel-rapl, " - "noop", + "Unknown system metrics reporter: {}, valid reporters are: aurora, aurora-linked, " + "intel-rapl, noop", reporter_name)); } return std::make_unique(); @@ -150,6 +198,9 @@ namespace shamsys { } SystemMetrics get_system_metrics(bool barrier) { + // Ensure that barriers aren't used if there is no reporter + barrier = barrier && has_reporter(); + if (barrier) { shamcomm::mpi::Barrier(MPI_COMM_WORLD); } From 9011b2ad163790489d574400f96310973f45f2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 17 Mar 2026 10:00:41 +0100 Subject: [PATCH 14/35] enable geopm on aurora --- env/machine/argonne/aurora/env_oneapi.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/env/machine/argonne/aurora/env_oneapi.sh b/env/machine/argonne/aurora/env_oneapi.sh index c59bc8eef..65bb36f34 100644 --- a/env/machine/argonne/aurora/env_oneapi.sh +++ b/env/machine/argonne/aurora/env_oneapi.sh @@ -21,6 +21,7 @@ function shamconfigure { -DCMAKE_BUILD_TYPE="${SHAMROCK_BUILD_TYPE}" \ -DBUILD_TEST=Yes \ -DSHAMROCK_USE_CPPTRACE=Yes \ + -DSHAMROCK_USE_GEOPM=Yes \ "${CMAKE_OPT[@]}" } From 9dc904d977a5cc139e20a0d9c33001fd90f29ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 17 Mar 2026 11:12:42 +0100 Subject: [PATCH 15/35] shut up warnings --- src/pylib/shamrock/utils/plot/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pylib/shamrock/utils/plot/__init__.py b/src/pylib/shamrock/utils/plot/__init__.py index a0e302908..f5ef6899a 100644 --- a/src/pylib/shamrock/utils/plot/__init__.py +++ b/src/pylib/shamrock/utils/plot/__init__.py @@ -15,7 +15,7 @@ _HAS_MATPLOTLIB = True except ImportError: _HAS_MATPLOTLIB = False - print("Warning: matplotlib is not installed, some Shamrock functions will not be available") + # print("Warning: matplotlib is not installed, some Shamrock functions will not be available") try: from PIL import Image @@ -23,7 +23,7 @@ _HAS_PIL = True except ImportError: _HAS_PIL = False - print("Warning: PIL is not installed, some Shamrock functions will not be available") + # print("Warning: PIL is not installed, some Shamrock functions will not be available") if _HAS_MATPLOTLIB and _HAS_PIL: __all__.append("show_image_sequence") From dcd76b050e596cfbc4bb1ac9af39c42effd64ec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Thu, 19 Mar 2026 17:34:31 +0100 Subject: [PATCH 16/35] better ? --- examples/benchmarks/sph_weak_scale_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 5a0bda2a9..69767b5cc 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -13,7 +13,7 @@ result_text = "" -for N_target_base in [32e6]: +for N_target_base in [1e6]: shamrock.backends.reset_mem_info_max() gamma = 5.0 / 3.0 @@ -25,7 +25,7 @@ compute_multiplier = shamrock.sys.world_size() # compute_multiplier = 12 - scheduler_split_val = int(2e7) + scheduler_split_val = int(1e5) scheduler_merge_val = int(1) N_target = N_target_base * compute_multiplier @@ -102,7 +102,7 @@ model.set_value_in_a_box("uint", "f64", 0, bmin, bmax) - rinj = 8 * dr + rinj = 16 * dr u_inj = 1 model.add_kernel_value("uint", "f64", u_inj, (0, 0, 0), rinj) @@ -116,7 +116,7 @@ model.set_cfl_cour(0.1) model.set_cfl_force(0.1) - model.set_cfl_multipler(1e-4) + model.set_cfl_multipler(1e-6) model.set_cfl_mult_stiffness(1e6) shamrock.backends.reset_mem_info_max() From 2ebb27a63e682f266c56a666159ab7d887d81c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Fri, 20 Mar 2026 00:04:32 +0100 Subject: [PATCH 17/35] more debug infos --- src/shamalgs/src/collective/sparse_exchange.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/shamalgs/src/collective/sparse_exchange.cpp b/src/shamalgs/src/collective/sparse_exchange.cpp index 6e6625544..970d35d05 100644 --- a/src/shamalgs/src/collective/sparse_exchange.cpp +++ b/src/shamalgs/src/collective/sparse_exchange.cpp @@ -56,6 +56,7 @@ namespace shamalgs::collective { /// fetch u64_2 from global message data std::vector fetch_global_message_data( const std::vector &messages_send) { + __shamrock_stack_entry(); std::vector local_data = std::vector(messages_send.size()); @@ -84,6 +85,7 @@ namespace shamalgs::collective { /// decode message to get message std::vector decode_all_message(const std::vector &global_data) { + __shamrock_stack_entry(); std::vector message_all(global_data.size()); for (u64 i = 0; i < global_data.size(); i++) { message_all[i] = unpack(global_data[i]); @@ -94,6 +96,7 @@ namespace shamalgs::collective { /// compute message tags void compute_tags(std::vector &message_all) { + __shamrock_stack_entry(); std::vector tag_map(shamcomm::world_size(), 0); From 8f7087bd5f7ac93b28fd9b0532c9c5764c2d3541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Fri, 20 Mar 2026 00:20:13 +0100 Subject: [PATCH 18/35] dammit --- examples/benchmarks/sph_weak_scale_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 69767b5cc..74944a270 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -13,7 +13,7 @@ result_text = "" -for N_target_base in [1e6]: +for N_target_base in [32e6]: shamrock.backends.reset_mem_info_max() gamma = 5.0 / 3.0 From 36b2516fc5624e661c024fee9af511b8cbf62e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Fri, 20 Mar 2026 00:20:58 +0100 Subject: [PATCH 19/35] dammit --- examples/benchmarks/sph_weak_scale_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 74944a270..c21bea71e 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -25,7 +25,7 @@ compute_multiplier = shamrock.sys.world_size() # compute_multiplier = 12 - scheduler_split_val = int(1e5) + scheduler_split_val = int(2e7) scheduler_merge_val = int(1) N_target = N_target_base * compute_multiplier From 5e2d837da403dd12356eaef780d3ce8e11ce0ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sat, 21 Mar 2026 00:11:14 +0100 Subject: [PATCH 20/35] more steps --- examples/benchmarks/sph_weak_scale_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index c21bea71e..4e9ba9dfb 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -129,7 +129,7 @@ res_cnts = [] res_system_metrics = [] - for i in range(5): + for i in range(10): shamrock.sys.mpi_barrier() model.timestep() From e698051a384dc7aca4f506480d9e3f4b795ad429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sun, 22 Mar 2026 21:56:36 +0100 Subject: [PATCH 21/35] better ? --- examples/benchmarks/sph_weak_scale_test.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 4e9ba9dfb..a71226a6b 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -63,7 +63,6 @@ ) cfg.set_boundary_periodic() cfg.set_eos_adiabatic(gamma) - cfg.set_max_neigh_cache_size(int(100e9)) cfg.print_status() model.set_solver_config(cfg) model.init_scheduler(scheduler_split_val, scheduler_merge_val) @@ -116,9 +115,6 @@ model.set_cfl_cour(0.1) model.set_cfl_force(0.1) - model.set_cfl_multipler(1e-6) - model.set_cfl_mult_stiffness(1e6) - shamrock.backends.reset_mem_info_max() # converge smoothing length and compute initial dt @@ -131,6 +127,9 @@ for i in range(10): shamrock.sys.mpi_barrier() + + # To replay the same step + model.set_next_dt(0.0) model.timestep() tmp_res_rate, tmp_res_cnt, tmp_system_metrics = ( From 907933994822f927f124ebebd51389f96cd1f9d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Mon, 23 Mar 2026 00:00:17 +0100 Subject: [PATCH 22/35] more omp --- src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp index 8449755de..b2decdfb4 100644 --- a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp +++ b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp @@ -269,10 +269,13 @@ class SerialPatchTree { sycl::queue &queue, shamrock::patch::PatchField pfield, Func &&reducer) { + __shamrock_stack_entry(); + shamrock::patch::PatchtreeField ptfield; ptfield.allocate(get_element_count()); { + __shamrock_stack_entry(); sycl::host_accessor lpid{ shambase::get_check_ref(linked_patch_ids_buf), sycl::read_only}; sycl::host_accessor tree_field{ @@ -280,6 +283,8 @@ class SerialPatchTree { // init reduction std::unordered_map &idp_to_gid = sched.patch_list.id_patch_to_global_idx; + +#pragma omp parallel for for (u64 idx = 0; idx < get_element_count(); idx++) { tree_field[idx] = (lpid[idx] != u64_max) ? pfield.get(lpid[idx]) : T(); } From 05bf4f70a1eba58135c51d62362605ec9528a350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 24 Mar 2026 15:26:30 +0100 Subject: [PATCH 23/35] print --- src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp | 4 ++-- src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp | 5 ++++- src/shammodels/sph/src/BasicSPHGhosts.cpp | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp index 070fc6f85..531a6e750 100644 --- a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp +++ b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp @@ -344,13 +344,13 @@ auto GSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } diff --git a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp index af5f17a9b..64f7b0ed4 100644 --- a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp +++ b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp @@ -86,7 +86,10 @@ namespace shammodels::sph { PatchField interactR_patch = sched.map_owned_to_patch_field_simple( [&](const Patch p, PatchDataLayer &pdat) -> flt { if (!pdat.is_empty()) { - return pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; + auto tmp = pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; + shamcomm::logs::raw_ln( + shambase::format("patch {}, Rghost = {}", p.id_patch, tmp)); + return tmp; } else { return shambase::VectorProperties::get_min(); } diff --git a/src/shammodels/sph/src/BasicSPHGhosts.cpp b/src/shammodels/sph/src/BasicSPHGhosts.cpp index 6b6357761..8f5e1a164 100644 --- a/src/shammodels/sph/src/BasicSPHGhosts.cpp +++ b/src/shammodels/sph/src/BasicSPHGhosts.cpp @@ -560,13 +560,13 @@ auto BasicSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } From b6228ead81a7410022330f97742ef02e4e6c6b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 24 Mar 2026 15:46:45 +0100 Subject: [PATCH 24/35] print --- .../src/solvergraph/ExchangeGhostLayer.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp index c87577db9..0aa22f670 100644 --- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp +++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp @@ -30,6 +30,21 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { auto &ghost_layer = edges.ghost_layer; const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner; + std::unordered_map msg_sizes_send; + + std::stringstream ss; + ss << "Rank " << shamcomm::world_rank() << " is sending " + << ghost_layer.patchdatas.get_native().size() << " patches sizes:"; + for (auto &pdat : ghost_layer.patchdatas.get_native()) { + // ss << pdat.first.first << " " << pdat.first.second << " " << pdat.second.get_obj_cnt() << + // "\n"; + msg_sizes_send[rank_owner.get_rank_owner(pdat.first.first)] += pdat.second.get_obj_cnt(); + } + for (auto &[rank, size] : msg_sizes_send) { + ss << "\n" << "msg size to rank " << rank << " is " << size; + } + shamcomm::logs::raw_ln(ss.str()); + shambase::DistributedDataShared recv_dat; shamalgs::collective::serialize_sparse_comm( From e0a117d03d1b2c1ee7ad42ffde20b680ee62b64e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 24 Mar 2026 16:59:56 +0100 Subject: [PATCH 25/35] print --- src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp index 0aa22f670..afd80fc5a 100644 --- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp +++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp @@ -31,17 +31,22 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner; std::unordered_map msg_sizes_send; + std::unordered_map msg_sizes_max_send; std::stringstream ss; ss << "Rank " << shamcomm::world_rank() << " is sending " << ghost_layer.patchdatas.get_native().size() << " patches sizes:"; for (auto &pdat : ghost_layer.patchdatas.get_native()) { + u64 key = rank_owner.get_rank_owner(pdat.first.first); // ss << pdat.first.first << " " << pdat.first.second << " " << pdat.second.get_obj_cnt() << // "\n"; - msg_sizes_send[rank_owner.get_rank_owner(pdat.first.first)] += pdat.second.get_obj_cnt(); + msg_sizes_send[key] += pdat.second.get_obj_cnt(); + msg_sizes_max_send[key] = std::max(msg_sizes_max_send[key], u64(pdat.second.get_obj_cnt())); } for (auto &[rank, size] : msg_sizes_send) { - ss << "\n" << "msg size to rank " << rank << " is " << size; + ss << "\n" + << "msg size from rank " << rank << " is " << size << " max is " + << msg_sizes_max_send[rank]; } shamcomm::logs::raw_ln(ss.str()); From 6ec6ec7fda3bb0886be23af8ccd597e4ea8e549c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 24 Mar 2026 17:36:50 +0100 Subject: [PATCH 26/35] print --- src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp | 4 ++++ src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp index 64f7b0ed4..80dacb3af 100644 --- a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp +++ b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp @@ -86,10 +86,14 @@ namespace shammodels::sph { PatchField interactR_patch = sched.map_owned_to_patch_field_simple( [&](const Patch p, PatchDataLayer &pdat) -> flt { if (!pdat.is_empty()) { +#if false auto tmp = pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; shamcomm::logs::raw_ln( shambase::format("patch {}, Rghost = {}", p.id_patch, tmp)); return tmp; +#else + return pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; +#endif } else { return shambase::VectorProperties::get_min(); } diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp index afd80fc5a..5edb41815 100644 --- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp +++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp @@ -30,6 +30,7 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { auto &ghost_layer = edges.ghost_layer; const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner; +#if false std::unordered_map msg_sizes_send; std::unordered_map msg_sizes_max_send; @@ -49,6 +50,7 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { << msg_sizes_max_send[rank]; } shamcomm::logs::raw_ln(ss.str()); +#endif shambase::DistributedDataShared recv_dat; From 2b28ecd1515256ec2f0bd2f83333890580f5999f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Tue, 24 Mar 2026 23:12:01 +0100 Subject: [PATCH 27/35] lb weight --- .../loadbalance/LoadBalanceStrategy.hpp | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp index c429d06f4..db6347a4b 100644 --- a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp +++ b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp @@ -222,7 +222,8 @@ namespace shamrock::scheduler::details { inline LBMetric compute_LB_metric( const std::vector> &lb_vector, const std::vector &new_owners, - i32 world_size) { + i32 world_size, + f64 strategy_weight) { std::vector load_per_node(world_size, 0); @@ -250,7 +251,11 @@ namespace shamrock::scheduler::details { } var /= world_size; - return {min, max, avg, sycl::sqrt(var)}; + return { + min * strategy_weight, + max * strategy_weight, + avg * strategy_weight, + sycl::sqrt(var) * strategy_weight}; } } // namespace shamrock::scheduler::details @@ -270,30 +275,39 @@ namespace shamrock::scheduler { std::vector> &&lb_vector, i32 world_size = shamcomm::world_size()) { - auto tmpres = details::lb_startegy_parallel_sweep(lb_vector, world_size); - auto metric_psweep = details::compute_LB_metric(lb_vector, tmpres, world_size); + using namespace details; - auto tmpres_2 = details::lb_startegy_roundrobin(lb_vector, world_size); - auto metric_rrobin = details::compute_LB_metric(lb_vector, tmpres_2, world_size); + f64 factor_boost_psweep = 1; + auto tmpres = lb_startegy_parallel_sweep(lb_vector, world_size); + auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep); + // We boost the round robin strategy to favor it if the difference is around 5% since the + // increased uniformity will probably offset the cost anyway + f64 factor_boost_rrobin = 0.95; + auto tmpres_2 = lb_startegy_roundrobin(lb_vector, world_size); + auto metric_rrobin + = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin); + + std::string strategy_name = "parallel sweep"; if (metric_rrobin.max < metric_psweep.max) { - tmpres = tmpres_2; + tmpres = tmpres_2; + strategy_name = "round robin"; } if (shamcomm::world_rank() == 0) { - logger::info_ln("LoadBalance", "summary :"); - logger::info_ln( - "LoadBalance", - " - strategy \"psweep\" : max =", - metric_psweep.max, - "min =", - metric_psweep.min); logger::info_ln( "LoadBalance", - " - strategy \"round robin\" : max =", - metric_rrobin.max, - "min =", - metric_rrobin.min); + shambase::format( + R"=(Summary (strategy = {0:}): + - strategy "psweep" : max = {1:.1f} min = {2:.1f} factor = {3:} + - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=", + strategy_name, + metric_psweep.max, + metric_psweep.min, + factor_boost_psweep, + metric_rrobin.max, + metric_rrobin.min, + factor_boost_rrobin)); } return tmpres; } From 2ab4df4aa0ced40cc08e90506525b586b26d1d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 12:20:27 +0100 Subject: [PATCH 28/35] fix buildbot if invoked outside of repo --- buildbot/lib/buildbot.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/buildbot/lib/buildbot.py b/buildbot/lib/buildbot.py index 49163c8a3..a5d00e4ab 100644 --- a/buildbot/lib/buildbot.py +++ b/buildbot/lib/buildbot.py @@ -59,24 +59,36 @@ def print_buildbot_info(utility_name): print() - str_git = os.popen("git log -n 1 --decorate=full").read() + try: + r_log = subprocess.run( + ["git", "log", "-n", "1", "--decorate=full"], + capture_output=True, + text=True, + ) + if r_log.returncode != 0: + raise RuntimeError("git log failed") - git_hash = str_git.split()[1] - git_head = str_git[str_git.find("HEAD -> ") + 8 : str_git.find(")")] + str_git = r_log.stdout + git_hash = str_git.split()[1] + git_head = str_git[str_git.find("HEAD -> ") + 8 : str_git.find(")")] - git_head = git_head.split(",") + git_head = git_head.split(",") - if len(git_head) == 1: - git_head = "\033[1;92m" + git_head[0] + "\033[0;0m" - else: - git_head = "\033[1;92m" + git_head[0] + "\033[0;0m , \033[1;91m" + git_head[0] + "\033[0;0m" + if len(git_head) == 1: + git_head = "\033[1;92m" + git_head[0] + "\033[0;0m" + else: + git_head = ( + "\033[1;92m" + git_head[0] + "\033[0;0m , \033[1;91m" + git_head[0] + "\033[0;0m" + ) - print("\033[1;34mGit status \033[0;0m: ") - print(" \033[1;93mcommit \033[0;0m: ", git_hash) - print(" \033[1;36mHEAD \033[0;0m: ", git_head) - print(" \033[1;31mmodified files\033[0;0m (since last commit):") - print(os.popen('git diff-index --name-only HEAD -- | sed "s/^/ /g"').read()) - print("\033[1;90m" + "-" * col_cnt + "\033[0;0m\n") + print("\033[1;34mGit status \033[0;0m: ") + print(" \033[1;93mcommit \033[0;0m: ", git_hash) + print(" \033[1;36mHEAD \033[0;0m: ", git_head) + print(" \033[1;31mmodified files\033[0;0m (since last commit):") + print(os.popen('git diff-index --name-only HEAD -- | sed "s/^/ /g"').read()) + print("\033[1;90m" + "-" * col_cnt + "\033[0;0m\n") + except Exception: # noqa: BLE001 + print("Warn : couldn't get git status") def run_cmd(str): From 1bed2de0eb511fabc5c18c5284e4144d24ee9dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 12:20:46 +0100 Subject: [PATCH 29/35] add way of fetching MPI timers --- src/shamcomm/include/shamcomm/collectives.hpp | 18 +++- src/shamcomm/include/shamcomm/wrapper.hpp | 4 + src/shamcomm/src/collectives.cpp | 82 +++++++++++++++++++ src/shamcomm/src/wrapper.cpp | 2 + src/shampylib/src/pyShamcomm.cpp | 59 +++++++++++++ src/tests/shamcomm/collectivesTests.cpp | 23 ++++++ 6 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 src/shampylib/src/pyShamcomm.cpp diff --git a/src/shamcomm/include/shamcomm/collectives.hpp b/src/shamcomm/include/shamcomm/collectives.hpp index 865ee044d..b9d0a022e 100644 --- a/src/shamcomm/include/shamcomm/collectives.hpp +++ b/src/shamcomm/include/shamcomm/collectives.hpp @@ -41,6 +41,19 @@ namespace shamcomm { void gather_basic_str( const std::basic_string &send_vec, std::basic_string &recv_vec); + /** + * @brief Allgathers a string from all nodes and concatenates it in a std::string + * + * This function gathers the string `send_vec` from all nodes and concatenates the + * result in `recv_vec` on every rank. The result is ordered by the order of the + * nodes in the communicator, i.e. the string is ordered by rank. + */ + void allgather_str(const std::string &send_vec, std::string &recv_vec); + + /// same as allgather_str but with std::basic_string + void allgather_basic_str( + const std::basic_string &send_vec, std::basic_string &recv_vec); + /** * @brief Constructs a histogram from a vector of strings, counting occurrences * of each unique string. @@ -56,8 +69,11 @@ namespace shamcomm { * @return An unordered map where keys are unique strings from the input and * values are the counts of their occurrences. (valid only on rank 0) */ - std::unordered_map string_histogram( const std::vector &inputs, std::string delimiter = "\n"); + /// same as string_histogram but with result return on every rank + std::unordered_map all_string_histogram( + const std::vector &inputs, std::string delimiter = "\n"); + } // namespace shamcomm diff --git a/src/shamcomm/include/shamcomm/wrapper.hpp b/src/shamcomm/include/shamcomm/wrapper.hpp index 501a218de..3b7df7bc5 100644 --- a/src/shamcomm/include/shamcomm/wrapper.hpp +++ b/src/shamcomm/include/shamcomm/wrapper.hpp @@ -19,6 +19,7 @@ #include "shambase/aliases_float.hpp" #include "shambase/aliases_int.hpp" #include "shamcomm/mpi.hpp" +#include #include namespace shamcomm::mpi { @@ -29,6 +30,9 @@ namespace shamcomm::mpi { /// get a timer value f64 get_timer(std::string timername); + /// return all internal timers + const std::unordered_map &get_timers(); + /// MPI wrapper for MPI_Allreduce void Allreduce( const void *sendbuf, diff --git a/src/shamcomm/src/collectives.cpp b/src/shamcomm/src/collectives.cpp index 934c4555d..158badcda 100644 --- a/src/shamcomm/src/collectives.cpp +++ b/src/shamcomm/src/collectives.cpp @@ -81,6 +81,56 @@ namespace { recv_vec = result; } + /** + * @brief Allgather a vector of characters from all MPI ranks into a single string + * + * The resulting string is concatenated in rank order and is returned on every rank. + */ + template + inline void _internal_allgather_str( + const std::basic_string &send_vec, std::basic_string &recv_vec) { + StackEntry stack_loc{}; + + if (shamcomm::world_size() == 1) { + recv_vec = send_vec; + return; + } + + i32 wsize = shamcomm::world_size(); + size_t wsize_sz = static_cast(wsize); + + // counts/displacements are expressed in number of characters. + std::vector counts(wsize_sz); + std::vector disps(wsize_sz); + + // MPI counts/displacements use `int`. + int local_count = static_cast(send_vec.size()); + + shamcomm::mpi::Allgather( + &local_count, 1, MPI_INT, counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + for (size_t i = 0; i < wsize_sz; i++) { + disps[i] = (i > 0) ? (disps[i - 1] + counts[i - 1]) : 0; + } + + int global_len = disps[wsize_sz - 1] + counts[wsize_sz - 1]; + + std::basic_string result; + result.resize(static_cast(global_len)); + + shamcomm::mpi::Allgatherv( + send_vec.data(), + local_count, + MPI_CHAR, + result.data(), + counts.data(), + disps.data(), + MPI_CHAR, + MPI_COMM_WORLD); + + recv_vec = result; + } + } // namespace void shamcomm::gather_str(const std::string &send_vec, std::string &recv_vec) { @@ -94,6 +144,17 @@ void shamcomm::gather_basic_str( _internal_gather_str(send_vec, recv_vec); } +void shamcomm::allgather_str(const std::string &send_vec, std::string &recv_vec) { + StackEntry stack_loc{}; + _internal_allgather_str(send_vec, recv_vec); +} + +void shamcomm::allgather_basic_str( + const std::basic_string &send_vec, std::basic_string &recv_vec) { + StackEntry stack_loc{}; + _internal_allgather_str(send_vec, recv_vec); +} + std::unordered_map shamcomm::string_histogram( const std::vector &inputs, std::string delimiter) { std::string accum_loc = ""; @@ -119,3 +180,24 @@ std::unordered_map shamcomm::string_histogram( return {}; } + +std::unordered_map shamcomm::all_string_histogram( + const std::vector &inputs, std::string delimiter) { + std::string accum_loc = ""; + for (auto &s : inputs) { + accum_loc += s + delimiter; + } + + std::string recv = ""; + allgather_str(accum_loc, recv); + + std::vector splitted = shambase::split_str(recv, delimiter); + + std::unordered_map histogram; + + for (size_t i = 0; i < splitted.size(); i++) { + histogram[splitted[i]] += 1; + } + + return histogram; +} diff --git a/src/shamcomm/src/wrapper.cpp b/src/shamcomm/src/wrapper.cpp index 3223b0fb9..f56cfe64c 100644 --- a/src/shamcomm/src/wrapper.cpp +++ b/src/shamcomm/src/wrapper.cpp @@ -43,6 +43,8 @@ namespace shamcomm::mpi { f64 get_timer(std::string timername) { return mpi_timers[timername]; } + const std::unordered_map &get_timers() { return mpi_timers; } + } // namespace shamcomm::mpi namespace { diff --git a/src/shampylib/src/pyShamcomm.cpp b/src/shampylib/src/pyShamcomm.cpp new file mode 100644 index 000000000..34d234e4d --- /dev/null +++ b/src/shampylib/src/pyShamcomm.cpp @@ -0,0 +1,59 @@ +// -------------------------------------------------------// +// +// SHAMROCK code for hydrodynamics +// Copyright (c) 2021-2026 Timothée David--Cléris +// SPDX-License-Identifier: CeCILL Free Software License Agreement v2.1 +// Shamrock is licensed under the CeCILL 2.1 License, see LICENSE for more information +// +// -------------------------------------------------------// + +/** + * @file pyShamcomm.cpp + * @author Timothée David--Cléris (tim.shamrock@proton.me) + * @brief + */ + +#include "shamalgs/collective/reduction.hpp" +#include "shambindings/pybind11_stl.hpp" +#include "shambindings/pybindaliases.hpp" +#include "shambindings/pytypealias.hpp" +#include "shamcomm/collectives.hpp" +#include "shamcomm/logs.hpp" +#include "shamcomm/wrapper.hpp" +#include +#include +#include +#include + +Register_pymod(shamcommlibinit) { + + py::module shamcomm_module = m.def_submodule("comm", "comm library"); + + shamcomm_module.def("get_timer", [](std::string name) { + return shamcomm::mpi::get_timer(std::move(name)); + }); + + shamcomm_module.def("get_timers", []() { + return shamcomm::mpi::get_timers(); + }); + + shamcomm_module.def( + "mpi_timers_delta", + [](std::unordered_map start, std::unordered_map end) { + std::vector keys{}; + + for (auto &[k, v] : end) { + keys.push_back(k); + } + + auto key_histo = shamcomm::all_string_histogram(keys); + + std::unordered_map deltas{}; + + for (auto &[k, c] : key_histo) { + deltas[k] = shamalgs::collective::allreduce_max(end[k] - start[k]); + } + + return deltas; + }); +} diff --git a/src/tests/shamcomm/collectivesTests.cpp b/src/tests/shamcomm/collectivesTests.cpp index a4d0b732a..4be7c99d4 100644 --- a/src/tests/shamcomm/collectivesTests.cpp +++ b/src/tests/shamcomm/collectivesTests.cpp @@ -37,3 +37,26 @@ TestStart(Unittest, "shamcomm/collectives::gather_str", test_gather_str, 4) { REQUIRE_EQUAL(recv, result); } + +TestStart(Unittest, "shamcomm/collectives::allgather_str", test_allgather_str, 4) { + + std::array ref_base{ + "I'm a very important string", + "But I'm a very important string", + "Listen, I'm a very important string", + "The most importantest string", + }; + + std::string result = ""; + for (u32 i = 0; i < ref_base.size(); i++) { + result += ref_base[i]; + } + + std::string send = ref_base[shamcomm::world_rank()]; + + std::string recv = "random string"; // Just to check that it is overwritten + + shamcomm::allgather_str(send, recv); + + REQUIRE_EQUAL(recv, result); +} From 35941e4d0ae8858036c071d567a7e53c15cc8a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 12:21:46 +0100 Subject: [PATCH 30/35] add mpi timers --- examples/benchmarks/sph_weak_scale_test.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index a71226a6b..8544346ae 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -124,14 +124,20 @@ res_rates = [] res_cnts = [] res_system_metrics = [] + res_mpi_timers = [] for i in range(10): shamrock.sys.mpi_barrier() + # per carefull this is still per ranks + before_mpi_timers = shamrock.comm.get_timers() + # To replay the same step model.set_next_dt(0.0) model.timestep() + after_mpi_timers = shamrock.comm.get_timers() + tmp_res_rate, tmp_res_cnt, tmp_system_metrics = ( model.solver_logs_last_rate(), model.solver_logs_last_obj_count(), @@ -140,6 +146,7 @@ res_rates.append(tmp_res_rate) res_cnts.append(tmp_res_cnt) res_system_metrics.append(tmp_system_metrics) + res_mpi_timers.append(shamrock.comm.mpi_timers_delta(before_mpi_timers, after_mpi_timers)) # result is the best rate of the 5 steps res_rate, res_cnt = max(res_rates), res_cnts[0] @@ -147,7 +154,7 @@ # index of the max rate max_rate_index = res_rates.index(max(res_rates)) max_rate_system_metrics = res_system_metrics[max_rate_index] - + max_mpi_timers = res_mpi_timers[max_rate_index] step_time = res_cnt / res_rate if shamrock.sys.world_rank() == 0: @@ -167,6 +174,7 @@ "rate": res_rate, "cnt": res_cnt, "step_time": step_time, + "mpi_timers": max_mpi_timers, } # print the system metrics From 5687d4891a6360fc875b61783f7ff31ddead7fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 16:57:21 +0100 Subject: [PATCH 31/35] add timestep callback --- .../sph/include/shammodels/sph/Solver.hpp | 9 +++++++++ src/shammodels/sph/src/Solver.cpp | 12 ++++++++++++ src/shammodels/sph/src/pySPHModel.cpp | 15 ++++++++++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/shammodels/sph/include/shammodels/sph/Solver.hpp b/src/shammodels/sph/include/shammodels/sph/Solver.hpp index 10df216c7..83509cbcd 100644 --- a/src/shammodels/sph/include/shammodels/sph/Solver.hpp +++ b/src/shammodels/sph/include/shammodels/sph/Solver.hpp @@ -30,9 +30,12 @@ #include "shamrock/scheduler/ShamrockCtx.hpp" #include "shamsys/legacy/log.hpp" #include "shamtree/TreeTraversalCache.hpp" +#include #include +#include #include #include +#include namespace shammodels::sph { struct TimestepLog { @@ -75,6 +78,12 @@ namespace shammodels::sph { Config solver_config; SolverLog solve_logs; + struct SolverStepCallback { + std::optional> step_begin_callback; + std::optional> step_end_callback; + }; + std::vector timestep_callbacks{}; + inline void init_required_fields() { solver_config.set_layout(context.get_pdl_write()); } // serial patch tree control diff --git a/src/shammodels/sph/src/Solver.cpp b/src/shammodels/sph/src/Solver.cpp index 0ef3a5ad0..66e3001fc 100644 --- a/src/shammodels/sph/src/Solver.cpp +++ b/src/shammodels/sph/src/Solver.cpp @@ -1583,6 +1583,12 @@ shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() sham::MemPerfInfos mem_perf_infos_start = sham::details::get_mem_perf_info(); f64 mpi_timer_start = shamcomm::mpi::get_timer("total"); + for (auto &callbacks : timestep_callbacks) { + if (callbacks.step_begin_callback) { + shambase::get_check_ref(callbacks.step_begin_callback)(); + } + } + Tscal t_current = solver_config.get_time(); Tscal dt = solver_config.get_dt_sph(); @@ -2643,6 +2649,12 @@ shammodels::sph::TimestepLog shammodels::sph::Solver::evolve_once() tstep.end(); + for (auto it = timestep_callbacks.rbegin(); it != timestep_callbacks.rend(); ++it) { + if (it->step_begin_callback) { + shambase::get_check_ref(it->step_end_callback)(); + } + } + f64 delta_mpi_timer = shamcomm::mpi::get_timer("total") - mpi_timer_start; sham::MemPerfInfos mem_perf_infos_end = sham::details::get_mem_perf_info(); diff --git a/src/shammodels/sph/src/pySPHModel.cpp b/src/shammodels/sph/src/pySPHModel.cpp index 74da727eb..b927346f7 100644 --- a/src/shammodels/sph/src/pySPHModel.cpp +++ b/src/shammodels/sph/src/pySPHModel.cpp @@ -38,7 +38,9 @@ #include #include #include +#include #include +#include template class SPHKernel> void add_instance(py::module &m, std::string name_config, std::string name_model) { @@ -1246,7 +1248,18 @@ void add_instance(py::module &m, std::string name_config, std::string name_model return sched.get_patch_transform(); }) .def("apply_momentum_offset", &T::apply_momentum_offset) - .def("apply_position_offset", &T::apply_position_offset); + .def("apply_position_offset", &T::apply_position_offset) + .def( + "add_timestep_callback", + [](T &self, + std::optional> step_begin_callback, + std::optional> step_end_callback) { + self.solver.timestep_callbacks.push_back( + {std::move(step_begin_callback), std::move(step_end_callback)}); + }, + py::kw_only(), + py::arg("step_begin") = std::nullopt, + py::arg("step_end") = std::nullopt); } template class SPHKernel> From 06eee3e43213f1dccfffa0807662a7d3b27989d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 16:57:54 +0100 Subject: [PATCH 32/35] add callback to script --- examples/benchmarks/sph_weak_scale_test.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 8544346ae..eac62ace6 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -126,18 +126,29 @@ res_system_metrics = [] res_mpi_timers = [] - for i in range(10): - shamrock.sys.mpi_barrier() + before_mpi_timers, after_mpi_timers = None, None - # per carefull this is still per ranks + def callback_before_mpi_timer(): + global before_mpi_timers + print(shamrock.sys.world_rank(), "register before_mpi_timers") before_mpi_timers = shamrock.comm.get_timers() + def callback_after_mpi_timer(): + global after_mpi_timers + print(shamrock.sys.world_rank(), "register after_mpi_timers") + after_mpi_timers = shamrock.comm.get_timers() + + model.add_timestep_callback( + step_begin=callback_before_mpi_timer, step_end=callback_after_mpi_timer + ) + + for i in range(10): + shamrock.sys.mpi_barrier() + # To replay the same step model.set_next_dt(0.0) model.timestep() - after_mpi_timers = shamrock.comm.get_timers() - tmp_res_rate, tmp_res_cnt, tmp_system_metrics = ( model.solver_logs_last_rate(), model.solver_logs_last_obj_count(), From 98b1e994f3b920f60d8ca9cc2ba286efa6dc3047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Wed, 25 Mar 2026 17:17:55 +0100 Subject: [PATCH 33/35] cleaner --- examples/benchmarks/sph_weak_scale_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index eac62ace6..7c8b0f241 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -126,16 +126,19 @@ res_system_metrics = [] res_mpi_timers = [] + """ + Here we insert callbacks to measure solver MPI usage by fetching the timers twice at the begining and end of the step + """ before_mpi_timers, after_mpi_timers = None, None def callback_before_mpi_timer(): global before_mpi_timers - print(shamrock.sys.world_rank(), "register before_mpi_timers") + # print(shamrock.sys.world_rank(), "register before_mpi_timers") before_mpi_timers = shamrock.comm.get_timers() def callback_after_mpi_timer(): global after_mpi_timers - print(shamrock.sys.world_rank(), "register after_mpi_timers") + # print(shamrock.sys.world_rank(), "register after_mpi_timers") after_mpi_timers = shamrock.comm.get_timers() model.add_timestep_callback( From d2ddf425ccd72ea628a59a0067b1778e076f2d5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Thu, 26 Mar 2026 21:20:09 +0100 Subject: [PATCH 34/35] faster --- examples/benchmarks/sph_weak_scale_test.py | 17 +++++++++++++++++ src/shamcomm/include/shamcomm/wrapper.hpp | 4 ++++ src/shamcomm/src/wrapper.cpp | 14 ++++++++++++++ src/shampylib/src/pyShamcomm.cpp | 10 +--------- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 7c8b0f241..13b3753f4 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -146,12 +146,19 @@ def callback_after_mpi_timer(): ) for i in range(10): + if shamrock.sys.world_rank() == 0: + print("running step ", i+1, "/", 10, " ...") + shamrock.sys.mpi_barrier() # To replay the same step model.set_next_dt(0.0) model.timestep() + + if shamrock.sys.world_rank() == 0: + print("collecting results ...") + tmp_res_rate, tmp_res_cnt, tmp_system_metrics = ( model.solver_logs_last_rate(), model.solver_logs_last_obj_count(), @@ -162,6 +169,16 @@ def callback_after_mpi_timer(): res_system_metrics.append(tmp_system_metrics) res_mpi_timers.append(shamrock.comm.mpi_timers_delta(before_mpi_timers, after_mpi_timers)) + + if shamrock.sys.world_rank() == 0: + print("sleeping 1 second ...") + + import time + time.sleep(1) + + if shamrock.sys.world_rank() == 0: + print("done sleeping 1 second ...") + # result is the best rate of the 5 steps res_rate, res_cnt = max(res_rates), res_cnts[0] diff --git a/src/shamcomm/include/shamcomm/wrapper.hpp b/src/shamcomm/include/shamcomm/wrapper.hpp index 3b7df7bc5..252ee3216 100644 --- a/src/shamcomm/include/shamcomm/wrapper.hpp +++ b/src/shamcomm/include/shamcomm/wrapper.hpp @@ -21,6 +21,7 @@ #include "shamcomm/mpi.hpp" #include #include +#include namespace shamcomm::mpi { @@ -33,6 +34,9 @@ namespace shamcomm::mpi { /// return all internal timers const std::unordered_map &get_timers(); + /// return all possible keys for the internal timers + const std::vector &get_possible_keys(); + /// MPI wrapper for MPI_Allreduce void Allreduce( const void *sendbuf, diff --git a/src/shamcomm/src/wrapper.cpp b/src/shamcomm/src/wrapper.cpp index f56cfe64c..7335ba654 100644 --- a/src/shamcomm/src/wrapper.cpp +++ b/src/shamcomm/src/wrapper.cpp @@ -45,6 +45,20 @@ namespace shamcomm::mpi { const std::unordered_map &get_timers() { return mpi_timers; } + std::vector possible_keys{ + "total", "MPI_Isend", "MPI_Irecv", + "MPI_Allreduce", "MPI_Allgather", "MPI_Allgatherv", + "MPI_Exscan", "MPI_Wait", "MPI_Waitall", + "MPI_Barrier", "MPI_Probe", "MPI_Recv", + "MPI_Get_count", "MPI_Send", "MPI_File_set_view", + "MPI_Type_size", "MPI_File_write_all", "MPI_File_write", + "MPI_File_read", "MPI_File_write_at", "MPI_File_read_at", + "MPI_File_close", "MPI_File_open", "MPI_Test", + "MPI_Gather", "MPI_Gatherv", + }; + + const std::vector &get_possible_keys() { return possible_keys; } + } // namespace shamcomm::mpi namespace { diff --git a/src/shampylib/src/pyShamcomm.cpp b/src/shampylib/src/pyShamcomm.cpp index 34d234e4d..c50985332 100644 --- a/src/shampylib/src/pyShamcomm.cpp +++ b/src/shampylib/src/pyShamcomm.cpp @@ -40,17 +40,9 @@ Register_pymod(shamcommlibinit) { shamcomm_module.def( "mpi_timers_delta", [](std::unordered_map start, std::unordered_map end) { - std::vector keys{}; - - for (auto &[k, v] : end) { - keys.push_back(k); - } - - auto key_histo = shamcomm::all_string_histogram(keys); - std::unordered_map deltas{}; - for (auto &[k, c] : key_histo) { + for (auto &k : shamcomm::mpi::get_possible_keys()) { deltas[k] = shamalgs::collective::allreduce_max(end[k] - start[k]); } From 9774603ac3c390a7c91c8c982fbfd82f3f08dc3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Thu, 26 Mar 2026 21:20:42 +0100 Subject: [PATCH 35/35] faster --- examples/benchmarks/sph_weak_scale_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 13b3753f4..8ec3c5504 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -147,7 +147,7 @@ def callback_after_mpi_timer(): for i in range(10): if shamrock.sys.world_rank() == 0: - print("running step ", i+1, "/", 10, " ...") + print("running step ", i + 1, "/", 10, " ...") shamrock.sys.mpi_barrier() @@ -155,7 +155,6 @@ def callback_after_mpi_timer(): model.set_next_dt(0.0) model.timestep() - if shamrock.sys.world_rank() == 0: print("collecting results ...") @@ -169,11 +168,11 @@ def callback_after_mpi_timer(): res_system_metrics.append(tmp_system_metrics) res_mpi_timers.append(shamrock.comm.mpi_timers_delta(before_mpi_timers, after_mpi_timers)) - if shamrock.sys.world_rank() == 0: print("sleeping 1 second ...") import time + time.sleep(1) if shamrock.sys.world_rank() == 0: