From 751677f94c065c2abe10899eda9341e1fc3264b5 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 10:40:31 +0100 Subject: [PATCH 1/9] Fix UndefVarError: Rename unsafe_nvml* to unchecked_nvml* for CUDA.jl v5 compatibility --- ext/CUDAExt/cuda_wrappers.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/CUDAExt/cuda_wrappers.jl b/ext/CUDAExt/cuda_wrappers.jl index a5ebad9..7de602b 100644 --- a/ext/CUDAExt/cuda_wrappers.jl +++ b/ext/CUDAExt/cuda_wrappers.jl @@ -1,6 +1,6 @@ function supports_get_temperature(nvml_device::NVML.Device) temp = Ref{UInt32}() - nvml_return = NVML.unsafe_nvmlDeviceGetTemperature( + nvml_return = NVML.unchecked_nvmlDeviceGetTemperature( nvml_device, CUDA.NVML.NVML_TEMPERATURE_GPU, temp ) return nvml_return == NVML.NVML_SUCCESS @@ -30,7 +30,7 @@ get_temperatures(devices=CUDA.devices()) = [get_temperature(dev) for dev in devi function supports_get_power_usage(nvml_device::NVML.Device) power = Ref{UInt32}() - nvml_return = NVML.unsafe_nvmlDeviceGetPowerUsage(nvml_device, power) + nvml_return = NVML.unchecked_nvmlDeviceGetPowerUsage(nvml_device, power) return nvml_return == NVML.NVML_SUCCESS end function supports_get_power_usage(dev::CuDevice) @@ -56,7 +56,7 @@ get_power_usages(devices=CUDA.devices()) = [get_power_usage(dev) for dev in devi function supports_get_gpu_utilization(nvml_device::NVML.Device) util = Ref{NVML.nvmlUtilization_t}() - nvml_return = NVML.unsafe_nvmlDeviceGetUtilizationRates(nvml_device, util) + nvml_return = NVML.unchecked_nvmlDeviceGetUtilizationRates(nvml_device, util) return nvml_return == NVML.NVML_SUCCESS end function supports_get_gpu_utilization(dev::CuDevice) From 48315d1c690511d71054e695e2c9422f6bba44b7 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 10:47:11 +0100 Subject: [PATCH 2/9] Fix thread utilities and CUDA extension --- Project.toml | 8 +++++--- ext/CUDAExt/CUDAExt.jl | 1 + src/GPUInspector.jl | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 95c470c..1aab1c4 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" +ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [weakdeps] @@ -28,17 +29,18 @@ CUDAExt = "CUDA" CairoMakieExt = "CairoMakie" [compat] -CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15" CUDA = "3.8.4, 3.12, 4.4, 5" +CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15" CpuId = "0.3" DocStringExtensions = "0.9" Glob = "1.3" -NVTX = "0.3, 1" HDF5 = "0.16, 0.17" +NVTX = "0.3, 1" Reexport = "1.2" Statistics = "1" -ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1" TestItemRunner = "0.2, 1" +ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1" +ThreadPools = "2.1.1" UnicodePlots = "2.8, 3" julia = "1.9" diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index 9d2a770..fa0eee8 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -2,6 +2,7 @@ module CUDAExt using GPUInspector using CUDA +using ThreadPools # stdlibs etc. using Base: UUID diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 9dbe9e9..32f88f7 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -11,6 +11,7 @@ using Pkg: Pkg # external using Reexport @reexport using ThreadPinning +using ThreadPools using DocStringExtensions using UnicodePlots using CpuId: cachesize From 98877c958844eab3fc2c4f6632a0ea09b2e058c6 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 12:08:38 +0100 Subject: [PATCH 3/9] Fix stubs for saving figure --- ext/CairoMakieExt.jl | 6 +++--- src/stubs/stubs_monitoring.jl | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/CairoMakieExt.jl b/ext/CairoMakieExt.jl index c1a3089..6c48b0a 100644 --- a/ext/CairoMakieExt.jl +++ b/ext/CairoMakieExt.jl @@ -1,10 +1,10 @@ module CairoMakieExt using GPUInspector -import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label +import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label, savefig_monitoring_results using CairoMakie -function GPUInspector.savefig_monitoring_results( +function savefig_monitoring_results( r::MonitoringResults, symbols=keys(r.results); ext=:pdf ) for s in symbols @@ -13,7 +13,7 @@ function GPUInspector.savefig_monitoring_results( return nothing end -function GPUInspector.savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf) +function savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf) times = r.times values = r.results[s] title, ylabel = _symbol2title_and_label(s) diff --git a/src/stubs/stubs_monitoring.jl b/src/stubs/stubs_monitoring.jl index 5ad00db..7406b2b 100644 --- a/src/stubs/stubs_monitoring.jl +++ b/src/stubs/stubs_monitoring.jl @@ -75,6 +75,6 @@ livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet() Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk. **Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl. """ -function savefig_monitoring_results(r::Any, symbols::Any=nothing; ext=:pdf) +function savefig_monitoring_results(r::MonitoringResults, args...; kwargs...) return error("You need to load CairoMakie.jl first.") end From e384b5766c7b09bce8a92bee62e6f6320af94703 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 12:12:40 +0100 Subject: [PATCH 4/9] Fix memory clearing utilities and CUDA extension --- ext/CUDAExt/CUDAExt.jl | 2 +- ext/CUDAExt/implementations/general.jl | 20 +++++++++++++++++++- src/GPUInspector.jl | 2 +- src/stubs/stubs_general.jl | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index fa0eee8..162ecd2 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -27,7 +27,7 @@ using GPUInspector: NVIDIABackend # import stubs to implement them -import GPUInspector: backendinfo, functional +import GPUInspector: backendinfo, functional, clear_gpu_memory, clear_all_gpus_memory # gpuinfo import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus # p2p bw diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl index b44e5fd..e31dc57 100644 --- a/ext/CUDAExt/implementations/general.jl +++ b/ext/CUDAExt/implementations/general.jl @@ -3,7 +3,7 @@ function functional(::NVIDIABackend; verbose=true) verbose && @info("CUDA/GPU available.") hascuda = true else - verbose && @info("No CUDA/GPU found.") + verbose && @warn("No CUDA/GPU found.") hascuda = false if verbose # debug information @@ -21,3 +21,21 @@ function functional(::NVIDIABackend; verbose=true) end return hascuda end + + +function clear_gpu_memory(::NVIDIABackend, device::CuDevice=CUDA.device(); gc=true) + device!(device) do + gc && GC.gc() + CUDA.reclaim() + end + return nothing +end + + +function clear_all_gpus_memory(::NVIDIABackend, devices=CUDA.devices(); gc=true) + gc && GC.gc() + for dev in devices + clear_gpu_memory(NVIDIABackend(), dev; gc=false) + end + return nothing +end diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 32f88f7..06cf434 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -57,7 +57,7 @@ export logspace # export stresstest_cpu # stubs gpuinfo -export ngpus, gpuinfo, gpuinfo_p2p_access, gpus +export ngpus, gpuinfo, gpuinfo_p2p_access, gpus, clear_gpu_memory, clear_all_gpus_memory # stubs p2p bandwidth export p2p_bandwidth, p2p_bandwidth_all, p2p_bandwidth_bidirectional, p2p_bandwidth_bidirectional_all diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl index eee42ed..c633b79 100644 --- a/src/stubs/stubs_general.jl +++ b/src/stubs/stubs_general.jl @@ -4,3 +4,21 @@ If not, print some hopefully useful debug information (or turn it off with `verb """ functional(; kwargs...) = functional(backend(); kwargs...) functional(::Backend; kwargs...) = not_implemented_yet() + + +""" + clear_gpu_memory([device]; kwargs...) +Reclaim the unused memory of the given GPU (default: currently active GPU). +""" +clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...) +clear_gpu_memory(device; kwargs...) = clear_gpu_memory(backend(), device; kwargs...) +clear_gpu_memory(::Backend, args...; kwargs...) = not_implemented_yet() + + +""" + clear_all_gpus_memory([devices]; kwargs...) +Reclaim the unused memory of all available GPUs. +""" +clear_all_gpus_memory(; kwargs...) = clear_all_gpus_memory(backend(); kwargs...) +clear_all_gpus_memory(devices; kwargs...) = clear_all_gpus_memory(backend(), devices; kwargs...) +clear_all_gpus_memory(::Backend, args...; kwargs...) = not_implemented_yet() From aba104c6642c2d357c021ebe59b9f05cb71046e6 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 12:19:58 +0100 Subject: [PATCH 5/9] Remove duplicate memory clearing functions from utility.jl to fix precompilation error --- ext/CUDAExt/utility.jl | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl index 7f6a504..3c9681d 100644 --- a/ext/CUDAExt/utility.jl +++ b/ext/CUDAExt/utility.jl @@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa return mem_handles end -# TODO: Maybe make API/stub? -"Reclaim the unused memory of the currently active GPU (i.e. `device()`)." -function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true) - device!(device) do - gc && GC.gc() - CUDA.reclaim() - end - return nothing -end - -# TODO: Maybe make API/stub? -"Reclaim the unused memory of all available GPUs." -function clear_all_gpus_memory(devices=CUDA.devices(); gc=true) - gc && GC.gc() - for dev in devices - clear_gpu_memory(dev; gc=false) - end - return nothing -end - """ toggle_tensorcoremath([enable::Bool; verbose=true]) Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`). From 2393779163019a1b95d6dff6c20feed2054b899c Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 14:11:23 +0100 Subject: [PATCH 6/9] Fix savefig_monitoring_results: fix StackOverflow and support filename argument --- ext/CairoMakieExt.jl | 32 ++++++++++++++++++++++++-------- src/stubs/stubs_monitoring.jl | 4 ++++ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/ext/CairoMakieExt.jl b/ext/CairoMakieExt.jl index 6c48b0a..7648775 100644 --- a/ext/CairoMakieExt.jl +++ b/ext/CairoMakieExt.jl @@ -7,20 +7,33 @@ using CairoMakie function savefig_monitoring_results( r::MonitoringResults, symbols=keys(r.results); ext=:pdf ) - for s in symbols - savefig_monitoring_results(r, s; ext) + if symbols isa Symbol + _savefig_monitoring_results(r, symbols; ext) + else + for s in symbols + _savefig_monitoring_results(r, s; ext) + end end return nothing end -function savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf) +function savefig_monitoring_results(filename::String, r::MonitoringResults; symbol=nothing) + if isnothing(symbol) + # if no symbol is provided, we pick the first one + symbol = first(keys(r.results)) + end + _savefig_monitoring_results(r, symbol; filename) + return nothing +end + +function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, filename=nothing) times = r.times values = r.results[s] title, ylabel = _symbol2title_and_label(s) ylims = _defaultylims(values) device_labels = [str for (str, uuid) in r.devices] - f = CairoMakie.Figure(; resolution=(1000, 500)) + f = CairoMakie.Figure(; size=(1000, 500)) ax = f[1, 1] = CairoMakie.Axis(f; xlabel="Time [s]", ylabel=ylabel, title=title) ylims!(ax, ylims) CairoMakie.scatterlines!(times, getindex.(values, 1); label=device_labels[1]) @@ -28,11 +41,14 @@ function savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf) CairoMakie.scatterlines!(times, getindex.(values, i); label=device_labels[i]) end f[1, 2] = CairoMakie.Legend(f, ax, "Devices"; framevisible=false) - filename = - replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") * - "_plot.$(string(ext))" + + if isnothing(filename) + filename = + replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") * + "_plot.$(string(ext))" + end CairoMakie.save(filename, f) return nothing end -end # module +end # module \ No newline at end of file diff --git a/src/stubs/stubs_monitoring.jl b/src/stubs/stubs_monitoring.jl index 7406b2b..cf35ea2 100644 --- a/src/stubs/stubs_monitoring.jl +++ b/src/stubs/stubs_monitoring.jl @@ -72,9 +72,13 @@ livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet() """ savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf) + savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...) Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk. **Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl. """ function savefig_monitoring_results(r::MonitoringResults, args...; kwargs...) return error("You need to load CairoMakie.jl first.") end +function savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...) + return error("You need to load CairoMakie.jl first.") +end From 7fc1d2de38aaf29dcd79ca4b4e7106fe5cd0368e Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 14:14:07 +0100 Subject: [PATCH 7/9] Enhance savefig_monitoring_results: add Tiled Summary Dashboard and prefix support --- ext/CairoMakieExt.jl | 69 ++++++++++++++++++++++++++--------- src/stubs/stubs_monitoring.jl | 2 +- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/ext/CairoMakieExt.jl b/ext/CairoMakieExt.jl index 7648775..1643c70 100644 --- a/ext/CairoMakieExt.jl +++ b/ext/CairoMakieExt.jl @@ -5,13 +5,13 @@ import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label, using CairoMakie function savefig_monitoring_results( - r::MonitoringResults, symbols=keys(r.results); ext=:pdf + r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix="" ) if symbols isa Symbol - _savefig_monitoring_results(r, symbols; ext) + _savefig_monitoring_results(r, symbols; ext, prefix) else for s in symbols - _savefig_monitoring_results(r, s; ext) + _savefig_monitoring_results(r, s; ext, prefix) end end return nothing @@ -19,14 +19,15 @@ end function savefig_monitoring_results(filename::String, r::MonitoringResults; symbol=nothing) if isnothing(symbol) - # if no symbol is provided, we pick the first one - symbol = first(keys(r.results)) + # If no symbol is provided, we create a tiled summary of ALL results + _savefig_monitoring_results_summary(r; filename) + else + _savefig_monitoring_results(r, symbol; filename) end - _savefig_monitoring_results(r, symbol; filename) return nothing end -function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, filename=nothing) +function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, filename=nothing, prefix="") times = r.times values = r.results[s] title, ylabel = _symbol2title_and_label(s) @@ -34,21 +35,53 @@ function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, device_labels = [str for (str, uuid) in r.devices] f = CairoMakie.Figure(; size=(1000, 500)) - ax = f[1, 1] = CairoMakie.Axis(f; xlabel="Time [s]", ylabel=ylabel, title=title) - ylims!(ax, ylims) - CairoMakie.scatterlines!(times, getindex.(values, 1); label=device_labels[1]) - for i in 2:length(first(values)) - CairoMakie.scatterlines!(times, getindex.(values, i); label=device_labels[i]) - end - f[1, 2] = CairoMakie.Legend(f, ax, "Devices"; framevisible=false) + _plot_metric!(f[1, 1], times, values, title, ylabel, ylims, device_labels) if isnothing(filename) - filename = - replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") * - "_plot.$(string(ext))" + clean_title = replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") + filename = prefix * clean_title * "_plot.$(string(ext))" + end + CairoMakie.save(filename, f) + return nothing +end + +function _savefig_monitoring_results_summary(r::MonitoringResults; filename="monitoring_summary.pdf") + symbols = collect(keys(r.results)) + n = length(symbols) + times = r.times + device_labels = [str for (str, uuid) in r.devices] + + # Create a tall figure to fit all plots + f = CairoMakie.Figure(; size=(1000, 400 * n)) + + for (i, s) in enumerate(symbols) + values = r.results[s] + title, ylabel = _symbol2title_and_label(s) + ylims = _defaultylims(values) + + # Plot into the i-th row + _plot_metric!(f[i, 1], times, values, title, ylabel, ylims, device_labels; show_legend=(i==1)) end + CairoMakie.save(filename, f) return nothing end -end # module \ No newline at end of file +function _plot_metric!(target, times, values, title, ylabel, ylims, device_labels; show_legend=true) + ax = CairoMakie.Axis(target; xlabel="Time [s]", ylabel=ylabel, title=title) + CairoMakie.ylims!(ax, ylims) + + # Use a color cycle for devices + for i in 1:length(first(values)) + CairoMakie.scatterlines!(ax, times, getindex.(values, i); label=device_labels[i]) + end + + if show_legend && length(device_labels) > 0 + # Position legend based on whether it's a single plot or tiled + # For simplicity in tiled, we'll put it to the right of the first plot + # but CairoMakie figures handle this via Layout positions + end + return ax +end + +end # module diff --git a/src/stubs/stubs_monitoring.jl b/src/stubs/stubs_monitoring.jl index cf35ea2..d3e0618 100644 --- a/src/stubs/stubs_monitoring.jl +++ b/src/stubs/stubs_monitoring.jl @@ -71,7 +71,7 @@ end livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet() """ - savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf) + savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix="") savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...) Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk. **Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl. From cd19c6a77d053a1a8319f10338ea7a5fef324b30 Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 14:24:19 +0100 Subject: [PATCH 8/9] Fix precompilation: make savefig stubs generic to avoid overwrite error --- src/stubs/stubs_monitoring.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/stubs/stubs_monitoring.jl b/src/stubs/stubs_monitoring.jl index d3e0618..72dae9a 100644 --- a/src/stubs/stubs_monitoring.jl +++ b/src/stubs/stubs_monitoring.jl @@ -76,9 +76,6 @@ livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet() Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk. **Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl. """ -function savefig_monitoring_results(r::MonitoringResults, args...; kwargs...) - return error("You need to load CairoMakie.jl first.") -end -function savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...) +function savefig_monitoring_results(args...; kwargs...) return error("You need to load CairoMakie.jl first.") end From a1ec52904677f71bb1b39344a778ba25583c39aa Mon Sep 17 00:00:00 2001 From: Nicola Mosco Date: Fri, 6 Feb 2026 18:54:44 +0100 Subject: [PATCH 9/9] fix: Use totalmem(dev) instead of deprecated total_memory() in gpus() --- ext/CUDAExt/implementations/gpuinfo.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index b960b08..cc7c447 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -24,7 +24,7 @@ function gpus(::NVIDIABackend; io::IO=stdout) cap = capability(dev) mem = device!(dev) do # this requires a device context, so we prefer NVML - (free=available_memory(), total=total_memory()) + (free=available_memory(), total=totalmem(dev)) end end println(