Skip to content
Open
8 changes: 5 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"

[weakdeps]
Expand All @@ -28,17 +29,18 @@ CUDAExt = "CUDA"
CairoMakieExt = "CairoMakie"

[compat]
CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15"
CUDA = "3.8.4, 3.12, 4.4, 5"
CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15"
CpuId = "0.3"
DocStringExtensions = "0.9"
Glob = "1.3"
NVTX = "0.3, 1"
HDF5 = "0.16, 0.17"
NVTX = "0.3, 1"
Reexport = "1.2"
Statistics = "1"
ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1"
TestItemRunner = "0.2, 1"
ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1"
ThreadPools = "2.1.1"
UnicodePlots = "2.8, 3"
julia = "1.9"

Expand Down
3 changes: 2 additions & 1 deletion ext/CUDAExt/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module CUDAExt

using GPUInspector
using CUDA
using ThreadPools

# stdlibs etc.
using Base: UUID
Expand All @@ -26,7 +27,7 @@ using GPUInspector:
NVIDIABackend

# import stubs to implement them
import GPUInspector: backendinfo, functional
import GPUInspector: backendinfo, functional, clear_gpu_memory, clear_all_gpus_memory
# gpuinfo
import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus
# p2p bw
Expand Down
6 changes: 3 additions & 3 deletions ext/CUDAExt/cuda_wrappers.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
function supports_get_temperature(nvml_device::NVML.Device)
temp = Ref{UInt32}()
nvml_return = NVML.unsafe_nvmlDeviceGetTemperature(
nvml_return = NVML.unchecked_nvmlDeviceGetTemperature(
nvml_device, CUDA.NVML.NVML_TEMPERATURE_GPU, temp
)
return nvml_return == NVML.NVML_SUCCESS
Expand Down Expand Up @@ -30,7 +30,7 @@ get_temperatures(devices=CUDA.devices()) = [get_temperature(dev) for dev in devi

function supports_get_power_usage(nvml_device::NVML.Device)
power = Ref{UInt32}()
nvml_return = NVML.unsafe_nvmlDeviceGetPowerUsage(nvml_device, power)
nvml_return = NVML.unchecked_nvmlDeviceGetPowerUsage(nvml_device, power)
return nvml_return == NVML.NVML_SUCCESS
end
function supports_get_power_usage(dev::CuDevice)
Expand All @@ -56,7 +56,7 @@ get_power_usages(devices=CUDA.devices()) = [get_power_usage(dev) for dev in devi

function supports_get_gpu_utilization(nvml_device::NVML.Device)
util = Ref{NVML.nvmlUtilization_t}()
nvml_return = NVML.unsafe_nvmlDeviceGetUtilizationRates(nvml_device, util)
nvml_return = NVML.unchecked_nvmlDeviceGetUtilizationRates(nvml_device, util)
return nvml_return == NVML.NVML_SUCCESS
end
function supports_get_gpu_utilization(dev::CuDevice)
Expand Down
20 changes: 19 additions & 1 deletion ext/CUDAExt/implementations/general.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ function functional(::NVIDIABackend; verbose=true)
verbose && @info("CUDA/GPU available.")
hascuda = true
else
verbose && @info("No CUDA/GPU found.")
verbose && @warn("No CUDA/GPU found.")
hascuda = false
if verbose
# debug information
Expand All @@ -21,3 +21,21 @@ function functional(::NVIDIABackend; verbose=true)
end
return hascuda
end


function clear_gpu_memory(::NVIDIABackend, device::CuDevice=CUDA.device(); gc=true)
device!(device) do
gc && GC.gc()
CUDA.reclaim()
end
return nothing
end


function clear_all_gpus_memory(::NVIDIABackend, devices=CUDA.devices(); gc=true)
gc && GC.gc()
for dev in devices
clear_gpu_memory(NVIDIABackend(), dev; gc=false)
end
return nothing
end
2 changes: 1 addition & 1 deletion ext/CUDAExt/implementations/gpuinfo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ function gpus(::NVIDIABackend; io::IO=stdout)
cap = capability(dev)
mem = device!(dev) do
# this requires a device context, so we prefer NVML
(free=available_memory(), total=total_memory())
(free=available_memory(), total=totalmem(dev))
end
end
println(
Expand Down
20 changes: 0 additions & 20 deletions ext/CUDAExt/utility.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa
return mem_handles
end

# TODO: Maybe make API/stub?
"Reclaim the unused memory of the currently active GPU (i.e. `device()`)."
function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true)
device!(device) do
gc && GC.gc()
CUDA.reclaim()
end
return nothing
end

# TODO: Maybe make API/stub?
"Reclaim the unused memory of all available GPUs."
function clear_all_gpus_memory(devices=CUDA.devices(); gc=true)
gc && GC.gc()
for dev in devices
clear_gpu_memory(dev; gc=false)
end
return nothing
end

"""
toggle_tensorcoremath([enable::Bool; verbose=true])
Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`).
Expand Down
81 changes: 65 additions & 16 deletions ext/CairoMakieExt.jl
Original file line number Diff line number Diff line change
@@ -1,38 +1,87 @@
module CairoMakieExt

using GPUInspector
import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label
import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label, savefig_monitoring_results
using CairoMakie

function GPUInspector.savefig_monitoring_results(
r::MonitoringResults, symbols=keys(r.results); ext=:pdf
function savefig_monitoring_results(
r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix=""
)
for s in symbols
savefig_monitoring_results(r, s; ext)
if symbols isa Symbol
_savefig_monitoring_results(r, symbols; ext, prefix)
else
for s in symbols
_savefig_monitoring_results(r, s; ext, prefix)
end
end
return nothing
end

function GPUInspector.savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf)
function savefig_monitoring_results(filename::String, r::MonitoringResults; symbol=nothing)
if isnothing(symbol)
# If no symbol is provided, we create a tiled summary of ALL results
_savefig_monitoring_results_summary(r; filename)
else
_savefig_monitoring_results(r, symbol; filename)
end
return nothing
end

function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, filename=nothing, prefix="")
times = r.times
values = r.results[s]
title, ylabel = _symbol2title_and_label(s)
ylims = _defaultylims(values)
device_labels = [str for (str, uuid) in r.devices]

f = CairoMakie.Figure(; resolution=(1000, 500))
ax = f[1, 1] = CairoMakie.Axis(f; xlabel="Time [s]", ylabel=ylabel, title=title)
ylims!(ax, ylims)
CairoMakie.scatterlines!(times, getindex.(values, 1); label=device_labels[1])
for i in 2:length(first(values))
CairoMakie.scatterlines!(times, getindex.(values, i); label=device_labels[i])
f = CairoMakie.Figure(; size=(1000, 500))
_plot_metric!(f[1, 1], times, values, title, ylabel, ylims, device_labels)

if isnothing(filename)
clean_title = replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "")
filename = prefix * clean_title * "_plot.$(string(ext))"
end
CairoMakie.save(filename, f)
return nothing
end

function _savefig_monitoring_results_summary(r::MonitoringResults; filename="monitoring_summary.pdf")
symbols = collect(keys(r.results))
n = length(symbols)
times = r.times
device_labels = [str for (str, uuid) in r.devices]

# Create a tall figure to fit all plots
f = CairoMakie.Figure(; size=(1000, 400 * n))

for (i, s) in enumerate(symbols)
values = r.results[s]
title, ylabel = _symbol2title_and_label(s)
ylims = _defaultylims(values)

# Plot into the i-th row
_plot_metric!(f[i, 1], times, values, title, ylabel, ylims, device_labels; show_legend=(i==1))
end
f[1, 2] = CairoMakie.Legend(f, ax, "Devices"; framevisible=false)
filename =
replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") *
"_plot.$(string(ext))"

CairoMakie.save(filename, f)
return nothing
end

function _plot_metric!(target, times, values, title, ylabel, ylims, device_labels; show_legend=true)
ax = CairoMakie.Axis(target; xlabel="Time [s]", ylabel=ylabel, title=title)
CairoMakie.ylims!(ax, ylims)

# Use a color cycle for devices
for i in 1:length(first(values))
CairoMakie.scatterlines!(ax, times, getindex.(values, i); label=device_labels[i])
end

if show_legend && length(device_labels) > 0
# Position legend based on whether it's a single plot or tiled
# For simplicity in tiled, we'll put it to the right of the first plot
# but CairoMakie figures handle this via Layout positions
end
return ax
end

end # module
3 changes: 2 additions & 1 deletion src/GPUInspector.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ using Pkg: Pkg
# external
using Reexport
@reexport using ThreadPinning
using ThreadPools
using DocStringExtensions
using UnicodePlots
using CpuId: cachesize
Expand Down Expand Up @@ -56,7 +57,7 @@ export logspace
# export stresstest_cpu

# stubs gpuinfo
export ngpus, gpuinfo, gpuinfo_p2p_access, gpus
export ngpus, gpuinfo, gpuinfo_p2p_access, gpus, clear_gpu_memory, clear_all_gpus_memory
# stubs p2p bandwidth
export p2p_bandwidth,
p2p_bandwidth_all, p2p_bandwidth_bidirectional, p2p_bandwidth_bidirectional_all
Expand Down
18 changes: 18 additions & 0 deletions src/stubs/stubs_general.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,21 @@ If not, print some hopefully useful debug information (or turn it off with `verb
"""
functional(; kwargs...) = functional(backend(); kwargs...)
functional(::Backend; kwargs...) = not_implemented_yet()


"""
clear_gpu_memory([device]; kwargs...)
Reclaim the unused memory of the given GPU (default: currently active GPU).
"""
clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...)
clear_gpu_memory(device; kwargs...) = clear_gpu_memory(backend(), device; kwargs...)
clear_gpu_memory(::Backend, args...; kwargs...) = not_implemented_yet()


"""
clear_all_gpus_memory([devices]; kwargs...)
Reclaim the unused memory of all available GPUs.
"""
clear_all_gpus_memory(; kwargs...) = clear_all_gpus_memory(backend(); kwargs...)
clear_all_gpus_memory(devices; kwargs...) = clear_all_gpus_memory(backend(), devices; kwargs...)
clear_all_gpus_memory(::Backend, args...; kwargs...) = not_implemented_yet()
5 changes: 3 additions & 2 deletions src/stubs/stubs_monitoring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ end
livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet()

"""
savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf)
savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix="")
savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...)
Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk.
**Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl.
"""
function savefig_monitoring_results(r::Any, symbols::Any=nothing; ext=:pdf)
function savefig_monitoring_results(args...; kwargs...)
return error("You need to load CairoMakie.jl first.")
end