diff --git a/Project.toml b/Project.toml index 95c470c..1aab1c4 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" +ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [weakdeps] @@ -28,17 +29,18 @@ CUDAExt = "CUDA" CairoMakieExt = "CairoMakie" [compat] -CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15" CUDA = "3.8.4, 3.12, 4.4, 5" +CairoMakie = "0.7, 0.10.7, 0.11, 0.12, 0.15" CpuId = "0.3" DocStringExtensions = "0.9" Glob = "1.3" -NVTX = "0.3, 1" HDF5 = "0.16, 0.17" +NVTX = "0.3, 1" Reexport = "1.2" Statistics = "1" -ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1" TestItemRunner = "0.2, 1" +ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7, 1" +ThreadPools = "2.1.1" UnicodePlots = "2.8, 3" julia = "1.9" diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index 9d2a770..162ecd2 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -2,6 +2,7 @@ module CUDAExt using GPUInspector using CUDA +using ThreadPools # stdlibs etc. using Base: UUID @@ -26,7 +27,7 @@ using GPUInspector: NVIDIABackend # import stubs to implement them -import GPUInspector: backendinfo, functional +import GPUInspector: backendinfo, functional, clear_gpu_memory, clear_all_gpus_memory # gpuinfo import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus # p2p bw diff --git a/ext/CUDAExt/cuda_wrappers.jl b/ext/CUDAExt/cuda_wrappers.jl index a5ebad9..7de602b 100644 --- a/ext/CUDAExt/cuda_wrappers.jl +++ b/ext/CUDAExt/cuda_wrappers.jl @@ -1,6 +1,6 @@ function supports_get_temperature(nvml_device::NVML.Device) temp = Ref{UInt32}() - nvml_return = NVML.unsafe_nvmlDeviceGetTemperature( + nvml_return = NVML.unchecked_nvmlDeviceGetTemperature( nvml_device, CUDA.NVML.NVML_TEMPERATURE_GPU, temp ) return nvml_return == NVML.NVML_SUCCESS @@ -30,7 +30,7 @@ get_temperatures(devices=CUDA.devices()) = [get_temperature(dev) for dev in devi function supports_get_power_usage(nvml_device::NVML.Device) power = Ref{UInt32}() - nvml_return = NVML.unsafe_nvmlDeviceGetPowerUsage(nvml_device, power) + nvml_return = NVML.unchecked_nvmlDeviceGetPowerUsage(nvml_device, power) return nvml_return == NVML.NVML_SUCCESS end function supports_get_power_usage(dev::CuDevice) @@ -56,7 +56,7 @@ get_power_usages(devices=CUDA.devices()) = [get_power_usage(dev) for dev in devi function supports_get_gpu_utilization(nvml_device::NVML.Device) util = Ref{NVML.nvmlUtilization_t}() - nvml_return = NVML.unsafe_nvmlDeviceGetUtilizationRates(nvml_device, util) + nvml_return = NVML.unchecked_nvmlDeviceGetUtilizationRates(nvml_device, util) return nvml_return == NVML.NVML_SUCCESS end function supports_get_gpu_utilization(dev::CuDevice) diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl index b44e5fd..e31dc57 100644 --- a/ext/CUDAExt/implementations/general.jl +++ b/ext/CUDAExt/implementations/general.jl @@ -3,7 +3,7 @@ function functional(::NVIDIABackend; verbose=true) verbose && @info("CUDA/GPU available.") hascuda = true else - verbose && @info("No CUDA/GPU found.") + verbose && @warn("No CUDA/GPU found.") hascuda = false if verbose # debug information @@ -21,3 +21,21 @@ function functional(::NVIDIABackend; verbose=true) end return hascuda end + + +function clear_gpu_memory(::NVIDIABackend, device::CuDevice=CUDA.device(); gc=true) + device!(device) do + gc && GC.gc() + CUDA.reclaim() + end + return nothing +end + + +function clear_all_gpus_memory(::NVIDIABackend, devices=CUDA.devices(); gc=true) + gc && GC.gc() + for dev in devices + clear_gpu_memory(NVIDIABackend(), dev; gc=false) + end + return nothing +end diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index b960b08..cc7c447 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -24,7 +24,7 @@ function gpus(::NVIDIABackend; io::IO=stdout) cap = capability(dev) mem = device!(dev) do # this requires a device context, so we prefer NVML - (free=available_memory(), total=total_memory()) + (free=available_memory(), total=totalmem(dev)) end end println( diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl index 7f6a504..3c9681d 100644 --- a/ext/CUDAExt/utility.jl +++ b/ext/CUDAExt/utility.jl @@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa return mem_handles end -# TODO: Maybe make API/stub? -"Reclaim the unused memory of the currently active GPU (i.e. `device()`)." -function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true) - device!(device) do - gc && GC.gc() - CUDA.reclaim() - end - return nothing -end - -# TODO: Maybe make API/stub? -"Reclaim the unused memory of all available GPUs." -function clear_all_gpus_memory(devices=CUDA.devices(); gc=true) - gc && GC.gc() - for dev in devices - clear_gpu_memory(dev; gc=false) - end - return nothing -end - """ toggle_tensorcoremath([enable::Bool; verbose=true]) Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`). diff --git a/ext/CairoMakieExt.jl b/ext/CairoMakieExt.jl index c1a3089..1643c70 100644 --- a/ext/CairoMakieExt.jl +++ b/ext/CairoMakieExt.jl @@ -1,38 +1,87 @@ module CairoMakieExt using GPUInspector -import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label +import GPUInspector: MonitoringResults, _defaultylims, _symbol2title_and_label, savefig_monitoring_results using CairoMakie -function GPUInspector.savefig_monitoring_results( - r::MonitoringResults, symbols=keys(r.results); ext=:pdf +function savefig_monitoring_results( + r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix="" ) - for s in symbols - savefig_monitoring_results(r, s; ext) + if symbols isa Symbol + _savefig_monitoring_results(r, symbols; ext, prefix) + else + for s in symbols + _savefig_monitoring_results(r, s; ext, prefix) + end end return nothing end -function GPUInspector.savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf) +function savefig_monitoring_results(filename::String, r::MonitoringResults; symbol=nothing) + if isnothing(symbol) + # If no symbol is provided, we create a tiled summary of ALL results + _savefig_monitoring_results_summary(r; filename) + else + _savefig_monitoring_results(r, symbol; filename) + end + return nothing +end + +function _savefig_monitoring_results(r::MonitoringResults, s::Symbol; ext=:pdf, filename=nothing, prefix="") times = r.times values = r.results[s] title, ylabel = _symbol2title_and_label(s) ylims = _defaultylims(values) device_labels = [str for (str, uuid) in r.devices] - f = CairoMakie.Figure(; resolution=(1000, 500)) - ax = f[1, 1] = CairoMakie.Axis(f; xlabel="Time [s]", ylabel=ylabel, title=title) - ylims!(ax, ylims) - CairoMakie.scatterlines!(times, getindex.(values, 1); label=device_labels[1]) - for i in 2:length(first(values)) - CairoMakie.scatterlines!(times, getindex.(values, i); label=device_labels[i]) + f = CairoMakie.Figure(; size=(1000, 500)) + _plot_metric!(f[1, 1], times, values, title, ylabel, ylims, device_labels) + + if isnothing(filename) + clean_title = replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") + filename = prefix * clean_title * "_plot.$(string(ext))" + end + CairoMakie.save(filename, f) + return nothing +end + +function _savefig_monitoring_results_summary(r::MonitoringResults; filename="monitoring_summary.pdf") + symbols = collect(keys(r.results)) + n = length(symbols) + times = r.times + device_labels = [str for (str, uuid) in r.devices] + + # Create a tall figure to fit all plots + f = CairoMakie.Figure(; size=(1000, 400 * n)) + + for (i, s) in enumerate(symbols) + values = r.results[s] + title, ylabel = _symbol2title_and_label(s) + ylims = _defaultylims(values) + + # Plot into the i-th row + _plot_metric!(f[i, 1], times, values, title, ylabel, ylims, device_labels; show_legend=(i==1)) end - f[1, 2] = CairoMakie.Legend(f, ax, "Devices"; framevisible=false) - filename = - replace(replace(replace(lowercase(title), " " => "_"), "(" => ""), ")" => "") * - "_plot.$(string(ext))" + CairoMakie.save(filename, f) return nothing end +function _plot_metric!(target, times, values, title, ylabel, ylims, device_labels; show_legend=true) + ax = CairoMakie.Axis(target; xlabel="Time [s]", ylabel=ylabel, title=title) + CairoMakie.ylims!(ax, ylims) + + # Use a color cycle for devices + for i in 1:length(first(values)) + CairoMakie.scatterlines!(ax, times, getindex.(values, i); label=device_labels[i]) + end + + if show_legend && length(device_labels) > 0 + # Position legend based on whether it's a single plot or tiled + # For simplicity in tiled, we'll put it to the right of the first plot + # but CairoMakie figures handle this via Layout positions + end + return ax +end + end # module diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 9dbe9e9..06cf434 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -11,6 +11,7 @@ using Pkg: Pkg # external using Reexport @reexport using ThreadPinning +using ThreadPools using DocStringExtensions using UnicodePlots using CpuId: cachesize @@ -56,7 +57,7 @@ export logspace # export stresstest_cpu # stubs gpuinfo -export ngpus, gpuinfo, gpuinfo_p2p_access, gpus +export ngpus, gpuinfo, gpuinfo_p2p_access, gpus, clear_gpu_memory, clear_all_gpus_memory # stubs p2p bandwidth export p2p_bandwidth, p2p_bandwidth_all, p2p_bandwidth_bidirectional, p2p_bandwidth_bidirectional_all diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl index eee42ed..c633b79 100644 --- a/src/stubs/stubs_general.jl +++ b/src/stubs/stubs_general.jl @@ -4,3 +4,21 @@ If not, print some hopefully useful debug information (or turn it off with `verb """ functional(; kwargs...) = functional(backend(); kwargs...) functional(::Backend; kwargs...) = not_implemented_yet() + + +""" + clear_gpu_memory([device]; kwargs...) +Reclaim the unused memory of the given GPU (default: currently active GPU). +""" +clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...) +clear_gpu_memory(device; kwargs...) = clear_gpu_memory(backend(), device; kwargs...) +clear_gpu_memory(::Backend, args...; kwargs...) = not_implemented_yet() + + +""" + clear_all_gpus_memory([devices]; kwargs...) +Reclaim the unused memory of all available GPUs. +""" +clear_all_gpus_memory(; kwargs...) = clear_all_gpus_memory(backend(); kwargs...) +clear_all_gpus_memory(devices; kwargs...) = clear_all_gpus_memory(backend(), devices; kwargs...) +clear_all_gpus_memory(::Backend, args...; kwargs...) = not_implemented_yet() diff --git a/src/stubs/stubs_monitoring.jl b/src/stubs/stubs_monitoring.jl index 5ad00db..72dae9a 100644 --- a/src/stubs/stubs_monitoring.jl +++ b/src/stubs/stubs_monitoring.jl @@ -71,10 +71,11 @@ end livemonitor_powerusage(::Backend, args...; kwargs...) = not_implemented_yet() """ - savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf) + savefig_monitoring_results(r::MonitoringResults, symbols=keys(r.results); ext=:pdf, prefix="") + savefig_monitoring_results(filename::String, r::MonitoringResults; kwargs...) Save plots of the quantities specified through `symbols` of a `MonitoringResults` object to disk. **Note:** Only available if CairoMakie.jl is loaded next to GPUInspector.jl. """ -function savefig_monitoring_results(r::Any, symbols::Any=nothing; ext=:pdf) +function savefig_monitoring_results(args...; kwargs...) return error("You need to load CairoMakie.jl first.") end