diff --git a/Makefile b/Makefile index 4953479..acf5e2f 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,18 @@ CC = hipcc -GPU_ARCH=gfx90a +# Auto-detect GPU architecture, fallback to gfx942 if detection fails +GPU_ARCH ?= $(shell if command -v rocminfo >/dev/null 2>&1; then rocminfo 2>/dev/null | grep "Name:" | grep "gfx" | awk 'NR==1' | awk '{print $$2}'; else echo "gfx942"; fi) +# If GPU_ARCH is empty after detection, use default +ifeq ($(GPU_ARCH),) +GPU_ARCH = gfx942 +endif SRCS_ALL = $(shell echo *.cpp) CFLAGS = -O3 -Wall -fPIC --offload-arch=$(GPU_ARCH) SRCS = $(SRCS_ALL) -INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer -LIBS = -L$(ROCM_PATH)/roctracer/lib -lroctracer64 -lroctx64 +INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer -I$(ROCM_PATH)/include/rocprofiler-sdk-roctx +LIBS = -L$(ROCM_PATH)/roctracer/lib -L$(ROCM_PATH)/lib -lroctracer64 -lroctx64 -lrocprofiler-sdk-roctx OBJS = $(SRCS:.c=.o) @@ -16,7 +21,11 @@ TARGET = libHIPcode.so .PHONY: clean all: $(TARGET) - @echo Successfully compiled ${TARGET} library. + @echo Successfully compiled ${TARGET} library with GPU_ARCH=$(GPU_ARCH). + +info: + @echo "Detected GPU_ARCH: $(GPU_ARCH)" + @echo "To override, use: make GPU_ARCH=" $(TARGET): $(OBJS) $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -shared -o $(TARGET) $(OBJS) @@ -24,4 +33,4 @@ $(TARGET): $(OBJS) .c.o: $(CC) $(CFLAGS) $(INCLUDES) -cpp $< -o $@ clean: - $(RM) *.o ${TARGET} \ No newline at end of file + $(RM) *.o ${TARGET} diff --git a/README.md b/README.md index 6a7b648..83d163e 100644 --- a/README.md +++ b/README.md @@ -5,18 +5,34 @@ Example: Using roctx calls within a Python program Step 1: Compile the hip_code library ``` -module load rocm/6.0.0 +module load rocm make ``` +This will create `libHIPcode.so` which is used in the python code downstream. + Step 2: Run the script to make sure it works + +**Host example:** ``` python3 roctx_example.py ``` +**PyTorch GPU example:** +``` +python3 roctx_example_gpu.py +``` + Step 3: Get the roctx trace using rocprof + +**Host example:** +``` +rocprofv3 --marker-trace --output-format pftrace -- python3 roctx_example.py +``` + +**PyTorch GPU example:** ``` -rocprof --roctx-trace -d rocprof_output -o rocprof_output/results.csv python3 roctx_example.py +rocprofv3 --kernel-trace --marker-trace --output-format pftrace -d ex1 -o ex1 -- python3 roctx_example_gpu.py ``` -Step 4: Copy the **rocprof_output/results.json** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/) \ No newline at end of file +Step 4: Copy the **pftrace** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/) diff --git a/hip_tools.cpp b/hip_tools.cpp index f0dcc38..ec7513d 100644 --- a/hip_tools.cpp +++ b/hip_tools.cpp @@ -1,7 +1,7 @@ #include #include -#include -#include + +#include #include #define CHECK(command) { \ @@ -32,15 +32,25 @@ int set_device( int proc_id ) { } void start_roctracer(){ - roctracer_start(); + roctx_thread_id_t tid; + roctxGetThreadId(&tid); + roctxProfilerResume(tid); + //roctracer_start(); +} + +int get_roctx_tid(){ + roctx_thread_id_t tid; + roctxGetThreadId(&tid); + return static_cast(tid); } void stop_roctracer(){ - roctracer_stop(); + roctx_thread_id_t tid; + roctxGetThreadId(&tid); + roctxProfilerPause(tid); } int roctxr_start( char *c){ - // std::cout << "Starting roctx marker: " << c << std::endl; int id = roctxRangeStart(c); return id; } @@ -58,4 +68,4 @@ void roctxr_pop(){ } -} \ No newline at end of file +} diff --git a/hip_tools.py b/hip_tools.py index 5d1071b..afa548e 100644 --- a/hip_tools.py +++ b/hip_tools.py @@ -1,8 +1,18 @@ encode = lambda s : s.encode('utf-8') +# Needed so that we do not call start when already corresponding id is started +roctx_ids_start = set() +roctx_ids_stop = set() + class HIP_tools: - def __init__(self, hip_lib_path): + + """ + @param {string} hip_lib_path: path to compiled libHIPcode.so + @param {bool} stop_on_it prevents starting of rocprof on initialization. Useful for profiling ML workloads + + """ + def __init__(self, hip_lib_path, stop_on_init = True): from ctypes import cdll, c_int, c_char_p @@ -12,7 +22,7 @@ def __init__(self, hip_lib_path): self._set_device = libhip.set_device self._set_device.argtypes = [ c_int ] self._set_device.resypes = c_int - + self._get_tid = libhip.get_roctx_tid self._start_roctracer = libhip.start_roctracer self._stop_roctracer = libhip.stop_roctracer self._roctxr_push = libhip.roctxr_push @@ -23,16 +33,27 @@ def __init__(self, hip_lib_path): self._roctxr_start.argtypes = [ c_char_p ] self._roctxr_start.resypes = c_int self._roctxr_stop.argtypes = [ c_int ] + if stop_on_init: + self.stop_roctracer() def set_device(self, device_id ): self._set_device(device_id) def start_roctracer(self): - self._start_roctracer() + tid = self._get_tid() + if(tid not in roctx_ids_start): + self._start_roctracer() + roctx_ids_start.add(tid) + roctx_ids_stop.discard(tid) def stop_roctracer(self): - self._stop_roctracer() + tid = self._get_tid() + if(tid not in roctx_ids_stop): + self._stop_roctracer() + roctx_ids_stop.add(tid) + roctx_ids_start.discard(tid) + def start_marker( self, marker_name ): marker_id = self._roctxr_start( encode(marker_name) ) diff --git a/roctx_example.py b/roctx_example.py index 5ca8341..6b93d64 100644 --- a/roctx_example.py +++ b/roctx_example.py @@ -11,7 +11,7 @@ hip_lib_path = f'{work_dir}/libHIPcode.so' # Initialize the roctracer tools -hip_tools = HIP_tools( hip_lib_path ) +hip_tools = HIP_tools( hip_lib_path,True ) if use_torch: if not torch.cuda.is_available(): @@ -25,7 +25,7 @@ print('Setting hip device 0') hip_tools.set_device(0) - +hip_tools.start_roctracer() # Do some fun stuff id_init = hip_tools.start_marker('init') @@ -33,18 +33,21 @@ A = np.random.rand( nx, ny ) B = np.random.rand( nx, ny ) -hip_tools.stop_marker(id_init) - -id_main = hip_tools.start_marker('main') n_iterations = 20 for i in range(n_iterations): - + + #Only profiling even number + if(i%2 == 0): + hip_tools.start_roctracer() + id_iter = hip_tools.start_marker(f'iter_{i}') print( f'iteration: {i}') C = np.matmul(A, B) hip_tools.stop_marker(id_iter) + if(i%2 == 0): + hip_tools.stop_roctracer() + print('Finished successfully') -hip_tools.stop_marker(id_main) \ No newline at end of file diff --git a/roctx_example_gpu.py b/roctx_example_gpu.py new file mode 100644 index 0000000..b0f9ae9 --- /dev/null +++ b/roctx_example_gpu.py @@ -0,0 +1,54 @@ +import os, sys +import torch +from hip_tools import HIP_tools + + +# Get the path to the hip code library +work_dir = os.getcwd() +hip_lib_path = f'{work_dir}/libHIPcode.so' + +# Initialize the roctracer tools +hip_tools = HIP_tools( hip_lib_path, True ) + +# Check if CUDA/ROCm is available +if not torch.cuda.is_available(): + print('Warning: GPU not found, using CPU') + device = torch.device('cpu') +else: + print('Setting torch device cuda') + device = torch.device('cuda') + # Set the device: Needed since nothing else initialize the device + hip_tools.set_device(0) + +hip_tools.start_roctracer() +# Do some fun stuff +id_init = hip_tools.start_marker('init') + +nx, ny = 128, 128 +A = torch.randn(nx, ny, device=device) +B = torch.randn(nx, ny, device=device) + +hip_tools.stop_marker(id_init) + +n_iterations = 5 +for i in range(n_iterations): + + #Only profiling even number + if(i%2 == 0): + hip_tools.start_roctracer() + + id_iter = hip_tools.start_marker(f'iter_{i}') + print(f'iteration: {i}') + C = torch.matmul(A, B) + + # Synchronize to ensure GPU operation completes + if device.type == 'cuda': + torch.cuda.synchronize(device=device) + + hip_tools.stop_marker(id_iter) + + if(i%2 == 0): + hip_tools.stop_roctracer() + +print('Finished successfully') +