Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
CC = hipcc
GPU_ARCH=gfx90a
# Auto-detect GPU architecture, fallback to gfx942 if detection fails
GPU_ARCH ?= $(shell if command -v rocminfo >/dev/null 2>&1; then rocminfo 2>/dev/null | grep "Name:" | grep "gfx" | awk 'NR==1' | awk '{print $$2}'; else echo "gfx942"; fi)
# If GPU_ARCH is empty after detection, use default
ifeq ($(GPU_ARCH),)
GPU_ARCH = gfx942
endif

SRCS_ALL = $(shell echo *.cpp)

CFLAGS = -O3 -Wall -fPIC --offload-arch=$(GPU_ARCH)
SRCS = $(SRCS_ALL)

INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer
LIBS = -L$(ROCM_PATH)/roctracer/lib -lroctracer64 -lroctx64
INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer -I$(ROCM_PATH)/include/rocprofiler-sdk-roctx
LIBS = -L$(ROCM_PATH)/roctracer/lib -L$(ROCM_PATH)/lib -lroctracer64 -lroctx64 -lrocprofiler-sdk-roctx


OBJS = $(SRCS:.c=.o)
Expand All @@ -16,12 +21,16 @@ TARGET = libHIPcode.so
.PHONY: clean

all: $(TARGET)
@echo Successfully compiled ${TARGET} library.
@echo Successfully compiled ${TARGET} library with GPU_ARCH=$(GPU_ARCH).

info:
@echo "Detected GPU_ARCH: $(GPU_ARCH)"
@echo "To override, use: make GPU_ARCH=<arch>"

$(TARGET): $(OBJS)
$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -shared -o $(TARGET) $(OBJS)

.c.o:
$(CC) $(CFLAGS) $(INCLUDES) -cpp $< -o $@
clean:
$(RM) *.o ${TARGET}
$(RM) *.o ${TARGET}
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,34 @@ Example: Using roctx calls within a Python program

Step 1: Compile the hip_code library
```
module load rocm/6.0.0
module load rocm
make
```
This will create `libHIPcode.so` which is used in the python code downstream.


Step 2: Run the script to make sure it works

**Host example:**
```
python3 roctx_example.py
```

**PyTorch GPU example:**
```
python3 roctx_example_gpu.py
```

Step 3: Get the roctx trace using rocprof

**Host example:**
```
rocprofv3 --marker-trace --output-format pftrace -- python3 roctx_example.py
```

**PyTorch GPU example:**
```
rocprof --roctx-trace -d rocprof_output -o rocprof_output/results.csv python3 roctx_example.py
rocprofv3 --kernel-trace --marker-trace --output-format pftrace -d ex1 -o ex1 -- python3 roctx_example_gpu.py
```

Step 4: Copy the **rocprof_output/results.json** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/)
Step 4: Copy the **pftrace** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/)
22 changes: 16 additions & 6 deletions hip_tools.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <stdio.h>
#include <iostream>
#include <roctx.h>
#include <roctracer_ext.h>

#include <rocprofiler-sdk-roctx/roctx.h>
#include <hip/hip_runtime.h>

#define CHECK(command) { \
Expand Down Expand Up @@ -32,15 +32,25 @@ int set_device( int proc_id ) {
}

void start_roctracer(){
roctracer_start();
roctx_thread_id_t tid;
roctxGetThreadId(&tid);
roctxProfilerResume(tid);
//roctracer_start();
}

int get_roctx_tid(){
roctx_thread_id_t tid;
roctxGetThreadId(&tid);
return static_cast<int>(tid);
}

void stop_roctracer(){
roctracer_stop();
roctx_thread_id_t tid;
roctxGetThreadId(&tid);
roctxProfilerPause(tid);
}

int roctxr_start( char *c){
// std::cout << "Starting roctx marker: " << c << std::endl;
int id = roctxRangeStart(c);
return id;
}
Expand All @@ -58,4 +68,4 @@ void roctxr_pop(){
}


}
}
29 changes: 25 additions & 4 deletions hip_tools.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@

encode = lambda s : s.encode('utf-8')

# Needed so that we do not call start when already corresponding id is started
roctx_ids_start = set()
roctx_ids_stop = set()

class HIP_tools:
def __init__(self, hip_lib_path):

"""
@param {string} hip_lib_path: path to compiled libHIPcode.so
@param {bool} stop_on_it prevents starting of rocprof on initialization. Useful for profiling ML workloads

"""
def __init__(self, hip_lib_path, stop_on_init = True):


from ctypes import cdll, c_int, c_char_p
Expand All @@ -12,7 +22,7 @@ def __init__(self, hip_lib_path):
self._set_device = libhip.set_device
self._set_device.argtypes = [ c_int ]
self._set_device.resypes = c_int

self._get_tid = libhip.get_roctx_tid
self._start_roctracer = libhip.start_roctracer
self._stop_roctracer = libhip.stop_roctracer
self._roctxr_push = libhip.roctxr_push
Expand All @@ -23,16 +33,27 @@ def __init__(self, hip_lib_path):
self._roctxr_start.argtypes = [ c_char_p ]
self._roctxr_start.resypes = c_int
self._roctxr_stop.argtypes = [ c_int ]
if stop_on_init:
self.stop_roctracer()


def set_device(self, device_id ):
self._set_device(device_id)

def start_roctracer(self):
self._start_roctracer()
tid = self._get_tid()
if(tid not in roctx_ids_start):
self._start_roctracer()
roctx_ids_start.add(tid)
roctx_ids_stop.discard(tid)

def stop_roctracer(self):
self._stop_roctracer()
tid = self._get_tid()
if(tid not in roctx_ids_stop):
self._stop_roctracer()
roctx_ids_stop.add(tid)
roctx_ids_start.discard(tid)


def start_marker( self, marker_name ):
marker_id = self._roctxr_start( encode(marker_name) )
Expand Down
17 changes: 10 additions & 7 deletions roctx_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
hip_lib_path = f'{work_dir}/libHIPcode.so'

# Initialize the roctracer tools
hip_tools = HIP_tools( hip_lib_path )
hip_tools = HIP_tools( hip_lib_path,True )

if use_torch:
if not torch.cuda.is_available():
Expand All @@ -25,26 +25,29 @@
print('Setting hip device 0')
hip_tools.set_device(0)


hip_tools.start_roctracer()
# Do some fun stuff
id_init = hip_tools.start_marker('init')

nx, ny = 1024, 1024
A = np.random.rand( nx, ny )
B = np.random.rand( nx, ny )

hip_tools.stop_marker(id_init)

id_main = hip_tools.start_marker('main')

n_iterations = 20
for i in range(n_iterations):


#Only profiling even number
if(i%2 == 0):
hip_tools.start_roctracer()

id_iter = hip_tools.start_marker(f'iter_{i}')
print( f'iteration: {i}')
C = np.matmul(A, B)

hip_tools.stop_marker(id_iter)

if(i%2 == 0):
hip_tools.stop_roctracer()

print('Finished successfully')
hip_tools.stop_marker(id_main)
54 changes: 54 additions & 0 deletions roctx_example_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os, sys
import torch
from hip_tools import HIP_tools


# Get the path to the hip code library
work_dir = os.getcwd()
hip_lib_path = f'{work_dir}/libHIPcode.so'

# Initialize the roctracer tools
hip_tools = HIP_tools( hip_lib_path, True )

# Check if CUDA/ROCm is available
if not torch.cuda.is_available():
print('Warning: GPU not found, using CPU')
device = torch.device('cpu')
else:
print('Setting torch device cuda')
device = torch.device('cuda')
# Set the device: Needed since nothing else initialize the device
hip_tools.set_device(0)

hip_tools.start_roctracer()
# Do some fun stuff
id_init = hip_tools.start_marker('init')

nx, ny = 128, 128
A = torch.randn(nx, ny, device=device)
B = torch.randn(nx, ny, device=device)

hip_tools.stop_marker(id_init)

n_iterations = 5
for i in range(n_iterations):

#Only profiling even number
if(i%2 == 0):
hip_tools.start_roctracer()

id_iter = hip_tools.start_marker(f'iter_{i}')
print(f'iteration: {i}')
C = torch.matmul(A, B)

# Synchronize to ensure GPU operation completes
if device.type == 'cuda':
torch.cuda.synchronize(device=device)

hip_tools.stop_marker(id_iter)

if(i%2 == 0):
hip_tools.stop_roctracer()

print('Finished successfully')