diff --git a/Makefile b/Makefile
index 4953479..acf5e2f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,18 @@
 CC = hipcc
-GPU_ARCH=gfx90a
+# Auto-detect GPU architecture, fallback to gfx942 if detection fails
+GPU_ARCH ?= $(shell if command -v rocminfo >/dev/null 2>&1; then rocminfo 2>/dev/null | grep "Name:" | grep "gfx" | awk 'NR==1' | awk '{print $$2}'; else echo "gfx942"; fi)
+# If GPU_ARCH is empty after detection, use default
+ifeq ($(GPU_ARCH),)
+GPU_ARCH = gfx942
+endif
 
 SRCS_ALL = $(shell echo *.cpp)
 
 CFLAGS = -O3 -Wall -fPIC --offload-arch=$(GPU_ARCH) 
 SRCS = $(SRCS_ALL)
 
-INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer
-LIBS = -L$(ROCM_PATH)/roctracer/lib -lroctracer64 -lroctx64
+INCLUDES = -I/. -I$(ROCM_PATH)/include/roctracer -I$(ROCM_PATH)/include/rocprofiler-sdk-roctx
+LIBS = -L$(ROCM_PATH)/roctracer/lib -L$(ROCM_PATH)/lib -lroctracer64 -lroctx64 -lrocprofiler-sdk-roctx
 
 
 OBJS = $(SRCS:.c=.o)
@@ -16,7 +21,11 @@ TARGET = libHIPcode.so
 .PHONY: clean
     
 all:    $(TARGET)
-	@echo  Successfully compiled ${TARGET} library.
+	@echo  Successfully compiled ${TARGET} library with GPU_ARCH=$(GPU_ARCH).
+
+info:
+	@echo "Detected GPU_ARCH: $(GPU_ARCH)"
+	@echo "To override, use: make GPU_ARCH=<arch>"
 
 $(TARGET): $(OBJS) 
 	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -shared -o $(TARGET) $(OBJS) 
@@ -24,4 +33,4 @@ $(TARGET): $(OBJS)
 .c.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -cpp $<  -o $@
 clean:
-	$(RM) *.o ${TARGET}
\ No newline at end of file
+	$(RM) *.o ${TARGET}
diff --git a/README.md b/README.md
index 6a7b648..83d163e 100644
--- a/README.md
+++ b/README.md
@@ -5,18 +5,34 @@ Example: Using roctx calls within a Python program
 
 Step 1: Compile the hip_code library
 ```
-module load rocm/6.0.0
+module load rocm
 make
 ```
+This will create `libHIPcode.so` which is used in the python code downstream.
+
 
 Step 2: Run the script to make sure it works
+
+**Host example:**
 ```
 python3 roctx_example.py
 ```
 
+**PyTorch GPU example:**
+```
+python3 roctx_example_gpu.py
+```
+
 Step 3: Get the roctx trace using rocprof
+
+**Host example:**
+```
+rocprofv3 --marker-trace --output-format pftrace -- python3 roctx_example.py
+```
+
+**PyTorch GPU example:**
 ```
-rocprof --roctx-trace -d rocprof_output -o rocprof_output/results.csv python3 roctx_example.py
+rocprofv3 --kernel-trace --marker-trace --output-format pftrace -d ex1 -o ex1 -- python3 roctx_example_gpu.py
 ```
 
-Step 4: Copy the **rocprof_output/results.json** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/)
\ No newline at end of file
+Step 4: Copy the **pftrace** file to your system and visualize in [Perfetto](https://ui.perfetto.dev/)
diff --git a/hip_tools.cpp b/hip_tools.cpp
index f0dcc38..ec7513d 100644
--- a/hip_tools.cpp
+++ b/hip_tools.cpp
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <iostream>
-#include <roctx.h>
-#include <roctracer_ext.h>
+
+#include <rocprofiler-sdk-roctx/roctx.h>
 #include <hip/hip_runtime.h>
 
 #define CHECK(command) {   \
@@ -32,15 +32,25 @@ int set_device( int proc_id ) {
 }
 
 void start_roctracer(){
-  roctracer_start();
+  roctx_thread_id_t tid;
+  roctxGetThreadId(&tid);
+  roctxProfilerResume(tid);
+     //roctracer_start();
+}
+
+int get_roctx_tid(){
+  roctx_thread_id_t tid;
+  roctxGetThreadId(&tid);
+  return static_cast<int>(tid);
 }
 
 void stop_roctracer(){
-  roctracer_stop();
+  roctx_thread_id_t tid;
+  roctxGetThreadId(&tid);
+  roctxProfilerPause(tid);
 }
 
 int roctxr_start( char *c){
-  // std::cout << "Starting roctx marker: " << c << std::endl; 
   int id = roctxRangeStart(c);
   return id;
 }
@@ -58,4 +68,4 @@ void roctxr_pop(){
 }
 
 
-}
\ No newline at end of file
+}
diff --git a/hip_tools.py b/hip_tools.py
index 5d1071b..afa548e 100644
--- a/hip_tools.py
+++ b/hip_tools.py
@@ -1,8 +1,18 @@
 
 encode = lambda s : s.encode('utf-8')
 
+# Needed so that we do not call start when already corresponding id is started
+roctx_ids_start = set()
+roctx_ids_stop = set()
+
 class HIP_tools:    
-  def __init__(self,  hip_lib_path):
+  
+  """
+  @param {string} hip_lib_path: path to compiled libHIPcode.so
+  @param {bool} stop_on_it prevents starting of rocprof on initialization. Useful for profiling ML workloads
+
+  """
+  def __init__(self,  hip_lib_path, stop_on_init = True):
 
 
     from ctypes import cdll, c_int, c_char_p        
@@ -12,7 +22,7 @@ def __init__(self,  hip_lib_path):
     self._set_device = libhip.set_device
     self._set_device.argtypes = [ c_int ]
     self._set_device.resypes = c_int
-
+    self._get_tid = libhip.get_roctx_tid
     self._start_roctracer = libhip.start_roctracer
     self._stop_roctracer = libhip.stop_roctracer
     self._roctxr_push = libhip.roctxr_push
@@ -23,16 +33,27 @@ def __init__(self,  hip_lib_path):
     self._roctxr_start.argtypes = [ c_char_p ]
     self._roctxr_start.resypes = c_int 
     self._roctxr_stop.argtypes = [ c_int ]
+    if stop_on_init:
+        self.stop_roctracer()
   
 
   def set_device(self, device_id ):
     self._set_device(device_id)
 
   def start_roctracer(self):
-    self._start_roctracer()
+    tid = self._get_tid()
+    if(tid not in roctx_ids_start):
+      self._start_roctracer()
+      roctx_ids_start.add(tid)
+      roctx_ids_stop.discard(tid)
 
   def stop_roctracer(self):
-    self._stop_roctracer()   
+    tid = self._get_tid()
+    if(tid not in roctx_ids_stop):
+      self._stop_roctracer()
+      roctx_ids_stop.add(tid)
+      roctx_ids_start.discard(tid)
+         
 
   def start_marker( self, marker_name ):
     marker_id = self._roctxr_start( encode(marker_name) )
diff --git a/roctx_example.py b/roctx_example.py
index 5ca8341..6b93d64 100644
--- a/roctx_example.py
+++ b/roctx_example.py
@@ -11,7 +11,7 @@
 hip_lib_path = f'{work_dir}/libHIPcode.so'
 
 # Initialize the roctracer tools
-hip_tools = HIP_tools( hip_lib_path )
+hip_tools = HIP_tools( hip_lib_path,True )
 
 if use_torch:
   if not torch.cuda.is_available():
@@ -25,7 +25,7 @@
   print('Setting hip device 0')
   hip_tools.set_device(0)  
 
-
+hip_tools.start_roctracer()
 # Do some fun stuff
 id_init = hip_tools.start_marker('init')
 
@@ -33,18 +33,21 @@
 A = np.random.rand( nx, ny )
 B = np.random.rand( nx, ny )
 
-hip_tools.stop_marker(id_init)
-
-id_main = hip_tools.start_marker('main')
 
 n_iterations = 20
 for i in range(n_iterations):
-
+  
+  #Only profiling even number
+  if(i%2 == 0):
+    hip_tools.start_roctracer()
+  
   id_iter = hip_tools.start_marker(f'iter_{i}')
   print( f'iteration: {i}')
   C = np.matmul(A, B)
 
   hip_tools.stop_marker(id_iter)
 
+  if(i%2 == 0):
+    hip_tools.stop_roctracer()
+
 print('Finished successfully')
-hip_tools.stop_marker(id_main)
\ No newline at end of file
diff --git a/roctx_example_gpu.py b/roctx_example_gpu.py
new file mode 100644
index 0000000..b0f9ae9
--- /dev/null
+++ b/roctx_example_gpu.py
@@ -0,0 +1,54 @@
+import os, sys
+import torch
+from hip_tools import HIP_tools
+
+
+# Get the path to the hip code library
+work_dir = os.getcwd()
+hip_lib_path = f'{work_dir}/libHIPcode.so'
+
+# Initialize the roctracer tools
+hip_tools = HIP_tools( hip_lib_path, True )
+
+# Check if CUDA/ROCm is available
+if not torch.cuda.is_available():
+    print('Warning: GPU not found, using CPU')
+    device = torch.device('cpu')
+else:  
+    print('Setting torch device cuda')
+    device = torch.device('cuda')
+    # Set the device: Needed since nothing else initialize the device 
+    hip_tools.set_device(0)
+
+hip_tools.start_roctracer()
+# Do some fun stuff
+id_init = hip_tools.start_marker('init')
+
+nx, ny = 128, 128
+A = torch.randn(nx, ny, device=device)
+B = torch.randn(nx, ny, device=device)
+
+hip_tools.stop_marker(id_init)
+
+n_iterations = 5
+for i in range(n_iterations):
+  
+  #Only profiling even number
+  if(i%2 == 0):
+    hip_tools.start_roctracer()
+  
+  id_iter = hip_tools.start_marker(f'iter_{i}')
+  print(f'iteration: {i}')
+  C = torch.matmul(A, B)
+  
+  # Synchronize to ensure GPU operation completes
+  if device.type == 'cuda':
+    torch.cuda.synchronize(device=device)
+  
+  hip_tools.stop_marker(id_iter)
+
+  if(i%2 == 0):
+    hip_tools.stop_roctracer()
+
+print('Finished successfully')
+