From ada608242a0674cf1eb7348eefcba81b3abf75f1 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Fri, 10 Apr 2026 13:42:39 -0700
Subject: [PATCH] fix(kernels): rename kernel_add in fused_encoder_bwd to avoid
 symbol clash

The fused encoder backward file defined kernel_add which clashes with
the same symbol in elementwise.cu when linking into libkernels.so.
Rename to kernel_enc_bwd_add to avoid the duplicate symbol error.
---
 internal/cuda/kernels/fused_encoder_bwd.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/internal/cuda/kernels/fused_encoder_bwd.cu b/internal/cuda/kernels/fused_encoder_bwd.cu
index 3c6e68a..2b97c3e 100644
--- a/internal/cuda/kernels/fused_encoder_bwd.cu
+++ b/internal/cuda/kernels/fused_encoder_bwd.cu
@@ -9,7 +9,7 @@
  *   kernel_gelu_bwd            GELU derivative * upstream gradient
  *   kernel_softmax_bwd         Softmax backward (Jacobian-vector product)
  *   kernel_bias_grad_reduce    Sum rows to compute bias gradients
- *   kernel_add_elementwise     Element-wise addition for residual gradients
+ *   kernel_enc_bwd_add_elementwise     Element-wise addition for residual gradients
  *   kernel_matmul_grad_accum   Accumulate weight gradient: dW += A^T @ B
  *
  * cuBLAS calls (~14 total per layer):
@@ -313,7 +313,7 @@ __global__ void kernel_bias_grad_reduce(
 /*  out[i] = a[i] + b[i]                                              */
 /* ------------------------------------------------------------------ */
 
-__global__ void kernel_add(
+__global__ void kernel_enc_bwd_add(
     const float* __restrict__ a,
     const float* __restrict__ b,
     float* __restrict__ out,
@@ -330,7 +330,7 @@ __global__ void kernel_add(
 /*  out[i] = a[i] + b[i] + c[i]                                       */
 /* ------------------------------------------------------------------ */
 
-__global__ void kernel_add3(
+__global__ void kernel_enc_bwd_add3(
     const float* __restrict__ a,
     const float* __restrict__ b,
     const float* __restrict__ c,
@@ -558,7 +558,7 @@ cudaError_t fused_encoder_bwd_f32(
         dXRes1, dg_norm2W, dg_norm2B, dModel);
 
     /* Add residual skip: dXRes1 += dOutput */
-    kernel_add<<<elemGridTrDm, block256, 0, stream>>>(
+    kernel_enc_bwd_add<<<elemGridTrDm, block256, 0, stream>>>(
         dXRes1, dOutput, dXRes1, trDm);
 
     /* ------------------------------------------------------------ */
@@ -722,7 +722,7 @@ cudaError_t fused_encoder_bwd_f32(
         temp, dg_norm1W, dg_norm1B, dModel);
 
     /* dInput = temp + dXRes1 */
-    kernel_add<<<elemGridTrDm, block256, 0, stream>>>(
+    kernel_enc_bwd_add<<<elemGridTrDm, block256, 0, stream>>>(
         temp, dXRes1, dInput, trDm);
 
     return cudaGetLastError();