From ada608242a0674cf1eb7348eefcba81b3abf75f1 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Fri, 10 Apr 2026 13:42:39 -0700 Subject: [PATCH] fix(kernels): rename kernel_add in fused_encoder_bwd to avoid symbol clash The fused encoder backward file defined kernel_add which clashes with the same symbol in elementwise.cu when linking into libkernels.so. Rename to kernel_enc_bwd_add to avoid the duplicate symbol error. --- internal/cuda/kernels/fused_encoder_bwd.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/cuda/kernels/fused_encoder_bwd.cu b/internal/cuda/kernels/fused_encoder_bwd.cu index 3c6e68a..2b97c3e 100644 --- a/internal/cuda/kernels/fused_encoder_bwd.cu +++ b/internal/cuda/kernels/fused_encoder_bwd.cu @@ -9,7 +9,7 @@ * kernel_gelu_bwd GELU derivative * upstream gradient * kernel_softmax_bwd Softmax backward (Jacobian-vector product) * kernel_bias_grad_reduce Sum rows to compute bias gradients - * kernel_add_elementwise Element-wise addition for residual gradients + * kernel_enc_bwd_add_elementwise Element-wise addition for residual gradients * kernel_matmul_grad_accum Accumulate weight gradient: dW += A^T @ B * * cuBLAS calls (~14 total per layer): @@ -313,7 +313,7 @@ __global__ void kernel_bias_grad_reduce( /* out[i] = a[i] + b[i] */ /* ------------------------------------------------------------------ */ -__global__ void kernel_add( +__global__ void kernel_enc_bwd_add( const float* __restrict__ a, const float* __restrict__ b, float* __restrict__ out, @@ -330,7 +330,7 @@ __global__ void kernel_add( /* out[i] = a[i] + b[i] + c[i] */ /* ------------------------------------------------------------------ */ -__global__ void kernel_add3( +__global__ void kernel_enc_bwd_add3( const float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, @@ -558,7 +558,7 @@ cudaError_t fused_encoder_bwd_f32( dXRes1, dg_norm2W, dg_norm2B, dModel); /* Add residual skip: dXRes1 += dOutput */ - kernel_add<<>>( + kernel_enc_bwd_add<<>>( dXRes1, dOutput, dXRes1, trDm); /* ------------------------------------------------------------ */ @@ -722,7 +722,7 @@ cudaError_t fused_encoder_bwd_f32( temp, dg_norm1W, dg_norm1B, dModel); /* dInput = temp + dXRes1 */ - kernel_add<<>>( + kernel_enc_bwd_add<<>>( temp, dXRes1, dInput, trDm); return cudaGetLastError();