From ffffc7daa27b0470438ce91b701f4eaae48ca2ba Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 14 Apr 2025 12:19:58 -0500
Subject: [PATCH 01/86] Create directory to hold optimized RISC-V vector
 instrinsics implementations

---
 tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc
new file mode 100644
index 00000000000..e69de29bb2d

From 54e272ad5cb898eb56780578714c99b7ba21f324 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 14 Apr 2025 12:43:40 -0500
Subject: [PATCH 02/86] Add placeholder file until we adapt the convolution
 implementation to tflite-micro

---
 .../kernels/riscv_vector/{convd_rvv.cc => conv_rvv.cc} |  0
 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h  | 10 ++++++++++
 2 files changed, 10 insertions(+)
 rename tensorflow/lite/micro/kernels/riscv_vector/{convd_rvv.cc => conv_rvv.cc} (100%)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/riscv_vector/convd_rvv.cc
rename to tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
new file mode 100644
index 00000000000..df812e9f8c2
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -0,0 +1,10 @@
+// tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_CONV_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_CONV_RVV_H_
+
+#include <cstdint>
+#include <cstddef>
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
\ No newline at end of file

From 863d5df6f1bfbc24707257ca9bc6ffc0cd89ed37 Mon Sep 17 00:00:00 2001
From: numbers1234567 <angelochem4@gmail.com>
Date: Mon, 14 Apr 2025 13:31:08 -0500
Subject: [PATCH 03/86] Annotations

---
 .../internal/reference/integer_ops/depthwise_conv.h  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 7676fce0f4d..761091e53ca 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 namespace tflite {
 namespace reference_integer_ops {
+
+// [PEANUT] It seems like the only difference between these are the data types and formats
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
@@ -30,13 +32,19 @@ inline void DepthwiseConvPerChannel(
     int8_t* output_data) {
   // Get parameters.
   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  // [PEANUT] Lots of Offset() calls. These map multi-dimensional indices to a one-dimensional index in the data buffers. More details in types.h
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
+  // [PEANUT] I think dilation refers to this: https://towardsdatascience.com/a-primer-on-atrous-convolutions-and-depth-wise-separable-convolutions-443b106919f5/
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
+  // [PEANUT] Amount to 0-pad the input. This affects low pixel indices. High indices beyond input height and width are always treated as 0.
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
+  // [PEANUT] It seems like each input channel maps to "depth_multiplier" consecutive output channels
   const int depth_multiplier = params.depth_multiplier;
+
+  // [PEANUT] Activation clamping
   const int32_t input_offset = params.input_offset;
   const int32_t output_offset = params.output_offset;
   const int32_t output_activation_min = params.quantized_activation_min;
@@ -47,6 +55,10 @@ inline void DepthwiseConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
+  // [PEANUT] Input/output/filter dimensions
+  // [PEANUT] Input shape (batches, height, width, in-depth)
+  // [PEANUT] Output shape (batches, height, width, out-depth)
+  // [PEANUT] Filter shape (1?, height, width, out-depth)
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);

From ca7187005e157a0c9c30941ab9516329ef077628 Mon Sep 17 00:00:00 2001
From: numbers1234567 <angelochem4@gmail.com>
Date: Mon, 14 Apr 2025 13:34:35 -0500
Subject: [PATCH 04/86] Offset() and MultiplyByQuantizedMultiplier() references

---
 .../internal/reference/integer_ops/depthwise_conv.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 761091e53ca..b99ad89dac0 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -22,7 +22,8 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 
-// [PEANUT] It seems like the only difference between these are the data types and formats
+// [PEANUT] It seems like the only difference between these are the data types and formats.
+//            We are mainly working on 8-bit data, so the below function is most important
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
@@ -33,6 +34,7 @@ inline void DepthwiseConvPerChannel(
   // Get parameters.
   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
   // [PEANUT] Lots of Offset() calls. These map multi-dimensional indices to a one-dimensional index in the data buffers. More details in types.h
+  // [PEANUT] Offset() defined in tflite-micro/lite/kernels/internal/runtime_shape.h
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   // [PEANUT] I think dilation refers to this: https://towardsdatascience.com/a-primer-on-atrous-convolutions-and-depth-wise-separable-convolutions-443b106919f5/
@@ -59,6 +61,9 @@ inline void DepthwiseConvPerChannel(
   // [PEANUT] Input shape (batches, height, width, in-depth)
   // [PEANUT] Output shape (batches, height, width, out-depth)
   // [PEANUT] Filter shape (1?, height, width, out-depth)
+  // [PEANUT] These shapes also match how the data is stored in memory: 
+  //            batch-major, then row-major, then column-major. 
+  //            Channels are last. Refer to RuntimeShape.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
@@ -86,7 +91,8 @@ inline void DepthwiseConvPerChannel(
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
                 const int in_y =
                     in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
+                // Zero padding by omitting the areas outside the image
+                // [PEANUT] The branches may be a bottleneck
                 const bool is_point_inside_image =
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
@@ -118,10 +124,13 @@ inline void DepthwiseConvPerChannel(
             if (bias_data) {
               acc += bias_data[output_channel];
             }
+            // [PEANUT] This is analogous to the output_shift in our example
+            // [PEANUT] tflite-micro/tensorflow/lite/kernels/internal/common.cc
             acc = MultiplyByQuantizedMultiplier(
                 acc, output_multiplier[output_channel],
                 output_shift[output_channel]);
             acc += output_offset;
+            // [PEANUT] Clamp output
             acc = std::max(acc, output_activation_min);
             acc = std::min(acc, output_activation_max);
             output_data[Offset(output_shape, batch, out_y, out_x,

From 3b24e0c5c01c9fc256286fef087be798a386ca28 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 14 Apr 2025 16:37:54 -0500
Subject: [PATCH 05/86] Add vector intrinsiscs convolution implementation

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 244 ++++++++++++++++++
 1 file changed, 244 insertions(+)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index e69de29bb2d..e6e14e6feae 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -0,0 +1,244 @@
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <limits> 
+
+#include <riscv_vector.h>
+
+__attribute__((hot))
+void convolution_hwc_ohwi_rvv(
+    const uint8_t* input_data,
+    const uint16_t input_height,
+    const uint16_t input_width,
+    const uint16_t input_channels,
+    const int32_t input_offset,
+    const int8_t* filter_data,
+    const uint16_t filter_height,
+    const uint16_t filter_width,
+    const int32_t* bias_data,
+    uint8_t* output_data,
+    const uint16_t output_height,
+    const uint16_t output_width,
+    const uint16_t output_channels,
+    const int32_t output_offset,
+    const int32_t* output_multiplier,
+    const int32_t* output_shift,
+    const uint16_t stride_height,
+    const uint16_t stride_width,
+    const uint16_t pad_height,
+    const uint16_t pad_width)
+{
+    assert(input_data != nullptr);
+    assert(filter_data != nullptr);
+    assert(bias_data != nullptr);
+    assert(output_data != nullptr);
+    assert(output_multiplier != nullptr);
+    assert(output_shift != nullptr);
+    assert(input_height > 0); 
+    assert(input_width > 0); 
+    assert(input_channels > 0);
+    assert(filter_height > 0); 
+    assert(filter_width > 0);
+    assert(output_height > 0); 
+    assert(output_width > 0); 
+    assert(output_channels > 0);
+    assert(stride_height > 0); 
+    assert(stride_width > 0);
+    assert(input_offset >= 0 && input_offset <= 255);
+
+    // Pre-calculate strides and kernel plane size for efficient access
+    const size_t input_row_stride = (size_t)input_width * input_channels;
+    const size_t output_row_stride = (size_t)output_width * output_channels;
+    const size_t filter_kernel_plane_size = (size_t)filter_height * filter_width * input_channels;
+
+    // Define activation clamping limits based on output type
+    const int32_t output_activation_min_i32 = static_cast<int32_t>(std::numeric_limits<uint8_t>::min());
+    const int32_t output_activation_max_i32 = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
+
+    // Set the default rounding mode for fixed-point vector instructions
+    const unsigned int default_vxrm = __RISCV_VXRM_RNU;
+
+    // Iterate through each output channel
+    for (int out_c = 0; out_c < output_channels; ++out_c)
+    {
+        // Calculate the sum of filter weights for the current channel for offset correction
+        int32_t filter_sum = 0;
+        const int8_t* filter_start_for_channel = filter_data + (size_t)out_c * filter_kernel_plane_size;
+        for (size_t i = 0; i < filter_kernel_plane_size; ++i)
+        {
+            filter_sum += filter_start_for_channel[i];
+        }
+
+        // Pre-calculate per-channel constants for bias, quantization, and correction
+        const int32_t current_offset_correction = input_offset * filter_sum;
+        const int32_t current_bias = bias_data[out_c];
+        const int32_t current_output_multiplier = output_multiplier[out_c];
+        const int32_t current_output_shift = output_shift[out_c];
+
+        // Determine requantization shifts and rounding offset
+        const int32_t left_shift = std::max((int32_t)0, current_output_shift);
+        const int32_t right_shift = std::max((int32_t)0, -current_output_shift);
+        const int32_t rounding_offset = (right_shift > 0) ? (1 << (right_shift - 1)) : 0;
+
+        // Calculate saturation limits for intermediate requantization steps
+        const int32_t add_rounding_limit = INT32_MAX - rounding_offset;
+        const int32_t add_offset_limit_pos = INT32_MAX - output_offset;
+        const int32_t add_offset_limit_neg = INT32_MIN - output_offset;
+        const int32_t left_shift_limit_pos = (left_shift < 31) ? (INT32_MAX >> left_shift) : 0;
+        const int32_t left_shift_limit_neg = (left_shift < 31) ? (INT32_MIN >> left_shift) : -1;
+
+        // Iterate through each output row
+        for (int out_y = 0; out_y < output_height; ++out_y)
+        {
+            // Calculate the starting input row corresponding to the current output row
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+
+            // Process output row strip-by-strip
+            size_t current_out_x = 0;
+            while (current_out_x < output_width)
+            {
+                // Set the vector length for the current strip
+                const size_t vl = __riscv_vsetvl_e32m8(output_width - current_out_x);
+
+                // Initialize the accumulator vector with the bias for the current channel
+                vint32m8_t v_acc = __riscv_vmv_v_x_i32m8(current_bias, vl);
+
+                // Iterate through the filter kernel height
+                for (int k_y = 0; k_y < filter_height; ++k_y)
+                {
+                    // Calculate the current input row index and skip if out of bounds (padding)
+                    const int in_y = in_y_origin + k_y;
+                    if (in_y < 0 || in_y >= input_height) continue;
+
+                    // Iterate through the filter kernel width
+                    for (int k_x = 0; k_x < filter_width; ++k_x)
+                    {
+                        // Calculate the vector of input x coordinates for the current strip
+                        const int32_t in_x_origin_for_k = (int32_t)k_x - pad_width;
+                        vuint32m8_t v_lane_indices = __riscv_vid_v_u32m8(vl);
+                        vuint32m8_t v_out_x_indices = __riscv_vadd_vx_u32m8(v_lane_indices, current_out_x, vl);
+                        vuint32m8_t v_in_x_base = __riscv_vmul_vx_u32m8(v_out_x_indices, stride_width, vl);
+                        vuint32m8_t v_in_x_u32_temp = __riscv_vadd_vx_u32m8(v_in_x_base, in_x_origin_for_k, vl);
+                        vint32m8_t v_in_x_i32 = __riscv_vreinterpret_v_u32m8_i32m8(v_in_x_u32_temp);
+
+                        // Generate a mask for valid input x coordinates (handling horizontal padding)
+                        vbool4_t v_mask_x_ge_0 = __riscv_vmsge_vx_i32m8_b4(v_in_x_i32, 0, vl);
+                        vbool4_t v_mask_x_lt_w = __riscv_vmslt_vx_i32m8_b4(v_in_x_i32, input_width, vl);
+                        vbool4_t v_mask_valid_x = __riscv_vmand_mm_b4(v_mask_x_ge_0, v_mask_x_lt_w, vl);
+
+                        // Calculate the input x offset scaled by the number of channels
+                        vuint32m8_t v_in_x_u32 = __riscv_vreinterpret_v_i32m8_u32m8(v_in_x_i32);
+                        vuint32m8_t v_in_x_ch_offset = __riscv_vmul_vx_u32m8(v_in_x_u32, input_channels, vl);
+
+                        // Get the base pointer for the filter weights for this kernel position
+                        const int8_t* filter_ptr_base = filter_data +
+                            (size_t)out_c * filter_kernel_plane_size +
+                            (size_t)k_y * filter_width * input_channels +
+                            (size_t)k_x * input_channels;
+
+                        // Iterate through the input channels, performing MAC operations
+                        for (int in_c = 0; in_c < input_channels; ++in_c)
+                        {
+                            // Skip MAC if filter weight is zero
+                            const int8_t filter_val = filter_ptr_base[in_c];
+                            if (filter_val == 0) 
+                                continue;
+
+                            // Calculate the vector of byte offsets into the input data
+                            uint32_t base_offset_for_row_ch = (uint32_t)in_y * input_row_stride + in_c;
+                            vuint32m8_t v_byte_offset_u32 = __riscv_vadd_vx_u32m8(v_in_x_ch_offset, base_offset_for_row_ch, vl);
+
+                            // Load input data elements using indexed load
+                            vuint8m2_t v_loaded_raw = __riscv_vloxei32_v_u8m2(
+                                                            input_data,
+                                                            v_byte_offset_u32,
+                                                            vl);
+
+                            // Create a vector of input zero-points
+                            uint8_t input_zero_point_u8 = (uint8_t)input_offset;
+                            vuint8m2_t v_zero_points = __riscv_vmv_v_x_u8m2(input_zero_point_u8, vl);
+
+                            // Merge loaded data with zero-points based on the padding mask
+                            vuint8m2_t v_input_u8 = __riscv_vmerge_vvm_u8m2(
+                                                            v_loaded_raw,
+                                                            v_zero_points,
+                                                            v_mask_valid_x,
+                                                            vl);
+
+                            // Sign-extend input from 8-bit to 16-bit for widening MAC
+                            vint8m2_t v_input_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_input_u8);
+                            vint16m4_t v_input_i16 = __riscv_vsext_vf2_i16m4(v_input_i8, vl);
+
+                            // Perform widening multiply-accumulate operation
+                            v_acc = __riscv_vwmacc_vx_i32m8(v_acc,
+                                                            filter_val,
+                                                            v_input_i16,
+                                                            vl);
+                        }
+                    }
+                }
+
+                // Apply the input offset correction term to the accumulator
+                v_acc = __riscv_vsub_vx_i32m8(v_acc, current_offset_correction, vl);
+
+                // Multiply by output multiplier (high part)
+                vint32m8_t v_requant_stage1 = __riscv_vmulh_vx_i32m8(v_acc, current_output_multiplier, vl);
+
+                // Declare a temporary mask variable for requantization saturation checks
+                vbool4_t v_temp_mask_b4;
+
+                // Apply rounding offset and right shift with saturation
+                if (right_shift > 0) 
+                {
+                    v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_rounding_limit, vl);
+                    vint32m8_t v_added_round = __riscv_vadd_vx_i32m8(v_requant_stage1, rounding_offset, vl);
+                    v_requant_stage1 = __riscv_vmerge_vxm_i32m8(v_added_round, INT32_MAX, v_temp_mask_b4, vl);
+                    v_requant_stage1 = __riscv_vsra_vx_i32m8(v_requant_stage1, right_shift, vl);
+                }
+
+                // Apply left shift with saturation if needed
+                if (left_shift > 0) 
+                {
+                    v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_pos, vl);
+                    vbool4_t v_temp_mask_b4_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_neg, vl);
+                    vint32m8_t v_shifted = __riscv_vsll_vx_i32m8(v_requant_stage1, left_shift, vl);
+                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MAX, v_temp_mask_b4, vl);
+                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MIN, v_temp_mask_b4_neg, vl);
+                    v_requant_stage1 = v_shifted;
+                }
+
+                // Add output offset with saturation
+                v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_pos, vl);
+                vbool4_t v_temp_mask_b4_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_neg, vl);
+                vint32m8_t v_requant_stage2 = __riscv_vadd_vx_i32m8(v_requant_stage1, output_offset, vl);
+                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MAX, v_temp_mask_b4, vl);
+                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MIN, v_temp_mask_b4_neg, vl);
+
+                // Clamp the result to the final activation range [0, 255]
+                vint32m8_t v_clamped_i32 = __riscv_vmax_vx_i32m8(v_requant_stage2, output_activation_min_i32, vl);
+                v_clamped_i32 = __riscv_vmin_vx_i32m8(v_clamped_i32, output_activation_max_i32, vl);
+
+                // Narrow the 32-bit results down to 8-bit unsigned integers
+                vint16m4_t v_narrowed_i16 = __riscv_vnsra_wx_i16m4(v_clamped_i32, 0, vl);
+                vuint16m4_t v_narrowed_u16 = __riscv_vreinterpret_v_i16m4_u16m4(v_narrowed_i16);
+                vuint8m2_t v_output_u8 = __riscv_vnclipu_wx_u8m2(v_narrowed_u16, 0, default_vxrm, vl);
+
+                // Calculate the base pointer for storing the output strip
+                uint8_t* output_base_ptr = output_data +
+                                            (size_t)out_y * output_row_stride +
+                                            current_out_x * output_channels +
+                                            out_c;
+
+                // Define the byte stride for storing into the HWC output layout
+                ptrdiff_t byte_stride = (ptrdiff_t)output_channels;
+
+                // Store the computed 8-bit output values using strided store
+                __riscv_vsse8_v_u8m2(output_base_ptr, byte_stride, v_output_u8, vl);
+
+                // Move to the next horizontal strip
+                current_out_x += vl;
+            }
+        }
+    }
+}
\ No newline at end of file

From 1632652221bb339b4837967e7a830ed9339c83d3 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 14 Apr 2025 17:38:15 -0500
Subject: [PATCH 06/86] Add empty makefile for building the vector intrinsics
 implementations

---
 .../lite/micro/tools/make/targets/riscv32_vector_makefile.inc     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc

diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
new file mode 100644
index 00000000000..e69de29bb2d

From a98604f2eb68691c0b574a3b0150549b28c4051c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 09:50:10 -0500
Subject: [PATCH 07/86] Build and test our 2d convolution implementation with
 tflite micro

---
 .gitignore                                    |   1 +
 tensorflow/lite/micro/kernels/conv.cc         |  84 +++++
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 302 ++++++++++++------
 .../micro/kernels/riscv_vector/conv_rvv.h     |  26 +-
 .../lite/micro/testing/test_with_spike.sh     |  29 ++
 .../tools/make/ext_libs/riscv_vector.inc      |   0
 .../make/targets/riscv32_vector_makefile.inc  |  63 ++++
 7 files changed, 400 insertions(+), 105 deletions(-)
 create mode 100755 tensorflow/lite/micro/testing/test_with_spike.sh
 create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/riscv_vector.inc

diff --git a/.gitignore b/.gitignore
index 1d1c4dec664..6e2b2680b38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *__pycache__*
 venv
 gen
+.venv
 
 # Ignore the directory in which `clangd` stores its local index.
 /.cache/
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 7be915ab51e..3d0a5557a75 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -24,6 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+#if defined(TFLM_USE_RISCV_VECTOR)
+#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
+#endif
+
 namespace tflite {
 namespace {
 
@@ -151,6 +155,85 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
+#if defined(TFLM_USE_RISCV_VECTOR)
+#ifdef USE_TFLM_COMPRESSION
+          TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
+          if (weights_comp_td != nullptr || bias_comp_td != nullptr) 
+          {
+              MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet.");
+              return kTfLiteError;
+          }
+#endif // USE_TFLM_COMPRESSION
+          const TfLiteConvParams* conv_params_rvv = // Use different name to avoid shadowing
+              static_cast<const TfLiteConvParams*>(node->builtin_data);
+          const TfLiteEvalTensor* input_rvv =
+              tflite::micro::GetEvalInput(context, node, kConvInputTensor);
+          const TfLiteEvalTensor* filter_rvv =
+              tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
+          const TfLiteEvalTensor* bias_rvv =
+              (NumInputs(node) == 3)
+                  ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
+                  : nullptr;
+          TfLiteEvalTensor* output_rvv =
+              tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
+
+          if (bias_rvv != nullptr && bias_rvv->type != kTfLiteInt32) {
+             MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias_rvv->type));
+             return kTfLiteError;
+          }
+
+          const int8_t* input_data_ptr = tflite::micro::GetTensorData<int8_t>(input_rvv);
+          const int8_t* filter_data_ptr = tflite::micro::GetTensorData<int8_t>(filter_rvv);
+          const int32_t* bias_data_ptr =
+              (bias_rvv) ? tflite::micro::GetTensorData<int32_t>(bias_rvv) : nullptr;
+          int8_t* output_data_ptr = tflite::micro::GetTensorData<int8_t>(output_rvv);
+
+          const int32_t input_zero_point_arg = data.input_zero_point;
+          const int32_t output_zero_point_arg = data.output_zero_point;
+          const int32_t* output_multiplier_ptr = data.per_channel_output_multiplier;
+          const int32_t* output_shift_ptr = data.per_channel_output_shift;
+
+          const uint16_t input_height = static_cast<uint16_t>(input_rvv->dims->data[1]);
+          const uint16_t input_width = static_cast<uint16_t>(input_rvv->dims->data[2]);
+          const uint16_t input_channels = static_cast<uint16_t>(input_rvv->dims->data[3]);
+
+          const uint16_t filter_height = static_cast<uint16_t>(filter_rvv->dims->data[1]);
+          const uint16_t filter_width = static_cast<uint16_t>(filter_rvv->dims->data[2]);
+
+          const uint16_t output_channels = static_cast<uint16_t>(output_rvv->dims->data[3]);
+
+          const uint16_t output_height = static_cast<uint16_t>(output_rvv->dims->data[1]);
+          const uint16_t output_width = static_cast<uint16_t>(output_rvv->dims->data[2]);
+
+          const uint16_t stride_height = static_cast<uint16_t>(conv_params_rvv->stride_height);
+          const uint16_t stride_width = static_cast<uint16_t>(conv_params_rvv->stride_width);
+          const uint16_t pad_height = static_cast<uint16_t>(data.padding.height);
+          const uint16_t pad_width = static_cast<uint16_t>(data.padding.width);
+
+          // Call the optimized RVV kernel
+          convolution_hwc_ohwi_rvv(
+            input_data_ptr,
+            input_height,
+            input_width,
+            input_channels,
+            input_zero_point_arg,
+            filter_data_ptr,
+            filter_height,
+            filter_width,
+            bias_data_ptr, // Pass bias pointer (can be null)
+            output_data_ptr,
+            output_height,
+            output_width,
+            output_channels,
+            output_zero_point_arg,
+            output_multiplier_ptr,
+            output_shift_ptr,
+            stride_height,
+            stride_width,
+            pad_height,
+            pad_width
+           );
+#else // defined(TFLM_USE_RISCV_VECTOR)
           reference_integer_ops::ConvPerChannel(
               ConvParamsQuantized(params, data),
               data.per_channel_output_multiplier, data.per_channel_output_shift,
@@ -171,6 +254,7 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
 #endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
+#endif
           break;
         }
         default:
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index e6e14e6feae..883f02d13e3 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -6,9 +6,123 @@
 
 #include <riscv_vector.h>
 
+// TFLite Micro reference
+// Fixed-point per-channel-quantization convolution reference kernel.
+// inline void ConvPerChannel(
+//     const ConvParams& params, const int32_t* output_multiplier,
+//     const int32_t* output_shift, const RuntimeShape& input_shape,
+//     const int8_t* input_data, const RuntimeShape& filter_shape,
+//     const int8_t* filter_data, const RuntimeShape& bias_shape,
+//     const int32_t* bias_data, const RuntimeShape& output_shape,
+//     int8_t* output_data) {
+//   // Get parameters.
+//   const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+//   const int stride_width = params.stride_width;
+//   const int stride_height = params.stride_height;
+//   const int dilation_width_factor = params.dilation_width_factor;
+//   const int dilation_height_factor = params.dilation_height_factor;
+//   const int pad_width = params.padding_values.width;
+//   const int pad_height = params.padding_values.height;
+//   const int32_t output_offset = params.output_offset;
+
+//   // Set min and max value of the output.
+//   const int32_t output_activation_min = params.quantized_activation_min;
+//   const int32_t output_activation_max = params.quantized_activation_max;
+
+//   // Consistency check.
+//   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+//   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+//   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+//   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+//   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+//   const int input_depth = input_shape.Dims(3);
+//   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+//   if (bias_data) {
+//     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+//   }
+
+//   // Check dimensions of the tensors.
+//   const int input_height = input_shape.Dims(1);
+//   const int input_width = input_shape.Dims(2);
+//   const int filter_height = filter_shape.Dims(1);
+//   const int filter_width = filter_shape.Dims(2);
+//   const int filter_input_depth = filter_shape.Dims(3);
+//   const int groups = input_depth / filter_input_depth;
+//   TFLITE_DCHECK_NE(groups, 0);
+//   TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+//   const int filters_per_group = output_depth / groups;
+//   TFLITE_DCHECK_NE(filters_per_group, 0);
+//   const int output_height = output_shape.Dims(1);
+//   const int output_width = output_shape.Dims(2);
+//   for (int batch = 0; batch < batches; ++batch) {
+//     for (int out_y = 0; out_y < output_height; ++out_y) {
+//       const int in_y_origin = (out_y * stride_height) - pad_height;
+//       for (int out_x = 0; out_x < output_width; ++out_x) {
+//         const int in_x_origin = (out_x * stride_width) - pad_width;
+//         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+//           auto group = out_channel / filters_per_group;
+//           int32_t acc = 0;
+//           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+//             const int in_y = in_y_origin + dilation_height_factor * filter_y;
+//             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+//               const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+//               // Zero padding by omitting the areas outside the image.
+//               const bool is_point_inside_image =
+//                   (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+//                   (in_y < input_height);
+
+//               if (!is_point_inside_image) {
+//                 continue;
+//               }
+
+//               for (int in_channel = 0; in_channel < filter_input_depth;
+//                    ++in_channel) {
+//                 int32_t input_val =
+//                     input_data[Offset(input_shape, batch, in_y, in_x,
+//                                       in_channel + group * filter_input_depth)];
+//                 int32_t filter_val = filter_data[Offset(
+//                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
+//                 // Accumulate with 32 bits accumulator.
+//                 // In the nudging process during model quantization, we force
+//                 // real value of 0.0 be represented by a quantized value. This
+//                 // guarantees that the input_offset is a int8_t, even though
+//                 // it is represented using int32_t. int32_t += int8_t *
+//                 // (int8_t - int8_t) so the highest value we can get from each
+//                 // accumulation is [-127, 127] * ([-128, 127] -
+//                 // [-128, 127]), which is [-32512, 32512]. log2(32512)
+//                 // = 14.98, which means we can accumulate at least 2^16
+//                 // multiplications without overflow. The accumulator is
+//                 // applied to a filter so the accumulation logic will hold as
+//                 // long as the filter size (filter_y * filter_x * in_channel)
+//                 // does not exceed 2^16, which is the case in all the models
+//                 // we have seen so far.
+//                 // TODO(b/174275578): Add a check to make sure the
+//                 // accumulator depth is smaller than 2^16.
+//                 acc += filter_val * (input_val + input_offset);
+//               }
+//             }
+//           }
+
+//           if (bias_data) {
+//             acc += bias_data[out_channel];
+//           }
+//           acc = MultiplyByQuantizedMultiplier(
+//               acc, output_multiplier[out_channel], output_shift[out_channel]);
+//           acc += output_offset;
+//           acc = std::max(acc, output_activation_min);
+//           acc = std::min(acc, output_activation_max);
+//           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+//               static_cast<int8_t>(acc);
+//         }
+//       }
+//     }
+//   }
+// }
+
 __attribute__((hot))
 void convolution_hwc_ohwi_rvv(
-    const uint8_t* input_data,
+    const int8_t* input_data,
     const uint16_t input_height,
     const uint16_t input_width,
     const uint16_t input_channels,
@@ -17,7 +131,7 @@ void convolution_hwc_ohwi_rvv(
     const uint16_t filter_height,
     const uint16_t filter_width,
     const int32_t* bias_data,
-    uint8_t* output_data,
+    int8_t* output_data,
     const uint16_t output_height,
     const uint16_t output_width,
     const uint16_t output_channels,
@@ -27,216 +141,200 @@ void convolution_hwc_ohwi_rvv(
     const uint16_t stride_height,
     const uint16_t stride_width,
     const uint16_t pad_height,
-    const uint16_t pad_width)
+    const uint16_t pad_width,
+    const int32_t output_activation_min,
+    const int32_t output_activation_max,
+    int dilation_height_factor,
+    int dilation_width_factor
+)
 {
     assert(input_data != nullptr);
     assert(filter_data != nullptr);
-    assert(bias_data != nullptr);
+
     assert(output_data != nullptr);
     assert(output_multiplier != nullptr);
     assert(output_shift != nullptr);
-    assert(input_height > 0); 
-    assert(input_width > 0); 
+
+
+    assert(input_height > 0);
+    assert(input_width > 0);
     assert(input_channels > 0);
-    assert(filter_height > 0); 
+    assert(filter_height > 0);
     assert(filter_width > 0);
-    assert(output_height > 0); 
-    assert(output_width > 0); 
+    assert(output_height > 0);
+    assert(output_width > 0);
     assert(output_channels > 0);
-    assert(stride_height > 0); 
+    assert(stride_height > 0);
     assert(stride_width > 0);
-    assert(input_offset >= 0 && input_offset <= 255);
 
-    // Pre-calculate strides and kernel plane size for efficient access
+    assert(input_offset >= -128 && input_offset <= 127);
+    assert(output_offset >= -128 && output_offset <= 127);
+
+
     const size_t input_row_stride = (size_t)input_width * input_channels;
     const size_t output_row_stride = (size_t)output_width * output_channels;
     const size_t filter_kernel_plane_size = (size_t)filter_height * filter_width * input_channels;
 
-    // Define activation clamping limits based on output type
-    const int32_t output_activation_min_i32 = static_cast<int32_t>(std::numeric_limits<uint8_t>::min());
-    const int32_t output_activation_max_i32 = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
+    const int32_t output_activation_min_i32 = output_activation_min;
+    const int32_t output_activation_max_i32 = output_activation_max;
 
-    // Set the default rounding mode for fixed-point vector instructions
     const unsigned int default_vxrm = __RISCV_VXRM_RNU;
 
-    // Iterate through each output channel
     for (int out_c = 0; out_c < output_channels; ++out_c)
     {
-        // Calculate the sum of filter weights for the current channel for offset correction
-        int32_t filter_sum = 0;
-        const int8_t* filter_start_for_channel = filter_data + (size_t)out_c * filter_kernel_plane_size;
-        for (size_t i = 0; i < filter_kernel_plane_size; ++i)
-        {
-            filter_sum += filter_start_for_channel[i];
-        }
 
-        // Pre-calculate per-channel constants for bias, quantization, and correction
-        const int32_t current_offset_correction = input_offset * filter_sum;
-        const int32_t current_bias = bias_data[out_c];
+
+        const int32_t current_bias = (bias_data != nullptr) ? bias_data[out_c] : 0;
         const int32_t current_output_multiplier = output_multiplier[out_c];
         const int32_t current_output_shift = output_shift[out_c];
 
-        // Determine requantization shifts and rounding offset
+
         const int32_t left_shift = std::max((int32_t)0, current_output_shift);
         const int32_t right_shift = std::max((int32_t)0, -current_output_shift);
+
         const int32_t rounding_offset = (right_shift > 0) ? (1 << (right_shift - 1)) : 0;
 
-        // Calculate saturation limits for intermediate requantization steps
+
         const int32_t add_rounding_limit = INT32_MAX - rounding_offset;
+
         const int32_t add_offset_limit_pos = INT32_MAX - output_offset;
         const int32_t add_offset_limit_neg = INT32_MIN - output_offset;
+
         const int32_t left_shift_limit_pos = (left_shift < 31) ? (INT32_MAX >> left_shift) : 0;
         const int32_t left_shift_limit_neg = (left_shift < 31) ? (INT32_MIN >> left_shift) : -1;
 
-        // Iterate through each output row
+
         for (int out_y = 0; out_y < output_height; ++out_y)
         {
-            // Calculate the starting input row corresponding to the current output row
             const int in_y_origin = (out_y * stride_height) - pad_height;
 
-            // Process output row strip-by-strip
             size_t current_out_x = 0;
             while (current_out_x < output_width)
             {
-                // Set the vector length for the current strip
                 const size_t vl = __riscv_vsetvl_e32m8(output_width - current_out_x);
 
-                // Initialize the accumulator vector with the bias for the current channel
-                vint32m8_t v_acc = __riscv_vmv_v_x_i32m8(current_bias, vl);
+                vint32m8_t v_acc = __riscv_vmv_v_x_i32m8(0, vl);
 
-                // Iterate through the filter kernel height
                 for (int k_y = 0; k_y < filter_height; ++k_y)
                 {
-                    // Calculate the current input row index and skip if out of bounds (padding)
-                    const int in_y = in_y_origin + k_y;
+                    const int in_y = in_y_origin + k_y * dilation_height_factor;
+
                     if (in_y < 0 || in_y >= input_height) continue;
 
-                    // Iterate through the filter kernel width
                     for (int k_x = 0; k_x < filter_width; ++k_x)
                     {
-                        // Calculate the vector of input x coordinates for the current strip
-                        const int32_t in_x_origin_for_k = (int32_t)k_x - pad_width;
                         vuint32m8_t v_lane_indices = __riscv_vid_v_u32m8(vl);
                         vuint32m8_t v_out_x_indices = __riscv_vadd_vx_u32m8(v_lane_indices, current_out_x, vl);
                         vuint32m8_t v_in_x_base = __riscv_vmul_vx_u32m8(v_out_x_indices, stride_width, vl);
-                        vuint32m8_t v_in_x_u32_temp = __riscv_vadd_vx_u32m8(v_in_x_base, in_x_origin_for_k, vl);
-                        vint32m8_t v_in_x_i32 = __riscv_vreinterpret_v_u32m8_i32m8(v_in_x_u32_temp);
+                        const int32_t in_x_origin_for_k = (int32_t)(k_x * dilation_width_factor) - pad_width;
+                        vint32m8_t v_in_x_i32 = __riscv_vadd_vx_i32m8(__riscv_vreinterpret_v_u32m8_i32m8(v_in_x_base), in_x_origin_for_k, vl);
 
-                        // Generate a mask for valid input x coordinates (handling horizontal padding)
                         vbool4_t v_mask_x_ge_0 = __riscv_vmsge_vx_i32m8_b4(v_in_x_i32, 0, vl);
                         vbool4_t v_mask_x_lt_w = __riscv_vmslt_vx_i32m8_b4(v_in_x_i32, input_width, vl);
                         vbool4_t v_mask_valid_x = __riscv_vmand_mm_b4(v_mask_x_ge_0, v_mask_x_lt_w, vl);
 
-                        // Calculate the input x offset scaled by the number of channels
                         vuint32m8_t v_in_x_u32 = __riscv_vreinterpret_v_i32m8_u32m8(v_in_x_i32);
                         vuint32m8_t v_in_x_ch_offset = __riscv_vmul_vx_u32m8(v_in_x_u32, input_channels, vl);
 
-                        // Get the base pointer for the filter weights for this kernel position
                         const int8_t* filter_ptr_base = filter_data +
                             (size_t)out_c * filter_kernel_plane_size +
                             (size_t)k_y * filter_width * input_channels +
                             (size_t)k_x * input_channels;
 
-                        // Iterate through the input channels, performing MAC operations
                         for (int in_c = 0; in_c < input_channels; ++in_c)
                         {
-                            // Skip MAC if filter weight is zero
                             const int8_t filter_val = filter_ptr_base[in_c];
-                            if (filter_val == 0) 
+
+                            if (filter_val == 0)
                                 continue;
 
-                            // Calculate the vector of byte offsets into the input data
                             uint32_t base_offset_for_row_ch = (uint32_t)in_y * input_row_stride + in_c;
                             vuint32m8_t v_byte_offset_u32 = __riscv_vadd_vx_u32m8(v_in_x_ch_offset, base_offset_for_row_ch, vl);
 
-                            // Load input data elements using indexed load
-                            vuint8m2_t v_loaded_raw = __riscv_vloxei32_v_u8m2(
+                            vint8m2_t v_loaded_input_i8 = __riscv_vloxei32_v_i8m2_m(
+                                                            v_mask_valid_x,
                                                             input_data,
                                                             v_byte_offset_u32,
                                                             vl);
 
-                            // Create a vector of input zero-points
-                            uint8_t input_zero_point_u8 = (uint8_t)input_offset;
-                            vuint8m2_t v_zero_points = __riscv_vmv_v_x_u8m2(input_zero_point_u8, vl);
+                            vint16m4_t v_input_i16 = __riscv_vsext_vf2_i16m4(v_loaded_input_i8, vl);
 
-                            // Merge loaded data with zero-points based on the padding mask
-                            vuint8m2_t v_input_u8 = __riscv_vmerge_vvm_u8m2(
-                                                            v_loaded_raw,
-                                                            v_zero_points,
-                                                            v_mask_valid_x,
-                                                            vl);
+                            vint16m4_t v_input_offset_i16 = __riscv_vmv_v_x_i16m4((int16_t)input_offset, vl);
+                            vint16m4_t v_input_i16_add_zp = __riscv_vadd_vv_i16m4(v_input_i16, v_input_offset_i16, vl);
 
-                            // Sign-extend input from 8-bit to 16-bit for widening MAC
-                            vint8m2_t v_input_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_input_u8);
-                            vint16m4_t v_input_i16 = __riscv_vsext_vf2_i16m4(v_input_i8, vl);
+                            vint16m4_t v_zeros_i16 = __riscv_vmv_v_x_i16m4(0, vl);
+
+                            vint16m4_t v_input_i16_padded = __riscv_vmerge_vvm_i16m4(
+                                                                    v_zeros_i16,
+                                                                    v_input_i16_add_zp,
+                                                                    v_mask_valid_x,
+                                                                    vl);
 
-                            // Perform widening multiply-accumulate operation
                             v_acc = __riscv_vwmacc_vx_i32m8(v_acc,
                                                             filter_val,
-                                                            v_input_i16,
+                                                            v_input_i16_padded,
                                                             vl);
                         }
                     }
                 }
 
-                // Apply the input offset correction term to the accumulator
-                v_acc = __riscv_vsub_vx_i32m8(v_acc, current_offset_correction, vl);
-
-                // Multiply by output multiplier (high part)
-                vint32m8_t v_requant_stage1 = __riscv_vmulh_vx_i32m8(v_acc, current_output_multiplier, vl);
+                if (bias_data != nullptr) {
+                    v_acc = __riscv_vadd_vx_i32m8(v_acc, current_bias, vl);
+                }                 
 
-                // Declare a temporary mask variable for requantization saturation checks
-                vbool4_t v_temp_mask_b4;
+                 vint32m8_t v_requant_stage1 = __riscv_vmulh_vx_i32m8(v_acc, current_output_multiplier, vl);
 
-                // Apply rounding offset and right shift with saturation
-                if (right_shift > 0) 
+                if (right_shift > 0)
                 {
-                    v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_rounding_limit, vl);
-                    vint32m8_t v_added_round = __riscv_vadd_vx_i32m8(v_requant_stage1, rounding_offset, vl);
-                    v_requant_stage1 = __riscv_vmerge_vxm_i32m8(v_added_round, INT32_MAX, v_temp_mask_b4, vl);
-                    v_requant_stage1 = __riscv_vsra_vx_i32m8(v_requant_stage1, right_shift, vl);
-                }
+                    vbool4_t v_mask_add_round_ovf = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_rounding_limit, vl);
+                    vint32m8_t v_add_round_sat = __riscv_vmerge_vxm_i32m8(v_requant_stage1, INT32_MAX, v_mask_add_round_ovf, vl);
+
+                    vint32m8_t v_added_round = __riscv_vadd_vx_i32m8(v_add_round_sat, rounding_offset, vl);
 
-                // Apply left shift with saturation if needed
-                if (left_shift > 0) 
+                    v_requant_stage1 = __riscv_vsra_vx_i32m8(v_added_round, right_shift, vl);
+                 }
+
+                if (left_shift > 0)
                 {
-                    v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_pos, vl);
-                    vbool4_t v_temp_mask_b4_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_neg, vl);
+                    vbool4_t v_mask_lshift_ovf_pos = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_pos, vl);
+                    vbool4_t v_mask_lshift_ovf_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_neg, vl);
+
                     vint32m8_t v_shifted = __riscv_vsll_vx_i32m8(v_requant_stage1, left_shift, vl);
-                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MAX, v_temp_mask_b4, vl);
-                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MIN, v_temp_mask_b4_neg, vl);
+
+                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MAX, v_mask_lshift_ovf_pos, vl);
+                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MIN, v_mask_lshift_ovf_neg, vl);
                     v_requant_stage1 = v_shifted;
                 }
 
-                // Add output offset with saturation
-                v_temp_mask_b4 = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_pos, vl);
-                vbool4_t v_temp_mask_b4_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_neg, vl);
+                vbool4_t v_mask_add_offset_ovf_pos = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_pos, vl);
+                vbool4_t v_mask_add_offset_ovf_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_neg, vl);
+
                 vint32m8_t v_requant_stage2 = __riscv_vadd_vx_i32m8(v_requant_stage1, output_offset, vl);
-                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MAX, v_temp_mask_b4, vl);
-                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MIN, v_temp_mask_b4_neg, vl);
 
-                // Clamp the result to the final activation range [0, 255]
+                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MAX, v_mask_add_offset_ovf_pos, vl);
+                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MIN, v_mask_add_offset_ovf_neg, vl);
+
                 vint32m8_t v_clamped_i32 = __riscv_vmax_vx_i32m8(v_requant_stage2, output_activation_min_i32, vl);
                 v_clamped_i32 = __riscv_vmin_vx_i32m8(v_clamped_i32, output_activation_max_i32, vl);
 
-                // Narrow the 32-bit results down to 8-bit unsigned integers
                 vint16m4_t v_narrowed_i16 = __riscv_vnsra_wx_i16m4(v_clamped_i32, 0, vl);
-                vuint16m4_t v_narrowed_u16 = __riscv_vreinterpret_v_i16m4_u16m4(v_narrowed_i16);
-                vuint8m2_t v_output_u8 = __riscv_vnclipu_wx_u8m2(v_narrowed_u16, 0, default_vxrm, vl);
 
-                // Calculate the base pointer for storing the output strip
-                uint8_t* output_base_ptr = output_data +
+                __riscv_csrw(CSR_VXRM, __RISCV_VXRM_RDN);
+
+                vint8m2_t v_output_i8 = __riscv_vnclip_wx_i8m2(v_narrowed_i16, 0, default_vxrm, vl);
+
+                int8_t* output_base_ptr = output_data +
                                             (size_t)out_y * output_row_stride +
                                             current_out_x * output_channels +
                                             out_c;
 
-                // Define the byte stride for storing into the HWC output layout
-                ptrdiff_t byte_stride = (ptrdiff_t)output_channels;
+                ptrdiff_t byte_stride = (ptrdiff_t)output_channels * sizeof(int8_t);
 
-                // Store the computed 8-bit output values using strided store
-                __riscv_vsse8_v_u8m2(output_base_ptr, byte_stride, v_output_u8, vl);
+                __riscv_vsse8_v_i8m2(output_base_ptr,
+                                     byte_stride,
+                                     v_output_i8,
+                                     vl);
 
-                // Move to the next horizontal strip
                 current_out_x += vl;
             }
         }
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
index df812e9f8c2..3efb5c0520d 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -5,6 +5,26 @@
 #include <cstdint>
 #include <cstddef>
 
-// #ifdef __cplusplus
-// extern "C" {
-// #endif
\ No newline at end of file
+void convolution_hwc_ohwi_rvv(
+    const int8_t* input_data,
+    const uint16_t input_height,
+    const uint16_t input_width,
+    const uint16_t input_channels,
+    const int32_t input_offset,
+    const int8_t* filter_data,
+    const uint16_t filter_height,
+    const uint16_t filter_width,
+    const int32_t* bias_data,
+    int8_t* output_data,
+    const uint16_t output_height,
+    const uint16_t output_width,
+    const uint16_t output_channels,
+    const int32_t output_offset,
+    const int32_t* output_multiplier,
+    const int32_t* output_shift,
+    const uint16_t stride_height,
+    const uint16_t stride_width,
+    const uint16_t pad_height,
+    const uint16_t pad_width);
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/testing/test_with_spike.sh b/tensorflow/lite/micro/testing/test_with_spike.sh
new file mode 100755
index 00000000000..35d20e57456
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_with_spike.sh
@@ -0,0 +1,29 @@
+#!/bin/bash -e
+
+# Parameters:
+#  ${1} suffix for qemu binary (e.g. to use qemu-arm ${1} should be arm
+#  ${2} architecture to pass to qemu (e.g. cortex-m3)
+#  ${3} cross-compiled binary to be emulated
+#  ${4} - String that is checked for pass/fail.
+#  ${5} - target (cortex_m_qemu etc.)
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TFLM_ROOT_DIR=${SCRIPT_DIR}/../../../../
+
+TEST_TMPDIR=/tmp/test_${5}
+MICRO_LOG_PATH=${TEST_TMPDIR}/${3}
+MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+
+mkdir -p ${MICRO_LOG_PATH}
+spike --isa=rv32gcv ~/rv32imc_zve32x_zvl128b/riscv32-unknown-elf/bin/pk ${1} 2>&1 | tee ${MICRO_LOG_FILENAME}
+if [[ ${2} != "non_test_binary" ]]
+then
+  if grep -q "${2}" ${MICRO_LOG_FILENAME}
+  then
+    echo "Pass"
+    exit 0
+  else
+    echo "Fail"
+    exit 1
+  fi
+fi
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/riscv_vector.inc b/tensorflow/lite/micro/tools/make/ext_libs/riscv_vector.inc
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index e69de29bb2d..8ae7cbe0b2f 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -0,0 +1,63 @@
+# Settings for RISCV 32-bit toolchain.
+TARGET_ARCH := riscv32
+TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
+
+RISCV_ARCH := rv32imc_zve32x_zvl128b
+RISCV_ABI := ilp32
+RISCV_CODE_MODEL := medany
+
+# Allow additional flags on the command line for debugging.
+RISCV_EXTRA_CFLAGS :=
+
+TARGET_DEFAULT_TOOLCHAIN_ROOT := $(HOME)/rv32imc_zve32x_zvl128b/bin/
+TARGET_TOOLCHAIN_ROOT := $(TARGET_DEFAULT_TOOLCHAIN_ROOT)
+ifeq ($(TARGET_TOOLCHAIN_ROOT), $(TARGET_DEFAULT_TOOLCHAIN_ROOT))
+  $(eval $(call add_third_party_download,$(RISCV_TOOLCHAIN_URL),$(RISCV_TOOLCHAIN_MD5),riscv_toolchain,))
+endif
+
+export PATH := $(TARGET_TOOLCHAIN_ROOT):$(PATH)
+
+PLATFORM_FLAGS = \
+  -march=$(RISCV_ARCH) \
+  -mabi=$(RISCV_ABI) \
+  -mcmodel=$(RISCV_CODE_MODEL) \
+  -mexplicit-relocs \
+  -fno-builtin-printf \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -DTF_LITE_USE_GLOBAL_CMATH_FUNCTIONS \
+  -funsigned-char \
+  -fno-delete-null-pointer-checks \
+  -fomit-frame-pointer \
+  -DTFLM_USE_RISCV_VECTOR
+
+CXXFLAGS += $(PLATFORM_FLAGS) \
+  -fpermissive \
+  -fno-use-cxa-atexit \
+  -DTF_LITE_USE_GLOBAL_MIN \
+  -DTF_LITE_USE_GLOBAL_MAX
+
+CCFLAGS += $(PLATFORM_FLAGS)
+
+BUILD_TYPE := micro
+
+LDFLAGS += --specs=nano.specs
+
+# See http://b/15851472 for why memory arena threshold test is disabled.
+EXCLUDED_TESTS := \
+  $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_arena_threshold_test.cc
+
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+CCFLAGS += $(RISCV_EXTRA_CFLAGS)
+CXXFLAGS += $(RISCV_EXTRA_CFLAGS)
+
+# This disables the "linker relaxation" optimization, which produced incorrect code.
+# TODO(b/279805615): Check whether this is fixed in newer versions of the toolchain.
+LDFLAGS += -mno-relax
+TEST_SCRIPT := $(TENSORFLOW_ROOT)tensorflow/lite/micro/testing/test_with_spike.sh
+SIZE_SCRIPT := ${TENSORFLOW_ROOT}tensorflow/lite/micro/testing/size_riscv32_binary.sh
+
+include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
+
+MICROLITE_CC_SRCS += \
+  tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc

From e173412b4a840cb1d1a3967417c76b60d4cf89fb Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 10:20:33 -0500
Subject: [PATCH 08/86] Fix formatting

---
 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 883f02d13e3..1d144f2aad2 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -182,28 +182,22 @@ void convolution_hwc_ohwi_rvv(
 
     for (int out_c = 0; out_c < output_channels; ++out_c)
     {
-
-
         const int32_t current_bias = (bias_data != nullptr) ? bias_data[out_c] : 0;
         const int32_t current_output_multiplier = output_multiplier[out_c];
         const int32_t current_output_shift = output_shift[out_c];
 
-
         const int32_t left_shift = std::max((int32_t)0, current_output_shift);
         const int32_t right_shift = std::max((int32_t)0, -current_output_shift);
 
         const int32_t rounding_offset = (right_shift > 0) ? (1 << (right_shift - 1)) : 0;
 
-
         const int32_t add_rounding_limit = INT32_MAX - rounding_offset;
-
         const int32_t add_offset_limit_pos = INT32_MAX - output_offset;
         const int32_t add_offset_limit_neg = INT32_MIN - output_offset;
 
         const int32_t left_shift_limit_pos = (left_shift < 31) ? (INT32_MAX >> left_shift) : 0;
         const int32_t left_shift_limit_neg = (left_shift < 31) ? (INT32_MIN >> left_shift) : -1;
 
-
         for (int out_y = 0; out_y < output_height; ++out_y)
         {
             const int in_y_origin = (out_y * stride_height) - pad_height;

From d980b28b77a0054a325f9d4e643363db02d9c13c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 11:11:36 -0500
Subject: [PATCH 09/86] Fix padding logic

---
 tensorflow/lite/micro/kernels/conv.cc         |   8 +-
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 234 +++++++++---------
 2 files changed, 119 insertions(+), 123 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 3d0a5557a75..2d3debc6661 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -220,7 +220,7 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
             filter_data_ptr,
             filter_height,
             filter_width,
-            bias_data_ptr, // Pass bias pointer (can be null)
+            bias_data_ptr,
             output_data_ptr,
             output_height,
             output_width,
@@ -231,7 +231,11 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
             stride_height,
             stride_width,
             pad_height,
-            pad_width
+            pad_width,
+            data.output_activation_min,
+            data.output_activation_max,
+            data.dilation_height_factor,
+            data.dilation_width_factor
            );
 #else // defined(TFLM_USE_RISCV_VECTOR)
           reference_integer_ops::ConvPerChannel(
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 1d144f2aad2..210c3cd4372 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -8,117 +8,117 @@
 
 // TFLite Micro reference
 // Fixed-point per-channel-quantization convolution reference kernel.
-// inline void ConvPerChannel(
-//     const ConvParams& params, const int32_t* output_multiplier,
-//     const int32_t* output_shift, const RuntimeShape& input_shape,
-//     const int8_t* input_data, const RuntimeShape& filter_shape,
-//     const int8_t* filter_data, const RuntimeShape& bias_shape,
-//     const int32_t* bias_data, const RuntimeShape& output_shape,
-//     int8_t* output_data) {
-//   // Get parameters.
-//   const int32_t input_offset = params.input_offset;  // r = s(q - Z)
-//   const int stride_width = params.stride_width;
-//   const int stride_height = params.stride_height;
-//   const int dilation_width_factor = params.dilation_width_factor;
-//   const int dilation_height_factor = params.dilation_height_factor;
-//   const int pad_width = params.padding_values.width;
-//   const int pad_height = params.padding_values.height;
-//   const int32_t output_offset = params.output_offset;
-
-//   // Set min and max value of the output.
-//   const int32_t output_activation_min = params.quantized_activation_min;
-//   const int32_t output_activation_max = params.quantized_activation_max;
-
-//   // Consistency check.
-//   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-//   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-//   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-//   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-//   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-//   const int input_depth = input_shape.Dims(3);
-//   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-//   if (bias_data) {
-//     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-//   }
-
-//   // Check dimensions of the tensors.
-//   const int input_height = input_shape.Dims(1);
-//   const int input_width = input_shape.Dims(2);
-//   const int filter_height = filter_shape.Dims(1);
-//   const int filter_width = filter_shape.Dims(2);
-//   const int filter_input_depth = filter_shape.Dims(3);
-//   const int groups = input_depth / filter_input_depth;
-//   TFLITE_DCHECK_NE(groups, 0);
-//   TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
-//   const int filters_per_group = output_depth / groups;
-//   TFLITE_DCHECK_NE(filters_per_group, 0);
-//   const int output_height = output_shape.Dims(1);
-//   const int output_width = output_shape.Dims(2);
-//   for (int batch = 0; batch < batches; ++batch) {
-//     for (int out_y = 0; out_y < output_height; ++out_y) {
-//       const int in_y_origin = (out_y * stride_height) - pad_height;
-//       for (int out_x = 0; out_x < output_width; ++out_x) {
-//         const int in_x_origin = (out_x * stride_width) - pad_width;
-//         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-//           auto group = out_channel / filters_per_group;
-//           int32_t acc = 0;
-//           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-//             const int in_y = in_y_origin + dilation_height_factor * filter_y;
-//             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-//               const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-//               // Zero padding by omitting the areas outside the image.
-//               const bool is_point_inside_image =
-//                   (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-//                   (in_y < input_height);
-
-//               if (!is_point_inside_image) {
-//                 continue;
-//               }
-
-//               for (int in_channel = 0; in_channel < filter_input_depth;
-//                    ++in_channel) {
-//                 int32_t input_val =
-//                     input_data[Offset(input_shape, batch, in_y, in_x,
-//                                       in_channel + group * filter_input_depth)];
-//                 int32_t filter_val = filter_data[Offset(
-//                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
-//                 // Accumulate with 32 bits accumulator.
-//                 // In the nudging process during model quantization, we force
-//                 // real value of 0.0 be represented by a quantized value. This
-//                 // guarantees that the input_offset is a int8_t, even though
-//                 // it is represented using int32_t. int32_t += int8_t *
-//                 // (int8_t - int8_t) so the highest value we can get from each
-//                 // accumulation is [-127, 127] * ([-128, 127] -
-//                 // [-128, 127]), which is [-32512, 32512]. log2(32512)
-//                 // = 14.98, which means we can accumulate at least 2^16
-//                 // multiplications without overflow. The accumulator is
-//                 // applied to a filter so the accumulation logic will hold as
-//                 // long as the filter size (filter_y * filter_x * in_channel)
-//                 // does not exceed 2^16, which is the case in all the models
-//                 // we have seen so far.
-//                 // TODO(b/174275578): Add a check to make sure the
-//                 // accumulator depth is smaller than 2^16.
-//                 acc += filter_val * (input_val + input_offset);
-//               }
-//             }
-//           }
-
-//           if (bias_data) {
-//             acc += bias_data[out_channel];
-//           }
-//           acc = MultiplyByQuantizedMultiplier(
-//               acc, output_multiplier[out_channel], output_shift[out_channel]);
-//           acc += output_offset;
-//           acc = std::max(acc, output_activation_min);
-//           acc = std::min(acc, output_activation_max);
-//           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-//               static_cast<int8_t>(acc);
-//         }
-//       }
-//     }
-//   }
-// }
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(b/174275578): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
 
 __attribute__((hot))
 void convolution_hwc_ohwi_rvv(
@@ -254,19 +254,11 @@ void convolution_hwc_ohwi_rvv(
                             vint16m4_t v_input_i16 = __riscv_vsext_vf2_i16m4(v_loaded_input_i8, vl);
 
                             vint16m4_t v_input_offset_i16 = __riscv_vmv_v_x_i16m4((int16_t)input_offset, vl);
-                            vint16m4_t v_input_i16_add_zp = __riscv_vadd_vv_i16m4(v_input_i16, v_input_offset_i16, vl);
-
-                            vint16m4_t v_zeros_i16 = __riscv_vmv_v_x_i16m4(0, vl);
-
-                            vint16m4_t v_input_i16_padded = __riscv_vmerge_vvm_i16m4(
-                                                                    v_zeros_i16,
-                                                                    v_input_i16_add_zp,
-                                                                    v_mask_valid_x,
-                                                                    vl);
+                            vint16m4_t v_input_plus_offset_all = __riscv_vadd_vv_i16m4(v_input_i16, v_input_offset_i16, vl);
 
                             v_acc = __riscv_vwmacc_vx_i32m8(v_acc,
                                                             filter_val,
-                                                            v_input_i16_padded,
+                                                            v_input_plus_offset_all,
                                                             vl);
                         }
                     }

From a749783c583bb8e01b1f9a9f54aab98050bfd990 Mon Sep 17 00:00:00 2001
From: numbers1234567 <angelochem4@gmail.com>
Date: Tue, 15 Apr 2025 13:14:39 -0500
Subject: [PATCH 10/86] Build tflite with custom kernel

---
 .../kernels/riscv_vector/depthwise_conv.cc    | 192 ++++++++++++++++++
 tensorflow/lite/micro/tools/make/Makefile     |   3 +-
 .../targets/rv32imc_v128_newlib_makefile.inc  |  60 ++++++
 3 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
 create mode 100644 tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc b/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
new file mode 100644
index 00000000000..3c381ecfc9f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
@@ -0,0 +1,192 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+namespace {
+
+void* DepthwiseConvInit(TfLiteContext* context, const char* buffer,
+                        size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
+}
+
+TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const OpDataConv& data = *(static_cast<const OpDataConv*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
+          : nullptr;
+
+  MicroPrintf("[PEANUT MICROSYSTEMS] Using vectorized implementation");
+
+#ifdef USE_TFLM_COMPRESSION
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* filter_comp_td =
+      micro_context->GetTensorCompressionData(node,
+                                              kDepthwiseConvWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kDepthwiseConvBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              filter_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(
+              micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = static_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   filter_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      switch (filter->type) {
+        case kTfLiteInt8: {
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int16_t>(input),
+              tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   filter_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int64_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_DEPTHWISE_CONV_2D() {
+  return tflite::micro::RegisterOp(DepthwiseConvInit, DepthwiseConvPrepare,
+                                   DepthwiseConvEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 287661882f6..1c4c7643968 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -382,7 +382,6 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space.cc \
-$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize_common.cc \
@@ -598,7 +597,7 @@ $(DOWNLOADS_DIR)/kissfft/tools/kiss_fftr.h \
 $(DOWNLOADS_DIR)/ruy/ruy/profiler/instrumentation.h
 
 THIRD_PARTY_CC_SRCS :=
-THIRD_PARTY_KERNEL_CC_SRCS :=
+THIRD_PARTY_KERNEL_CC_SRCS := $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
 
 # Load custom kernels.
 include $(MAKEFILE_DIR)/additional_kernels.inc
diff --git a/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc b/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc
new file mode 100644
index 00000000000..4257f6f4b89
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc
@@ -0,0 +1,60 @@
+# Settings for RISCV 32-bit toolchain.
+TARGET_ARCH := riscv32
+TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
+
+RISCV_ARCH := rv32imc_zve32x_zvl128b
+RISCV_ABI := ilp32
+RISCV_CODE_MODEL := medany
+
+# Allow additional flags on the command line for debugging.
+RISCV_EXTRA_CFLAGS :=
+
+TARGET_DEFAULT_TOOLCHAIN_ROOT := $(RISCV)/bin/
+
+TARGET_TOOLCHAIN_ROOT := $(TARGET_DEFAULT_TOOLCHAIN_ROOT)
+ifeq ($(TARGET_TOOLCHAIN_ROOT), $(TARGET_DEFAULT_TOOLCHAIN_ROOT))
+  $(eval $(call add_third_party_download,$(RISCV_TOOLCHAIN_URL),$(RISCV_TOOLCHAIN_MD5),riscv_toolchain,))
+endif
+
+export PATH := $(TARGET_TOOLCHAIN_ROOT):$(PATH)
+
+PLATFORM_FLAGS = \
+  -march=$(RISCV_ARCH) \
+  -mabi=$(RISCV_ABI) \
+  -mcmodel=$(RISCV_CODE_MODEL) \
+  -mexplicit-relocs \
+  -fno-builtin-printf \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -DTF_LITE_USE_GLOBAL_CMATH_FUNCTIONS \
+  -funsigned-char \
+  -fno-delete-null-pointer-checks \
+  -fomit-frame-pointer
+
+CXXFLAGS += $(PLATFORM_FLAGS) \
+  -fpermissive \
+  -fno-use-cxa-atexit \
+  -DTF_LITE_USE_GLOBAL_MIN \
+  -DTF_LITE_USE_GLOBAL_MAX
+
+CCFLAGS += $(PLATFORM_FLAGS)
+
+BUILD_TYPE := micro
+
+LDFLAGS += --specs=nano.specs
+
+# See http://b/158651472 for why memory arena threshold test is disabled.
+EXCLUDED_TESTS := \
+  $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_arena_threshold_test.cc
+
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+CCFLAGS += $(RISCV_EXTRA_CFLAGS)
+CXXFLAGS += $(RISCV_EXTRA_CFLAGS)
+
+# This disables the "linker relaxation" optimization, which produced incorrect code.
+# TODO(b/279805615): Check whether this is fixed in newer versions of the toolchain.
+LDFLAGS += -mno-relax
+TEST_SCRIPT := $(TENSORFLOW_ROOT)tensorflow/lite/micro/testing/test_with_qemu.sh riscv32 rv32
+SIZE_SCRIPT := ${TENSORFLOW_ROOT}tensorflow/lite/micro/testing/size_riscv32_binary.sh
+
+include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
\ No newline at end of file

From 3d6bbc1ed59858379fe1e840307f281e5db14006 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 15:19:48 -0500
Subject: [PATCH 11/86] Partially vectorized 2D convolution implementation that
 passes all convolution tests

---
 tensorflow/lite/micro/kernels/conv.cc         |  93 +----
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 386 ++++++------------
 .../micro/kernels/riscv_vector/conv_rvv.h     |  38 +-
 3 files changed, 152 insertions(+), 365 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 2d3debc6661..b9980866f92 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -164,79 +164,26 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
               return kTfLiteError;
           }
 #endif // USE_TFLM_COMPRESSION
-          const TfLiteConvParams* conv_params_rvv = // Use different name to avoid shadowing
-              static_cast<const TfLiteConvParams*>(node->builtin_data);
-          const TfLiteEvalTensor* input_rvv =
-              tflite::micro::GetEvalInput(context, node, kConvInputTensor);
-          const TfLiteEvalTensor* filter_rvv =
-              tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
-          const TfLiteEvalTensor* bias_rvv =
-              (NumInputs(node) == 3)
-                  ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
-                  : nullptr;
-          TfLiteEvalTensor* output_rvv =
-              tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
-
-          if (bias_rvv != nullptr && bias_rvv->type != kTfLiteInt32) {
-             MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias_rvv->type));
-             return kTfLiteError;
-          }
-
-          const int8_t* input_data_ptr = tflite::micro::GetTensorData<int8_t>(input_rvv);
-          const int8_t* filter_data_ptr = tflite::micro::GetTensorData<int8_t>(filter_rvv);
-          const int32_t* bias_data_ptr =
-              (bias_rvv) ? tflite::micro::GetTensorData<int32_t>(bias_rvv) : nullptr;
-          int8_t* output_data_ptr = tflite::micro::GetTensorData<int8_t>(output_rvv);
-
-          const int32_t input_zero_point_arg = data.input_zero_point;
-          const int32_t output_zero_point_arg = data.output_zero_point;
-          const int32_t* output_multiplier_ptr = data.per_channel_output_multiplier;
-          const int32_t* output_shift_ptr = data.per_channel_output_shift;
-
-          const uint16_t input_height = static_cast<uint16_t>(input_rvv->dims->data[1]);
-          const uint16_t input_width = static_cast<uint16_t>(input_rvv->dims->data[2]);
-          const uint16_t input_channels = static_cast<uint16_t>(input_rvv->dims->data[3]);
-
-          const uint16_t filter_height = static_cast<uint16_t>(filter_rvv->dims->data[1]);
-          const uint16_t filter_width = static_cast<uint16_t>(filter_rvv->dims->data[2]);
-
-          const uint16_t output_channels = static_cast<uint16_t>(output_rvv->dims->data[3]);
-
-          const uint16_t output_height = static_cast<uint16_t>(output_rvv->dims->data[1]);
-          const uint16_t output_width = static_cast<uint16_t>(output_rvv->dims->data[2]);
-
-          const uint16_t stride_height = static_cast<uint16_t>(conv_params_rvv->stride_height);
-          const uint16_t stride_width = static_cast<uint16_t>(conv_params_rvv->stride_width);
-          const uint16_t pad_height = static_cast<uint16_t>(data.padding.height);
-          const uint16_t pad_width = static_cast<uint16_t>(data.padding.width);
-
-          // Call the optimized RVV kernel
-          convolution_hwc_ohwi_rvv(
-            input_data_ptr,
-            input_height,
-            input_width,
-            input_channels,
-            input_zero_point_arg,
-            filter_data_ptr,
-            filter_height,
-            filter_width,
-            bias_data_ptr,
-            output_data_ptr,
-            output_height,
-            output_width,
-            output_channels,
-            output_zero_point_arg,
-            output_multiplier_ptr,
-            output_shift_ptr,
-            stride_height,
-            stride_width,
-            pad_height,
-            pad_width,
-            data.output_activation_min,
-            data.output_activation_max,
-            data.dilation_height_factor,
-            data.dilation_width_factor
-           );
+          // Check bias type is compatible (as per your original check)
+          if (bias != nullptr && bias->type != kTfLiteInt32) {
+            MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
+            return kTfLiteError;
+         }
+
+         // Call the optimized RVV kernel with the *new* correct parameters
+         ConvPerChannelRVV(
+             ConvParamsQuantized(params, data),          // const ConvParams& params
+             data.per_channel_output_multiplier,         // const int32_t* output_multiplier
+             data.per_channel_output_shift,              // const int32_t* output_shift
+             tflite::micro::GetTensorShape(input),       // const RuntimeShape& input_shape
+             tflite::micro::GetTensorData<int8_t>(input), // const int8_t* input_data
+             tflite::micro::GetTensorShape(filter),      // const RuntimeShape& filter_shape
+             tflite::micro::GetTensorData<int8_t>(filter),// const int8_t* filter_data
+             tflite::micro::GetTensorShape(bias),        // const RuntimeShape& bias_shape
+             tflite::micro::GetOptionalTensorData<int32_t>(bias), // const int32_t* bias_data
+             tflite::micro::GetTensorShape(output),      // const RuntimeShape& output_shape
+             tflite::micro::GetTensorData<int8_t>(output) // int8_t* output_data
+         );
 #else // defined(TFLM_USE_RISCV_VECTOR)
           reference_integer_ops::ConvPerChannel(
               ConvParamsQuantized(params, data),
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 210c3cd4372..8ec84677ff5 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -6,323 +6,167 @@
 
 #include <riscv_vector.h>
 
-// TFLite Micro reference
-// Fixed-point per-channel-quantization convolution reference kernel.
-inline void ConvPerChannel(
+#include "tensorflow/lite/kernels/internal/common.h"
+
+using namespace tflite;
+
+void ConvPerChannelRVV(
     const ConvParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
     const int8_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
     const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data) {
-  // Get parameters.
-  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int32_t output_offset = params.output_offset;
-
-  // Set min and max value of the output.
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-
-  // Consistency check.
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = input_shape.Dims(3);
-  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (bias_data) {
-    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-  }
-
-  // Check dimensions of the tensors.
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int filter_input_depth = filter_shape.Dims(3);
-  const int groups = input_depth / filter_input_depth;
-  TFLITE_DCHECK_NE(groups, 0);
-  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
-  const int filters_per_group = output_depth / groups;
-  TFLITE_DCHECK_NE(filters_per_group, 0);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin = (out_x * stride_width) - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          auto group = out_channel / filters_per_group;
-          int32_t acc = 0;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-              // Zero padding by omitting the areas outside the image.
-              const bool is_point_inside_image =
-                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                  (in_y < input_height);
-
-              if (!is_point_inside_image) {
-                continue;
-              }
-
-              for (int in_channel = 0; in_channel < filter_input_depth;
-                   ++in_channel) {
-                int32_t input_val =
-                    input_data[Offset(input_shape, batch, in_y, in_x,
-                                      in_channel + group * filter_input_depth)];
-                int32_t filter_val = filter_data[Offset(
-                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
-                // Accumulate with 32 bits accumulator.
-                // In the nudging process during model quantization, we force
-                // real value of 0.0 be represented by a quantized value. This
-                // guarantees that the input_offset is a int8_t, even though
-                // it is represented using int32_t. int32_t += int8_t *
-                // (int8_t - int8_t) so the highest value we can get from each
-                // accumulation is [-127, 127] * ([-128, 127] -
-                // [-128, 127]), which is [-32512, 32512]. log2(32512)
-                // = 14.98, which means we can accumulate at least 2^16
-                // multiplications without overflow. The accumulator is
-                // applied to a filter so the accumulation logic will hold as
-                // long as the filter size (filter_y * filter_x * in_channel)
-                // does not exceed 2^16, which is the case in all the models
-                // we have seen so far.
-                // TODO(b/174275578): Add a check to make sure the
-                // accumulator depth is smaller than 2^16.
-                acc += filter_val * (input_val + input_offset);
-              }
-            }
-          }
-
-          if (bias_data) {
-            acc += bias_data[out_channel];
-          }
-          acc = MultiplyByQuantizedMultiplier(
-              acc, output_multiplier[out_channel], output_shift[out_channel]);
-          acc += output_offset;
-          acc = std::max(acc, output_activation_min);
-          acc = std::min(acc, output_activation_max);
-          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<int8_t>(acc);
-        }
-      }
-    }
-  }
-}
-
-__attribute__((hot))
-void convolution_hwc_ohwi_rvv(
-    const int8_t* input_data,
-    const uint16_t input_height,
-    const uint16_t input_width,
-    const uint16_t input_channels,
-    const int32_t input_offset,
-    const int8_t* filter_data,
-    const uint16_t filter_height,
-    const uint16_t filter_width,
-    const int32_t* bias_data,
-    int8_t* output_data,
-    const uint16_t output_height,
-    const uint16_t output_width,
-    const uint16_t output_channels,
-    const int32_t output_offset,
-    const int32_t* output_multiplier,
-    const int32_t* output_shift,
-    const uint16_t stride_height,
-    const uint16_t stride_width,
-    const uint16_t pad_height,
-    const uint16_t pad_width,
-    const int32_t output_activation_min,
-    const int32_t output_activation_max,
-    int dilation_height_factor,
-    int dilation_width_factor
-)
+    int8_t* output_data)
 {
-    assert(input_data != nullptr);
-    assert(filter_data != nullptr);
-
-    assert(output_data != nullptr);
-    assert(output_multiplier != nullptr);
-    assert(output_shift != nullptr);
-
-
-    assert(input_height > 0);
-    assert(input_width > 0);
-    assert(input_channels > 0);
-    assert(filter_height > 0);
-    assert(filter_width > 0);
-    assert(output_height > 0);
-    assert(output_width > 0);
-    assert(output_channels > 0);
-    assert(stride_height > 0);
-    assert(stride_width > 0);
+    const int32_t input_offset = params.input_offset;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int32_t output_offset = params.output_offset;
 
-    assert(input_offset >= -128 && input_offset <= 127);
-    assert(output_offset >= -128 && output_offset <= 127);
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
 
+    const int input_batches = input_shape.Dims(0);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
 
-    const size_t input_row_stride = (size_t)input_width * input_channels;
-    const size_t output_row_stride = (size_t)output_width * output_channels;
-    const size_t filter_kernel_plane_size = (size_t)filter_height * filter_width * input_channels;
+    const int filter_output_depth = filter_shape.Dims(0);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int filter_input_depth = filter_shape.Dims(3);
 
-    const int32_t output_activation_min_i32 = output_activation_min;
-    const int32_t output_activation_max_i32 = output_activation_max;
+    const int output_batches = output_shape.Dims(0);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    const int output_depth_dim = output_shape.Dims(3);
 
-    const unsigned int default_vxrm = __RISCV_VXRM_RNU;
+    const int batches = std::min(input_batches, output_batches);
+    const int output_depth = std::min(filter_output_depth, output_depth_dim);
 
-    for (int out_c = 0; out_c < output_channels; ++out_c)
-    {
-        const int32_t current_bias = (bias_data != nullptr) ? bias_data[out_c] : 0;
-        const int32_t current_output_multiplier = output_multiplier[out_c];
-        const int32_t current_output_shift = output_shift[out_c];
+    const int groups = input_depth / filter_input_depth;
+    const int filters_per_group = output_depth / groups;
 
-        const int32_t left_shift = std::max((int32_t)0, current_output_shift);
-        const int32_t right_shift = std::max((int32_t)0, -current_output_shift);
+    const int16_t input_offset_s16 = static_cast<int16_t>(input_offset);
 
-        const int32_t rounding_offset = (right_shift > 0) ? (1 << (right_shift - 1)) : 0;
+    const int input_ch_stride = 1;
+    const int input_w_stride = input_depth * input_ch_stride;
+    const int input_h_stride = input_width * input_w_stride;
+    const int input_b_stride = input_height * input_h_stride;
 
-        const int32_t add_rounding_limit = INT32_MAX - rounding_offset;
-        const int32_t add_offset_limit_pos = INT32_MAX - output_offset;
-        const int32_t add_offset_limit_neg = INT32_MIN - output_offset;
+    const int filter_ch_stride = 1;
+    const int filter_w_stride = filter_input_depth * filter_ch_stride;
+    const int filter_h_stride = filter_width * filter_w_stride;
+    const int filter_o_stride = filter_height * filter_h_stride;
 
-        const int32_t left_shift_limit_pos = (left_shift < 31) ? (INT32_MAX >> left_shift) : 0;
-        const int32_t left_shift_limit_neg = (left_shift < 31) ? (INT32_MIN >> left_shift) : -1;
+    const int output_ch_stride = 1;
+    const int output_w_stride = output_depth * output_ch_stride;
+    const int output_h_stride = output_width * output_w_stride;
+    const int output_b_stride = output_height * output_h_stride;
 
-        for (int out_y = 0; out_y < output_height; ++out_y)
-        {
-            const int in_y_origin = (out_y * stride_height) - pad_height;
 
-            size_t current_out_x = 0;
-            while (current_out_x < output_width)
-            {
-                const size_t vl = __riscv_vsetvl_e32m8(output_width - current_out_x);
+    for (int batch = 0; batch < batches; ++batch) {
+      const int8_t* input_batch_base = input_data + batch * input_b_stride;
+      int8_t* output_batch_base = output_data + batch * output_b_stride;
 
-                vint32m8_t v_acc = __riscv_vmv_v_x_i32m8(0, vl);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
 
-                for (int k_y = 0; k_y < filter_height; ++k_y)
-                {
-                    const int in_y = in_y_origin + k_y * dilation_height_factor;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
 
-                    if (in_y < 0 || in_y >= input_height) continue;
+          for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+            const int group = out_channel / filters_per_group;
+            const int group_start_input_channel = group * filter_input_depth;
+            int32_t acc = 0;
 
-                    for (int k_x = 0; k_x < filter_width; ++k_x)
-                    {
-                        vuint32m8_t v_lane_indices = __riscv_vid_v_u32m8(vl);
-                        vuint32m8_t v_out_x_indices = __riscv_vadd_vx_u32m8(v_lane_indices, current_out_x, vl);
-                        vuint32m8_t v_in_x_base = __riscv_vmul_vx_u32m8(v_out_x_indices, stride_width, vl);
-                        const int32_t in_x_origin_for_k = (int32_t)(k_x * dilation_width_factor) - pad_width;
-                        vint32m8_t v_in_x_i32 = __riscv_vadd_vx_i32m8(__riscv_vreinterpret_v_u32m8_i32m8(v_in_x_base), in_x_origin_for_k, vl);
+            const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
-                        vbool4_t v_mask_x_ge_0 = __riscv_vmsge_vx_i32m8_b4(v_in_x_i32, 0, vl);
-                        vbool4_t v_mask_x_lt_w = __riscv_vmslt_vx_i32m8_b4(v_in_x_i32, input_width, vl);
-                        vbool4_t v_mask_valid_x = __riscv_vmand_mm_b4(v_mask_x_ge_0, v_mask_x_lt_w, vl);
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              const int8_t* filter_y_base = filter_oc_base + filter_y * filter_h_stride;
 
-                        vuint32m8_t v_in_x_u32 = __riscv_vreinterpret_v_i32m8_u32m8(v_in_x_i32);
-                        vuint32m8_t v_in_x_ch_offset = __riscv_vmul_vx_u32m8(v_in_x_u32, input_channels, vl);
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int8_t* filter_x_base = filter_y_base + filter_x * filter_w_stride;
 
-                        const int8_t* filter_ptr_base = filter_data +
-                            (size_t)out_c * filter_kernel_plane_size +
-                            (size_t)k_y * filter_width * input_channels +
-                            (size_t)k_x * input_channels;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
 
-                        for (int in_c = 0; in_c < input_channels; ++in_c)
-                        {
-                            const int8_t filter_val = filter_ptr_base[in_c];
-
-                            if (filter_val == 0)
-                                continue;
-
-                            uint32_t base_offset_for_row_ch = (uint32_t)in_y * input_row_stride + in_c;
-                            vuint32m8_t v_byte_offset_u32 = __riscv_vadd_vx_u32m8(v_in_x_ch_offset, base_offset_for_row_ch, vl);
-
-                            vint8m2_t v_loaded_input_i8 = __riscv_vloxei32_v_i8m2_m(
-                                                            v_mask_valid_x,
-                                                            input_data,
-                                                            v_byte_offset_u32,
-                                                            vl);
-
-                            vint16m4_t v_input_i16 = __riscv_vsext_vf2_i16m4(v_loaded_input_i8, vl);
-
-                            vint16m4_t v_input_offset_i16 = __riscv_vmv_v_x_i16m4((int16_t)input_offset, vl);
-                            vint16m4_t v_input_plus_offset_all = __riscv_vadd_vv_i16m4(v_input_i16, v_input_offset_i16, vl);
-
-                            v_acc = __riscv_vwmacc_vx_i32m8(v_acc,
-                                                            filter_val,
-                                                            v_input_plus_offset_all,
-                                                            vl);
-                        }
-                    }
+                if (!is_point_inside_image) {
+                  continue;
                 }
 
-                if (bias_data != nullptr) {
-                    v_acc = __riscv_vadd_vx_i32m8(v_acc, current_bias, vl);
-                }                 
+                const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
+                const int8_t* input_ptr = input_batch_base + input_offset_addr;
 
-                 vint32m8_t v_requant_stage1 = __riscv_vmulh_vx_i32m8(v_acc, current_output_multiplier, vl);
+                const int8_t* filter_ptr = filter_x_base;
 
-                if (right_shift > 0)
-                {
-                    vbool4_t v_mask_add_round_ovf = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_rounding_limit, vl);
-                    vint32m8_t v_add_round_sat = __riscv_vmerge_vxm_i32m8(v_requant_stage1, INT32_MAX, v_mask_add_round_ovf, vl);
 
-                    vint32m8_t v_added_round = __riscv_vadd_vx_i32m8(v_add_round_sat, rounding_offset, vl);
+                size_t channels_remaining = filter_input_depth;
+                int32_t patch_acc = 0;
 
-                    v_requant_stage1 = __riscv_vsra_vx_i32m8(v_added_round, right_shift, vl);
-                 }
+                vint32m1_t v_zero_for_reduction = __riscv_vmv_s_x_i32m1(0, 1);
+                vint32m1_t v_sum_reduction = v_zero_for_reduction;
 
-                if (left_shift > 0)
-                {
-                    vbool4_t v_mask_lshift_ovf_pos = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_pos, vl);
-                    vbool4_t v_mask_lshift_ovf_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, left_shift_limit_neg, vl);
+                while (channels_remaining > 0) {
+                    size_t current_vl = __riscv_vsetvl_e16m4(channels_remaining);
 
-                    vint32m8_t v_shifted = __riscv_vsll_vx_i32m8(v_requant_stage1, left_shift, vl);
+                    vint8m2_t v_input_s8 = __riscv_vle8_v_i8m2(input_ptr, current_vl);
+                    vint8m2_t v_filter_s8 = __riscv_vle8_v_i8m2(filter_ptr, current_vl);
 
-                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MAX, v_mask_lshift_ovf_pos, vl);
-                    v_shifted = __riscv_vmerge_vxm_i32m8(v_shifted, INT32_MIN, v_mask_lshift_ovf_neg, vl);
-                    v_requant_stage1 = v_shifted;
-                }
+                    vint16m4_t v_input_s16 = __riscv_vsext_vf2_i16m4(v_input_s8, current_vl);
+                    vint16m4_t v_filter_s16 = __riscv_vsext_vf2_i16m4(v_filter_s8, current_vl);
 
-                vbool4_t v_mask_add_offset_ovf_pos = __riscv_vmsgt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_pos, vl);
-                vbool4_t v_mask_add_offset_ovf_neg = __riscv_vmslt_vx_i32m8_b4(v_requant_stage1, add_offset_limit_neg, vl);
+                    v_input_s16 = __riscv_vadd_vx_i16m4(v_input_s16, input_offset_s16, current_vl);
 
-                vint32m8_t v_requant_stage2 = __riscv_vadd_vx_i32m8(v_requant_stage1, output_offset, vl);
+                    vint32m8_t v_prod_s32 = __riscv_vwmul_vv_i32m8(v_filter_s16, v_input_s16, current_vl);
 
-                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MAX, v_mask_add_offset_ovf_pos, vl);
-                v_requant_stage2 = __riscv_vmerge_vxm_i32m8(v_requant_stage2, INT32_MIN, v_mask_add_offset_ovf_neg, vl);
+                    v_sum_reduction = __riscv_vredsum_vs_i32m8_i32m1(
+                                                v_prod_s32,
+                                                v_zero_for_reduction,
+                                                current_vl);
 
-                vint32m8_t v_clamped_i32 = __riscv_vmax_vx_i32m8(v_requant_stage2, output_activation_min_i32, vl);
-                v_clamped_i32 = __riscv_vmin_vx_i32m8(v_clamped_i32, output_activation_max_i32, vl);
+                    patch_acc += __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
 
-                vint16m4_t v_narrowed_i16 = __riscv_vnsra_wx_i16m4(v_clamped_i32, 0, vl);
+                    input_ptr += current_vl;
+                    filter_ptr += current_vl;
+                    channels_remaining -= current_vl;
+                }
+                acc += patch_acc;
+              }
+            }
 
-                __riscv_csrw(CSR_VXRM, __RISCV_VXRM_RDN);
+            if (bias_data) {
+              acc += bias_data[out_channel];
+            }
 
-                vint8m2_t v_output_i8 = __riscv_vnclip_wx_i8m2(v_narrowed_i16, 0, default_vxrm, vl);
+            const int32_t current_multiplier = output_multiplier[out_channel];
+            const int32_t current_shift = output_shift[out_channel];
+            const int64_t total_shift = 31 - current_shift;
+            const int64_t round_val = (total_shift > 0) ? (static_cast<int64_t>(1) << (total_shift - 1)) : 0LL;
+            int64_t result64 = static_cast<int64_t>(acc) * static_cast<int64_t>(current_multiplier);
+            result64 += round_val;
+            result64 = result64 >> total_shift;
+            result64 = std::max(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
+            result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
+            acc = static_cast<int32_t>(result64);
 
-                int8_t* output_base_ptr = output_data +
-                                            (size_t)out_y * output_row_stride +
-                                            current_out_x * output_channels +
-                                            out_c;
+            acc += output_offset;
 
-                ptrdiff_t byte_stride = (ptrdiff_t)output_channels * sizeof(int8_t);
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
 
-                __riscv_vsse8_v_i8m2(output_base_ptr,
-                                     byte_stride,
-                                     v_output_i8,
-                                     vl);
+            const int output_offset_addr = (out_y * output_h_stride) + (out_x * output_w_stride) + (out_channel * output_ch_stride);
+            output_batch_base[output_offset_addr] = static_cast<int8_t>(acc);
 
-                current_out_x += vl;
-            }
+          }
         }
+      }
     }
 }
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
index 3efb5c0520d..54b178f5f4c 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -5,26 +5,22 @@
 #include <cstdint>
 #include <cstddef>
 
-void convolution_hwc_ohwi_rvv(
-    const int8_t* input_data,
-    const uint16_t input_height,
-    const uint16_t input_width,
-    const uint16_t input_channels,
-    const int32_t input_offset,
-    const int8_t* filter_data,
-    const uint16_t filter_height,
-    const uint16_t filter_width,
-    const int32_t* bias_data,
-    int8_t* output_data,
-    const uint16_t output_height,
-    const uint16_t output_width,
-    const uint16_t output_channels,
-    const int32_t output_offset,
-    const int32_t* output_multiplier,
-    const int32_t* output_shift,
-    const uint16_t stride_height,
-    const uint16_t stride_width,
-    const uint16_t pad_height,
-    const uint16_t pad_width);
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+// #include "fixedpoint/fixedpoint.h"
+// #include "tensorflow/lite/core/macros.h"
+// #include "tensorflow/lite/kernels/internal/cppmath.h"
+// #include "tensorflow/lite/kernels/internal/types.h"
+
+using namespace tflite;
+
+void ConvPerChannelRVV(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data);
 
 #endif
\ No newline at end of file

From 060158467a0e8f42d48f16695a8ed9f09a7ad2a5 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 15:21:38 -0500
Subject: [PATCH 12/86] Remove duplicate makefile

---
 .../targets/rv32imc_v128_newlib_makefile.inc  | 60 -------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc

diff --git a/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc b/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc
deleted file mode 100644
index 4257f6f4b89..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/rv32imc_v128_newlib_makefile.inc
+++ /dev/null
@@ -1,60 +0,0 @@
-# Settings for RISCV 32-bit toolchain.
-TARGET_ARCH := riscv32
-TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
-
-RISCV_ARCH := rv32imc_zve32x_zvl128b
-RISCV_ABI := ilp32
-RISCV_CODE_MODEL := medany
-
-# Allow additional flags on the command line for debugging.
-RISCV_EXTRA_CFLAGS :=
-
-TARGET_DEFAULT_TOOLCHAIN_ROOT := $(RISCV)/bin/
-
-TARGET_TOOLCHAIN_ROOT := $(TARGET_DEFAULT_TOOLCHAIN_ROOT)
-ifeq ($(TARGET_TOOLCHAIN_ROOT), $(TARGET_DEFAULT_TOOLCHAIN_ROOT))
-  $(eval $(call add_third_party_download,$(RISCV_TOOLCHAIN_URL),$(RISCV_TOOLCHAIN_MD5),riscv_toolchain,))
-endif
-
-export PATH := $(TARGET_TOOLCHAIN_ROOT):$(PATH)
-
-PLATFORM_FLAGS = \
-  -march=$(RISCV_ARCH) \
-  -mabi=$(RISCV_ABI) \
-  -mcmodel=$(RISCV_CODE_MODEL) \
-  -mexplicit-relocs \
-  -fno-builtin-printf \
-  -DTF_LITE_MCU_DEBUG_LOG \
-  -DTF_LITE_USE_GLOBAL_CMATH_FUNCTIONS \
-  -funsigned-char \
-  -fno-delete-null-pointer-checks \
-  -fomit-frame-pointer
-
-CXXFLAGS += $(PLATFORM_FLAGS) \
-  -fpermissive \
-  -fno-use-cxa-atexit \
-  -DTF_LITE_USE_GLOBAL_MIN \
-  -DTF_LITE_USE_GLOBAL_MAX
-
-CCFLAGS += $(PLATFORM_FLAGS)
-
-BUILD_TYPE := micro
-
-LDFLAGS += --specs=nano.specs
-
-# See http://b/158651472 for why memory arena threshold test is disabled.
-EXCLUDED_TESTS := \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_arena_threshold_test.cc
-
-MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
-
-CCFLAGS += $(RISCV_EXTRA_CFLAGS)
-CXXFLAGS += $(RISCV_EXTRA_CFLAGS)
-
-# This disables the "linker relaxation" optimization, which produced incorrect code.
-# TODO(b/279805615): Check whether this is fixed in newer versions of the toolchain.
-LDFLAGS += -mno-relax
-TEST_SCRIPT := $(TENSORFLOW_ROOT)tensorflow/lite/micro/testing/test_with_qemu.sh riscv32 rv32
-SIZE_SCRIPT := ${TENSORFLOW_ROOT}tensorflow/lite/micro/testing/size_riscv32_binary.sh
-
-include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
\ No newline at end of file

From c4c873b7a2ed36f313be03af6bf46515fcaeccaf Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 16:03:55 -0500
Subject: [PATCH 13/86] Restore TFLM primary Makefile

---
 tensorflow/lite/micro/tools/make/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 1c4c7643968..d37ba57018f 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -382,6 +382,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize_common.cc \
@@ -597,7 +598,7 @@ $(DOWNLOADS_DIR)/kissfft/tools/kiss_fftr.h \
 $(DOWNLOADS_DIR)/ruy/ruy/profiler/instrumentation.h
 
 THIRD_PARTY_CC_SRCS :=
-THIRD_PARTY_KERNEL_CC_SRCS := $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
+THIRD_PARTY_KERNEL_CC_SRCS :=
 
 # Load custom kernels.
 include $(MAKEFILE_DIR)/additional_kernels.inc
@@ -605,8 +606,6 @@ include $(MAKEFILE_DIR)/additional_kernels.inc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_BENCHMARK_SRCS), $(MICROLITE_CC_SRCS))
 
-
-
 # The download scripts require that the downloads directory already exist for
 # improved error checking. To accomodate that, we first create a downloads
 # directory.

From e51f7c18af847363a043c5b3de7a4a9542db427e Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 16:19:51 -0500
Subject: [PATCH 14/86] Use vwmacc for convolution channel accumulation

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 50 +++++++++++--------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 8ec84677ff5..5b4ef4edb51 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -68,7 +68,6 @@ void ConvPerChannelRVV(
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
-
     for (int batch = 0; batch < batches; ++batch) {
       const int8_t* input_batch_base = input_data + batch * input_b_stride;
       int8_t* output_batch_base = output_data + batch * output_b_stride;
@@ -104,39 +103,48 @@ void ConvPerChannelRVV(
 
                 const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
                 const int8_t* input_ptr = input_batch_base + input_offset_addr;
-
                 const int8_t* filter_ptr = filter_x_base;
 
-
                 size_t channels_remaining = filter_input_depth;
                 int32_t patch_acc = 0;
 
-                vint32m1_t v_zero_for_reduction = __riscv_vmv_s_x_i32m1(0, 1);
-                vint32m1_t v_sum_reduction = v_zero_for_reduction;
+                if (channels_remaining > 0) {
+                    size_t vlmax_for_acc = __riscv_vsetvlmax_e32m4();
+                    vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vlmax_for_acc);
 
-                while (channels_remaining > 0) {
-                    size_t current_vl = __riscv_vsetvl_e16m4(channels_remaining);
+                    while (channels_remaining > 0) {
+                        // Use LMUL=1 for 8-bit loads
+                        size_t current_vl = __riscv_vsetvl_e8m1(channels_remaining);
 
-                    vint8m2_t v_input_s8 = __riscv_vle8_v_i8m2(input_ptr, current_vl);
-                    vint8m2_t v_filter_s8 = __riscv_vle8_v_i8m2(filter_ptr, current_vl);
+                        // Load 8-bit data into m1 vectors
+                        vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_ptr, current_vl);
+                        vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_ptr, current_vl);
 
-                    vint16m4_t v_input_s16 = __riscv_vsext_vf2_i16m4(v_input_s8, current_vl);
-                    vint16m4_t v_filter_s16 = __riscv_vsext_vf2_i16m4(v_filter_s8, current_vl);
+                        // Widen 8m1 -> 16m2
+                        vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, current_vl);
+                        vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, current_vl);
 
-                    v_input_s16 = __riscv_vadd_vx_i16m4(v_input_s16, input_offset_s16, current_vl);
+                        // Perform add on 16m2 vectors
+                        v_input_s16 = __riscv_vadd_vx_i16m2(v_input_s16, input_offset_s16, current_vl);
 
-                    vint32m8_t v_prod_s32 = __riscv_vwmul_vv_i32m8(v_filter_s16, v_input_s16, current_vl);
+                        // Widening multiply-accumulate: 16m2 * 16m2 + 32m4 -> 32m4
+                        // Pass current_vl, the number of elements processed in this iteration
+                        v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
 
-                    v_sum_reduction = __riscv_vredsum_vs_i32m8_i32m1(
-                                                v_prod_s32,
-                                                v_zero_for_reduction,
-                                                current_vl);
+                        input_ptr += current_vl;
+                        filter_ptr += current_vl;
+                        channels_remaining -= current_vl;
+                    }
 
-                    patch_acc += __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
+                    // Reduce the final 32m4 accumulator
+                    size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth); // Set VL for the reduction source type
+                    vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
+                    vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
+                                                    v_acc_s32,
+                                                    v_zero_reduction,
+                                                    vl_for_reduce); // Use the VL corresponding to the accumulator length
 
-                    input_ptr += current_vl;
-                    filter_ptr += current_vl;
-                    channels_remaining -= current_vl;
+                    patch_acc = __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
                 }
                 acc += patch_acc;
               }

From 228ae901c2545a5b50c85b971a325d0c7de9e6ff Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 17:30:53 -0500
Subject: [PATCH 15/86] Add comments

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 92 ++++++++++++-------
 1 file changed, 61 insertions(+), 31 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 5b4ef4edb51..4580c850d12 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -7,6 +7,7 @@
 #include <riscv_vector.h>
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/micro/micro_log.h"
 
 using namespace tflite;
 
@@ -18,6 +19,7 @@ void ConvPerChannelRVV(
     const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data)
 {
+    // Extract convolution parameters
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
@@ -26,134 +28,160 @@ void ConvPerChannelRVV(
     const int pad_width = params.padding_values.width;
     const int pad_height = params.padding_values.height;
     const int32_t output_offset = params.output_offset;
-
     const int32_t output_activation_min = params.quantized_activation_min;
     const int32_t output_activation_max = params.quantized_activation_max;
 
+    // Extract dimensions from input, filter, and output shapes
     const int input_batches = input_shape.Dims(0);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
     const int input_depth = input_shape.Dims(3);
-
     const int filter_output_depth = filter_shape.Dims(0);
     const int filter_height = filter_shape.Dims(1);
     const int filter_width = filter_shape.Dims(2);
     const int filter_input_depth = filter_shape.Dims(3);
-
     const int output_batches = output_shape.Dims(0);
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
     const int output_depth_dim = output_shape.Dims(3);
 
+    // Determine the actual number of batches and output channels to process
     const int batches = std::min(input_batches, output_batches);
     const int output_depth = std::min(filter_output_depth, output_depth_dim);
 
+    // Calculate group information for grouped/depthwise convolutions
     const int groups = input_depth / filter_input_depth;
     const int filters_per_group = output_depth / groups;
 
+    // Prepare input offset as int16_t for vector operations
     const int16_t input_offset_s16 = static_cast<int16_t>(input_offset);
 
+    // Calculate memory strides for navigating input, filter, and output tensors
     const int input_ch_stride = 1;
     const int input_w_stride = input_depth * input_ch_stride;
     const int input_h_stride = input_width * input_w_stride;
     const int input_b_stride = input_height * input_h_stride;
-
     const int filter_ch_stride = 1;
     const int filter_w_stride = filter_input_depth * filter_ch_stride;
     const int filter_h_stride = filter_width * filter_w_stride;
     const int filter_o_stride = filter_height * filter_h_stride;
-
     const int output_ch_stride = 1;
     const int output_w_stride = output_depth * output_ch_stride;
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
-    for (int batch = 0; batch < batches; ++batch) {
+    for (int batch = 0; batch < batches; ++batch) 
+    {
+      // Get base pointers for the current batch's input and output data
       const int8_t* input_batch_base = input_data + batch * input_b_stride;
       int8_t* output_batch_base = output_data + batch * output_b_stride;
 
-      for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_y = 0; out_y < output_height; ++out_y) 
+      {
+        // Calculate the starting row index in the input tensor corresponding to the current output row
         const int in_y_origin = (out_y * stride_height) - pad_height;
 
-        for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_x = 0; out_x < output_width; ++out_x) 
+        {
+          // Calculate the starting column index in the input tensor corresponding to the current output column
           const int in_x_origin = (out_x * stride_width) - pad_width;
 
-          for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
+          {
+            // Determine the group index and starting input channel for the current output channel
             const int group = out_channel / filters_per_group;
             const int group_start_input_channel = group * filter_input_depth;
+
+            // Initialize the accumulator for the current output pixel and channel
             int32_t acc = 0;
 
+            // Get the base pointer for the filter data corresponding to the current output channel
             const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+            {
+              // Calculate the corresponding row index in the input tensor
               const int in_y = in_y_origin + dilation_height_factor * filter_y;
+
+              // Get the base pointer for the current filter row
               const int8_t* filter_y_base = filter_oc_base + filter_y * filter_h_stride;
 
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+              {
+                // Calculate the corresponding column index in the input tensor
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+                // Get the base pointer for the current filter column
                 const int8_t* filter_x_base = filter_y_base + filter_x * filter_w_stride;
 
+                // Check if the calculated input patch position is within the input tensor boundaries
                 const bool is_point_inside_image =
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
 
-                if (!is_point_inside_image) {
+                // Skip computation if the current filter patch position is outside the input boundaries
+                if (!is_point_inside_image) 
                   continue;
-                }
 
+                // Calculate the memory offset to the start of the relevant input data patch
                 const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
+                // Get pointers to the start of the input patch and corresponding filter data
                 const int8_t* input_ptr = input_batch_base + input_offset_addr;
                 const int8_t* filter_ptr = filter_x_base;
 
+                // Initialize variables for the vector processing loop over input channels for this patch
                 size_t channels_remaining = filter_input_depth;
                 int32_t patch_acc = 0;
 
-                if (channels_remaining > 0) {
+                // Perform vector MAC operation if there are channels to process for this patch
+                if (channels_remaining > 0)
+                {
+                    // Initialize a 32-bit vector accumulator (m4) to zeros
                     size_t vlmax_for_acc = __riscv_vsetvlmax_e32m4();
                     vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vlmax_for_acc);
 
-                    while (channels_remaining > 0) {
-                        // Use LMUL=1 for 8-bit loads
+                    // Process input channels in vector chunks until all are done
+                    while (channels_remaining > 0)
+                    {
+                        // Set the vector length for the current iteration
                         size_t current_vl = __riscv_vsetvl_e8m1(channels_remaining);
 
-                        // Load 8-bit data into m1 vectors
+                        // Load 8-bit input and filter data chunks into m1 vectors
                         vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_ptr, current_vl);
                         vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_ptr, current_vl);
 
-                        // Widen 8m1 -> 16m2
+                        // Widen 8-bit vectors (m1) to 16-bit vectors (m2)
                         vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, current_vl);
                         vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, current_vl);
 
-                        // Perform add on 16m2 vectors
+                        // Add the input offset to the widened 16-bit input vector
                         v_input_s16 = __riscv_vadd_vx_i16m2(v_input_s16, input_offset_s16, current_vl);
 
-                        // Widening multiply-accumulate: 16m2 * 16m2 + 32m4 -> 32m4
-                        // Pass current_vl, the number of elements processed in this iteration
+                        // Perform widening multiply-accumulate: 16m2 * 16m2 -> 32m4, accumulating into v_acc_s32
                         v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
-
-                        input_ptr += current_vl;
-                        filter_ptr += current_vl;
-                        channels_remaining -= current_vl;
-                    }
-
-                    // Reduce the final 32m4 accumulator
-                    size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth); // Set VL for the reduction source type
+                        , processing up to LMUL=1 (m1) 8-bit elements
+                    size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth);
                     vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
                     vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
                                                     v_acc_s32,
                                                     v_zero_reduction,
-                                                    vl_for_reduce); // Use the VL corresponding to the accumulator length
+                                                    vl_for_reduce);
 
+                    // Extract the scalar reduction result into patch_acc
                     patch_acc = __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
                 }
+
+                // Accumulate the result from the processed patch into the overall accumulator
                 acc += patch_acc;
               }
             }
 
+            // Add bias value
             if (bias_data) {
               acc += bias_data[out_channel];
             }
 
+            // Apply per-channel requantization to the accumulated value
             const int32_t current_multiplier = output_multiplier[out_channel];
             const int32_t current_shift = output_shift[out_channel];
             const int64_t total_shift = 31 - current_shift;
@@ -165,14 +193,16 @@ void ConvPerChannelRVV(
             result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
             acc = static_cast<int32_t>(result64);
 
+            // Add the output offset to the requantized value
             acc += output_offset;
 
+            // Clamp the result to the final activation range
             acc = std::max(acc, output_activation_min);
             acc = std::min(acc, output_activation_max);
 
+            // Calculate the memory offset for the output pixel and store the final 8-bit result
             const int output_offset_addr = (out_y * output_h_stride) + (out_x * output_w_stride) + (out_channel * output_ch_stride);
             output_batch_base[output_offset_addr] = static_cast<int8_t>(acc);
-
           }
         }
       }

From 4d7257fed471316704ce364a736aec475c60501d Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 17:36:40 -0500
Subject: [PATCH 16/86] Add MicroPrintf output to ConvPerChannelRVV

---
 .../lite/micro/kernels/riscv_vector/conv_rvv.cc   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 4580c850d12..5d75ed52303 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -19,6 +19,8 @@ void ConvPerChannelRVV(
     const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data)
 {
+    MicroPrintf("[PEANUT MICROSYSTEM] ConvPerChannelRVV");
+
     // Extract convolution parameters
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
@@ -159,7 +161,14 @@ void ConvPerChannelRVV(
 
                         // Perform widening multiply-accumulate: 16m2 * 16m2 -> 32m4, accumulating into v_acc_s32
                         v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
-                        , processing up to LMUL=1 (m1) 8-bit elements
+
+                        // Advance input and filter pointers and decrement remaining channel count
+                        input_ptr += current_vl;
+                        filter_ptr += current_vl;
+                        channels_remaining -= current_vl;
+                    }
+
+                    // Reduce the final 32-bit vector accumulator to a scalar sum
                     size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth);
                     vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
                     vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
@@ -187,8 +196,8 @@ void ConvPerChannelRVV(
             const int64_t total_shift = 31 - current_shift;
             const int64_t round_val = (total_shift > 0) ? (static_cast<int64_t>(1) << (total_shift - 1)) : 0LL;
             int64_t result64 = static_cast<int64_t>(acc) * static_cast<int64_t>(current_multiplier);
-            result64 += round_val;
-            result64 = result64 >> total_shift;
+            result64 += round_val; // Add rounding value
+            result64 = result64 >> total_shift; // Perform the shift
             result64 = std::max(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
             result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
             acc = static_cast<int32_t>(result64);

From 93cf8b2f75d2364d902756bf53539e16394fc934 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 15 Apr 2025 17:37:01 -0500
Subject: [PATCH 17/86] Fix typo

---
 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 5d75ed52303..91d31260b17 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -19,7 +19,7 @@ void ConvPerChannelRVV(
     const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data)
 {
-    MicroPrintf("[PEANUT MICROSYSTEM] ConvPerChannelRVV");
+    MicroPrintf("[PEANUT MICROSYSTEMS] ConvPerChannelRVV");
 
     // Extract convolution parameters
     const int32_t input_offset = params.input_offset;

From 8f4a7a4db3c81e1dab7a712a04caef78ac1f4a9f Mon Sep 17 00:00:00 2001
From: numbers1234567 <angelochem4@gmail.com>
Date: Tue, 15 Apr 2025 19:11:49 -0500
Subject: [PATCH 18/86] MicroPrintf in base implementation

---
 .../kernels/internal/reference/integer_ops/depthwise_conv.h     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index b99ad89dac0..300c6926c8c 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -57,6 +57,8 @@ inline void DepthwiseConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
+  MicroPrintf("[PEANUT MICROSYSTEMS] Using base depthwise conv");
+
   // [PEANUT] Input/output/filter dimensions
   // [PEANUT] Input shape (batches, height, width, in-depth)
   // [PEANUT] Output shape (batches, height, width, out-depth)

From 5dc8182ec688b175e0bded07151d00e5f12a2612 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 16 Apr 2025 14:06:15 -0500
Subject: [PATCH 19/86] Fix formatting

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 252 +++++++++---------
 1 file changed, 126 insertions(+), 126 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 91d31260b17..c0ee86aa3c7 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -74,146 +74,146 @@ void ConvPerChannelRVV(
 
     for (int batch = 0; batch < batches; ++batch) 
     {
-      // Get base pointers for the current batch's input and output data
-      const int8_t* input_batch_base = input_data + batch * input_b_stride;
-      int8_t* output_batch_base = output_data + batch * output_b_stride;
+        // Get base pointers for the current batch's input and output data
+        const int8_t* input_batch_base = input_data + batch * input_b_stride;
+        int8_t* output_batch_base = output_data + batch * output_b_stride;
 
-      for (int out_y = 0; out_y < output_height; ++out_y) 
-      {
-        // Calculate the starting row index in the input tensor corresponding to the current output row
-        const int in_y_origin = (out_y * stride_height) - pad_height;
-
-        for (int out_x = 0; out_x < output_width; ++out_x) 
+        for (int out_y = 0; out_y < output_height; ++out_y) 
         {
-          // Calculate the starting column index in the input tensor corresponding to the current output column
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-
-          for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
-          {
-            // Determine the group index and starting input channel for the current output channel
-            const int group = out_channel / filters_per_group;
-            const int group_start_input_channel = group * filter_input_depth;
-
-            // Initialize the accumulator for the current output pixel and channel
-            int32_t acc = 0;
+            // Calculate the starting row index in the input tensor corresponding to the current output row
+            const int in_y_origin = (out_y * stride_height) - pad_height;
 
-            // Get the base pointer for the filter data corresponding to the current output channel
-            const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
-
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+            for (int out_x = 0; out_x < output_width; ++out_x) 
             {
-              // Calculate the corresponding row index in the input tensor
-              const int in_y = in_y_origin + dilation_height_factor * filter_y;
-
-              // Get the base pointer for the current filter row
-              const int8_t* filter_y_base = filter_oc_base + filter_y * filter_h_stride;
-
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
-              {
-                // Calculate the corresponding column index in the input tensor
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-                // Get the base pointer for the current filter column
-                const int8_t* filter_x_base = filter_y_base + filter_x * filter_w_stride;
-
-                // Check if the calculated input patch position is within the input tensor boundaries
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-
-                // Skip computation if the current filter patch position is outside the input boundaries
-                if (!is_point_inside_image) 
-                  continue;
-
-                // Calculate the memory offset to the start of the relevant input data patch
-                const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
-                // Get pointers to the start of the input patch and corresponding filter data
-                const int8_t* input_ptr = input_batch_base + input_offset_addr;
-                const int8_t* filter_ptr = filter_x_base;
-
-                // Initialize variables for the vector processing loop over input channels for this patch
-                size_t channels_remaining = filter_input_depth;
-                int32_t patch_acc = 0;
+                // Calculate the starting column index in the input tensor corresponding to the current output column
+                const int in_x_origin = (out_x * stride_width) - pad_width;
 
-                // Perform vector MAC operation if there are channels to process for this patch
-                if (channels_remaining > 0)
+                for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
                 {
-                    // Initialize a 32-bit vector accumulator (m4) to zeros
-                    size_t vlmax_for_acc = __riscv_vsetvlmax_e32m4();
-                    vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vlmax_for_acc);
+                    // Determine the group index and starting input channel for the current output channel
+                    const int group = out_channel / filters_per_group;
+                    const int group_start_input_channel = group * filter_input_depth;
 
-                    // Process input channels in vector chunks until all are done
-                    while (channels_remaining > 0)
-                    {
-                        // Set the vector length for the current iteration
-                        size_t current_vl = __riscv_vsetvl_e8m1(channels_remaining);
-
-                        // Load 8-bit input and filter data chunks into m1 vectors
-                        vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_ptr, current_vl);
-                        vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_ptr, current_vl);
-
-                        // Widen 8-bit vectors (m1) to 16-bit vectors (m2)
-                        vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, current_vl);
-                        vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, current_vl);
+                    // Initialize the accumulator for the current output pixel and channel
+                    int32_t acc = 0;
 
-                        // Add the input offset to the widened 16-bit input vector
-                        v_input_s16 = __riscv_vadd_vx_i16m2(v_input_s16, input_offset_s16, current_vl);
+                    // Get the base pointer for the filter data corresponding to the current output channel
+                    const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
-                        // Perform widening multiply-accumulate: 16m2 * 16m2 -> 32m4, accumulating into v_acc_s32
-                        v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
-
-                        // Advance input and filter pointers and decrement remaining channel count
-                        input_ptr += current_vl;
-                        filter_ptr += current_vl;
-                        channels_remaining -= current_vl;
+                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+                    {
+                        // Calculate the corresponding row index in the input tensor
+                        const int in_y = in_y_origin + dilation_height_factor * filter_y;
+
+                        // Get the base pointer for the current filter row
+                        const int8_t* filter_y_base = filter_oc_base + filter_y * filter_h_stride;
+
+                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+                        {
+                            // Calculate the corresponding column index in the input tensor
+                            const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+                            // Get the base pointer for the current filter column
+                            const int8_t* filter_x_base = filter_y_base + filter_x * filter_w_stride;
+
+                            // Check if the calculated input patch position is within the input tensor boundaries
+                            const bool is_point_inside_image =
+                                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                                (in_y < input_height);
+
+                            // Skip computation if the current filter patch position is outside the input boundaries
+                            if (!is_point_inside_image) 
+                            continue;
+
+                            // Calculate the memory offset to the start of the relevant input data patch
+                            const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
+                            // Get pointers to the start of the input patch and corresponding filter data
+                            const int8_t* input_ptr = input_batch_base + input_offset_addr;
+                            const int8_t* filter_ptr = filter_x_base;
+
+                            // Initialize variables for the vector processing loop over input channels for this patch
+                            size_t channels_remaining = filter_input_depth;
+                            int32_t patch_acc = 0;
+
+                            // Perform vector MAC operation if there are channels to process for this patch
+                            if (channels_remaining > 0)
+                            {
+                                // Initialize a 32-bit vector accumulator (m4) to zeros
+                                size_t vlmax_for_acc = __riscv_vsetvlmax_e32m4();
+                                vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vlmax_for_acc);
+
+                                // Process input channels in vector chunks until all are done
+                                while (channels_remaining > 0)
+                                {
+                                    // Set the vector length for the current iteration
+                                    size_t current_vl = __riscv_vsetvl_e8m1(channels_remaining);
+
+                                    // Load 8-bit input and filter data chunks into m1 vectors
+                                    vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_ptr, current_vl);
+                                    vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_ptr, current_vl);
+
+                                    // Widen 8-bit vectors (m1) to 16-bit vectors (m2)
+                                    vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, current_vl);
+                                    vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, current_vl);
+
+                                    // Add the input offset to the widened 16-bit input vector
+                                    v_input_s16 = __riscv_vadd_vx_i16m2(v_input_s16, input_offset_s16, current_vl);
+
+                                    // Perform widening multiply-accumulate: 16m2 * 16m2 -> 32m4, accumulating into v_acc_s32
+                                    v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
+
+                                    // Advance input and filter pointers and decrement remaining channel count
+                                    input_ptr += current_vl;
+                                    filter_ptr += current_vl;
+                                    channels_remaining -= current_vl;
+                                }
+
+                                // Reduce the final 32-bit vector accumulator to a scalar sum
+                                size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth);
+                                vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
+                                vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
+                                                                v_acc_s32,
+                                                                v_zero_reduction,
+                                                                vl_for_reduce);
+
+                                // Extract the scalar reduction result into patch_acc
+                                patch_acc = __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
+                            }
+
+                            // Accumulate the result from the processed patch into the overall accumulator
+                            acc += patch_acc;
+                        }
                     }
 
-                    // Reduce the final 32-bit vector accumulator to a scalar sum
-                    size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth);
-                    vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
-                    vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
-                                                    v_acc_s32,
-                                                    v_zero_reduction,
-                                                    vl_for_reduce);
+                    // Add bias value
+                    if (bias_data) {
+                    acc += bias_data[out_channel];
+                    }
 
-                    // Extract the scalar reduction result into patch_acc
-                    patch_acc = __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
+                    // Apply per-channel requantization to the accumulated value
+                    const int32_t current_multiplier = output_multiplier[out_channel];
+                    const int32_t current_shift = output_shift[out_channel];
+                    const int64_t total_shift = 31 - current_shift;
+                    const int64_t round_val = (total_shift > 0) ? (static_cast<int64_t>(1) << (total_shift - 1)) : 0LL;
+                    int64_t result64 = static_cast<int64_t>(acc) * static_cast<int64_t>(current_multiplier);
+                    result64 += round_val; // Add rounding value
+                    result64 = result64 >> total_shift; // Perform the shift
+                    result64 = std::max(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
+                    result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
+                    acc = static_cast<int32_t>(result64);
+
+                    // Add the output offset to the requantized value
+                    acc += output_offset;
+
+                    // Clamp the result to the final activation range
+                    acc = std::max(acc, output_activation_min);
+                    acc = std::min(acc, output_activation_max);
+
+                    // Calculate the memory offset for the output pixel and store the final 8-bit result
+                    const int output_offset_addr = (out_y * output_h_stride) + (out_x * output_w_stride) + (out_channel * output_ch_stride);
+                    output_batch_base[output_offset_addr] = static_cast<int8_t>(acc);
                 }
-
-                // Accumulate the result from the processed patch into the overall accumulator
-                acc += patch_acc;
-              }
             }
-
-            // Add bias value
-            if (bias_data) {
-              acc += bias_data[out_channel];
-            }
-
-            // Apply per-channel requantization to the accumulated value
-            const int32_t current_multiplier = output_multiplier[out_channel];
-            const int32_t current_shift = output_shift[out_channel];
-            const int64_t total_shift = 31 - current_shift;
-            const int64_t round_val = (total_shift > 0) ? (static_cast<int64_t>(1) << (total_shift - 1)) : 0LL;
-            int64_t result64 = static_cast<int64_t>(acc) * static_cast<int64_t>(current_multiplier);
-            result64 += round_val; // Add rounding value
-            result64 = result64 >> total_shift; // Perform the shift
-            result64 = std::max(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
-            result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
-            acc = static_cast<int32_t>(result64);
-
-            // Add the output offset to the requantized value
-            acc += output_offset;
-
-            // Clamp the result to the final activation range
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-
-            // Calculate the memory offset for the output pixel and store the final 8-bit result
-            const int output_offset_addr = (out_y * output_h_stride) + (out_x * output_w_stride) + (out_channel * output_ch_stride);
-            output_batch_base[output_offset_addr] = static_cast<int8_t>(acc);
-          }
         }
-      }
     }
 }
\ No newline at end of file

From d2ae0707ac843fd81237f732147d850c1fa841a9 Mon Sep 17 00:00:00 2001
From: numbers1234567 <angelochem4@gmail.com>
Date: Thu, 17 Apr 2025 22:13:41 -0500
Subject: [PATCH 20/86] Information on building w/ custom implementations

---
 PEANUT-README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 PEANUT-README.md

diff --git a/PEANUT-README.md b/PEANUT-README.md
new file mode 100644
index 00000000000..6967f5c0479
--- /dev/null
+++ b/PEANUT-README.md
@@ -0,0 +1,16 @@
+# TFLite-Micro with Vector Intrinsics
+
+This is Peanut Microsystems' fork of tflite-micro to optimize bottleneck operations using vector intrinsics.
+
+## Building
+
+Follow the guide in the *toolchains* repository for a guide on how to build and run *tflite-micro*. Instead of using the *riscv32_generic_makefile.inc*, use *riscv32_vector_makefile.inc* to build with vector intrinsics. Also, use the *rv32gcv* ISA for Spike. This is a superset of the instructions we intend to support.
+
+To run with informative Peanut Microsystems-specifc logs, add a PEANUT_MICRO_LOG flag in the PLATFORM_FLAGS of the *riscv32_vector_makefile.inc*:
+
+    PLATFORM_FLAGS = \
+        -march=$(RISCV_ARCH) \
+        ... \
+        -DPEANUT_MICRO_LOG
+
+The main purpose for this flag is to sanity-check which implementations are used and to determine model architectures, including input and output shapes.
\ No newline at end of file

From 0f5b1872fb258180d0226ac7c2e32358291a8bdc Mon Sep 17 00:00:00 2001
From: pseudonam-gc <55512819+pseudonam-gc@users.noreply.github.com>
Date: Thu, 17 Apr 2025 23:00:43 -0500
Subject: [PATCH 21/86] Update PEANUT-README.md

typo
---
 PEANUT-README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PEANUT-README.md b/PEANUT-README.md
index 6967f5c0479..2f5902a482b 100644
--- a/PEANUT-README.md
+++ b/PEANUT-README.md
@@ -6,11 +6,11 @@ This is Peanut Microsystems' fork of tflite-micro to optimize bottleneck operati
 
 Follow the guide in the *toolchains* repository for a guide on how to build and run *tflite-micro*. Instead of using the *riscv32_generic_makefile.inc*, use *riscv32_vector_makefile.inc* to build with vector intrinsics. Also, use the *rv32gcv* ISA for Spike. This is a superset of the instructions we intend to support.
 
-To run with informative Peanut Microsystems-specifc logs, add a PEANUT_MICRO_LOG flag in the PLATFORM_FLAGS of the *riscv32_vector_makefile.inc*:
+To run with informative Peanut Microsystems-specific logs, add a PEANUT_MICRO_LOG flag in the PLATFORM_FLAGS of the *riscv32_vector_makefile.inc*:
 
     PLATFORM_FLAGS = \
         -march=$(RISCV_ARCH) \
         ... \
         -DPEANUT_MICRO_LOG
 
-The main purpose for this flag is to sanity-check which implementations are used and to determine model architectures, including input and output shapes.
\ No newline at end of file
+The main purpose for this flag is to sanity-check which implementations are used and to determine model architectures, including input and output shapes.

From 06dc7156b7992f99f009cabd34679d7fc2d2c7a6 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 12:02:38 -0500
Subject: [PATCH 22/86] Vectorize out_x dimension

---
 tensorflow/lite/micro/kernels/conv_test.h     |   2 +-
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 276 ++++++++----------
 2 files changed, 126 insertions(+), 152 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/conv_test.h b/tensorflow/lite/micro/kernels/conv_test.h
index 642f4c76d7a..7fa7ac2009a 100644
--- a/tensorflow/lite/micro/kernels/conv_test.h
+++ b/tensorflow/lite/micro/kernels/conv_test.h
@@ -226,4 +226,4 @@ TfLiteStatus TestConvQuantizedPerChannelCompressed(
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_TEST_H_
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_TEST_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index c0ee86aa3c7..2c48ce976f0 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -1,27 +1,44 @@
+#include <riscv_vector.h>
+
 #include <algorithm>
 #include <cassert>
-#include <cstdint>
 #include <cstddef>
-#include <limits> 
-
-#include <riscv_vector.h>
+#include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
 using namespace tflite;
 
-void ConvPerChannelRVV(
-    const ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const RuntimeShape& input_shape,
-    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data)
+#define MAX_VL_E32M4_ZVL128B 16
+
+inline int32_t multi_64bit(int32_t x, int32_t quantized_multiplier, int shift) 
 {
-    MicroPrintf("[PEANUT MICROSYSTEMS] ConvPerChannelRVV");
+    int64_t acc = static_cast<int64_t>(x) * static_cast<int64_t>(quantized_multiplier);
+
+    const int64_t rounding = (shift > 0) ? (INT64_C(1) << (shift - 1)) : INT64_C(0);
+    acc += rounding;
+
+    acc = acc >> shift;
 
-    // Extract convolution parameters
+    acc = std::max(acc, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
+    acc = std::min(acc, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
+
+    return static_cast<int32_t>(acc);
+}
+
+
+void ConvPerChannelRVV(const ConvParams& params,
+                       const int32_t* output_multiplier,
+                       const int32_t* output_shift,
+                       const RuntimeShape& input_shape,
+                       const int8_t* input_data,
+                       const RuntimeShape& filter_shape,
+                       const int8_t* filter_data,
+                       const RuntimeShape& bias_shape, const int32_t* bias_data,
+                       const RuntimeShape& output_shape, int8_t* output_data)
+{
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
@@ -33,185 +50,142 @@ void ConvPerChannelRVV(
     const int32_t output_activation_min = params.quantized_activation_min;
     const int32_t output_activation_max = params.quantized_activation_max;
 
-    // Extract dimensions from input, filter, and output shapes
     const int input_batches = input_shape.Dims(0);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
     const int input_depth = input_shape.Dims(3);
-    const int filter_output_depth = filter_shape.Dims(0);
     const int filter_height = filter_shape.Dims(1);
     const int filter_width = filter_shape.Dims(2);
     const int filter_input_depth = filter_shape.Dims(3);
-    const int output_batches = output_shape.Dims(0);
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
-    const int output_depth_dim = output_shape.Dims(3);
-
-    // Determine the actual number of batches and output channels to process
-    const int batches = std::min(input_batches, output_batches);
-    const int output_depth = std::min(filter_output_depth, output_depth_dim);
+    const int output_depth = output_shape.Dims(3);
 
-    // Calculate group information for grouped/depthwise convolutions
     const int groups = input_depth / filter_input_depth;
     const int filters_per_group = output_depth / groups;
 
-    // Prepare input offset as int16_t for vector operations
-    const int16_t input_offset_s16 = static_cast<int16_t>(input_offset);
-
-    // Calculate memory strides for navigating input, filter, and output tensors
     const int input_ch_stride = 1;
-    const int input_w_stride = input_depth * input_ch_stride;
+    const int input_w_stride = input_depth;
     const int input_h_stride = input_width * input_w_stride;
     const int input_b_stride = input_height * input_h_stride;
     const int filter_ch_stride = 1;
-    const int filter_w_stride = filter_input_depth * filter_ch_stride;
+    const int filter_w_stride = filter_input_depth;
     const int filter_h_stride = filter_width * filter_w_stride;
     const int filter_o_stride = filter_height * filter_h_stride;
     const int output_ch_stride = 1;
-    const int output_w_stride = output_depth * output_ch_stride;
+    const int output_w_stride = output_depth;
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
-    for (int batch = 0; batch < batches; ++batch) 
-    {
-        // Get base pointers for the current batch's input and output data
+    int32_t temp_requant_buffer[MAX_VL_E32M4_ZVL128B] __attribute__((aligned(16)));
+
+    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+    const int32_t s_output_offset_s32 = output_offset;
+    const int32_t s_output_activation_min_s32 = output_activation_min;
+    const int32_t s_output_activation_max_s32 = output_activation_max;
+
+    for (int batch = 0; batch < input_batches; ++batch) {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
 
-        for (int out_y = 0; out_y < output_height; ++out_y) 
-        {
-            // Calculate the starting row index in the input tensor corresponding to the current output row
+        for (int out_y = 0; out_y < output_height; ++out_y) {
             const int in_y_origin = (out_y * stride_height) - pad_height;
+            int8_t* output_row_base = output_batch_base + out_y * output_h_stride;
+
+            for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+                const int group = out_channel / filters_per_group;
+                const int group_start_input_channel = group * filter_input_depth;
+                const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
-            for (int out_x = 0; out_x < output_width; ++out_x) 
-            {
-                // Calculate the starting column index in the input tensor corresponding to the current output column
-                const int in_x_origin = (out_x * stride_width) - pad_width;
+                const int32_t scalar_multiplier = output_multiplier[out_channel];
+                const int32_t scalar_output_shift = output_shift[out_channel];
+                const int scalar_right_shift = 31 - scalar_output_shift;
+                const int32_t bias_val = bias_data ? bias_data[out_channel] : 0;
 
-                for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
-                {
-                    // Determine the group index and starting input channel for the current output channel
-                    const int group = out_channel / filters_per_group;
-                    const int group_start_input_channel = group * filter_input_depth;
+                int8_t* output_channel_base = output_row_base + out_channel * output_ch_stride;
+                const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
-                    // Initialize the accumulator for the current output pixel and channel
-                    int32_t acc = 0;
+                size_t current_out_x = 0;
+                while (current_out_x < (size_t)output_width) {
+                    size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
+                    assert(vl <= MAX_VL_E32M4_ZVL128B && "Vector length exceeds temporary buffer size");
 
-                    // Get the base pointer for the filter data corresponding to the current output channel
-                    const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
+                    vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
 
-                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
-                    {
-                        // Calculate the corresponding row index in the input tensor
+                    vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl); // [0, 1, ..., vl-1]
+                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(
+                        __riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                    vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(
+                        __riscv_vmul_vx_i32m4(v_out_x, stride_width, vl),
+                        pad_width, vl);
+
+                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                         const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                        const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
 
-                        // Get the base pointer for the current filter row
-                        const int8_t* filter_y_base = filter_oc_base + filter_y * filter_h_stride;
-
-                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
-                        {
-                            // Calculate the corresponding column index in the input tensor
-                            const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-                            // Get the base pointer for the current filter column
-                            const int8_t* filter_x_base = filter_y_base + filter_x * filter_w_stride;
-
-                            // Check if the calculated input patch position is within the input tensor boundaries
-                            const bool is_point_inside_image =
-                                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                                (in_y < input_height);
-
-                            // Skip computation if the current filter patch position is outside the input boundaries
-                            if (!is_point_inside_image) 
-                            continue;
-
-                            // Calculate the memory offset to the start of the relevant input data patch
-                            const int input_offset_addr = (in_y * input_h_stride) + (in_x * input_w_stride) + (group_start_input_channel * input_ch_stride);
-                            // Get pointers to the start of the input patch and corresponding filter data
-                            const int8_t* input_ptr = input_batch_base + input_offset_addr;
-                            const int8_t* filter_ptr = filter_x_base;
-
-                            // Initialize variables for the vector processing loop over input channels for this patch
-                            size_t channels_remaining = filter_input_depth;
-                            int32_t patch_acc = 0;
-
-                            // Perform vector MAC operation if there are channels to process for this patch
-                            if (channels_remaining > 0)
-                            {
-                                // Initialize a 32-bit vector accumulator (m4) to zeros
-                                size_t vlmax_for_acc = __riscv_vsetvlmax_e32m4();
-                                vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vlmax_for_acc);
-
-                                // Process input channels in vector chunks until all are done
-                                while (channels_remaining > 0)
-                                {
-                                    // Set the vector length for the current iteration
-                                    size_t current_vl = __riscv_vsetvl_e8m1(channels_remaining);
-
-                                    // Load 8-bit input and filter data chunks into m1 vectors
-                                    vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_ptr, current_vl);
-                                    vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_ptr, current_vl);
-
-                                    // Widen 8-bit vectors (m1) to 16-bit vectors (m2)
-                                    vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, current_vl);
-                                    vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, current_vl);
-
-                                    // Add the input offset to the widened 16-bit input vector
-                                    v_input_s16 = __riscv_vadd_vx_i16m2(v_input_s16, input_offset_s16, current_vl);
-
-                                    // Perform widening multiply-accumulate: 16m2 * 16m2 -> 32m4, accumulating into v_acc_s32
-                                    v_acc_s32 = __riscv_vwmacc_vv_i32m4(v_acc_s32, v_filter_s16, v_input_s16, current_vl);
-
-                                    // Advance input and filter pointers and decrement remaining channel count
-                                    input_ptr += current_vl;
-                                    filter_ptr += current_vl;
-                                    channels_remaining -= current_vl;
-                                }
-
-                                // Reduce the final 32-bit vector accumulator to a scalar sum
-                                size_t vl_for_reduce = __riscv_vsetvl_e32m4(filter_input_depth);
-                                vint32m1_t v_zero_reduction = __riscv_vmv_s_x_i32m1(0, 1);
-                                vint32m1_t v_sum_reduction = __riscv_vredsum_vs_i32m4_i32m1(
-                                                                v_acc_s32,
-                                                                v_zero_reduction,
-                                                                vl_for_reduce);
-
-                                // Extract the scalar reduction result into patch_acc
-                                patch_acc = __riscv_vmv_x_s_i32m1_i32(v_sum_reduction);
-                            }
+                        if (!is_y_inside_image) continue;
+
+                        const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
+
+                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                            const int in_x_offset = dilation_width_factor * filter_x;
+                            const int8_t* filter_patch_base = filter_y_base + (filter_x * filter_w_stride);
+
+                            vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
+
+                            vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
+                            vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
+                            vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
+
+                            int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
+                            const int8_t* input_base_for_y_x0 = input_batch_base
+                                                                + (in_y * input_h_stride)
+                                                                + (base_in_x_for_vector0 * input_w_stride)
+                                                                + (group_start_input_channel * input_ch_stride);
+
+                            ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
+
+                            for (int ic = 0; ic < filter_input_depth; ++ic) {
+                                int8_t s_filter_val_s8 = filter_patch_base[ic * filter_ch_stride];
+                                int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
-                            // Accumulate the result from the processed patch into the overall accumulator
-                            acc += patch_acc;
+                                const int8_t* input_ic_ptr = input_base_for_y_x0 + (ic * input_ch_stride);
+
+                                vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_ic_ptr, input_x_stride_bytes, vl);
+                                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
+                                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
+
+                                v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
+                            }
                         }
                     }
 
-                    // Add bias value
+                    vint32m4_t v_res32 = v_acc_s32;
                     if (bias_data) {
-                    acc += bias_data[out_channel];
+                        v_res32 = __riscv_vadd_vx_i32m4(v_res32, bias_val, vl);
                     }
 
-                    // Apply per-channel requantization to the accumulated value
-                    const int32_t current_multiplier = output_multiplier[out_channel];
-                    const int32_t current_shift = output_shift[out_channel];
-                    const int64_t total_shift = 31 - current_shift;
-                    const int64_t round_val = (total_shift > 0) ? (static_cast<int64_t>(1) << (total_shift - 1)) : 0LL;
-                    int64_t result64 = static_cast<int64_t>(acc) * static_cast<int64_t>(current_multiplier);
-                    result64 += round_val; // Add rounding value
-                    result64 = result64 >> total_shift; // Perform the shift
-                    result64 = std::max(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
-                    result64 = std::min(result64, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
-                    acc = static_cast<int32_t>(result64);
-
-                    // Add the output offset to the requantized value
-                    acc += output_offset;
-
-                    // Clamp the result to the final activation range
-                    acc = std::max(acc, output_activation_min);
-                    acc = std::min(acc, output_activation_max);
-
-                    // Calculate the memory offset for the output pixel and store the final 8-bit result
-                    const int output_offset_addr = (out_y * output_h_stride) + (out_x * output_w_stride) + (out_channel * output_ch_stride);
-                    output_batch_base[output_offset_addr] = static_cast<int8_t>(acc);
+                    __riscv_vse32_v_i32m4(temp_requant_buffer, v_res32, vl);
+
+                    for (size_t i = 0; i < vl; ++i) {
+                        temp_requant_buffer[i] = multi_64bit(
+                            temp_requant_buffer[i],
+                            scalar_multiplier,
+                            scalar_right_shift);
+                    }
+
+                    v_res32 = __riscv_vle32_v_i32m4(temp_requant_buffer, vl);
+                    v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
+                    v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
+                    v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
+
+                    vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
+                    vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+
+                    int8_t* output_strip_base_ptr = output_channel_base + current_out_x * output_w_stride;
+                    __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
+
+                    current_out_x += vl;
                 }
             }
         }

From 1d20d3195b35e1a11572d19ba18d478052559900 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 14:39:31 -0500
Subject: [PATCH 23/86] DepthwiseConvPerChannel

---
 .../lite/micro/kernels/depthwise_conv.cc      |  36 ++++
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 173 ++++++++++++++++++
 .../micro/kernels/riscv_vector/conv_rvv.h     |  10 +
 3 files changed, 219 insertions(+)

diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index 489e83f94f2..a7be7ff37e7 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -24,6 +24,11 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+#if defined(TFLM_USE_RISCV_VECTOR)
+#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
+#endif
+
+
 namespace tflite {
 namespace {
 
@@ -109,6 +114,36 @@ TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
+#if defined(TFLM_USE_RISCV_VECTOR)
+#ifdef USE_TFLM_COMPRESSION
+          TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
+          if (weights_comp_td != nullptr || bias_comp_td != nullptr) 
+          {
+              MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet.");
+              return kTfLiteError;
+          }
+#endif // USE_TFLM_COMPRESSION
+          // Check bias type is compatible (as per your original check)
+          if (bias != nullptr && bias->type != kTfLiteInt32) {
+            MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
+            return kTfLiteError;
+         }
+
+         // Call the optimized RVV kernel with the *new* correct parameters
+         DepthwiseConvPerChannelRVV(
+            DepthwiseConvParamsQuantized(params, data),          // const ConvParams& params
+            data.per_channel_output_multiplier,         // const int32_t* output_multiplier
+            data.per_channel_output_shift,              // const int32_t* output_shift
+            tflite::micro::GetTensorShape(input),       // const RuntimeShape& input_shape
+            tflite::micro::GetTensorData<int8_t>(input), // const int8_t* input_data
+            tflite::micro::GetTensorShape(filter),      // const RuntimeShape& filter_shape
+            tflite::micro::GetTensorData<int8_t>(filter),// const int8_t* filter_data
+            tflite::micro::GetTensorShape(bias),        // const RuntimeShape& bias_shape
+            tflite::micro::GetOptionalTensorData<int32_t>(bias), // const int32_t* bias_data
+            tflite::micro::GetTensorShape(output),      // const RuntimeShape& output_shape
+            tflite::micro::GetTensorData<int8_t>(output) // int8_t* output_data
+         );
+#else
           reference_integer_ops::DepthwiseConvPerChannel(
               DepthwiseConvParamsQuantized(params, data),
               data.per_channel_output_multiplier, data.per_channel_output_shift,
@@ -129,6 +164,7 @@ TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
 #endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
+#endif
           break;
         }
         default:
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 2c48ce976f0..c6ec55ae119 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -190,4 +190,177 @@ void ConvPerChannelRVV(const ConvParams& params,
             }
         }
     }
+}
+
+void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
+    const int32_t* output_multiplier,
+    const int32_t* output_shift,
+    const RuntimeShape& input_shape,
+    const int8_t* input_data,
+    const RuntimeShape& filter_shape,
+    const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data)
+{
+    const int32_t input_offset = params.input_offset;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+
+    const int input_batches = input_shape.Dims(0);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    const int input_ch_stride = 1;
+    const int input_w_stride = input_depth;
+    const int input_h_stride = input_width * input_w_stride;
+    const int input_b_stride = input_height * input_h_stride;
+
+    const int filter_ch_stride = 1;
+    const int filter_w_stride = output_depth;
+    const int filter_h_stride = filter_width * filter_w_stride;
+
+    const int output_ch_stride = 1;
+    const int output_w_stride = output_depth;
+    const int output_h_stride = output_width * output_w_stride;
+    const int output_b_stride = output_height * output_h_stride;
+
+    int32_t temp_requant_buffer[MAX_VL_E32M4_ZVL128B] __attribute__((aligned(16)));
+
+    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+    const int32_t s_output_offset_s32 = output_offset;
+    const int32_t s_output_activation_min_s32 = output_activation_min;
+    const int32_t s_output_activation_max_s32 = output_activation_max;
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+        const int8_t* input_batch_base = input_data + batch * input_b_stride;
+        int8_t* output_batch_base = output_data + batch * output_b_stride;
+        for (int out_y = 0; out_y < output_height; ++out_y) {
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                for (int m = 0; m < depth_multiplier; ++m) {
+                    const int output_channel = m + in_channel * depth_multiplier;
+                    const int32_t scalar_multiplier = output_multiplier[output_channel];
+                    const int32_t scalar_output_shift = output_shift[output_channel];
+                    const int effective_right_shift = 31 - scalar_output_shift;
+
+                    const int32_t bias_val = bias_data ? bias_data[output_channel] : 0;
+
+                    int8_t* output_channel_row_base = output_batch_base
+                                                      + out_y * output_h_stride
+                                                      + output_channel * output_ch_stride;
+
+                    const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
+
+                    size_t current_out_x = 0;
+                    while (current_out_x < (size_t)output_width) {
+
+                        size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
+                        assert(vl <= MAX_VL_E32M4_ZVL128B && "Vector length exceeds temporary buffer size");
+
+                        vint32m4_t v_acc_s32;
+                        if (bias_data) {
+                             v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
+                        } else {
+                             v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+                        }
+
+
+                        vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
+                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(
+                            __riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                        vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(
+                            __riscv_vmul_vx_i32m4(v_out_x, stride_width, vl),
+                            pad_width, vl);
+
+                        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                            const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
+
+                            if (!is_y_inside_image) continue;
+
+                             const int8_t* filter_y_base = filter_data
+                                    + filter_y * filter_h_stride;
+
+                            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+
+                                const int in_x_offset = dilation_width_factor * filter_x;
+                                vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
+
+                                vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
+                                vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
+                                vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
+
+                                uint32_t first_mask_bit = __riscv_vfirst_m_b8(v_active_lane_mask_b8, vl);
+                                if (first_mask_bit == (uint32_t)-1 && vl > 0) continue;
+
+                                const int8_t* filter_ptr = filter_y_base
+                                                          + filter_x * filter_w_stride
+                                                          + output_channel * filter_ch_stride;
+                                int8_t s_filter_val_s8 = *filter_ptr;
+                                int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
+
+                                int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
+                                const int8_t* input_base_ptr = input_batch_base
+                                                               + in_y * input_h_stride
+                                                               + base_in_x_for_vector0 * input_w_stride
+                                                               + in_channel * input_ch_stride;
+
+                                ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
+
+                                vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);
+
+                                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
+                                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
+
+                                v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
+                            }
+                        }
+
+                        // Store accumulator to buffer for scalar requantization
+                        __riscv_vse32_v_i32m4(temp_requant_buffer, v_acc_s32, vl);
+
+                        // Scalar requantization loop (inefficient but avoids 64-bit vector types)
+                        for (size_t i = 0; i < vl; ++i) {
+                            temp_requant_buffer[i] = multi_64bit(
+                                temp_requant_buffer[i],
+                                scalar_multiplier,
+                                effective_right_shift);
+                        }
+
+                        // Load result back from buffer
+                        vint32m4_t v_res32 = __riscv_vle32_v_i32m4(temp_requant_buffer, vl);
+
+                        // Vectorized Offset, Clamping, Narrowing
+                        v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
+                        v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
+                        v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
+
+                        vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
+                        vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+
+                        int8_t* output_strip_base_ptr = output_channel_row_base + current_out_x * output_w_stride;
+                        __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
+
+                        current_out_x += vl;
+                    }
+                }
+            }
+        }
+    }
 }
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
index 54b178f5f4c..d9a314f4c5b 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -23,4 +23,14 @@ void ConvPerChannelRVV(
     const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data);
 
+void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
+    const int32_t* output_multiplier,
+    const int32_t* output_shift,
+    const RuntimeShape& input_shape,
+    const int8_t* input_data,
+    const RuntimeShape& filter_shape,
+    const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data);
+
 #endif
\ No newline at end of file

From 8ca51a2e82f3386cb11d23fdc108248ebd92cecf Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 14:40:33 -0500
Subject: [PATCH 24/86] Remove old includes

---
 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
index d9a314f4c5b..68dd6109781 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -8,10 +8,6 @@
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/runtime_shape.h"
-// #include "fixedpoint/fixedpoint.h"
-// #include "tensorflow/lite/core/macros.h"
-// #include "tensorflow/lite/kernels/internal/cppmath.h"
-// #include "tensorflow/lite/kernels/internal/types.h"
 
 using namespace tflite;
 

From 63700c5257260294ae144679692fb690713afade Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 15:05:55 -0500
Subject: [PATCH 25/86] Perform 64bit operations with 32bit vector intrinsics

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 273 ++++++++++--------
 1 file changed, 150 insertions(+), 123 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index c6ec55ae119..254a293e0e6 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -11,33 +11,17 @@
 
 using namespace tflite;
 
-#define MAX_VL_E32M4_ZVL128B 16
-
-inline int32_t multi_64bit(int32_t x, int32_t quantized_multiplier, int shift) 
-{
-    int64_t acc = static_cast<int64_t>(x) * static_cast<int64_t>(quantized_multiplier);
-
-    const int64_t rounding = (shift > 0) ? (INT64_C(1) << (shift - 1)) : INT64_C(0);
-    acc += rounding;
-
-    acc = acc >> shift;
-
-    acc = std::max(acc, static_cast<int64_t>(std::numeric_limits<int32_t>::min()));
-    acc = std::min(acc, static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
-
-    return static_cast<int32_t>(acc);
-}
-
-
 void ConvPerChannelRVV(const ConvParams& params,
-                       const int32_t* output_multiplier,
-                       const int32_t* output_shift,
-                       const RuntimeShape& input_shape,
-                       const int8_t* input_data,
-                       const RuntimeShape& filter_shape,
-                       const int8_t* filter_data,
-                       const RuntimeShape& bias_shape, const int32_t* bias_data,
-                       const RuntimeShape& output_shape, int8_t* output_data)
+                  const int32_t* output_multiplier,
+                  const int32_t* output_shift,
+                  const RuntimeShape& input_shape,
+                  const int8_t* input_data,
+                  const RuntimeShape& filter_shape,
+                  const int8_t* filter_data,
+                  const RuntimeShape& bias_shape,
+                  const int32_t* bias_data,
+                  const RuntimeShape& output_shape,
+                  int8_t* output_data)
 {
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
@@ -77,57 +61,71 @@ void ConvPerChannelRVV(const ConvParams& params,
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
-    int32_t temp_requant_buffer[MAX_VL_E32M4_ZVL128B] __attribute__((aligned(16)));
-
     const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
     const int32_t s_output_offset_s32 = output_offset;
     const int32_t s_output_activation_min_s32 = output_activation_min;
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
-    for (int batch = 0; batch < input_batches; ++batch) {
+    for (int batch = 0; batch < input_batches; ++batch) 
+    {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
 
-        for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_y = 0; out_y < output_height; ++out_y) 
+        {
             const int in_y_origin = (out_y * stride_height) - pad_height;
             int8_t* output_row_base = output_batch_base + out_y * output_h_stride;
 
-            for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+            for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
+            {
                 const int group = out_channel / filters_per_group;
                 const int group_start_input_channel = group * filter_input_depth;
                 const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
                 const int32_t scalar_multiplier = output_multiplier[out_channel];
-                const int32_t scalar_output_shift = output_shift[out_channel];
-                const int scalar_right_shift = 31 - scalar_output_shift;
+                const int32_t scalar_shift = output_shift[out_channel];
+                const int effective_right_shift = 31 - scalar_shift;
+
                 const int32_t bias_val = bias_data ? bias_data[out_channel] : 0;
 
+                int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
+                int32_t rounding_lo = (int32_t)rounding_val;
+                int32_t rounding_hi = (int32_t)(rounding_val >> 32);
+
                 int8_t* output_channel_base = output_row_base + out_channel * output_ch_stride;
                 const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
                 size_t current_out_x = 0;
-                while (current_out_x < (size_t)output_width) {
+                while (current_out_x < (size_t)output_width) 
+                {
                     size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
-                    assert(vl <= MAX_VL_E32M4_ZVL128B && "Vector length exceeds temporary buffer size");
 
-                    vint32m4_t v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+                    vint32m4_t v_acc_s32;
+                    if (bias_data) 
+                    {
+                        v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
+                    } 
+                    else 
+                    {
+                        v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+                    }
 
-                    vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl); // [0, 1, ..., vl-1]
-                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(
-                        __riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
-                    vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(
-                        __riscv_vmul_vx_i32m4(v_out_x, stride_width, vl),
-                        pad_width, vl);
+                    vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
+                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                    vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
-                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+                    {
                         const int in_y = in_y_origin + dilation_height_factor * filter_y;
                         const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
 
-                        if (!is_y_inside_image) continue;
+                        if (!is_y_inside_image)
+                            continue;
 
                         const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
 
-                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+                        {
                             const int in_x_offset = dilation_width_factor * filter_x;
                             const int8_t* filter_patch_base = filter_y_base + (filter_x * filter_w_stride);
 
@@ -138,18 +136,17 @@ void ConvPerChannelRVV(const ConvParams& params,
                             vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
                             int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
-                            const int8_t* input_base_for_y_x0 = input_batch_base
-                                                                + (in_y * input_h_stride)
-                                                                + (base_in_x_for_vector0 * input_w_stride)
-                                                                + (group_start_input_channel * input_ch_stride);
+                            const int8_t* input_base_for_y_x_patch = input_batch_base + (in_y * input_h_stride) + (base_in_x_for_vector0 * input_w_stride) +
+                                                                     (group_start_input_channel * input_ch_stride);
 
                             ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
 
-                            for (int ic = 0; ic < filter_input_depth; ++ic) {
+                            for (int ic = 0; ic < filter_input_depth; ++ic) 
+                            {
                                 int8_t s_filter_val_s8 = filter_patch_base[ic * filter_ch_stride];
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
-                                const int8_t* input_ic_ptr = input_base_for_y_x0 + (ic * input_ch_stride);
+                                const int8_t* input_ic_ptr = input_base_for_y_x_patch + (ic * input_ch_stride);
 
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_ic_ptr, input_x_stride_bytes, vl);
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
@@ -160,22 +157,36 @@ void ConvPerChannelRVV(const ConvParams& params,
                         }
                     }
 
-                    vint32m4_t v_res32 = v_acc_s32;
-                    if (bias_data) {
-                        v_res32 = __riscv_vadd_vx_i32m4(v_res32, bias_val, vl);
+                    vint32m4_t v_res32;
+
+                    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
+                    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
+
+                    vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+                    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
+                    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
+                    vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
+                    v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+                    vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
+
+                    if (effective_right_shift == 0) 
+                    {
+                        v_res32 = v_rounded_lo;
+                    } 
+                    else if (effective_right_shift > 0 && effective_right_shift < 32) 
+                    {
+                        vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
+                        vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
+                        v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
+                    } 
+                    else 
+                    {
+                        int shift_hi = std::min(31, effective_right_shift - 32);
+                        v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
                     }
 
-                    __riscv_vse32_v_i32m4(temp_requant_buffer, v_res32, vl);
-
-                    for (size_t i = 0; i < vl; ++i) {
-                        temp_requant_buffer[i] = multi_64bit(
-                            temp_requant_buffer[i],
-                            scalar_multiplier,
-                            scalar_right_shift);
-                    }
-
-                    v_res32 = __riscv_vle32_v_i32m4(temp_requant_buffer, vl);
                     v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
+
                     v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
                     v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
 
@@ -193,14 +204,16 @@ void ConvPerChannelRVV(const ConvParams& params,
 }
 
 void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
-    const int32_t* output_multiplier,
-    const int32_t* output_shift,
-    const RuntimeShape& input_shape,
-    const int8_t* input_data,
-    const RuntimeShape& filter_shape,
-    const int8_t* filter_data,
-    const RuntimeShape& bias_shape, const int32_t* bias_data,
-    const RuntimeShape& output_shape, int8_t* output_data)
+                           const int32_t* output_multiplier,
+                           const int32_t* output_shift,
+                           const RuntimeShape& input_shape,
+                           const int8_t* input_data,
+                           const RuntimeShape& filter_shape,
+                           const int8_t* filter_data,
+                           const RuntimeShape& bias_shape,
+                           const int32_t* bias_data,
+                           const RuntimeShape& output_shape,
+                           int8_t* output_data)
 {
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
@@ -240,65 +253,69 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
-    int32_t temp_requant_buffer[MAX_VL_E32M4_ZVL128B] __attribute__((aligned(16)));
-
     const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
     const int32_t s_output_offset_s32 = output_offset;
     const int32_t s_output_activation_min_s32 = output_activation_min;
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
-    for (int batch = 0; batch < input_batches; ++batch) {
+    for (int batch = 0; batch < input_batches; ++batch) 
+    {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
-        for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_y = 0; out_y < output_height; ++out_y) 
+        {
             const int in_y_origin = (out_y * stride_height) - pad_height;
-            for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                for (int m = 0; m < depth_multiplier; ++m) {
+            for (int in_channel = 0; in_channel < input_depth; ++in_channel) 
+            {
+                for (int m = 0; m < depth_multiplier; ++m) 
+                {
                     const int output_channel = m + in_channel * depth_multiplier;
                     const int32_t scalar_multiplier = output_multiplier[output_channel];
-                    const int32_t scalar_output_shift = output_shift[output_channel];
-                    const int effective_right_shift = 31 - scalar_output_shift;
+                    const int32_t scalar_shift = output_shift[output_channel];
+                    const int effective_right_shift = 31 - scalar_shift;
 
                     const int32_t bias_val = bias_data ? bias_data[output_channel] : 0;
 
-                    int8_t* output_channel_row_base = output_batch_base
-                                                      + out_y * output_h_stride
-                                                      + output_channel * output_ch_stride;
+                    int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
+                    int32_t rounding_lo = (int32_t)rounding_val;
+                    int32_t rounding_hi = (int32_t)(rounding_val >> 32);
+
+                    int8_t* output_channel_row_base = output_batch_base + out_y * output_h_stride + output_channel * output_ch_stride;
 
                     const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
                     size_t current_out_x = 0;
-                    while (current_out_x < (size_t)output_width) {
+                    while (current_out_x < (size_t)output_width) 
+                    {
 
                         size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
-                        assert(vl <= MAX_VL_E32M4_ZVL128B && "Vector length exceeds temporary buffer size");
 
                         vint32m4_t v_acc_s32;
-                        if (bias_data) {
-                             v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
-                        } else {
-                             v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+                        if (bias_data) 
+                        {
+                            v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
+                        } 
+                        else 
+                        {
+                            v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
                         }
 
-
                         vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
-                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(
-                            __riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
-                        vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(
-                            __riscv_vmul_vx_i32m4(v_out_x, stride_width, vl),
-                            pad_width, vl);
+                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                        vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
-                        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                        for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+                        {
                             const int in_y = in_y_origin + dilation_height_factor * filter_y;
                             const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
 
-                            if (!is_y_inside_image) continue;
-
-                             const int8_t* filter_y_base = filter_data
-                                    + filter_y * filter_h_stride;
+                            if (!is_y_inside_image)
+                                continue;
 
-                            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                            const int8_t* filter_y_base = filter_data + filter_y * filter_h_stride;
 
+                            for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+                            {
                                 const int in_x_offset = dilation_width_factor * filter_x;
                                 vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
 
@@ -307,24 +324,20 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                                 vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
                                 uint32_t first_mask_bit = __riscv_vfirst_m_b8(v_active_lane_mask_b8, vl);
-                                if (first_mask_bit == (uint32_t)-1 && vl > 0) continue;
+                                if (first_mask_bit == (uint32_t)-1 && vl > 0)
+                                    continue;
 
-                                const int8_t* filter_ptr = filter_y_base
-                                                          + filter_x * filter_w_stride
-                                                          + output_channel * filter_ch_stride;
+                                const int8_t* filter_ptr = filter_y_base + filter_x * filter_w_stride + output_channel * filter_ch_stride;
                                 int8_t s_filter_val_s8 = *filter_ptr;
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
                                 int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
-                                const int8_t* input_base_ptr = input_batch_base
-                                                               + in_y * input_h_stride
-                                                               + base_in_x_for_vector0 * input_w_stride
-                                                               + in_channel * input_ch_stride;
+                                const int8_t* input_base_ptr =
+                                  input_batch_base + in_y * input_h_stride + base_in_x_for_vector0 * input_w_stride + in_channel * input_ch_stride;
 
                                 ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
 
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);
-
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
                                 vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
 
@@ -332,22 +345,36 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                             }
                         }
 
-                        // Store accumulator to buffer for scalar requantization
-                        __riscv_vse32_v_i32m4(temp_requant_buffer, v_acc_s32, vl);
-
-                        // Scalar requantization loop (inefficient but avoids 64-bit vector types)
-                        for (size_t i = 0; i < vl; ++i) {
-                            temp_requant_buffer[i] = multi_64bit(
-                                temp_requant_buffer[i],
-                                scalar_multiplier,
-                                effective_right_shift);
+                        vint32m4_t v_res32;
+
+                        vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
+                        vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
+
+                        vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+                        vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
+                        vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
+                        vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
+                        v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+                        vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
+
+                        if (effective_right_shift == 0) 
+                        {
+                            v_res32 = v_rounded_lo;
+                        } 
+                        else if (effective_right_shift > 0 && effective_right_shift < 32) 
+                        {
+                            vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
+                            vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
+                            v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
+                        } 
+                        else 
+                        {
+                            int shift_hi = std::min(31, effective_right_shift - 32);
+                            v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
                         }
 
-                        // Load result back from buffer
-                        vint32m4_t v_res32 = __riscv_vle32_v_i32m4(temp_requant_buffer, vl);
-
-                        // Vectorized Offset, Clamping, Narrowing
                         v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
+
                         v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
                         v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
 

From a8a51b08b2f40daedbbe555d2e9b5c881e9ee38d Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 15:10:59 -0500
Subject: [PATCH 26/86] Add comments

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 87 +++++++++++++++++--
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 254a293e0e6..e41b412094b 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -23,6 +23,7 @@ void ConvPerChannelRVV(const ConvParams& params,
                   const RuntimeShape& output_shape,
                   int8_t* output_data)
 {
+    // Extract convolution parameters
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
@@ -34,6 +35,7 @@ void ConvPerChannelRVV(const ConvParams& params,
     const int32_t output_activation_min = params.quantized_activation_min;
     const int32_t output_activation_max = params.quantized_activation_max;
 
+    // Extract shape dimensions
     const int input_batches = input_shape.Dims(0);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
@@ -45,9 +47,11 @@ void ConvPerChannelRVV(const ConvParams& params,
     const int output_width = output_shape.Dims(2);
     const int output_depth = output_shape.Dims(3);
 
+    // Calculate grouping parameters
     const int groups = input_depth / filter_input_depth;
     const int filters_per_group = output_depth / groups;
 
+    // Calculate tensor strides
     const int input_ch_stride = 1;
     const int input_w_stride = input_depth;
     const int input_h_stride = input_width * input_w_stride;
@@ -61,45 +65,57 @@ void ConvPerChannelRVV(const ConvParams& params,
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
+    // Prepare scalar constants
     const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
     const int32_t s_output_offset_s32 = output_offset;
     const int32_t s_output_activation_min_s32 = output_activation_min;
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
+    // Loop over batches
     for (int batch = 0; batch < input_batches; ++batch) 
     {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
 
+        // Loop over output height
         for (int out_y = 0; out_y < output_height; ++out_y) 
         {
             const int in_y_origin = (out_y * stride_height) - pad_height;
             int8_t* output_row_base = output_batch_base + out_y * output_h_stride;
 
+            // Loop over output channels
             for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
             {
+                // Calculate group and filter parameters for this output channel
                 const int group = out_channel / filters_per_group;
                 const int group_start_input_channel = group * filter_input_depth;
                 const int8_t* filter_oc_base = filter_data + out_channel * filter_o_stride;
 
+                // Get per-channel requantization parameters
                 const int32_t scalar_multiplier = output_multiplier[out_channel];
                 const int32_t scalar_shift = output_shift[out_channel];
                 const int effective_right_shift = 31 - scalar_shift;
 
+                // Get bias value for this output channel
                 const int32_t bias_val = bias_data ? bias_data[out_channel] : 0;
 
+                // Calculate rounding constants for requantization
                 int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
                 int32_t rounding_lo = (int32_t)rounding_val;
                 int32_t rounding_hi = (int32_t)(rounding_val >> 32);
 
+                // Calculate output pointer and stride for this channel row
                 int8_t* output_channel_base = output_row_base + out_channel * output_ch_stride;
                 const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
+                // Process output width in vector chunks
                 size_t current_out_x = 0;
                 while (current_out_x < (size_t)output_width) 
                 {
+                    // Set vector length for this iteration
                     size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
 
+                    // Initialize accumulator vector with bias
                     vint32m4_t v_acc_s32;
                     if (bias_data) 
                     {
@@ -110,58 +126,74 @@ void ConvPerChannelRVV(const ConvParams& params,
                         v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
                     }
 
+                    // Calculate base input x coordinates for the vector lanes
                     vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
                     vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
                     vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
+                    // Loop over filter height
                     for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
                     {
+                        // Calculate input y coordinate and check bounds
                         const int in_y = in_y_origin + dilation_height_factor * filter_y;
                         const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
 
+                        // Skip this filter row if input y is out of bounds
                         if (!is_y_inside_image)
                             continue;
 
                         const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
 
+                        // Loop over filter width
                         for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
                         {
+                            // Calculate input x offset and filter patch base pointer
                             const int in_x_offset = dilation_width_factor * filter_x;
                             const int8_t* filter_patch_base = filter_y_base + (filter_x * filter_w_stride);
 
+                            // Calculate input x coordinates for the vector lanes for this filter tap
                             vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
 
+                            // Create mask for valid input coordinates (within image width bounds)
                             vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
                             vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
                             vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
+                            // Calculate base input pointer and stride for vector load
                             int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
                             const int8_t* input_base_for_y_x_patch = input_batch_base + (in_y * input_h_stride) + (base_in_x_for_vector0 * input_w_stride) +
                                                                      (group_start_input_channel * input_ch_stride);
-
                             ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
 
+                            // Loop over input channels for this filter tap
                             for (int ic = 0; ic < filter_input_depth; ++ic) 
                             {
+                                // Load scalar filter value
                                 int8_t s_filter_val_s8 = filter_patch_base[ic * filter_ch_stride];
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
+                                // Calculate input pointer for this channel
                                 const int8_t* input_ic_ptr = input_base_for_y_x_patch + (ic * input_ch_stride);
 
+                                // Load input vector (masked, strided), widen, add offset
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_ic_ptr, input_x_stride_bytes, vl);
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
                                 vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
 
+                                // Perform widening multiply-accumulate (masked)
                                 v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
                     }
 
+                    // Start Vectorized Requantization
                     vint32m4_t v_res32;
 
+                    // Multiply accumulator by scalar multiplier (results in 64b intermediate)
                     vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
                     vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
 
+                    // Add 64b rounding value using 32b operations with carry
                     vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
                     vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
                     vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
@@ -169,6 +201,7 @@ void ConvPerChannelRVV(const ConvParams& params,
                     v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
                     vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
 
+                    // Perform 64b arithmetic right shift using 32b vector shifts
                     if (effective_right_shift == 0) 
                     {
                         v_res32 = v_rounded_lo;
@@ -185,17 +218,22 @@ void ConvPerChannelRVV(const ConvParams& params,
                         v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
                     }
 
+                    // Add output offset
                     v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
 
+                    // Clamp to activation bounds
                     v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
                     v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
 
+                    // Narrow result to int16 and then int8 with saturation
                     vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
                     vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
 
+                    // Store results vector (strided)
                     int8_t* output_strip_base_ptr = output_channel_base + current_out_x * output_w_stride;
                     __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
 
+                    // Advance output x pointer
                     current_out_x += vl;
                 }
             }
@@ -215,6 +253,7 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                            const RuntimeShape& output_shape,
                            int8_t* output_data)
 {
+    // Extract depthwise convolution parameters
     const int32_t input_offset = params.input_offset;
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
@@ -227,69 +266,81 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
     const int32_t output_activation_min = params.quantized_activation_min;
     const int32_t output_activation_max = params.quantized_activation_max;
 
+    // Extract shape dimensions
     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-
     const int input_batches = input_shape.Dims(0);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
     const int input_depth = input_shape.Dims(3);
     const int filter_height = filter_shape.Dims(1);
     const int filter_width = filter_shape.Dims(2);
-
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
 
+    // Calculate tensor strides
     const int input_ch_stride = 1;
     const int input_w_stride = input_depth;
     const int input_h_stride = input_width * input_w_stride;
     const int input_b_stride = input_height * input_h_stride;
-
     const int filter_ch_stride = 1;
     const int filter_w_stride = output_depth;
     const int filter_h_stride = filter_width * filter_w_stride;
-
     const int output_ch_stride = 1;
     const int output_w_stride = output_depth;
     const int output_h_stride = output_width * output_w_stride;
     const int output_b_stride = output_height * output_h_stride;
 
+    // Prepare scalar constants
     const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
     const int32_t s_output_offset_s32 = output_offset;
     const int32_t s_output_activation_min_s32 = output_activation_min;
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
+    // Loop over batches
     for (int batch = 0; batch < input_batches; ++batch) 
     {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
+
+        // Loop over output height
         for (int out_y = 0; out_y < output_height; ++out_y) 
         {
             const int in_y_origin = (out_y * stride_height) - pad_height;
+
+            // Loop over input channels (depthwise)
             for (int in_channel = 0; in_channel < input_depth; ++in_channel) 
             {
+                // Loop over depth multiplier
                 for (int m = 0; m < depth_multiplier; ++m) 
                 {
+                    // Calculate the current output channel
                     const int output_channel = m + in_channel * depth_multiplier;
+
+                    // Get per-channel requantization parameters
                     const int32_t scalar_multiplier = output_multiplier[output_channel];
                     const int32_t scalar_shift = output_shift[output_channel];
                     const int effective_right_shift = 31 - scalar_shift;
 
+                    // Get bias value for this output channel
                     const int32_t bias_val = bias_data ? bias_data[output_channel] : 0;
 
+                    // Calculate rounding constants for requantization
                     int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
                     int32_t rounding_lo = (int32_t)rounding_val;
                     int32_t rounding_hi = (int32_t)(rounding_val >> 32);
 
+                    // Calculate output pointer and stride for this channel row
                     int8_t* output_channel_row_base = output_batch_base + out_y * output_h_stride + output_channel * output_ch_stride;
-
                     const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
+                    // Process output width in vector chunks
                     size_t current_out_x = 0;
                     while (current_out_x < (size_t)output_width) 
                     {
-
+                        // Set vector length for this iteration
                         size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
 
+                        // Initialize accumulator vector with bias
                         vint32m4_t v_acc_s32;
                         if (bias_data) 
                         {
@@ -300,56 +351,70 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                             v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
                         }
 
+                        // Calculate base input x coordinates for the vector lanes
                         vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
                         vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
                         vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
+                        // Loop over filter height
                         for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
                         {
+                            // Calculate input y coordinate and check bounds
                             const int in_y = in_y_origin + dilation_height_factor * filter_y;
                             const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
 
+                            // Skip this filter row if input y is out of bounds
                             if (!is_y_inside_image)
                                 continue;
 
                             const int8_t* filter_y_base = filter_data + filter_y * filter_h_stride;
 
+                            // Loop over filter width
                             for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
                             {
+                                // Calculate input x coordinates for the vector lanes for this filter tap
                                 const int in_x_offset = dilation_width_factor * filter_x;
                                 vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
 
+                                // Create mask for valid input coordinates (within image width bounds)
                                 vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
                                 vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
                                 vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
+                                // Skip MAC calculation if all lanes are masked off for this tap
                                 uint32_t first_mask_bit = __riscv_vfirst_m_b8(v_active_lane_mask_b8, vl);
                                 if (first_mask_bit == (uint32_t)-1 && vl > 0)
                                     continue;
 
+                                // Load scalar filter value for this tap and output channel
                                 const int8_t* filter_ptr = filter_y_base + filter_x * filter_w_stride + output_channel * filter_ch_stride;
                                 int8_t s_filter_val_s8 = *filter_ptr;
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
+                                // Calculate base input pointer and stride for vector load (using in_channel)
                                 int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
                                 const int8_t* input_base_ptr =
                                   input_batch_base + in_y * input_h_stride + base_in_x_for_vector0 * input_w_stride + in_channel * input_ch_stride;
-
                                 ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
 
+                                // Load input vector (masked, strided), widen, add offset
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
                                 vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
 
+                                // Perform widening multiply-accumulate (masked)
                                 v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
 
+                        // Start Vectorized Requantization
                         vint32m4_t v_res32;
 
+                        // Multiply accumulator by scalar multiplier (results in 64b intermediate)
                         vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
                         vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
 
+                        // Add 64b rounding value using 32b operations with carry
                         vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
                         vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
                         vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
@@ -357,6 +422,7 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                         v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
                         vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
 
+                        // Perform 64b arithmetic right shift using 32b vector shifts
                         if (effective_right_shift == 0) 
                         {
                             v_res32 = v_rounded_lo;
@@ -373,17 +439,22 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                             v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
                         }
 
+                        // Add output offset
                         v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
 
+                        // Clamp to activation bounds
                         v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
                         v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
 
+                        // Narrow result to int16 and then int8 with saturation
                         vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
                         vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
 
+                        // Store results vector (strided)
                         int8_t* output_strip_base_ptr = output_channel_row_base + current_out_x * output_w_stride;
                         __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
 
+                        // Advance output x pointer
                         current_out_x += vl;
                     }
                 }

From 0a88c11d6bd95fcc273181fff8c3aac5510dc2a1 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 19 Apr 2025 15:22:53 -0500
Subject: [PATCH 27/86] Change C-style casts to static_cast

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index e41b412094b..8baeb83abd2 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -101,8 +101,8 @@ void ConvPerChannelRVV(const ConvParams& params,
 
                 // Calculate rounding constants for requantization
                 int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
-                int32_t rounding_lo = (int32_t)rounding_val;
-                int32_t rounding_hi = (int32_t)(rounding_val >> 32);
+                int32_t rounding_lo = static_cast<int32_t>(rounding_val);
+                int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
 
                 // Calculate output pointer and stride for this channel row
                 int8_t* output_channel_base = output_row_base + out_channel * output_ch_stride;
@@ -110,7 +110,7 @@ void ConvPerChannelRVV(const ConvParams& params,
 
                 // Process output width in vector chunks
                 size_t current_out_x = 0;
-                while (current_out_x < (size_t)output_width) 
+                while (current_out_x < static_cast<size_t>(output_width)) 
                 {
                     // Set vector length for this iteration
                     size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
@@ -128,7 +128,7 @@ void ConvPerChannelRVV(const ConvParams& params,
 
                     // Calculate base input x coordinates for the vector lanes
                     vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
-                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, static_cast<uint32_t>(current_out_x), vl));
                     vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
                     // Loop over filter height
@@ -160,10 +160,10 @@ void ConvPerChannelRVV(const ConvParams& params,
                             vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
                             // Calculate base input pointer and stride for vector load
-                            int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
+                            int32_t base_in_x_for_vector0 = static_cast<int32_t>(current_out_x) * stride_width - pad_width + in_x_offset;
                             const int8_t* input_base_for_y_x_patch = input_batch_base + (in_y * input_h_stride) + (base_in_x_for_vector0 * input_w_stride) +
                                                                      (group_start_input_channel * input_ch_stride);
-                            ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
+                            ptrdiff_t input_x_stride_bytes = static_cast<ptrdiff_t>(stride_width) * input_w_stride * sizeof(int8_t);
 
                             // Loop over input channels for this filter tap
                             for (int ic = 0; ic < filter_input_depth; ++ic) 
@@ -326,8 +326,8 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
 
                     // Calculate rounding constants for requantization
                     int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
-                    int32_t rounding_lo = (int32_t)rounding_val;
-                    int32_t rounding_hi = (int32_t)(rounding_val >> 32);
+                    int32_t rounding_lo = static_cast<int32_t>(rounding_val);
+                    int32_t rounding_hi = static_cast<int32_t>((rounding_val) >> 32);
 
                     // Calculate output pointer and stride for this channel row
                     int8_t* output_channel_row_base = output_batch_base + out_y * output_h_stride + output_channel * output_ch_stride;
@@ -335,7 +335,7 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
 
                     // Process output width in vector chunks
                     size_t current_out_x = 0;
-                    while (current_out_x < (size_t)output_width) 
+                    while (current_out_x < static_cast<size_t>(output_width)) 
                     {
                         // Set vector length for this iteration
                         size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
@@ -353,7 +353,7 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
 
                         // Calculate base input x coordinates for the vector lanes
                         vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
-                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, (uint32_t)current_out_x, vl));
+                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, static_cast<uint32_t>(current_out_x), vl));
                         vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
                         // Loop over filter height
@@ -383,7 +383,7 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
 
                                 // Skip MAC calculation if all lanes are masked off for this tap
                                 uint32_t first_mask_bit = __riscv_vfirst_m_b8(v_active_lane_mask_b8, vl);
-                                if (first_mask_bit == (uint32_t)-1 && vl > 0)
+                                if (first_mask_bit == static_cast<uint32_t>(-1) && vl > 0)
                                     continue;
 
                                 // Load scalar filter value for this tap and output channel
@@ -392,10 +392,10 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
 
                                 // Calculate base input pointer and stride for vector load (using in_channel)
-                                int32_t base_in_x_for_vector0 = (int32_t)current_out_x * stride_width - pad_width + in_x_offset;
+                                int32_t base_in_x_for_vector0 = static_cast<int32_t>(current_out_x) * stride_width - pad_width + in_x_offset;
                                 const int8_t* input_base_ptr =
                                   input_batch_base + in_y * input_h_stride + base_in_x_for_vector0 * input_w_stride + in_channel * input_ch_stride;
-                                ptrdiff_t input_x_stride_bytes = (ptrdiff_t)stride_width * input_w_stride * sizeof(int8_t);
+                                ptrdiff_t input_x_stride_bytes = static_cast<ptrdiff_t>(stride_width) * input_w_stride * sizeof(int8_t);
 
                                 // Load input vector (masked, strided), widen, add offset
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);

From a09fc3e0532d10e140e12cf6afaa274c57b54c52 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 20 Apr 2025 14:26:41 -0500
Subject: [PATCH 28/86] FullConnectedRVV and FullyConnectedPerChannelRVV

---
 .../reference/integer_ops/depthwise_conv.h    |  27 +--
 .../lite/micro/kernels/fully_connected.cc     | 143 ++++++++---
 .../riscv_vector/fully_connected_rvv.cc       | 227 ++++++++++++++++++
 .../riscv_vector/fully_connected_rvv.h        |  33 +++
 .../make/targets/riscv32_vector_makefile.inc  |   3 +-
 5 files changed, 369 insertions(+), 64 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 300c6926c8c..87030d9fbea 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -21,9 +21,6 @@ limitations under the License.
 
 namespace tflite {
 namespace reference_integer_ops {
-
-// [PEANUT] It seems like the only difference between these are the data types and formats.
-//            We are mainly working on 8-bit data, so the below function is most important
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
@@ -33,20 +30,13 @@ inline void DepthwiseConvPerChannel(
     int8_t* output_data) {
   // Get parameters.
   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
-  // [PEANUT] Lots of Offset() calls. These map multi-dimensional indices to a one-dimensional index in the data buffers. More details in types.h
-  // [PEANUT] Offset() defined in tflite-micro/lite/kernels/internal/runtime_shape.h
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
-  // [PEANUT] I think dilation refers to this: https://towardsdatascience.com/a-primer-on-atrous-convolutions-and-depth-wise-separable-convolutions-443b106919f5/
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
-  // [PEANUT] Amount to 0-pad the input. This affects low pixel indices. High indices beyond input height and width are always treated as 0.
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
-  // [PEANUT] It seems like each input channel maps to "depth_multiplier" consecutive output channels
   const int depth_multiplier = params.depth_multiplier;
-
-  // [PEANUT] Activation clamping
   const int32_t input_offset = params.input_offset;
   const int32_t output_offset = params.output_offset;
   const int32_t output_activation_min = params.quantized_activation_min;
@@ -57,15 +47,6 @@ inline void DepthwiseConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  MicroPrintf("[PEANUT MICROSYSTEMS] Using base depthwise conv");
-
-  // [PEANUT] Input/output/filter dimensions
-  // [PEANUT] Input shape (batches, height, width, in-depth)
-  // [PEANUT] Output shape (batches, height, width, out-depth)
-  // [PEANUT] Filter shape (1?, height, width, out-depth)
-  // [PEANUT] These shapes also match how the data is stored in memory: 
-  //            batch-major, then row-major, then column-major. 
-  //            Channels are last. Refer to RuntimeShape.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
@@ -93,8 +74,7 @@ inline void DepthwiseConvPerChannel(
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
                 const int in_y =
                     in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image
-                // [PEANUT] The branches may be a bottleneck
+                // Zero padding by omitting the areas outside the image.
                 const bool is_point_inside_image =
                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height);
@@ -126,13 +106,10 @@ inline void DepthwiseConvPerChannel(
             if (bias_data) {
               acc += bias_data[output_channel];
             }
-            // [PEANUT] This is analogous to the output_shift in our example
-            // [PEANUT] tflite-micro/tensorflow/lite/kernels/internal/common.cc
             acc = MultiplyByQuantizedMultiplier(
                 acc, output_multiplier[output_channel],
                 output_shift[output_channel]);
             acc += output_offset;
-            // [PEANUT] Clamp output
             acc = std::max(acc, output_activation_min);
             acc = std::min(acc, output_activation_max);
             output_data[Offset(output_shape, batch, out_y, out_x,
@@ -311,4 +288,4 @@ inline void DepthwiseConvHybridPerChannel(
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 6bf7665a81f..dabc784a421 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+#if defined(TFLM_USE_RISCV_VECTOR)
+#include "tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h"
+#endif
+
 namespace tflite {
 namespace {
 
@@ -181,49 +185,112 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
-          data.is_per_channel
-              ? tflite::reference_integer_ops::FullyConnectedPerChannel(
-                    FullyConnectedParamsQuantized(data),
-                    data.per_channel_output_multiplier,
-                    reinterpret_cast<const int*>(data.per_channel_output_shift),
-                    tflite::micro::GetTensorShape(input),
-                    tflite::micro::GetTensorData<int8_t>(input),
-                    tflite::micro::GetTensorShape(filter),
+          if (data.is_per_channel)
+          {
+#if defined(TFLM_USE_RISCV_VECTOR)
+#ifdef USE_TFLM_COMPRESSION
+             // Check if compression is enabled for weights or bias when RVV is active
+             TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
+             if (weights_comp_td != nullptr || bias_comp_td != nullptr)
+             {
+                 MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet for FullyConnected.");
+                 return kTfLiteError;
+             }
+#endif // USE_TFLM_COMPRESSION
+            // RVV kernel requires int32 bias
+            if (bias != nullptr && bias->type != kTfLiteInt32) {
+                MicroPrintf("RVV kernel for FullyConnected requires Int32 bias for per-channel, got %s", TfLiteTypeGetName(bias->type));
+                return kTfLiteError;
+            }
+            FullyConnectedPerChannelRVV(
+                 FullyConnectedParamsQuantized(data),
+                 data.per_channel_output_multiplier,
+                 reinterpret_cast<const int*>(data.per_channel_output_shift),
+                 tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int8_t>(input),
+                 tflite::micro::GetTensorShape(filter),
+                 tflite::micro::GetTensorData<int8_t>(filter),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output)
+            );
+#else // defined(TFLM_USE_RISCV_VECTOR)
+             tflite::reference_integer_ops::FullyConnectedPerChannel(
+                 FullyConnectedParamsQuantized(data),
+                 data.per_channel_output_multiplier,
+                 reinterpret_cast<const int*>(data.per_channel_output_shift),
+                 tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int8_t>(input),
+                 tflite::micro::GetTensorShape(filter),
 #ifdef USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorData<int8_t>(
-                        micro_context, filter, weights_comp_td,
-                        data.weights_scratch_index),
-                    tflite::micro::GetTensorShape(bias),
-                    tflite::micro::GetOptionalTensorData<int32_t>(
-                        micro_context, bias, bias_comp_td,
-                        data.bias_scratch_index),
+                 tflite::micro::GetTensorData<int8_t>(
+                     micro_context, filter, weights_comp_td,
+                     data.weights_scratch_index),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetOptionalTensorData<int32_t>(
+                     micro_context, bias, bias_comp_td,
+                     data.bias_scratch_index),
 #else   // USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorData<int8_t>(filter),
-                    tflite::micro::GetTensorShape(bias),
-                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                 tflite::micro::GetTensorData<int8_t>(filter),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
 #endif  // USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorShape(output),
-                    tflite::micro::GetTensorData<int8_t>(output))
-              : tflite::reference_integer_ops::FullyConnected(
-                    FullyConnectedParamsQuantized(data),
-                    tflite::micro::GetTensorShape(input),
-                    tflite::micro::GetTensorData<int8_t>(input),
-                    tflite::micro::GetTensorShape(filter),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output));
+#endif // defined(TFLM_USE_RISCV_VECTOR)
+          }
+          else // if (!data.is_per_channel)
+          {
+#if defined(TFLM_USE_RISCV_VECTOR)
 #ifdef USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorData<int8_t>(
-                        micro_context, filter, weights_comp_td,
-                        data.weights_scratch_index),
-                    tflite::micro::GetTensorShape(bias),
-                    tflite::micro::GetOptionalTensorData<int32_t>(
-                        micro_context, bias, bias_comp_td,
-                        data.bias_scratch_index),
+             // Check if compression is enabled for weights or bias when RVV is active
+             TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
+             if (weights_comp_td != nullptr || bias_comp_td != nullptr)
+             {
+                 MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet for FullyConnected.");
+                 return kTfLiteError;
+             }
+#endif // USE_TFLM_COMPRESSION
+            // RVV kernel requires int32 bias
+            if (bias != nullptr && bias->type != kTfLiteInt32) {
+                MicroPrintf("RVV kernel for FullyConnected requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
+                return kTfLiteError;
+            }
+            FullyConnectedRVV(
+                FullyConnectedParamsQuantized(data),
+                tflite::micro::GetTensorShape(input),
+                tflite::micro::GetTensorData<int8_t>(input),
+                tflite::micro::GetTensorShape(filter),
+                tflite::micro::GetTensorData<int8_t>(filter),
+                tflite::micro::GetTensorShape(bias),
+                tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                tflite::micro::GetTensorShape(output),
+                tflite::micro::GetTensorData<int8_t>(output)
+            );
+#else // defined(TFLM_USE_RISCV_VECTOR)
+            tflite::reference_integer_ops::FullyConnected(
+                 FullyConnectedParamsQuantized(data),
+                 tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int8_t>(input),
+                 tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                 tflite::micro::GetTensorData<int8_t>(
+                     micro_context, filter, weights_comp_td,
+                     data.weights_scratch_index),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetOptionalTensorData<int32_t>(
+                     micro_context, bias, bias_comp_td,
+                     data.bias_scratch_index),
 #else   // USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorData<int8_t>(filter),
-                    tflite::micro::GetTensorShape(bias),
-                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                 tflite::micro::GetTensorData<int8_t>(filter),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
 #endif  // USE_TFLM_COMPRESSION
-                    tflite::micro::GetTensorShape(output),
-                    tflite::micro::GetTensorData<int8_t>(output));
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output));
+#endif // defined(TFLM_USE_RISCV_VECTOR)
+          }
           break;
         }
         default: {
@@ -360,4 +427,4 @@ TFLMInferenceRegistration RegisterInference_FULLY_CONNECTED() {
   return tflite::micro::RegisterOp(FullyConnectedEval);
 }
 
-}  // namespace tflite
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
new file mode 100644
index 00000000000..34a094a4cc4
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
@@ -0,0 +1,227 @@
+#include <riscv_vector.h>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+using namespace tflite;
+
+void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
+                            const int32_t* output_multiplier,
+                            const int* output_shift,
+                            const RuntimeShape& input_shape,
+                            const int8_t* input_data,
+                            const RuntimeShape& filter_shape,
+                            const int8_t* filter_data,
+                            const RuntimeShape& bias_shape,
+                            const int32_t* bias_data,
+                            const RuntimeShape& output_shape,
+                            int8_t* output_data)
+{
+    // Extract quantization parameters
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+
+    // Extract shape dimensions
+    const int filter_dim_count = filter_shape.DimensionsCount();
+    const int output_dim_count = output_shape.DimensionsCount();
+    const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+    const int output_depth = output_shape.Dims(output_dim_count - 1);
+    const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+    // Prepare scalar constants for vector operations
+    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+    const int32_t s_output_offset_s32 = output_offset;
+    const int32_t s_output_activation_min_s32 = output_activation_min;
+    const int32_t s_output_activation_max_s32 = output_activation_max;
+
+    // Loop over batches
+    for (int b = 0; b < batches; ++b)
+    {
+        // Set base pointers for the current batch
+        const int8_t* input_batch_ptr = input_data + b * accum_depth;
+        int8_t* output_batch_ptr = output_data + b * output_depth;
+
+        // Loop over output channels (rows of the weight matrix)
+        for (int out_c = 0; out_c < output_depth; ++out_c) {
+            // Set filter pointer and get bias for the current output channel
+            const int8_t* filter_row_ptr = filter_data + out_c * accum_depth;
+            const int32_t bias_val = bias_data ? bias_data[out_c] : 0;
+
+            // Initialize vector accumulator to zero
+            size_t initial_vl_for_acc_init = __riscv_vsetvlmax_e16m2();
+            vint32m4_t v_acc_s32m4 = __riscv_vmv_v_x_i32m4(0, initial_vl_for_acc_init);
+
+            // Initialize scalar accumulator with bias value
+            int32_t s_acc_s32 = bias_val;
+
+            // Loop over accumulation depth (dot product length) in vector
+            // chunks
+            size_t current_d = 0;
+            while (current_d < static_cast<size_t>(accum_depth))
+            {
+                // Set vector length for the current chunk
+                size_t vl = __riscv_vsetvl_e16m2(accum_depth - current_d);
+
+                // Load input vector chunk, widen to i16, and add input offset
+                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_batch_ptr + current_d, vl);
+                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_input_s16, s_input_offset_s16, vl);
+
+                // Load filter vector chunk and widen to i16
+                vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_row_ptr + current_d, vl);
+                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
+
+                // Perform widening multiply-accumulate
+                v_acc_s32m4 = __riscv_vwmacc_vv_i32m4(v_acc_s32m4, v_input_plus_offset_s16, v_filter_s16, vl);
+
+                // Advance pointer for the next chunk
+                current_d += vl;
+            }
+
+            // Reduce the final vector accumulator to a scalar sum
+            size_t final_vl = __riscv_vsetvl_e32m4(accum_depth > 0 ? 1 : 0);
+            if (accum_depth > 0)
+            {
+                // Set VL for reduction based on accumulated depth
+                final_vl = __riscv_vsetvl_e32m4(accum_depth);
+
+                // Initialize reduction target vector register to zero
+                vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
+
+                // Perform reduction sum
+                vint32m1_t v_reduced_sum_s32m1 = __riscv_vredsum_vs_i32m4_i32m1(v_acc_s32m4, v_zero, final_vl);
+
+                // Extract scalar sum and add to the bias-initialized scalar accumulator
+                s_acc_s32 += __riscv_vmv_x_s_i32m1_i32(v_reduced_sum_s32m1);
+            }
+
+            // Apply per-channel requantization (scalar multiplication and shift)
+            int32_t s_requantized_acc_s32 = MultiplyByQuantizedMultiplier(s_acc_s32, output_multiplier[out_c], output_shift[out_c]);
+
+            // Add output offset to the requantized value
+            s_requantized_acc_s32 += s_output_offset_s32;
+
+            // Clamp the result to the activation range
+            s_requantized_acc_s32 = std::max(s_requantized_acc_s32, s_output_activation_min_s32);
+            s_requantized_acc_s32 = std::min(s_requantized_acc_s32, s_output_activation_max_s32);
+
+            // Store the final int8 result
+            output_batch_ptr[out_c] = static_cast<int8_t>(s_requantized_acc_s32);
+        }
+    }
+}
+
+void FullyConnectedRVV(const FullyConnectedParams& params,
+                  const RuntimeShape& input_shape,
+                  const int8_t* input_data,
+                  const RuntimeShape& filter_shape,
+                  const int8_t* filter_data,
+                  const RuntimeShape& bias_shape,
+                  const int32_t* bias_data,
+                  const RuntimeShape& output_shape,
+                  int8_t* output_data)
+{
+    // Extract quantization parameters (scalar values for the whole layer)
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+
+    // Extract shape dimensions
+    const int filter_dim_count = filter_shape.DimensionsCount();
+    const int output_dim_count = output_shape.DimensionsCount();
+    const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+    const int output_depth = output_shape.Dims(output_dim_count - 1);
+    const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+    // Prepare scalar constants for vector operations
+    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+    const int16_t s_filter_offset_s16 = static_cast<int16_t>(filter_offset);
+    const int32_t s_output_offset_s32 = output_offset;
+    const int32_t s_output_activation_min_s32 = output_activation_min;
+    const int32_t s_output_activation_max_s32 = output_activation_max;
+
+    // Loop over batches
+    for (int b = 0; b < batches; ++b) 
+    {
+        // Set base pointers for the current batch
+        const int8_t* input_batch_ptr = input_data + b * accum_depth;
+        int8_t* output_batch_ptr = output_data + b * output_depth;
+
+        // Loop over output channels (rows of the weight matrix)
+        for (int out_c = 0; out_c < output_depth; ++out_c)
+        {
+            // Set filter pointer and get bias for the current output channel
+            const int8_t* filter_row_ptr = filter_data + out_c * accum_depth;
+            // Bias is int32_t for non-per-channel int8 quantization
+            const int32_t bias_val = bias_data ? bias_data[out_c] : 0;
+
+            // Initialize vector accumulator to zero
+            // Use vlmax corresponding to operand type (e16m2) to determine acc size
+            size_t initial_vl_for_acc_init = __riscv_vsetvlmax_e16m2();
+            vint32m4_t v_acc_s32m4 = __riscv_vmv_v_x_i32m4(0, initial_vl_for_acc_init);
+
+            // Initialize scalar accumulator with bias value
+            int32_t s_acc_s32 = bias_val;
+
+            // Loop over accumulation depth (dot product length) in vector chunks
+            size_t current_d = 0;
+            while (current_d < static_cast<size_t>(accum_depth))
+            {
+                // Set vector length for the current chunk
+                size_t vl = __riscv_vsetvl_e16m2(accum_depth - current_d);
+
+                // Load input vector chunk, widen to i16, and add input offset
+                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_batch_ptr + current_d, vl);
+                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_input_s16, s_input_offset_s16, vl);
+
+                // Load filter vector chunk, widen to i16, and add filter offset
+                vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_row_ptr + current_d, vl);
+                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
+                vint16m2_t v_filter_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_filter_s16, s_filter_offset_s16, vl);
+
+                // Perform widening multiply-accumulate
+                v_acc_s32m4 = __riscv_vwmacc_vv_i32m4(v_acc_s32m4, v_input_plus_offset_s16, v_filter_plus_offset_s16, vl);
+
+                // Advance pointer for the next chunk
+                current_d += vl;
+            }
+
+            // Reduce the final vector accumulator to a scalar sum
+            size_t final_vl = __riscv_vsetvl_e32m4(accum_depth > 0 ? 1 : 0);
+            if (accum_depth > 0)
+            {
+                // Set VL for reduction based on accumulated depth
+                final_vl = __riscv_vsetvl_e32m4(accum_depth);
+
+                // Initialize reduction target vector register to zero
+                vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
+
+                // Perform reduction sum
+                vint32m1_t v_reduced_sum_s32m1 = __riscv_vredsum_vs_i32m4_i32m1(v_acc_s32m4, v_zero, final_vl);
+
+                // Extract scalar sum and add to the bias-initialized scalar accumulator
+                s_acc_s32 += __riscv_vmv_x_s_i32m1_i32(v_reduced_sum_s32m1);
+            }
+
+            // Apply uniform requantization (scalar multiplication and shift)
+            int32_t s_requantized_acc_s32 = MultiplyByQuantizedMultiplier(s_acc_s32, output_multiplier, output_shift);
+
+            // Add output offset to the requantized value
+            s_requantized_acc_s32 += s_output_offset_s32;
+
+            // Clamp the result to the activation range
+            s_requantized_acc_s32 = std::max(s_requantized_acc_s32, s_output_activation_min_s32);
+            s_requantized_acc_s32 = std::min(s_requantized_acc_s32, s_output_activation_max_s32);
+
+            // Store the final int8 result (using batch offset)
+            output_batch_ptr[out_c] = static_cast<int8_t>(s_requantized_acc_s32);
+        }
+    }
+}
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h
new file mode 100644
index 00000000000..d3694a65c8e
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h
@@ -0,0 +1,33 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FULLY_CONNECTED_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FULLY_CONNECTED_RVV_H_
+
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+#include "tensorflow/lite/c/common.h"
+
+using namespace tflite;
+
+void FullyConnectedPerChannelRVV(
+    const FullyConnectedParams& params,
+    const int32_t* output_multiplier,
+    const int* output_shift,
+    const RuntimeShape& input_shape,
+    const int8_t* input_data,
+    const RuntimeShape& filter_shape,
+    const int8_t* filter_data,
+    const RuntimeShape& bias_shape,
+    const int32_t* bias_data,
+    const RuntimeShape& output_shape,
+    int8_t* output_data);
+
+void FullyConnectedRVV(
+    const FullyConnectedParams& params,
+    const RuntimeShape& input_shape,
+    const int8_t* input_data,
+    const RuntimeShape& filter_shape,
+    const int8_t* filter_data,
+    const RuntimeShape& bias_shape, 
+    const int32_t* bias_data,
+    const RuntimeShape& output_shape, 
+    int8_t* output_data);
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 8ae7cbe0b2f..0df87e9714e 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -60,4 +60,5 @@ SIZE_SCRIPT := ${TENSORFLOW_ROOT}tensorflow/lite/micro/testing/size_riscv32_bina
 include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
 
 MICROLITE_CC_SRCS += \
-  tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc

From 5875dfae400fdf52a61e6fb670cd0c9c2ce18656 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 20 Apr 2025 14:38:18 -0500
Subject: [PATCH 29/86] Delete unused file copy

---
 .../kernels/riscv_vector/depthwise_conv.cc    | 192 ------------------
 1 file changed, 192 deletions(-)
 delete mode 100644 tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc b/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
deleted file mode 100644
index 3c381ecfc9f..00000000000
--- a/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/micro_log.h"
-
-namespace tflite {
-namespace {
-
-void* DepthwiseConvInit(TfLiteContext* context, const char* buffer,
-                        size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
-}
-
-TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  auto& params =
-      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
-  const OpDataConv& data = *(static_cast<const OpDataConv*>(node->user_data));
-
-  TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
-  const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
-  const TfLiteEvalTensor* filter =
-      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
-  const TfLiteEvalTensor* bias =
-      (NumInputs(node) == 3)
-          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
-          : nullptr;
-
-  MicroPrintf("[PEANUT MICROSYSTEMS] Using vectorized implementation");
-
-#ifdef USE_TFLM_COMPRESSION
-
-  MicroContext* micro_context = GetMicroContext(context);
-
-  const CompressionTensorData* filter_comp_td =
-      micro_context->GetTensorCompressionData(node,
-                                              kDepthwiseConvWeightsTensor);
-  const CompressionTensorData* bias_comp_td =
-      micro_context->GetTensorCompressionData(node, kDepthwiseConvBiasTensor);
-
-#endif  // USE_TFLM_COMPRESSION
-
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32: {
-      tflite::reference_ops::DepthwiseConv(
-          DepthwiseConvParamsFloat(params, data),
-          tflite::micro::GetTensorShape(input),
-          tflite::micro::GetTensorData<float>(input),
-          tflite::micro::GetTensorShape(filter),
-#ifdef USE_TFLM_COMPRESSION
-          tflite::micro::GetTensorData<float>(micro_context, filter,
-                                              filter_comp_td,
-                                              data.weights_scratch_index),
-          tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetOptionalTensorData<float>(
-              micro_context, bias, bias_comp_td, data.bias_scratch_index),
-#else   // USE_TFLM_COMPRESSION
-          tflite::micro::GetTensorData<float>(filter),
-          tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetOptionalTensorData<float>(bias),
-#endif  // USE_TFLM_COMPRESSION
-          tflite::micro::GetTensorShape(output),
-          tflite::micro::GetTensorData<float>(output));
-      break;
-    }
-    case kTfLiteInt8: {
-      switch (filter->type) {
-        case kTfLiteInt4: {
-          int8_t* unpacked_filter_data = static_cast<int8_t*>(
-              context->GetScratchBuffer(context, data.filter_buffer_index));
-          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
-              tflite::micro::GetTensorData<int8_t>(filter),
-              tflite::micro::GetTensorShape(filter).FlatSize(),
-              unpacked_filter_data);
-          reference_integer_ops::DepthwiseConvPerChannel(
-              DepthwiseConvParamsQuantized(params, data),
-              data.per_channel_output_multiplier, data.per_channel_output_shift,
-              tflite::micro::GetTensorShape(input),
-              tflite::micro::GetTensorData<int8_t>(input),
-              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
-              tflite::micro::GetTensorShape(bias),
-              tflite::micro::GetOptionalTensorData<int32_t>(bias),
-              tflite::micro::GetTensorShape(output),
-              tflite::micro::GetTensorData<int8_t>(output));
-          break;
-        }
-        case kTfLiteInt8: {
-          reference_integer_ops::DepthwiseConvPerChannel(
-              DepthwiseConvParamsQuantized(params, data),
-              data.per_channel_output_multiplier, data.per_channel_output_shift,
-              tflite::micro::GetTensorShape(input),
-              tflite::micro::GetTensorData<int8_t>(input),
-              tflite::micro::GetTensorShape(filter),
-#ifdef USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
-                                                   filter_comp_td,
-                                                   data.weights_scratch_index),
-              tflite::micro::GetTensorShape(bias),
-              tflite::micro::GetOptionalTensorData<int32_t>(
-                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
-#else   // USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorData<int8_t>(filter),
-              tflite::micro::GetTensorShape(bias),
-              tflite::micro::GetOptionalTensorData<int32_t>(bias),
-#endif  // USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorShape(output),
-              tflite::micro::GetTensorData<int8_t>(output));
-          break;
-        }
-        default:
-          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
-                      TfLiteTypeGetName(filter->type), filter->type,
-                      TfLiteTypeGetName(input->type));
-          return kTfLiteError;
-      }
-      break;
-    }
-    case kTfLiteInt16: {
-      switch (filter->type) {
-        case kTfLiteInt8: {
-          reference_integer_ops::DepthwiseConvPerChannel(
-              DepthwiseConvParamsQuantized(params, data),
-              data.per_channel_output_multiplier, data.per_channel_output_shift,
-              tflite::micro::GetTensorShape(input),
-              tflite::micro::GetTensorData<int16_t>(input),
-              tflite::micro::GetTensorShape(filter),
-#ifdef USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
-                                                   filter_comp_td,
-                                                   data.weights_scratch_index),
-              tflite::micro::GetTensorShape(bias),
-              tflite::micro::GetOptionalTensorData<int64_t>(
-                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
-#else   // USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorData<int8_t>(filter),
-              tflite::micro::GetTensorShape(bias),
-              tflite::micro::GetOptionalTensorData<int64_t>(bias),
-#endif  // USE_TFLM_COMPRESSION
-              tflite::micro::GetTensorShape(output),
-              tflite::micro::GetTensorData<int16_t>(output));
-          break;
-        }
-        default:
-          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
-                      TfLiteTypeGetName(filter->type), filter->type,
-                      TfLiteTypeGetName(input->type));
-          return kTfLiteError;
-      }
-      break;
-    }
-    default:
-      MicroPrintf("Input type %s (%d) not supported.",
-                  TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-TFLMRegistration Register_DEPTHWISE_CONV_2D() {
-  return tflite::micro::RegisterOp(DepthwiseConvInit, DepthwiseConvPrepare,
-                                   DepthwiseConvEval);
-}
-
-}  // namespace tflite

From 221f33d3c7d85bff6b5894311595d2dd358f1dbe Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 20 Apr 2025 15:36:36 -0500
Subject: [PATCH 30/86] Reimplement convolution dispatcher

---
 .../lite/micro/kernels/riscv_vector/conv.cc   | 199 ++++++++++++++++++
 .../micro/kernels/riscv_vector/rfft_rvv.cc    |   0
 .../micro/kernels/riscv_vector/rfft_rvv.h     |  10 +
 .../make/targets/riscv32_vector_makefile.inc  |   7 +-
 4 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/conv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv.cc
new file mode 100644
index 00000000000..ac732310e16
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv.cc
@@ -0,0 +1,199 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
+
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto& params =
+      *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data = *(static_cast<const OpDataConv*>(node->user_data));
+
+#ifdef USE_TFLM_COMPRESSION
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* weights_comp_td =
+      micro_context->GetTensorCompressionData(node, kConvWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kConvBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Conv(
+          ConvParamsFloat(params, data), tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              weights_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(
+              micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
+      break;
+    }
+    case kTfLiteInt16: {
+      if (bias == nullptr || bias->type == kTfLiteInt32) {
+        reference_integer_ops::ConvPerChannel(
+            ConvParamsQuantized(params, data),
+            data.per_channel_output_multiplier, data.per_channel_output_shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int16_t>(input),
+            tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 weights_comp_td,
+                                                 data.weights_scratch_index),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetOptionalTensorData<int32_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(filter),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetOptionalTensorData<std::int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+      } else if (bias->type == kTfLiteInt64) {
+        reference_integer_ops::ConvPerChannel(
+            ConvParamsQuantized(params, data),
+            data.per_channel_output_multiplier, data.per_channel_output_shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int16_t>(input),
+            tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                 weights_comp_td,
+                                                 data.weights_scratch_index),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<int64_t>(
+                micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorData<int8_t>(filter),
+            tflite::micro::GetTensorShape(bias),
+            tflite::micro::GetTensorData<std::int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+      } else {
+        MicroPrintf("Bias type %s (%d) not supported.",
+                    TfLiteTypeGetName(bias->type), bias->type);
+        return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = static_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          reference_integer_ops::ConvPerChannel(
+              ConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+            ConvPerChannelRVV(
+              ConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   weights_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Weight type %s (%d) not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(input->type),
+                  input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_CONV_2D() {
+  return tflite::micro::RegisterOp(ConvInit, ConvPrepare, ConvEval);
+}
+
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h
new file mode 100644
index 00000000000..e17af7a0365
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h
@@ -0,0 +1,10 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
+
+#include "tensorflow/lite/c/common.h"
+
+using namespace tflite;
+
+
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 0df87e9714e..977e6dbb2f7 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -61,4 +61,9 @@ include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
 
 MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/conv.cc
+
+EXCLUDED_SRCS := \
+  tensorflow/lite/micro/kernels/conv.cc
+

From 54ba9fe77fa4e8f8b253c28cbf39de2b23793791 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 20 Apr 2025 15:40:05 -0500
Subject: [PATCH 31/86] depwthwise convolution dispatcher

---
 .../lite/micro/kernels/depthwise_conv.cc      |  38 +---
 .../kernels/riscv_vector/depthwise_conv.cc    | 192 ++++++++++++++++++
 .../make/targets/riscv32_vector_makefile.inc  |   7 +-
 3 files changed, 198 insertions(+), 39 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc

diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index a7be7ff37e7..fb5cc3878ae 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -24,11 +24,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-#if defined(TFLM_USE_RISCV_VECTOR)
-#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
-#endif
-
-
 namespace tflite {
 namespace {
 
@@ -114,36 +109,6 @@ TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
-#if defined(TFLM_USE_RISCV_VECTOR)
-#ifdef USE_TFLM_COMPRESSION
-          TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
-          if (weights_comp_td != nullptr || bias_comp_td != nullptr) 
-          {
-              MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet.");
-              return kTfLiteError;
-          }
-#endif // USE_TFLM_COMPRESSION
-          // Check bias type is compatible (as per your original check)
-          if (bias != nullptr && bias->type != kTfLiteInt32) {
-            MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
-            return kTfLiteError;
-         }
-
-         // Call the optimized RVV kernel with the *new* correct parameters
-         DepthwiseConvPerChannelRVV(
-            DepthwiseConvParamsQuantized(params, data),          // const ConvParams& params
-            data.per_channel_output_multiplier,         // const int32_t* output_multiplier
-            data.per_channel_output_shift,              // const int32_t* output_shift
-            tflite::micro::GetTensorShape(input),       // const RuntimeShape& input_shape
-            tflite::micro::GetTensorData<int8_t>(input), // const int8_t* input_data
-            tflite::micro::GetTensorShape(filter),      // const RuntimeShape& filter_shape
-            tflite::micro::GetTensorData<int8_t>(filter),// const int8_t* filter_data
-            tflite::micro::GetTensorShape(bias),        // const RuntimeShape& bias_shape
-            tflite::micro::GetOptionalTensorData<int32_t>(bias), // const int32_t* bias_data
-            tflite::micro::GetTensorShape(output),      // const RuntimeShape& output_shape
-            tflite::micro::GetTensorData<int8_t>(output) // int8_t* output_data
-         );
-#else
           reference_integer_ops::DepthwiseConvPerChannel(
               DepthwiseConvParamsQuantized(params, data),
               data.per_channel_output_multiplier, data.per_channel_output_shift,
@@ -164,7 +129,6 @@ TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
 #endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
-#endif
           break;
         }
         default:
@@ -223,4 +187,4 @@ TFLMRegistration Register_DEPTHWISE_CONV_2D() {
                                    DepthwiseConvEval);
 }
 
-}  // namespace tflite
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc b/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
new file mode 100644
index 00000000000..f2a1ce9ec9a
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
@@ -0,0 +1,192 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
+
+namespace tflite {
+namespace {
+
+void* DepthwiseConvInit(TfLiteContext* context, const char* buffer,
+                        size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
+}
+
+TfLiteStatus DepthwiseConvEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const OpDataConv& data = *(static_cast<const OpDataConv*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
+          : nullptr;
+
+#ifdef USE_TFLM_COMPRESSION
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* filter_comp_td =
+      micro_context->GetTensorCompressionData(node,
+                                              kDepthwiseConvWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kDepthwiseConvBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              filter_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(
+              micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = static_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          DepthwiseConvPerChannelRVV(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   filter_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      switch (filter->type) {
+        case kTfLiteInt8: {
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int16_t>(input),
+              tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(micro_context, filter,
+                                                   filter_comp_td,
+                                                   data.weights_scratch_index),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int64_t>(
+                  micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_DEPTHWISE_CONV_2D() {
+  return tflite::micro::RegisterOp(DepthwiseConvInit, DepthwiseConvPrepare,
+                                   DepthwiseConvEval);
+}
+
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 977e6dbb2f7..a7ec7e84921 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -62,8 +62,11 @@ include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
 MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/conv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
 
 EXCLUDED_SRCS := \
-  tensorflow/lite/micro/kernels/conv.cc
+  tensorflow/lite/micro/kernels/conv.cc \
+  tensorflow/lite/micro/kernels/depthwise_conv.cc
+
 

From 81d4da00bc49f3eeaf200ff82f255910f0555d73 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 20 Apr 2025 15:43:37 -0500
Subject: [PATCH 32/86] FullyConnected dispatcher

---
 .../lite/micro/kernels/fully_connected.cc     | 141 ++-----
 .../kernels/riscv_vector/fully_connected.cc   | 365 ++++++++++++++++++
 .../make/targets/riscv32_vector_makefile.inc  |   6 +-
 3 files changed, 406 insertions(+), 106 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc

diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index dabc784a421..b632457aaa9 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -23,10 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-#if defined(TFLM_USE_RISCV_VECTOR)
-#include "tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h"
-#endif
-
 namespace tflite {
 namespace {
 
@@ -185,112 +181,49 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
-          if (data.is_per_channel)
-          {
-#if defined(TFLM_USE_RISCV_VECTOR)
-#ifdef USE_TFLM_COMPRESSION
-             // Check if compression is enabled for weights or bias when RVV is active
-             TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
-             if (weights_comp_td != nullptr || bias_comp_td != nullptr)
-             {
-                 MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet for FullyConnected.");
-                 return kTfLiteError;
-             }
-#endif // USE_TFLM_COMPRESSION
-            // RVV kernel requires int32 bias
-            if (bias != nullptr && bias->type != kTfLiteInt32) {
-                MicroPrintf("RVV kernel for FullyConnected requires Int32 bias for per-channel, got %s", TfLiteTypeGetName(bias->type));
-                return kTfLiteError;
-            }
-            FullyConnectedPerChannelRVV(
-                 FullyConnectedParamsQuantized(data),
-                 data.per_channel_output_multiplier,
-                 reinterpret_cast<const int*>(data.per_channel_output_shift),
-                 tflite::micro::GetTensorShape(input),
-                 tflite::micro::GetTensorData<int8_t>(input),
-                 tflite::micro::GetTensorShape(filter),
-                 tflite::micro::GetTensorData<int8_t>(filter),
-                 tflite::micro::GetTensorShape(bias),
-                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
-                 tflite::micro::GetTensorShape(output),
-                 tflite::micro::GetTensorData<int8_t>(output)
-            );
-#else // defined(TFLM_USE_RISCV_VECTOR)
-             tflite::reference_integer_ops::FullyConnectedPerChannel(
-                 FullyConnectedParamsQuantized(data),
-                 data.per_channel_output_multiplier,
-                 reinterpret_cast<const int*>(data.per_channel_output_shift),
-                 tflite::micro::GetTensorShape(input),
-                 tflite::micro::GetTensorData<int8_t>(input),
-                 tflite::micro::GetTensorShape(filter),
+          data.is_per_channel
+              ? tflite::reference_integer_ops::FullyConnectedPerChannel(
+                    FullyConnectedParamsQuantized(data),
+                    data.per_channel_output_multiplier,
+                    reinterpret_cast<const int*>(data.per_channel_output_shift),
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<int8_t>(input),
+                    tflite::micro::GetTensorShape(filter),
 #ifdef USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorData<int8_t>(
-                     micro_context, filter, weights_comp_td,
-                     data.weights_scratch_index),
-                 tflite::micro::GetTensorShape(bias),
-                 tflite::micro::GetOptionalTensorData<int32_t>(
-                     micro_context, bias, bias_comp_td,
-                     data.bias_scratch_index),
+                    tflite::micro::GetTensorData<int8_t>(
+                        micro_context, filter, weights_comp_td,
+                        data.weights_scratch_index),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(
+                        micro_context, bias, bias_comp_td,
+                        data.bias_scratch_index),
 #else   // USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorData<int8_t>(filter),
-                 tflite::micro::GetTensorShape(bias),
-                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                    tflite::micro::GetTensorData<int8_t>(filter),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
 #endif  // USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorShape(output),
-                 tflite::micro::GetTensorData<int8_t>(output));
-#endif // defined(TFLM_USE_RISCV_VECTOR)
-          }
-          else // if (!data.is_per_channel)
-          {
-#if defined(TFLM_USE_RISCV_VECTOR)
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<int8_t>(output))
+              : tflite::reference_integer_ops::FullyConnected(
+                    FullyConnectedParamsQuantized(data),
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<int8_t>(input),
+                    tflite::micro::GetTensorShape(filter),
 #ifdef USE_TFLM_COMPRESSION
-             // Check if compression is enabled for weights or bias when RVV is active
-             TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
-             if (weights_comp_td != nullptr || bias_comp_td != nullptr)
-             {
-                 MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet for FullyConnected.");
-                 return kTfLiteError;
-             }
-#endif // USE_TFLM_COMPRESSION
-            // RVV kernel requires int32 bias
-            if (bias != nullptr && bias->type != kTfLiteInt32) {
-                MicroPrintf("RVV kernel for FullyConnected requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
-                return kTfLiteError;
-            }
-            FullyConnectedRVV(
-                FullyConnectedParamsQuantized(data),
-                tflite::micro::GetTensorShape(input),
-                tflite::micro::GetTensorData<int8_t>(input),
-                tflite::micro::GetTensorShape(filter),
-                tflite::micro::GetTensorData<int8_t>(filter),
-                tflite::micro::GetTensorShape(bias),
-                tflite::micro::GetOptionalTensorData<int32_t>(bias),
-                tflite::micro::GetTensorShape(output),
-                tflite::micro::GetTensorData<int8_t>(output)
-            );
-#else // defined(TFLM_USE_RISCV_VECTOR)
-            tflite::reference_integer_ops::FullyConnected(
-                 FullyConnectedParamsQuantized(data),
-                 tflite::micro::GetTensorShape(input),
-                 tflite::micro::GetTensorData<int8_t>(input),
-                 tflite::micro::GetTensorShape(filter),
-#ifdef USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorData<int8_t>(
-                     micro_context, filter, weights_comp_td,
-                     data.weights_scratch_index),
-                 tflite::micro::GetTensorShape(bias),
-                 tflite::micro::GetOptionalTensorData<int32_t>(
-                     micro_context, bias, bias_comp_td,
-                     data.bias_scratch_index),
+                    tflite::micro::GetTensorData<int8_t>(
+                        micro_context, filter, weights_comp_td,
+                        data.weights_scratch_index),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(
+                        micro_context, bias, bias_comp_td,
+                        data.bias_scratch_index),
 #else   // USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorData<int8_t>(filter),
-                 tflite::micro::GetTensorShape(bias),
-                 tflite::micro::GetOptionalTensorData<int32_t>(bias),
+                    tflite::micro::GetTensorData<int8_t>(filter),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
 #endif  // USE_TFLM_COMPRESSION
-                 tflite::micro::GetTensorShape(output),
-                 tflite::micro::GetTensorData<int8_t>(output));
-#endif // defined(TFLM_USE_RISCV_VECTOR)
-          }
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<int8_t>(output));
           break;
         }
         default: {
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc
new file mode 100644
index 00000000000..32fb6b9fb44
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc
@@ -0,0 +1,365 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h"
+
+namespace tflite {
+namespace {
+
+void* FullyConnectedInit(TfLiteContext* context, const char* buffer,
+                         size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(OpDataFullyConnected));
+}
+
+TfLiteStatus FullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* data = static_cast<OpDataFullyConnected*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter = micro_context->AllocateTempInputTensor(
+      node, kFullyConnectedWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(
+      node, kFullyConnectedOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  if ((input->type == kTfLiteFloat32 && filter->type != kTfLiteFloat32) ||
+      (input->type == kTfLiteInt8 &&
+       (filter->type != kTfLiteInt8 && filter->type != kTfLiteInt4)) ||
+      (input->type == kTfLiteInt16 && filter->type != kTfLiteInt8)) {
+    MicroPrintf("Input type: %s with filter type: %s not supported.",
+                TfLiteTypeGetName(input->type),
+                TfLiteTypeGetName(filter->type));
+    return kTfLiteError;
+  }
+
+  if (filter->type == kTfLiteInt4) {
+    int filter_size =
+        RuntimeShape(filter->dims->size,
+                     reinterpret_cast<const int32_t*>(filter->dims->data))
+            .FlatSize();
+    context->RequestScratchBufferInArena(context, filter_size,
+                                         &data->filter_buffer_index);
+  }
+
+  TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected(
+                                 context, params->activation, input->type,
+                                 input, filter, bias, output, data));
+
+#ifdef USE_TFLM_COMPRESSION
+
+  // Compression scratch buffers.
+  // These will only be allocated if the tensor is compressed.
+  if (micro_context->IsTensorCompressed(node, kFullyConnectedWeightsTensor) &&
+      filter->type == kTfLiteInt4) {
+    MicroPrintf("Compression not supported with INT4 tensors");
+    return kTfLiteError;
+  }
+  data->weights_scratch_index =
+      micro_context->AllocateDecompressionScratchBuffer(
+          node, kFullyConnectedWeightsTensor);
+  data->bias_scratch_index = micro_context->AllocateDecompressionScratchBuffer(
+      node, kFullyConnectedBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  if (bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(bias);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
+
+#ifdef USE_TFLM_COMPRESSION
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const CompressionTensorData* weights_comp_td =
+      micro_context->GetTensorCompressionData(node,
+                                              kFullyConnectedWeightsTensor);
+  const CompressionTensorData* bias_comp_td =
+      micro_context->GetTensorCompressionData(node, kFullyConnectedBiasTensor);
+
+#endif  // USE_TFLM_COMPRESSION
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data =
+      *(static_cast<const OpDataFullyConnected*>(node->user_data));
+
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsFloat(params->activation),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(micro_context, filter,
+                                              weights_comp_td,
+                                              data.weights_scratch_index),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(
+              micro_context, bias, bias_comp_td, data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+#endif  // USE_TFLM_COMPRESSION
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = static_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          tflite::reference_integer_ops::FullyConnected(
+              FullyConnectedParamsQuantized(data),
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          data.is_per_channel
+              ? FullyConnectedPerChannelRVV(
+                    FullyConnectedParamsQuantized(data),
+                    data.per_channel_output_multiplier,
+                    reinterpret_cast<const int*>(data.per_channel_output_shift),
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<int8_t>(input),
+                    tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorData<int8_t>(
+                        micro_context, filter, weights_comp_td,
+                        data.weights_scratch_index),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(
+                        micro_context, bias, bias_comp_td,
+                        data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorData<int8_t>(filter),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<int8_t>(output))
+              : FullyConnectedRVV(
+                    FullyConnectedParamsQuantized(data),
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<int8_t>(input),
+                    tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorData<int8_t>(
+                        micro_context, filter, weights_comp_td,
+                        data.weights_scratch_index),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(
+                        micro_context, bias, bias_comp_td,
+                        data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorData<int8_t>(filter),
+                    tflite::micro::GetTensorShape(bias),
+                    tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default: {
+          MicroPrintf("Filter type %s (%d) not supported.",
+                      TfLiteTypeGetName(filter->type), input->type);
+          return kTfLiteError;
+        }
+      }
+      break;
+    }
+
+    case kTfLiteInt16: {
+      switch (filter->type) {
+        case kTfLiteInt8: {
+          if (bias == nullptr || bias->type == kTfLiteInt32) {
+            data.is_per_channel
+                ? tflite::reference_integer_ops::FullyConnectedPerChannel(
+                      FullyConnectedParamsQuantized(data),
+                      data.per_channel_output_multiplier,
+                      reinterpret_cast<const int*>(
+                          data.per_channel_output_shift),
+                      tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<int16_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(
+                          micro_context, filter, weights_comp_td,
+                          data.weights_scratch_index),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int32_t>(
+                          micro_context, bias, bias_comp_td,
+                          data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<int16_t>(output))
+                : tflite::reference_integer_ops::FullyConnected(
+                      FullyConnectedParamsQuantized(data),
+                      tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<int16_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(
+                          micro_context, filter, weights_comp_td,
+                          data.weights_scratch_index),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int32_t>(
+                          micro_context, bias, bias_comp_td,
+                          data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int32_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<int16_t>(output));
+          } else if (bias->type == kTfLiteInt64) {
+            data.is_per_channel
+                ? tflite::reference_integer_ops::FullyConnectedPerChannel(
+                      FullyConnectedParamsQuantized(data),
+                      data.per_channel_output_multiplier,
+                      reinterpret_cast<const int*>(
+                          data.per_channel_output_shift),
+                      tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<int16_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(
+                          micro_context, filter, weights_comp_td,
+                          data.weights_scratch_index),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int64_t>(
+                          micro_context, bias, bias_comp_td,
+                          data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<int16_t>(output))
+                : tflite::reference_integer_ops::FullyConnected(
+                      FullyConnectedParamsQuantized(data),
+                      tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<int16_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+#ifdef USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(
+                          micro_context, filter, weights_comp_td,
+                          data.weights_scratch_index),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int64_t>(
+                          micro_context, bias, bias_comp_td,
+                          data.bias_scratch_index),
+#else   // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorData<int8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetOptionalTensorData<int64_t>(bias),
+#endif  // USE_TFLM_COMPRESSION
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<int16_t>(output));
+          }
+          break;
+        }
+        default: {
+          MicroPrintf("Filter type %s (%d) not supported.",
+                      TfLiteTypeGetName(filter->type), input->type);
+          return kTfLiteError;
+        }
+      }
+      break;
+    }
+
+    default: {
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_FULLY_CONNECTED() {
+  return tflite::micro::RegisterOp(FullyConnectedInit, FullyConnectedPrepare,
+                                   FullyConnectedEval);
+}
+
+TFLMInferenceRegistration RegisterInference_FULLY_CONNECTED() {
+  return tflite::micro::RegisterOp(FullyConnectedEval);
+}
+
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index a7ec7e84921..cdc50598ef9 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -63,10 +63,12 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc
 
 EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/conv.cc \
-  tensorflow/lite/micro/kernels/depthwise_conv.cc
+  tensorflow/lite/micro/kernels/depthwise_conv.cc \
+  tensorflow/lite/micro/kernels/fully_connected.cc
 
 

From bcb2163f1ae0f5a3b2b546cd1be5cdb6215d0984 Mon Sep 17 00:00:00 2001
From: numbers1234567 <33708611+numbers1234567@users.noreply.github.com>
Date: Tue, 22 Apr 2025 10:15:15 -0500
Subject: [PATCH 33/86] Testing and issues

---
 PEANUT-README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/PEANUT-README.md b/PEANUT-README.md
index 2f5902a482b..8da7a460d3f 100644
--- a/PEANUT-README.md
+++ b/PEANUT-README.md
@@ -14,3 +14,13 @@ To run with informative Peanut Microsystems-specific logs, add a PEANUT_MICRO_LO
         -DPEANUT_MICRO_LOG
 
 The main purpose for this flag is to sanity-check which implementations are used and to determine model architectures, including input and output shapes.
+
+## Testing
+
+To test, follow the same steps as above, but instead of *hello_world*, run
+    
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=riscv32_vector test
+
+## Issues
+
+Sometimes, when modifying the kernels, the compiler/build system will use objects from the previous compilation, meaning the new code will not run. Make sure to sanity check that your code is actually being used.

From 0070b60baaca53f0679345b487411bdf8ae52790 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 14:27:01 -0500
Subject: [PATCH 34/86] Vectorized softmax

---
 .../micro/kernels/riscv_vector/rfft_rvv.cc    |   0
 .../micro/kernels/riscv_vector/rfft_rvv.h     |  10 -
 .../micro/kernels/riscv_vector/softmax.cc     |  92 +++++
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 362 ++++++++++++++++++
 .../make/targets/riscv32_vector_makefile.inc  |   7 +-
 5 files changed, 459 insertions(+), 12 deletions(-)
 delete mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc
 delete mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.cc
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h
deleted file mode 100644
index e17af7a0365..00000000000
--- a/tensorflow/lite/micro/kernels/riscv_vector/rfft_rvv.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
-
-#include "tensorflow/lite/c/common.h"
-
-using namespace tflite;
-
-
-
-#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc b/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
new file mode 100644
index 00000000000..e4afcdd0744
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
@@ -0,0 +1,92 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/softmax.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h"
+
+namespace tflite {
+namespace {
+
+void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
+                      const SoftmaxParams& op_data) {
+  if (input->type == kTfLiteInt8) {
+    if (output->type == kTfLiteInt16) {
+      SoftmaxInt8RVV<int16_t>(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+      SoftmaxInt8RVV<int8_t>(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else {
+    tflite::reference_ops::SoftmaxInt16(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int16_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int16_t>(output));
+  }
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Softmax(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      return kTfLiteOk;
+    }
+    case kTfLiteInt8:
+    case kTfLiteInt16: {
+      SoftmaxQuantized(input, output, op_data);
+      return kTfLiteOk;
+    }
+    default:
+      MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(input->type),
+                  input->type);
+      return kTfLiteError;
+  }
+}
+}  // namespace
+
+TFLMRegistration Register_SOFTMAX() {
+  return tflite::micro::RegisterOp(SoftmaxInit, SoftmaxPrepare, SoftmaxEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
new file mode 100644
index 00000000000..5f7dcd8f04e
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -0,0 +1,362 @@
+#include <riscv_vector.h>
+#include <limits>
+#include <algorithm>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/micro/kernels/softmax.h"
+
+constexpr size_t kMaxVLI8M1_H = (128 / 8) * 1;
+constexpr size_t kMaxVLI16M2_H = (128 / 16) * 2;
+constexpr size_t kMaxVLI32M4_H = (128 / 32) * 4;
+constexpr size_t kMaxVL = std::max({ kMaxVLI8M1_H, kMaxVLI16M2_H, kMaxVLI32M4_H });
+
+inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl) 
+{
+    const int32_t s_int32_min = INT32_MIN;
+    vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
+    vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
+    vbool8_t v_overflow_mask = __riscv_vmand_mm_b8(v_min_mask_a, v_min_mask_b, vl);
+
+    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
+    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
+
+    const int32_t s_round_const = (1 << 30);
+    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_round_const, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, s_round_const, vl);
+
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
+    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+
+    vint32m4_t v_result = v_sum_hi;
+
+    const int32_t s_int32_max = INT32_MAX;
+    v_result = __riscv_vmerge_vxm_i32m4(v_result, s_int32_max, v_overflow_mask, vl);
+
+    return v_result;
+}
+
+inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl) 
+{
+    const int32_t s_int32_min = INT32_MIN;
+    vbool8_t v_overflow_mask;
+    if (s_b == s_int32_min) 
+    {
+         v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
+    } else 
+    {
+        vint32m4_t zero = __riscv_vmv_v_x_i32m4(0, vl);
+        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(zero, 1, vl);
+    }
+
+    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
+    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
+
+    const int32_t s_round_const = (1 << 30);
+    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_round_const, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, s_round_const, vl);
+
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
+    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+
+    vint32m4_t v_result = v_sum_hi;
+
+    const int32_t s_int32_max = INT32_MAX;
+    v_result = __riscv_vmerge_vxm_i32m4(v_result, s_int32_max, v_overflow_mask, vl);
+
+    return v_result;
+}
+
+
+inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl) 
+{
+    if (shift == 0) 
+    {
+        return v_vec;
+    }
+
+    shift = std::max(1, std::min(31, shift));
+
+    const int32_t s_round_mask = (INT64_C(1) << shift) - 1;
+    const int32_t s_threshold_base = s_round_mask >> 1;
+
+    vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_round_mask, vl);
+    vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+
+    vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
+    vint32m4_t v_neg_adjust = __riscv_vmerge_vxm_i32m4(v_zero, 1, v_is_neg_mask, vl);
+    vint32m4_t v_threshold = __riscv_vadd_vx_i32m4(v_neg_adjust, s_threshold_base, vl);
+
+    vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
+    vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, shift, vl);
+    vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, 1, vl);
+
+    return v_result;
+}
+
+inline vint32m4_t RoundingMul_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl) 
+{
+    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
+    vuint32m4_t v_prod_lo = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul_vx_i32m4(v_a, s_b, vl));
+
+    const int32_t s_round_offset = (1 << 30);
+    vuint32m4_t v_sum_lo = __riscv_vadd_vx_u32m4(v_prod_lo, s_round_offset, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo, s_round_offset, vl);
+
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
+    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+
+    return v_sum_hi;
+}
+
+vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl) 
+{
+
+    const int kInputIntegerBits = 5;
+    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits;
+    const int kOutputFractionalBits = 31;
+
+    const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
+    const int32_t s_mask_val = (INT32_C(1) << (kInputFractionalBits + 2)) - 1;
+    const int32_t s_minus_32_q5_26 = INT32_MIN;
+
+    const int32_t s_result_one_q0_31 = INT32_MAX;
+    const int32_t s_result_zero_q0_31 = 0;
+    const int32_t s_exp_neg_1_8_q0_31 = 1895147668;
+    const int32_t s_one_third_q0_31 = 715827883;
+    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
+
+    const int32_t s_mult_exp_neg_1_4 = 1672461947;
+    const int32_t s_mult_exp_neg_1_2 = 1302514674;
+    const int32_t s_mult_exp_neg_1 = 790015084;
+    const int32_t s_mult_exp_neg_2 = 290630308;
+    const int32_t s_mult_exp_neg_4 = 39332535;
+    const int32_t s_mult_exp_neg_8 = 720401;
+    const int32_t s_mult_exp_neg_16 = 242;
+
+    vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
+    vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
+    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
+
+    const int rescale_shift = kOutputFractionalBits - kInputFractionalBits;
+    vint32m4_t v_a_input_taylor_q0_31 = __riscv_vsll_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
+
+    vint32m4_t v_x = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
+    vint32m4_t v_x2 = SRDMH_vv_i32m4(v_x, v_x, vl);
+    vint32m4_t v_x3 = SRDMH_vv_i32m4(v_x2, v_x, vl);
+    vint32m4_t v_x4 = SRDMH_vv_i32m4(v_x2, v_x2, vl);
+    vint32m4_t v_x4_over_4 = SRMPOT_vx_i32m4(v_x4, 2, vl);
+
+    vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_x4_over_4, v_x3, vl);
+    vint32m4_t v_term2 = SRDMH_vx_i32m4(v_term1, s_one_third_q0_31, vl);
+    vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_x2, vl);
+    vint32m4_t v_inner_sum = SRMPOT_vx_i32m4(v_term3, 1, vl);
+    vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_x, v_inner_sum, vl);
+    vint32m4_t v_mul_term = SRDMH_vx_i32m4(v_bracket_term, s_exp_neg_1_8_q0_31, vl);
+    vint32m4_t v_interval_result_q0_31 = __riscv_vsadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
+
+    vint32m4_t v_current_result = v_interval_result_q0_31;
+
+    #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
+    do { \
+        const int shift_amount = kInputFractionalBits + exponent; \
+        if (shift_amount >= 0 && shift_amount < 32) { \
+            int32_t bit_mask = INT32_C(1) << shift_amount; \
+            vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8( \
+                __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask, vl), \
+                0, vl); \
+            vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
+            v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
+        } \
+    } while(0)
+
+    APPLY_BARREL_SHIFT(-2, s_mult_exp_neg_1_4);
+    APPLY_BARREL_SHIFT(-1, s_mult_exp_neg_1_2);
+    APPLY_BARREL_SHIFT( 0, s_mult_exp_neg_1);
+    APPLY_BARREL_SHIFT( 1, s_mult_exp_neg_2);
+    APPLY_BARREL_SHIFT( 2, s_mult_exp_neg_4);
+    APPLY_BARREL_SHIFT( 3, s_mult_exp_neg_8);
+    APPLY_BARREL_SHIFT( 4, s_mult_exp_neg_16);
+
+    #undef APPLY_BARREL_SHIFT
+
+    vbool8_t v_clamp_mask = __riscv_vmslt_vx_i32m4_b8(v_a_q5_26, s_minus_32_q5_26, vl);
+    v_current_result = __riscv_vmerge_vxm_i32m4(v_current_result, s_result_zero_q0_31, v_clamp_mask, vl);
+
+    vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
+    v_current_result = __riscv_vmerge_vxm_i32m4(v_current_result, s_result_one_q0_31, v_zero_mask, vl);
+
+    return v_current_result;
+}
+
+template<typename OutputT>
+void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
+                    const tflite::RuntimeShape& input_shape,
+                    const int8_t* input_data,
+                    const tflite::RuntimeShape& output_shape,
+                    OutputT* output_data)
+{
+    const int32_t input_beta_multiplier = params.input_multiplier;
+    const int32_t input_beta_left_shift = params.input_left_shift;
+    const int diff_min = params.diff_min;
+    static const int kAccumulationIntegerBits = 12;
+
+    const int trailing_dim = input_shape.DimensionsCount() - 1;
+    const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+    const size_t depth_sz = static_cast<size_t>(depth);
+
+    for (int i = 0; i < outer_size; ++i)
+    {
+        const int8_t* current_input_data = input_data + i * depth;
+        OutputT* current_output_data = output_data + i * depth;
+
+        int8_t max_in_row = std::numeric_limits<int8_t>::min();
+        size_t current_c = 0;
+        size_t vl_max_init = __riscv_vsetvl_e8m1(1);
+        vint8m1_t v_max_acc_m1 = __riscv_vmv_v_x_i8m1(max_in_row, vl_max_init);
+        while (current_c < depth_sz)
+        {
+            size_t vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
+            vint8m1_t v_input_m1 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
+            v_max_acc_m1 = __riscv_vredmax_vs_i8m1_i8m1(v_input_m1, v_max_acc_m1, vl);
+            current_c += vl;
+        }
+        max_in_row = __riscv_vmv_x_s_i8m1_i8(v_max_acc_m1);
+        const int32_t max_in_row_s32 = static_cast<int32_t>(max_in_row);
+
+        size_t vl_sum_init = __riscv_vsetvl_e32m1(1);
+        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_sum_init);
+        current_c = 0;
+        while (current_c < depth_sz)
+        {
+            size_t vl_m4 = __riscv_vsetvl_e32m4(depth_sz - current_c);
+            size_t vl_m2 = __riscv_vsetvl_e16m2(vl_m4);
+            size_t vl_m1 = __riscv_vsetvl_e8m1(vl_m4);
+
+            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl_m1);
+            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl_m2);
+            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl_m4);
+            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl_m4);
+
+            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl_m4);
+
+            vint32m4_t v_diff_rescaled;
+            {
+              vint32m4_t v_a = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl_m4);
+              const int32_t b = input_beta_multiplier;
+              v_diff_rescaled = SRDMH_vx_i32m4(v_a, b, vl_m4);
+            }
+
+            vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled, vl_m4);
+
+            const int rescale_shift = kAccumulationIntegerBits;
+            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift, vl_m4);
+
+            vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl_m4);
+            vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl_m4);
+
+            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked_q12_19, v_sum_acc_m1, vl_m4);
+
+            current_c += vl_m4;
+        }
+        int32_t sum_of_exps_raw = __riscv_vmv_x_s_i32m1_i32(v_sum_acc_m1);
+
+        int num_bits_over_unit;
+        gemmlowp::FixedPoint<int32_t, 0> shifted_scale = gemmlowp::FixedPoint<int32_t, 0>::FromRaw(tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit));
+        const int32_t s_shifted_scale_raw = shifted_scale.raw();
+
+        const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
+
+        const OutputT output_min = std::numeric_limits<OutputT>::min();
+        const OutputT output_max = std::numeric_limits<OutputT>::max();
+        const int32_t output_min_s32 = static_cast<int32_t>(output_min);
+        const int32_t output_max_s32 = static_cast<int32_t>(output_max);
+
+        current_c = 0;
+        while (current_c < depth_sz)
+        {
+            size_t vl_output;
+             if constexpr (sizeof(OutputT) == 1)
+             {
+                vl_output = __riscv_vsetvl_e8m1(depth_sz - current_c);
+             }
+             else
+             {
+                vl_output = __riscv_vsetvl_e16m2(depth_sz - current_c);
+             }
+            size_t vl_m4 = __riscv_vsetvl_e32m4(vl_output);
+            size_t vl_m2 = __riscv_vsetvl_e16m2(vl_output);
+            size_t vl_m1 = __riscv_vsetvl_e8m1(vl_output);
+
+            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl_m1);
+            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl_m2);
+            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl_m4);
+            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl_m4);
+
+            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl_m4);
+
+            vint32m4_t v_diff_rescaled;
+            {
+              vint32m4_t v_a = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl_m4);
+              const int32_t b = input_beta_multiplier;
+              v_diff_rescaled = SRDMH_vx_i32m4(v_a, b, vl_m4);
+            }
+
+            vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled, vl_m4);
+
+            vint32m4_t v_product_raw_q0_31 = RoundingMul_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw, vl_m4);
+
+            vint32m4_t v_zero_q0_31 = __riscv_vmv_v_x_i32m4(0, vl_m4);
+            vint32m4_t v_product_masked = __riscv_vmerge_vvm_i32m4(v_zero_q0_31, v_product_raw_q0_31, v_diff_mask, vl_m4);
+
+            vint32m4_t v_unsat_output;
+            if (exponent <= 0)
+            {
+                v_unsat_output = v_product_masked;
+            }
+            else
+            {
+                const int32_t round_mask = (static_cast<int32_t>(1) << exponent) - 1;
+                const int32_t threshold_base = round_mask >> 1;
+                vint32m4_t v_x_shifted = __riscv_vsra_vx_i32m4(v_product_masked, exponent, vl_m4);
+                vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_product_masked, round_mask, vl_m4);
+                vbool8_t v_is_negative_mask = __riscv_vmslt_vx_i32m4_b8(v_product_masked, 0, vl_m4);
+
+                vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl_m4);
+                vint32m4_t v_neg_adjust = __riscv_vmerge_vxm_i32m4(v_zero, 1, v_is_negative_mask, vl_m4);
+                vint32m4_t v_threshold = __riscv_vadd_vx_i32m4(v_neg_adjust, threshold_base, vl_m4);
+
+                vbool8_t v_P_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl_m4);
+                v_unsat_output = __riscv_vadd_vx_i32m4_m(v_P_mask, v_x_shifted, 1, vl_m4);
+            }
+
+            vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl_m4);
+            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(v_shifted_output, output_min_s32, vl_m4);
+            v_clamped_output = __riscv_vmin_vx_i32m4(v_clamped_output, output_max_s32, vl_m4);
+            vint32m4_t v_final_s32 = v_clamped_output;
+
+
+            if constexpr (sizeof(OutputT) == 1)
+            {
+                size_t vl_w16_out = __riscv_vsetvl_e16m2(vl_output);
+                vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl_w16_out);
+                size_t vl_w8_out = __riscv_vsetvl_e8m1(vl_output);
+                vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl_w8_out);
+                __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl_output);
+            }
+            else
+            {
+                size_t vl_w16_out = __riscv_vsetvl_e16m2(vl_output);
+                vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl_w16_out);
+                __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl_output);
+            }
+            
+            current_c += vl_output;
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index cdc50598ef9..209571dfcae 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -62,13 +62,16 @@ include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
 MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc
+  tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
 
 EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/conv.cc \
   tensorflow/lite/micro/kernels/depthwise_conv.cc \
-  tensorflow/lite/micro/kernels/fully_connected.cc
+  tensorflow/lite/micro/kernels/fully_connected.cc \
+  tensorflow/lite/micro/kernels/softmax.cc
 
 

From 8d712e516aa72365415b368d4ac801d568e48287 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 14:28:47 -0500
Subject: [PATCH 35/86] Update Makefile

---
 .../lite/micro/tools/make/targets/riscv32_vector_makefile.inc    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 209571dfcae..8bb23b3e456 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -62,7 +62,6 @@ include $(MAKEFILE_DIR)/ext_libs/eyalroz_printf.inc
 MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \

From 59492559a838e0cf38b9ee545d12e7e110e8c13b Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 15:36:37 -0500
Subject: [PATCH 36/86] Restore reference TFLM conv.cc

---
 tensorflow/lite/micro/kernels/conv.cc | 37 +--------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index b9980866f92..794da09bb29 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -24,10 +24,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-#if defined(TFLM_USE_RISCV_VECTOR)
-#include "tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h"
-#endif
-
 namespace tflite {
 namespace {
 
@@ -155,36 +151,6 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
           break;
         }
         case kTfLiteInt8: {
-#if defined(TFLM_USE_RISCV_VECTOR)
-#ifdef USE_TFLM_COMPRESSION
-          TFLITE_DCHECK(weights_comp_td == nullptr && bias_comp_td == nullptr);
-          if (weights_comp_td != nullptr || bias_comp_td != nullptr) 
-          {
-              MicroPrintf("ERROR: RVV path does not support compressed weights/bias yet.");
-              return kTfLiteError;
-          }
-#endif // USE_TFLM_COMPRESSION
-          // Check bias type is compatible (as per your original check)
-          if (bias != nullptr && bias->type != kTfLiteInt32) {
-            MicroPrintf("RVV kernel requires Int32 bias, got %s", TfLiteTypeGetName(bias->type));
-            return kTfLiteError;
-         }
-
-         // Call the optimized RVV kernel with the *new* correct parameters
-         ConvPerChannelRVV(
-             ConvParamsQuantized(params, data),          // const ConvParams& params
-             data.per_channel_output_multiplier,         // const int32_t* output_multiplier
-             data.per_channel_output_shift,              // const int32_t* output_shift
-             tflite::micro::GetTensorShape(input),       // const RuntimeShape& input_shape
-             tflite::micro::GetTensorData<int8_t>(input), // const int8_t* input_data
-             tflite::micro::GetTensorShape(filter),      // const RuntimeShape& filter_shape
-             tflite::micro::GetTensorData<int8_t>(filter),// const int8_t* filter_data
-             tflite::micro::GetTensorShape(bias),        // const RuntimeShape& bias_shape
-             tflite::micro::GetOptionalTensorData<int32_t>(bias), // const int32_t* bias_data
-             tflite::micro::GetTensorShape(output),      // const RuntimeShape& output_shape
-             tflite::micro::GetTensorData<int8_t>(output) // int8_t* output_data
-         );
-#else // defined(TFLM_USE_RISCV_VECTOR)
           reference_integer_ops::ConvPerChannel(
               ConvParamsQuantized(params, data),
               data.per_channel_output_multiplier, data.per_channel_output_shift,
@@ -205,7 +171,6 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
 #endif  // USE_TFLM_COMPRESSION
               tflite::micro::GetTensorShape(output),
               tflite::micro::GetTensorData<int8_t>(output));
-#endif
           break;
         }
         default:
@@ -229,4 +194,4 @@ TFLMRegistration Register_CONV_2D() {
   return tflite::micro::RegisterOp(ConvInit, ConvPrepare, ConvEval);
 }
 
-}  // namespace tflite
+}  // namespace tflite
\ No newline at end of file

From 9e183491548105a0dae85a91a53aafa683437e7a Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 17:08:50 -0500
Subject: [PATCH 37/86] Fix Softmax

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 349 +++++++++---------
 1 file changed, 182 insertions(+), 167 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 5f7dcd8f04e..fa702b4942d 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -7,170 +7,199 @@
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
+#include "tensorflow/lite/micro/micro_log.h"
 
-constexpr size_t kMaxVLI8M1_H = (128 / 8) * 1;
-constexpr size_t kMaxVLI16M2_H = (128 / 16) * 2;
-constexpr size_t kMaxVLI32M4_H = (128 / 32) * 4;
-constexpr size_t kMaxVL = std::max({ kMaxVLI8M1_H, kMaxVLI16M2_H, kMaxVLI32M4_H });
-
-inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl) 
+inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
+    const int32_t s_int32_max = INT32_MAX;
+
     vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
     vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
     vbool8_t v_overflow_mask = __riscv_vmand_mm_b8(v_min_mask_a, v_min_mask_b, vl);
 
-    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
     vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
+    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
 
-    const int32_t s_round_const = (1 << 30);
+    const int32_t s_nudge_val = (INT32_C(1) << 30);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_round_const, vl);
-    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, s_round_const, vl);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_nudge_val, vl);
 
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
-    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+    vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
 
-    vint32m4_t v_result = v_sum_hi;
+    vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
+    vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
 
-    const int32_t s_int32_max = INT32_MAX;
-    v_result = __riscv_vmerge_vxm_i32m4(v_result, s_int32_max, v_overflow_mask, vl);
+    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
+
+    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 
     return v_result;
 }
 
-inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl) 
+inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
+    const int32_t s_int32_max = INT32_MAX;
+
     vbool8_t v_overflow_mask;
-    if (s_b == s_int32_min) 
+    if (s_b == s_int32_min)
     {
-         v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
-    } else 
+        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
+    } else
     {
-        vint32m4_t zero = __riscv_vmv_v_x_i32m4(0, vl);
-        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(zero, 1, vl);
+        vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
+        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl); // Create all-false mask
     }
 
-    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
     vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
+    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
 
-    const int32_t s_round_const = (1 << 30);
+    const int32_t s_nudge_val = (INT32_C(1) << 30);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_round_const, vl);
-    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, s_round_const, vl);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_nudge_val, vl);
 
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
-    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+    vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
 
-    vint32m4_t v_result = v_sum_hi;
+    vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
+    vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
 
-    const int32_t s_int32_max = INT32_MAX;
-    v_result = __riscv_vmerge_vxm_i32m4(v_result, s_int32_max, v_overflow_mask, vl);
+    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
+
+    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 
     return v_result;
 }
 
 
-inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl) 
+inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 {
-    if (shift == 0) 
-    {
-        return v_vec;
-    }
+    if (shift < 0) {
+        const int32_t s_shift = -shift;
+        if (s_shift == 0) return v_vec;
+
+        const int32_t s_max_val = INT32_MAX;
+        const int32_t s_min_val = INT32_MIN;
+
+        if (s_shift >= 31) {
+             vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
+             vint32m4_t v_res = __riscv_vmv_v_x_i32m4(s_min_val, vl);
+             v_res = __riscv_vmerge_vxm_i32m4(v_res, s_max_val, v_pos_mask, vl);
+             return v_res;
+        }
 
-    shift = std::max(1, std::min(31, shift));
+        const int32_t s_pos_thresh = s_max_val >> s_shift;
+        const int32_t s_neg_thresh = s_min_val >> s_shift;
 
-    const int32_t s_round_mask = (INT64_C(1) << shift) - 1;
-    const int32_t s_threshold_base = s_round_mask >> 1;
+        vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, s_pos_thresh, vl);
+        vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_neg_thresh, vl);
 
-    vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_round_mask, vl);
-    vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+        vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
 
-    vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-    vint32m4_t v_neg_adjust = __riscv_vmerge_vxm_i32m4(v_zero, 1, v_is_neg_mask, vl);
-    vint32m4_t v_threshold = __riscv_vadd_vx_i32m4(v_neg_adjust, s_threshold_base, vl);
+        vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
+        v_result = __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
+        return v_result;
 
-    vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
-    vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, shift, vl);
-    vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, 1, vl);
+    } else if (shift == 0) {
+         return v_vec;
+    } else {
+        shift = std::min(31, shift);
 
-    return v_result;
-}
+        int32_t s_round_mask;
+        if (shift == 31) {
+            s_round_mask = INT32_MAX;
+        } else {
+            s_round_mask = (INT32_C(1) << shift) - 1;
+        }
 
-inline vint32m4_t RoundingMul_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl) 
-{
-    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
-    vuint32m4_t v_prod_lo = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul_vx_i32m4(v_a, s_b, vl));
+        const int32_t s_threshold_base = s_round_mask >> 1;
+
+        vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_round_mask, vl);
+
+        vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
 
-    const int32_t s_round_offset = (1 << 30);
-    vuint32m4_t v_sum_lo = __riscv_vadd_vx_u32m4(v_prod_lo, s_round_offset, vl);
-    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo, s_round_offset, vl);
+        vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
+        v_threshold = __riscv_vadd_vx_i32m4_m(v_is_neg_mask, v_threshold, 1, vl);
 
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4(v_prod_hi, 0, vl);
-    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_sum_hi, 1, vl);
+        vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
 
-    return v_sum_hi;
+        vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, shift, vl);
+
+        vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, 1, vl);
+
+        return v_result;
+    }
 }
 
-vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl) 
-{
 
+vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
+{
     const int kInputIntegerBits = 5;
-    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits;
-    const int kOutputFractionalBits = 31;
+    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits; // 26
+    const int kOutputFractionalBits = 31; // 32 - 1 - 0
 
     const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
-    const int32_t s_mask_val = (INT32_C(1) << (kInputFractionalBits + 2)) - 1;
-    const int32_t s_minus_32_q5_26 = INT32_MIN;
+    const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
 
     const int32_t s_result_one_q0_31 = INT32_MAX;
-    const int32_t s_result_zero_q0_31 = 0;
     const int32_t s_exp_neg_1_8_q0_31 = 1895147668;
     const int32_t s_one_third_q0_31 = 715827883;
     const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
 
     const int32_t s_mult_exp_neg_1_4 = 1672461947;
     const int32_t s_mult_exp_neg_1_2 = 1302514674;
-    const int32_t s_mult_exp_neg_1 = 790015084;
-    const int32_t s_mult_exp_neg_2 = 290630308;
-    const int32_t s_mult_exp_neg_4 = 39332535;
-    const int32_t s_mult_exp_neg_8 = 720401;
-    const int32_t s_mult_exp_neg_16 = 242;
+    const int32_t s_mult_exp_neg_1   = 790015084;
+    const int32_t s_mult_exp_neg_2   = 290630308;
+    const int32_t s_mult_exp_neg_4   = 39332535;
+    const int32_t s_mult_exp_neg_8   = 720401;
+    const int32_t s_mult_exp_neg_16  = 242;
+
 
     vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
     vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
-    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
 
-    const int rescale_shift = kOutputFractionalBits - kInputFractionalBits;
-    vint32m4_t v_a_input_taylor_q0_31 = __riscv_vsll_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
+    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_q5_26, v_a_mod_q_m_q_q5_26, vl);
+
+    const int rescale_shift = kInputIntegerBits - 0;
+    vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, -rescale_shift, vl);
+
 
     vint32m4_t v_x = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
+
     vint32m4_t v_x2 = SRDMH_vv_i32m4(v_x, v_x, vl);
     vint32m4_t v_x3 = SRDMH_vv_i32m4(v_x2, v_x, vl);
     vint32m4_t v_x4 = SRDMH_vv_i32m4(v_x2, v_x2, vl);
+
     vint32m4_t v_x4_over_4 = SRMPOT_vx_i32m4(v_x4, 2, vl);
 
     vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_x4_over_4, v_x3, vl);
     vint32m4_t v_term2 = SRDMH_vx_i32m4(v_term1, s_one_third_q0_31, vl);
     vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_x2, vl);
+
     vint32m4_t v_inner_sum = SRMPOT_vx_i32m4(v_term3, 1, vl);
     vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_x, v_inner_sum, vl);
+
     vint32m4_t v_mul_term = SRDMH_vx_i32m4(v_bracket_term, s_exp_neg_1_8_q0_31, vl);
-    vint32m4_t v_interval_result_q0_31 = __riscv_vsadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
+
+    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
 
     vint32m4_t v_current_result = v_interval_result_q0_31;
 
     #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
     do { \
-        const int shift_amount = kInputFractionalBits + exponent; \
-        if (shift_amount >= 0 && shift_amount < 32) { \
-            int32_t bit_mask = INT32_C(1) << shift_amount; \
-            vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8( \
-                __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask, vl), \
-                0, vl); \
-            vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
-            v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
+        if (kInputIntegerBits > exponent) { \
+            const int shift_amount = kInputFractionalBits + exponent; \
+            if (shift_amount >= 0 && shift_amount < 32) { \
+                int32_t bit_mask = INT32_C(1) << shift_amount; \
+                vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8( \
+                    __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask, vl), \
+                    0, \
+                    vl); \
+                vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
+                v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
+            } \
         } \
     } while(0)
 
@@ -184,15 +213,15 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 
     #undef APPLY_BARREL_SHIFT
 
-    vbool8_t v_clamp_mask = __riscv_vmslt_vx_i32m4_b8(v_a_q5_26, s_minus_32_q5_26, vl);
-    v_current_result = __riscv_vmerge_vxm_i32m4(v_current_result, s_result_zero_q0_31, v_clamp_mask, vl);
+    vint32m4_t v_final_result = v_current_result;
 
     vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
-    v_current_result = __riscv_vmerge_vxm_i32m4(v_current_result, s_result_one_q0_31, v_zero_mask, vl);
+    v_final_result = __riscv_vmerge_vxm_i32m4(v_final_result, s_result_one_q0_31, v_zero_mask, vl);
 
-    return v_current_result;
+    return v_final_result;
 }
 
+
 template<typename OutputT>
 void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
                     const tflite::RuntimeShape& input_shape,
@@ -204,6 +233,8 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
     const int32_t input_beta_left_shift = params.input_left_shift;
     const int diff_min = params.diff_min;
     static const int kAccumulationIntegerBits = 12;
+    static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits; // 19
+    static const int kExpOutputFractionalBits = 31; // 32 - 1 - 0
 
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
@@ -234,43 +265,44 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
         current_c = 0;
         while (current_c < depth_sz)
         {
-            size_t vl_m4 = __riscv_vsetvl_e32m4(depth_sz - current_c);
-            size_t vl_m2 = __riscv_vsetvl_e16m2(vl_m4);
-            size_t vl_m1 = __riscv_vsetvl_e8m1(vl_m4);
+            size_t vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
 
-            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl_m1);
-            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl_m2);
-            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl_m4);
-            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl_m4);
+            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
+            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
 
-            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl_m4);
+            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
 
-            vint32m4_t v_diff_rescaled;
+            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
+
+            vint32m4_t v_diff_rescaled_q5_26;
             {
-              vint32m4_t v_a = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl_m4);
-              const int32_t b = input_beta_multiplier;
-              v_diff_rescaled = SRDMH_vx_i32m4(v_a, b, vl_m4);
+              vint32m4_t v_shifted_diff = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl);
+              v_diff_rescaled_q5_26 = SRDMH_vx_i32m4(v_shifted_diff, input_beta_multiplier, vl);
             }
 
-            vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled, vl_m4);
+            vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
-            const int rescale_shift = kAccumulationIntegerBits;
-            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift, vl_m4);
+            const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits; // 31 - 19 = 12
+            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift_exp_to_accum, vl);
 
-            vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl_m4);
-            vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl_m4);
+            vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
+            vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
 
-            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked_q12_19, v_sum_acc_m1, vl_m4);
+            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked_q12_19, v_sum_acc_m1, vl);
 
-            current_c += vl_m4;
+            current_c += vl;
         }
         int32_t sum_of_exps_raw = __riscv_vmv_x_s_i32m1_i32(v_sum_acc_m1);
 
         int num_bits_over_unit;
-        gemmlowp::FixedPoint<int32_t, 0> shifted_scale = gemmlowp::FixedPoint<int32_t, 0>::FromRaw(tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit));
-        const int32_t s_shifted_scale_raw = shifted_scale.raw();
+        int32_t reciprocal_raw_q0_31 = tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit);
+        const int32_t s_shifted_scale_raw_q0_31 = reciprocal_raw_q0_31;
+
+        const int output_bits = sizeof(OutputT) * 8;
+        const int exponent = num_bits_over_unit + 31 - output_bits;
+
 
-        const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
 
         const OutputT output_min = std::numeric_limits<OutputT>::min();
         const OutputT output_max = std::numeric_limits<OutputT>::max();
@@ -280,83 +312,66 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
         current_c = 0;
         while (current_c < depth_sz)
         {
-            size_t vl_output;
-             if constexpr (sizeof(OutputT) == 1)
-             {
-                vl_output = __riscv_vsetvl_e8m1(depth_sz - current_c);
+             size_t vl;
+             if constexpr (sizeof(OutputT) == 1) {
+                vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
+             } else {
+                vl = __riscv_vsetvl_e16m2(depth_sz - current_c);
              }
-             else
-             {
-                vl_output = __riscv_vsetvl_e16m2(depth_sz - current_c);
-             }
-            size_t vl_m4 = __riscv_vsetvl_e32m4(vl_output);
-            size_t vl_m2 = __riscv_vsetvl_e16m2(vl_output);
-            size_t vl_m1 = __riscv_vsetvl_e8m1(vl_output);
 
-            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl_m1);
-            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl_m2);
-            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl_m4);
-            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl_m4);
+            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
+            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
 
-            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl_m4);
+            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
 
-            vint32m4_t v_diff_rescaled;
+            vint32m4_t v_diff_rescaled_q5_26;
             {
-              vint32m4_t v_a = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl_m4);
-              const int32_t b = input_beta_multiplier;
-              v_diff_rescaled = SRDMH_vx_i32m4(v_a, b, vl_m4);
+              vint32m4_t v_shifted_diff = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl);
+              v_diff_rescaled_q5_26 = SRDMH_vx_i32m4(v_shifted_diff, input_beta_multiplier, vl);
             }
 
-            vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled, vl_m4);
+            vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
-            vint32m4_t v_product_raw_q0_31 = RoundingMul_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw, vl_m4);
+            vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw_q0_31, vl);
 
-            vint32m4_t v_zero_q0_31 = __riscv_vmv_v_x_i32m4(0, vl_m4);
-            vint32m4_t v_product_masked = __riscv_vmerge_vvm_i32m4(v_zero_q0_31, v_product_raw_q0_31, v_diff_mask, vl_m4);
+            vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, exponent, vl);
 
-            vint32m4_t v_unsat_output;
-            if (exponent <= 0)
-            {
-                v_unsat_output = v_product_masked;
-            }
-            else
-            {
-                const int32_t round_mask = (static_cast<int32_t>(1) << exponent) - 1;
-                const int32_t threshold_base = round_mask >> 1;
-                vint32m4_t v_x_shifted = __riscv_vsra_vx_i32m4(v_product_masked, exponent, vl_m4);
-                vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_product_masked, round_mask, vl_m4);
-                vbool8_t v_is_negative_mask = __riscv_vmslt_vx_i32m4_b8(v_product_masked, 0, vl_m4);
-
-                vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl_m4);
-                vint32m4_t v_neg_adjust = __riscv_vmerge_vxm_i32m4(v_zero, 1, v_is_negative_mask, vl_m4);
-                vint32m4_t v_threshold = __riscv_vadd_vx_i32m4(v_neg_adjust, threshold_base, vl_m4);
-
-                vbool8_t v_P_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl_m4);
-                v_unsat_output = __riscv_vadd_vx_i32m4_m(v_P_mask, v_x_shifted, 1, vl_m4);
-            }
 
-            vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl_m4);
-            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(v_shifted_output, output_min_s32, vl_m4);
-            v_clamped_output = __riscv_vmin_vx_i32m4(v_clamped_output, output_max_s32, vl_m4);
-            vint32m4_t v_final_s32 = v_clamped_output;
+            vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl);
+
+            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(v_shifted_output, output_min_s32, vl);
+            v_clamped_output = __riscv_vmin_vx_i32m4(v_clamped_output, output_max_s32, vl);
 
+            vint32m4_t v_output_min_vec = __riscv_vmv_v_x_i32m4(output_min_s32, vl);
+            vint32m4_t v_final_s32 = __riscv_vmerge_vvm_i32m4(v_output_min_vec, v_clamped_output, v_diff_mask, vl);
 
             if constexpr (sizeof(OutputT) == 1)
             {
-                size_t vl_w16_out = __riscv_vsetvl_e16m2(vl_output);
-                vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl_w16_out);
-                size_t vl_w8_out = __riscv_vsetvl_e8m1(vl_output);
-                vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl_w8_out);
-                __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl_output);
+                 vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
+                 vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl);
+                 __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl);
             }
             else
             {
-                size_t vl_w16_out = __riscv_vsetvl_e16m2(vl_output);
-                vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl_w16_out);
-                __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl_output);
+                 vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
+                 __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl);
             }
-            
-            current_c += vl_output;
+
+            current_c += vl;
         }
     }
-}
\ No newline at end of file
+}
+
+template void SoftmaxInt8RVV<int8_t>(const tflite::SoftmaxParams& params,
+                    const tflite::RuntimeShape& input_shape,
+                    const int8_t* input_data,
+                    const tflite::RuntimeShape& output_shape,
+                    int8_t* output_data);
+
+template void SoftmaxInt8RVV<int16_t>(const tflite::SoftmaxParams& params,
+                    const tflite::RuntimeShape& input_shape,
+                    const int8_t* input_data,
+                    const tflite::RuntimeShape& output_shape,
+                    int16_t* output_data);
\ No newline at end of file

From f451a40e4335ef39237aa878f60c50414937f0c8 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 17:31:56 -0500
Subject: [PATCH 38/86] Fix incorrect nudge factor in Softmax

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 102 +++++++++++-------
 1 file changed, 63 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index fa702b4942d..5de0dee1938 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -9,6 +9,7 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+
 inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
@@ -20,17 +21,26 @@ inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 
     vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
     vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
-
-    const int32_t s_nudge_val = (INT32_C(1) << 30);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_nudge_val, vl);
 
+    const int32_t s_nudge_pos = (1 << 30);
+    const int32_t s_nudge_neg = 1 - (1 << 30);
+    vint32m4_t v_a_sign = __riscv_vsra_vx_i32m4(v_a, 31, vl);
+    vint32m4_t v_b_sign = __riscv_vsra_vx_i32m4(v_b, 31, vl);
+    vbool8_t v_prod_non_negative_mask = __riscv_vmseq_vv_i32m4_b8(v_a_sign, v_b_sign, vl);
+
+    vint32m4_t v_nudge = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
+    v_nudge = __riscv_vmerge_vxm_i32m4(v_nudge, s_nudge_pos, v_prod_non_negative_mask, vl);
+    vuint32m4_t v_nudge_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge);
+
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
+    vint32m4_t v_nudge_hi = __riscv_vsra_vx_i32m4(v_nudge, 31, vl);
+    vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
+    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
     vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
     vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-
     vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
 
     vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
@@ -38,6 +48,7 @@ inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
     return v_result;
 }
 
+
 inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
@@ -50,22 +61,31 @@ inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
     } else
     {
         vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl); // Create all-false mask
+        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl);
     }
 
     vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
     vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
-
-    const int32_t s_nudge_val = (INT32_C(1) << 30);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_nudge_val, vl);
 
+    const int32_t s_nudge_pos = (1 << 30);
+    const int32_t s_nudge_neg = 1 - (1 << 30);
+    vint32m4_t v_a_sign = __riscv_vsra_vx_i32m4(v_a, 31, vl);
+    int32_t s_b_sign = s_b >> 31;
+    vbool8_t v_prod_non_negative_mask = __riscv_vmseq_vx_i32m4_b8(v_a_sign, s_b_sign, vl);
+
+    vint32m4_t v_nudge = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
+    v_nudge = __riscv_vmerge_vxm_i32m4(v_nudge, s_nudge_pos, v_prod_non_negative_mask, vl);
+    vuint32m4_t v_nudge_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge);
+
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
+    vint32m4_t v_nudge_hi = __riscv_vsra_vx_i32m4(v_nudge, 31, vl);
+    vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
+    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
     vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
     vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-
     vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
 
     vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
@@ -85,22 +105,26 @@ inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 
         if (s_shift >= 31) {
              vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
-             vint32m4_t v_res = __riscv_vmv_v_x_i32m4(s_min_val, vl);
+             vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+             vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_vec, 0, vl);
+             vint32m4_t v_res = __riscv_vmv_v_x_i32m4(0, vl);
              v_res = __riscv_vmerge_vxm_i32m4(v_res, s_max_val, v_pos_mask, vl);
+             v_res = __riscv_vmerge_vxm_i32m4(v_res, s_min_val, v_neg_mask, vl);
+             v_res = __riscv_vmerge_vxm_i32m4(v_res, 0, v_zero_mask, vl);
              return v_res;
-        }
-
-        const int32_t s_pos_thresh = s_max_val >> s_shift;
-        const int32_t s_neg_thresh = s_min_val >> s_shift;
+        } else {
+            const int32_t s_pos_thresh = s_max_val >> s_shift;
+            const int32_t s_neg_thresh = s_min_val >> s_shift;
 
-        vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, s_pos_thresh, vl);
-        vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_neg_thresh, vl);
+            vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, s_pos_thresh, vl);
+            vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_neg_thresh, vl);
 
-        vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
+            vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
 
-        vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
-        v_result = __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
-        return v_result;
+            vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
+            v_result = __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
+            return v_result;
+        }
 
     } else if (shift == 0) {
          return v_vec;
@@ -134,11 +158,19 @@ inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 }
 
 
+inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
+    vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) {
+
+    vint32m4_t v_a = SRMPOT_vx_i32m4(v_x, -left_shift, vl);
+    return SRDMH_vx_i32m4(v_a, quantized_multiplier, vl);
+}
+
+
 vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 {
     const int kInputIntegerBits = 5;
-    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits; // 26
-    const int kOutputFractionalBits = 31; // 32 - 1 - 0
+    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits;
+    const int kOutputFractionalBits = 31;
 
     const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
     const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
@@ -233,8 +265,8 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
     const int32_t input_beta_left_shift = params.input_left_shift;
     const int diff_min = params.diff_min;
     static const int kAccumulationIntegerBits = 12;
-    static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits; // 19
-    static const int kExpOutputFractionalBits = 31; // 32 - 1 - 0
+    static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits;
+    static const int kExpOutputFractionalBits = 31;
 
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
@@ -275,15 +307,12 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
 
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
 
-            vint32m4_t v_diff_rescaled_q5_26;
-            {
-              vint32m4_t v_shifted_diff = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl);
-              v_diff_rescaled_q5_26 = SRDMH_vx_i32m4(v_shifted_diff, input_beta_multiplier, vl);
-            }
+            vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
+                v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
 
             vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
-            const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits; // 31 - 19 = 12
+            const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits;
             vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift_exp_to_accum, vl);
 
             vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
@@ -302,8 +331,6 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
         const int output_bits = sizeof(OutputT) * 8;
         const int exponent = num_bits_over_unit + 31 - output_bits;
 
-
-
         const OutputT output_min = std::numeric_limits<OutputT>::min();
         const OutputT output_max = std::numeric_limits<OutputT>::max();
         const int32_t output_min_s32 = static_cast<int32_t>(output_min);
@@ -326,11 +353,8 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
 
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
 
-            vint32m4_t v_diff_rescaled_q5_26;
-            {
-              vint32m4_t v_shifted_diff = __riscv_vsll_vx_i32m4(v_diff_s32, input_beta_left_shift, vl);
-              v_diff_rescaled_q5_26 = SRDMH_vx_i32m4(v_shifted_diff, input_beta_multiplier, vl);
-            }
+            vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
+                v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
 
             vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 

From b427443a46ae5d86c3d131ecff10dde24d3ef9f0 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 18:06:33 -0500
Subject: [PATCH 39/86] Update Softmax

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 220 ++++++++----------
 1 file changed, 99 insertions(+), 121 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 5de0dee1938..4c00f98447a 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -1,3 +1,6 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SOFTMAX_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SOFTMAX_RVV_H_
+
 #include <riscv_vector.h>
 #include <limits>
 #include <algorithm>
@@ -9,11 +12,12 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-
 inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
+    const int32_t s_nudge_pos = (INT32_C(1) << 30);
+    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30);
 
     vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
     vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
@@ -23,19 +27,19 @@ inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
     vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
-    const int32_t s_nudge_pos = (1 << 30);
-    const int32_t s_nudge_neg = 1 - (1 << 30);
-    vint32m4_t v_a_sign = __riscv_vsra_vx_i32m4(v_a, 31, vl);
-    vint32m4_t v_b_sign = __riscv_vsra_vx_i32m4(v_b, 31, vl);
-    vbool8_t v_prod_non_negative_mask = __riscv_vmseq_vv_i32m4_b8(v_a_sign, v_b_sign, vl);
+    vint32m4_t v_xor_signs = __riscv_vxor_vv_i32m4(v_a, v_b, vl);
+    vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
 
-    vint32m4_t v_nudge = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
-    v_nudge = __riscv_vmerge_vxm_i32m4(v_nudge, s_nudge_pos, v_prod_non_negative_mask, vl);
-    vuint32m4_t v_nudge_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge);
+    vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
+    v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
+    vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
 
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_u, vl);
+    vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
+    v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
+
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_nudge_hi = __riscv_vsra_vx_i32m4(v_nudge, 31, vl);
+
     vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
     v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
@@ -48,11 +52,12 @@ inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
     return v_result;
 }
 
-
 inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
 {
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
+    const int32_t s_nudge_pos = (INT32_C(1) << 30);
+    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30);
 
     vbool8_t v_overflow_mask;
     if (s_b == s_int32_min)
@@ -68,19 +73,19 @@ inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
     vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
-    const int32_t s_nudge_pos = (1 << 30);
-    const int32_t s_nudge_neg = 1 - (1 << 30);
-    vint32m4_t v_a_sign = __riscv_vsra_vx_i32m4(v_a, 31, vl);
-    int32_t s_b_sign = s_b >> 31;
-    vbool8_t v_prod_non_negative_mask = __riscv_vmseq_vx_i32m4_b8(v_a_sign, s_b_sign, vl);
+    vint32m4_t v_xor_signs = __riscv_vxor_vx_i32m4(v_a, s_b, vl);
+    vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
+
+    vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
+    v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
+    vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
 
-    vint32m4_t v_nudge = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
-    v_nudge = __riscv_vmerge_vxm_i32m4(v_nudge, s_nudge_pos, v_prod_non_negative_mask, vl);
-    vuint32m4_t v_nudge_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge);
+    vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
+    v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
 
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_u, vl);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_nudge_hi = __riscv_vsra_vx_i32m4(v_nudge, 31, vl);
+
     vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
     v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
@@ -93,31 +98,33 @@ inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
     return v_result;
 }
 
-
 inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 {
-    if (shift < 0) {
-        const int32_t s_shift = -shift;
+    if (shift > 0) {
+
+        const int32_t s_shift = shift;
         if (s_shift == 0) return v_vec;
 
         const int32_t s_max_val = INT32_MAX;
         const int32_t s_min_val = INT32_MIN;
 
         if (s_shift >= 31) {
+             vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
              vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
              vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
-             vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_vec, 0, vl);
-             vint32m4_t v_res = __riscv_vmv_v_x_i32m4(0, vl);
-             v_res = __riscv_vmerge_vxm_i32m4(v_res, s_max_val, v_pos_mask, vl);
-             v_res = __riscv_vmerge_vxm_i32m4(v_res, s_min_val, v_neg_mask, vl);
-             v_res = __riscv_vmerge_vxm_i32m4(v_res, 0, v_zero_mask, vl);
-             return v_res;
+             vint32m4_t v_saturated = __riscv_vmerge_vxm_i32m4(v_zero, s_max_val, v_pos_mask, vl);
+             v_saturated = __riscv_vmerge_vxm_i32m4(v_saturated, s_min_val, v_neg_mask, vl);
+             return v_saturated;
         } else {
-            const int32_t s_pos_thresh = s_max_val >> s_shift;
-            const int32_t s_neg_thresh = s_min_val >> s_shift;
+            const int32_t scalar_type_bits = 32;
+            const int64_t pos_threshold_64 = (INT64_C(1) << (scalar_type_bits - 1 - s_shift)) - 1;
+            const int64_t neg_threshold_64 = -(INT64_C(1) << (scalar_type_bits - 1 - s_shift));
 
-            vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, s_pos_thresh, vl);
-            vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_neg_thresh, vl);
+            const int32_t pos_threshold = (pos_threshold_64 > INT32_MAX) ? INT32_MAX : (int32_t)pos_threshold_64;
+            const int32_t neg_threshold = (neg_threshold_64 < INT32_MIN) ? INT32_MIN : (int32_t)neg_threshold_64;
+
+            vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, pos_threshold, vl);
+            vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, neg_threshold, vl);
 
             vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
 
@@ -129,29 +136,25 @@ inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
     } else if (shift == 0) {
          return v_vec;
     } else {
-        shift = std::min(31, shift);
 
-        int32_t s_round_mask;
-        if (shift == 31) {
-            s_round_mask = INT32_MAX;
-        } else {
-            s_round_mask = (INT32_C(1) << shift) - 1;
-        }
-
-        const int32_t s_threshold_base = s_round_mask >> 1;
+        int exponent = -shift;
+        exponent = std::min(31, exponent);
+        if (exponent == 0) return v_vec;
 
-        vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_round_mask, vl);
+        const int32_t s_mask_val = (INT32_C(1) << exponent) - 1;
+        const int32_t s_zero = 0;
+        const int32_t s_one = 1;
 
-        vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+        vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_mask_val, vl);
+        vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, exponent, vl);
 
+        const int32_t s_threshold_base = s_mask_val >> 1;
         vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
-        v_threshold = __riscv_vadd_vx_i32m4_m(v_is_neg_mask, v_threshold, 1, vl);
+        vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_zero, vl);
+        v_threshold = __riscv_vadd_vx_i32m4_m(v_is_neg_mask, v_threshold, s_one, vl);
 
         vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
-
-        vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, shift, vl);
-
-        vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, 1, vl);
+        vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, s_one, vl);
 
         return v_result;
     }
@@ -161,11 +164,10 @@ inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
     vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) {
 
-    vint32m4_t v_a = SRMPOT_vx_i32m4(v_x, -left_shift, vl);
-    return SRDMH_vx_i32m4(v_a, quantized_multiplier, vl);
+    vint32m4_t v_shifted_x = SRMPOT_vx_i32m4(v_x, left_shift, vl);
+    return SRDMH_vx_i32m4(v_shifted_x, quantized_multiplier, vl);
 }
 
-
 vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 {
     const int kInputIntegerBits = 5;
@@ -180,55 +182,52 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     const int32_t s_one_third_q0_31 = 715827883;
     const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
 
-    const int32_t s_mult_exp_neg_1_4 = 1672461947;
-    const int32_t s_mult_exp_neg_1_2 = 1302514674;
-    const int32_t s_mult_exp_neg_1   = 790015084;
-    const int32_t s_mult_exp_neg_2   = 290630308;
-    const int32_t s_mult_exp_neg_4   = 39332535;
-    const int32_t s_mult_exp_neg_8   = 720401;
-    const int32_t s_mult_exp_neg_16  = 242;
-
 
     vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
     vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
-
     vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_q5_26, v_a_mod_q_m_q_q5_26, vl);
 
     const int rescale_shift = kInputIntegerBits - 0;
     vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, -rescale_shift, vl);
 
+    vint32m4_t v_y = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
 
-    vint32m4_t v_x = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
-
-    vint32m4_t v_x2 = SRDMH_vv_i32m4(v_x, v_x, vl);
-    vint32m4_t v_x3 = SRDMH_vv_i32m4(v_x2, v_x, vl);
-    vint32m4_t v_x4 = SRDMH_vv_i32m4(v_x2, v_x2, vl);
+    vint32m4_t v_y2 = SRDMH_vv_i32m4(v_y, v_y, vl);
+    vint32m4_t v_y3 = SRDMH_vv_i32m4(v_y2, v_y, vl);
+    vint32m4_t v_y4 = SRDMH_vv_i32m4(v_y2, v_y2, vl);
 
-    vint32m4_t v_x4_over_4 = SRMPOT_vx_i32m4(v_x4, 2, vl);
+    vint32m4_t v_y4_over_4 = SRMPOT_vx_i32m4(v_y4, -2, vl);
 
-    vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_x4_over_4, v_x3, vl);
+    vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_y4_over_4, v_y3, vl);
     vint32m4_t v_term2 = SRDMH_vx_i32m4(v_term1, s_one_third_q0_31, vl);
-    vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_x2, vl);
+    vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_y2, vl);
+    vint32m4_t v_sum_of_higher_terms = SRMPOT_vx_i32m4(v_term3, -1, vl);
 
-    vint32m4_t v_inner_sum = SRMPOT_vx_i32m4(v_term3, 1, vl);
-    vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_x, v_inner_sum, vl);
+    vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_y, v_sum_of_higher_terms, vl);
 
-    vint32m4_t v_mul_term = SRDMH_vx_i32m4(v_bracket_term, s_exp_neg_1_8_q0_31, vl);
+    vint32m4_t v_const_term_vec = __riscv_vmv_v_x_i32m4(s_exp_neg_1_8_q0_31, vl);
+    vint32m4_t v_mul_term = SRDMH_vv_i32m4(v_bracket_term, v_const_term_vec, vl);
 
-    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
+    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vv_i32m4(v_mul_term, v_const_term_vec, vl); // Reverted to non-saturating add
 
     vint32m4_t v_current_result = v_interval_result_q0_31;
 
+    const int32_t s_mult_exp_neg_1_4 = 1672461947;
+    const int32_t s_mult_exp_neg_1_2 = 1302514674;
+    const int32_t s_mult_exp_neg_1   = 790015084;
+    const int32_t s_mult_exp_neg_2   = 290630308;
+    const int32_t s_mult_exp_neg_4   = 39332535;
+    const int32_t s_mult_exp_neg_8   = 720401;
+    const int32_t s_mult_exp_neg_16  = 242;
+
     #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
     do { \
         if (kInputIntegerBits > exponent) { \
             const int shift_amount = kInputFractionalBits + exponent; \
             if (shift_amount >= 0 && shift_amount < 32) { \
-                int32_t bit_mask = INT32_C(1) << shift_amount; \
-                vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8( \
-                    __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask, vl), \
-                    0, \
-                    vl); \
+                int32_t bit_mask_val = INT32_C(1) << shift_amount; \
+                vint32m4_t v_rem_masked = __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask_val, vl); \
+                vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl); \
                 vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
                 v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
             } \
@@ -246,14 +245,12 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     #undef APPLY_BARREL_SHIFT
 
     vint32m4_t v_final_result = v_current_result;
-
     vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
     v_final_result = __riscv_vmerge_vxm_i32m4(v_final_result, s_result_one_q0_31, v_zero_mask, vl);
 
     return v_final_result;
 }
 
-
 template<typename OutputT>
 void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
                     const tflite::RuntimeShape& input_shape,
@@ -279,25 +276,25 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
         OutputT* current_output_data = output_data + i * depth;
 
         int8_t max_in_row = std::numeric_limits<int8_t>::min();
-        size_t current_c = 0;
-        size_t vl_max_init = __riscv_vsetvl_e8m1(1);
-        vint8m1_t v_max_acc_m1 = __riscv_vmv_v_x_i8m1(max_in_row, vl_max_init);
-        while (current_c < depth_sz)
-        {
-            size_t vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
-            vint8m1_t v_input_m1 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
+        size_t vl_temp = __riscv_vsetvl_e8m1(1);
+        vint8m1_t v_max_acc_m1 = __riscv_vmv_v_x_i8m1(max_in_row, vl_temp);
+        const int8_t* Ptr_max = current_input_data;
+        for (ptrdiff_t n = depth_sz; n > 0; ) {
+            size_t vl = __riscv_vsetvl_e8m1(n);
+            vint8m1_t v_input_m1 = __riscv_vle8_v_i8m1(Ptr_max, vl);
             v_max_acc_m1 = __riscv_vredmax_vs_i8m1_i8m1(v_input_m1, v_max_acc_m1, vl);
-            current_c += vl;
+            Ptr_max += vl;
+            n -= vl;
         }
         max_in_row = __riscv_vmv_x_s_i8m1_i8(v_max_acc_m1);
         const int32_t max_in_row_s32 = static_cast<int32_t>(max_in_row);
 
-        size_t vl_sum_init = __riscv_vsetvl_e32m1(1);
-        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_sum_init);
-        current_c = 0;
+        vl_temp = __riscv_vsetvl_e32m1(1);
+        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_temp);
+        size_t current_c = 0;
         while (current_c < depth_sz)
         {
-            size_t vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
+            size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
             vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
             vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
@@ -313,7 +310,7 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
             vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
             const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits;
-            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift_exp_to_accum, vl);
+            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, -rescale_shift_exp_to_accum, vl);
 
             vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
             vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
@@ -331,37 +328,28 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
         const int output_bits = sizeof(OutputT) * 8;
         const int exponent = num_bits_over_unit + 31 - output_bits;
 
-        const OutputT output_min = std::numeric_limits<OutputT>::min();
-        const OutputT output_max = std::numeric_limits<OutputT>::max();
-        const int32_t output_min_s32 = static_cast<int32_t>(output_min);
-        const int32_t output_max_s32 = static_cast<int32_t>(output_max);
+        const OutputT output_min_val = std::numeric_limits<OutputT>::min();
+        const OutputT output_max_val = std::numeric_limits<OutputT>::max();
+        const int32_t output_min_s32 = static_cast<int32_t>(output_min_val);
+        const int32_t output_max_s32 = static_cast<int32_t>(output_max_val);
 
         current_c = 0;
         while (current_c < depth_sz)
         {
-             size_t vl;
-             if constexpr (sizeof(OutputT) == 1) {
-                vl = __riscv_vsetvl_e8m1(depth_sz - current_c);
-             } else {
-                vl = __riscv_vsetvl_e16m2(depth_sz - current_c);
-             }
+             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
             vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
             vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
             vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
             vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
-
             vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
                 v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
-
             vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
             vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw_q0_31, vl);
 
-            vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, exponent, vl);
-
+            vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, -exponent, vl);
 
             vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl);
 
@@ -388,14 +376,4 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
     }
 }
 
-template void SoftmaxInt8RVV<int8_t>(const tflite::SoftmaxParams& params,
-                    const tflite::RuntimeShape& input_shape,
-                    const int8_t* input_data,
-                    const tflite::RuntimeShape& output_shape,
-                    int8_t* output_data);
-
-template void SoftmaxInt8RVV<int16_t>(const tflite::SoftmaxParams& params,
-                    const tflite::RuntimeShape& input_shape,
-                    const int8_t* input_data,
-                    const tflite::RuntimeShape& output_shape,
-                    int16_t* output_data);
\ No newline at end of file
+#endif
\ No newline at end of file

From ed5d6aa0ddaa10147ee80a85171768b081a56860 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 24 Apr 2025 18:45:47 -0500
Subject: [PATCH 40/86] Add comments to vectorized Softmax

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 187 +++++++++++++++---
 1 file changed, 157 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 4c00f98447a..cad20c63e19 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -12,206 +12,274 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+// Vectorized Saturating Rounding Doubling High Multiply (Vector-Vector)
 inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 {
+    // Define constants for saturation and rounding nudge
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_nudge_pos = (INT32_C(1) << 30);
-    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30);
+    const int32_t s_nudge_pos = (INT32_C(1) << 30); // 05 represented as Q31
+    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30); // -05 represented as Q31
 
+    // Check for overflow condition (INT32_MIN * INT32_MIN)
     vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
     vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
     vbool8_t v_overflow_mask = __riscv_vmand_mm_b8(v_min_mask_a, v_min_mask_b, vl);
 
+    // Calculate the 64-bit product using 32-bit low and high parts
     vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
     vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
+    // Determine the sign of the product to apply the correct rounding nudge
     vint32m4_t v_xor_signs = __riscv_vxor_vv_i32m4(v_a, v_b, vl);
     vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
 
+    // Select the appropriate nudge value (positive or negative 05 in Q31)
     vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
     v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
     vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
 
+    // Prepare the high part of the nudge (-1 for negative, 0 for positive)
     vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
     v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
 
+    // Add the 64-bit nudge value to the 64-bit product using 32-bit vector ops with carry
     vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-
     vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
     v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
+    // Perform the effective right shift by 31 (doubling high multiply)
     vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
     vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
     vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
 
+    // Apply saturation for the overflow case (INT32_MIN * INT32_MIN)
     vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 
+    // Return the final saturated and rounded result
     return v_result;
 }
 
+// Vectorized Saturating Rounding Doubling High Multiply (Vector-Scalar)
 inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
 {
+    // Define constants for saturation and rounding nudge
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_nudge_pos = (INT32_C(1) << 30);
-    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30);
+    const int32_t s_nudge_pos = (INT32_C(1) << 30); // 05 represented as Q31
+    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30); // -05 represented as Q31
 
+    // Check for overflow condition (vector elements == INT32_MIN and s_b == INT32_MIN)
     vbool8_t v_overflow_mask;
     if (s_b == s_int32_min)
     {
         v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
-    } else
+    } 
+    else
     {
+        // Create a mask that is all false if scalar is not INT32_MIN
         vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl);
+        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl); // Always false
     }
 
+    // Calculate the 64-bit product using 32-bit low and high parts
     vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
     vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
+    // Determine the sign of the product to apply the correct rounding nudge
     vint32m4_t v_xor_signs = __riscv_vxor_vx_i32m4(v_a, s_b, vl);
     vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
 
+    // Select the appropriate nudge value (positive or negative 05 in Q31)
     vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
     v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
     vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
 
+    // Prepare the high part of the nudge (-1 for negative, 0 for positive)
     vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
     v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
 
+    // Add the 64-bit nudge value to the 64-bit product using 32-bit vector ops with carry
     vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-
     vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
     v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
 
+    // Perform the effective right shift by 31 (doubling high multiply)
     vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
     vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
     vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
 
+    // Apply saturation for the overflow case
     vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 
+    // Return the final saturated and rounded result
     return v_result;
 }
 
+// Vectorized Saturating Rounded Multiply by Power of Two (Vector-Scalar)
 inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 {
-    if (shift > 0) {
-
+    // Handle left shifts (shift > 0)
+    if (shift > 0) 
+    {
+        // Cast shift amount to signed 32-bit integer
         const int32_t s_shift = shift;
+        // If shift is zero, return the original vector
         if (s_shift == 0) return v_vec;
 
+        // Define saturation limits
         const int32_t s_max_val = INT32_MAX;
         const int32_t s_min_val = INT32_MIN;
 
-        if (s_shift >= 31) {
+        // Handle large left shifts (>= 31), resulting in saturation or zero
+        if (s_shift >= 31) 
+        {
              vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
              vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
              vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+             // Saturate positive values to max, negative values to min
              vint32m4_t v_saturated = __riscv_vmerge_vxm_i32m4(v_zero, s_max_val, v_pos_mask, vl);
              v_saturated = __riscv_vmerge_vxm_i32m4(v_saturated, s_min_val, v_neg_mask, vl);
              return v_saturated;
-        } else {
+        } 
+        else 
+        {
+            // Calculate saturation thresholds for smaller left shifts
             const int32_t scalar_type_bits = 32;
             const int64_t pos_threshold_64 = (INT64_C(1) << (scalar_type_bits - 1 - s_shift)) - 1;
             const int64_t neg_threshold_64 = -(INT64_C(1) << (scalar_type_bits - 1 - s_shift));
 
+            // Clamp thresholds to int32 range
             const int32_t pos_threshold = (pos_threshold_64 > INT32_MAX) ? INT32_MAX : (int32_t)pos_threshold_64;
             const int32_t neg_threshold = (neg_threshold_64 < INT32_MIN) ? INT32_MIN : (int32_t)neg_threshold_64;
 
+            // Create masks for elements exceeding positive or negative thresholds
             vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, pos_threshold, vl);
             vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, neg_threshold, vl);
 
+            // Perform the left shift
             vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
 
+            // Merge shifted results with saturation values based on overflow masks
             vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
             v_result = __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
             return v_result;
         }
-
-    } else if (shift == 0) {
+    } 
+    else if (shift == 0) 
+    { // Handle no shift (shift == 0)
          return v_vec;
-    } else {
-
+    } 
+    else // Handle right shifts (shift < 0)
+    {
+        // Calculate the positive exponent for right shift, capped at 31
         int exponent = -shift;
         exponent = std::min(31, exponent);
+        // If exponent is zero, return the original vector
         if (exponent == 0) return v_vec;
 
+        // Calculate the mask for extracting the remainder bits
         const int32_t s_mask_val = (INT32_C(1) << exponent) - 1;
         const int32_t s_zero = 0;
         const int32_t s_one = 1;
 
+        // Extract the remainder and perform the arithmetic right shift
         vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_mask_val, vl);
         vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, exponent, vl);
 
+        // Calculate the rounding threshold (half the divisor)
         const int32_t s_threshold_base = s_mask_val >> 1;
         vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
+        // Adjust threshold for negative numbers (round away from zero)
         vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_zero, vl);
         v_threshold = __riscv_vadd_vx_i32m4_m(v_is_neg_mask, v_threshold, s_one, vl);
 
+        // Create mask for elements where remainder > threshold, requiring rounding up
         vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
+        // Add 1 to the shifted result for elements needing rounding up
         vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, s_one, vl);
 
+        // Return the rounded right-shifted result
         return v_result;
     }
 }
 
-
+// Vectorized MultiplyByQuantizedMultiplier for multipliers > 1 (Vector-Scalar)
 inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
-    vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) {
+    vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) 
+{
 
+    // Perform saturating left shift using SRMPOT
     vint32m4_t v_shifted_x = SRMPOT_vx_i32m4(v_x, left_shift, vl);
+
+    // Perform saturating rounding doubling high multiply using SRDMH
     return SRDMH_vx_i32m4(v_shifted_x, quantized_multiplier, vl);
 }
 
+// Vectorized exp function for quantized negative inputs
 vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 {
-    const int kInputIntegerBits = 5;
+    // Define quantization parameters and constants
+    const int kInputIntegerBits = 5; // Q526 input format
     const int kInputFractionalBits = 32 - 1 - kInputIntegerBits;
-    const int kOutputFractionalBits = 31;
+    const int kOutputFractionalBits = 31; // Q031 output format
 
+    // Constants for input range reduction (modulo 1/4)
     const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
     const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
 
-    const int32_t s_result_one_q0_31 = INT32_MAX;
-    const int32_t s_exp_neg_1_8_q0_31 = 1895147668;
-    const int32_t s_one_third_q0_31 = 715827883;
-    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
-
+    // Constants for Taylor series approximation and final result assembly
+    const int32_t s_result_one_q0_31 = INT32_MAX; // 10 in Q031
+    const int32_t s_exp_neg_1_8_q0_31 = 1895147668; // exp(-1/8) in Q031
+    const int32_t s_one_third_q0_31 = 715827883; // 1/3 in Q031
+    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3); // 1/8 in Q031
 
+    // Reduce input `a` to the range [-1/4, 0] by finding `a mod (-1/4)`
     vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
     vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
+    // Store the multiple of -1/4 that was subtracted
     vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_q5_26, v_a_mod_q_m_q_q5_26, vl);
 
+    // Rescale the reduced input from Q526 to Q031 for Taylor series input
     const int rescale_shift = kInputIntegerBits - 0;
     vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, -rescale_shift, vl);
 
+    // Calculate Taylor series approximation for exp(x) around x = -1/8 Let y = x + 1/8
     vint32m4_t v_y = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
 
+    // Calculate powers of y needed for the Taylor expansion (y^2, y^3, y^4)
     vint32m4_t v_y2 = SRDMH_vv_i32m4(v_y, v_y, vl);
     vint32m4_t v_y3 = SRDMH_vv_i32m4(v_y2, v_y, vl);
     vint32m4_t v_y4 = SRDMH_vv_i32m4(v_y2, v_y2, vl);
 
+    // Calculate term y^4 / 4
     vint32m4_t v_y4_over_4 = SRMPOT_vx_i32m4(v_y4, -2, vl);
 
+    // Combine Taylor series terms: exp(-1/8) * (1 + y + y^2/2! + y^3/3! + y^4/4! + )
+    // Approximation used: exp(-1/8) * (1 + y + (y^2 + (y^3 + y^4/4) / 3) / 2)
     vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_y4_over_4, v_y3, vl);
     vint32m4_t v_term2 = SRDMH_vx_i32m4(v_term1, s_one_third_q0_31, vl);
     vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_y2, vl);
-    vint32m4_t v_sum_of_higher_terms = SRMPOT_vx_i32m4(v_term3, -1, vl);
+    vint32m4_t v_sum_of_higher_terms = SRMPOT_vx_i32m4(v_term3, -1, vl); // Division by 2
 
+    // Calculate the term inside the main bracket (y + higher terms)
     vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_y, v_sum_of_higher_terms, vl);
 
+    // Multiply bracket term by precomputed exp(-1/8)
     vint32m4_t v_const_term_vec = __riscv_vmv_v_x_i32m4(s_exp_neg_1_8_q0_31, vl);
     vint32m4_t v_mul_term = SRDMH_vv_i32m4(v_bracket_term, v_const_term_vec, vl);
 
-    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vv_i32m4(v_mul_term, v_const_term_vec, vl); // Reverted to non-saturating add
+    // Add the constant term exp(-1/8) to complete the Taylor approximation for the interval [-1/4, 0]
+    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vv_i32m4(v_mul_term, v_const_term_vec, vl);
 
+    // Start reconstructing the full result using the remainder and precomputed exp factors
     vint32m4_t v_current_result = v_interval_result_q0_31;
 
+    // Define precomputed multipliers exp(-1/4), exp(-1/2), exp(-1), etc in Q031
     const int32_t s_mult_exp_neg_1_4 = 1672461947;
     const int32_t s_mult_exp_neg_1_2 = 1302514674;
     const int32_t s_mult_exp_neg_1   = 790015084;
@@ -220,20 +288,29 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     const int32_t s_mult_exp_neg_8   = 720401;
     const int32_t s_mult_exp_neg_16  = 242;
 
+    // Macro to apply barrel shifter logic: multiply by exp(-2^k) if corresponding bit in remainder is set
     #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
     do { \
+        /* Check if the exponent is within the representable input integer bits */ \
         if (kInputIntegerBits > exponent) { \
+            /* Calculate the bit position corresponding to this exponent in the Q526 remainder */ \
             const int shift_amount = kInputFractionalBits + exponent; \
+            /* Ensure the bit position is valid */ \
             if (shift_amount >= 0 && shift_amount < 32) { \
+                /* Create a mask for the specific bit */ \
                 int32_t bit_mask_val = INT32_C(1) << shift_amount; \
+                /* Check if the bit is set in the remainder vector */ \
                 vint32m4_t v_rem_masked = __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask_val, vl); \
                 vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl); \
+                /* Multiply the current result by the precomputed factor */ \
                 vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
+                /* Merge the multiplied result where the mask is true */ \
                 v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
             } \
         } \
     } while(0)
 
+    // Apply the barrel shifter for each power of 2 from exp(-1/4) up to exp(-16)
     APPLY_BARREL_SHIFT(-2, s_mult_exp_neg_1_4);
     APPLY_BARREL_SHIFT(-1, s_mult_exp_neg_1_2);
     APPLY_BARREL_SHIFT( 0, s_mult_exp_neg_1);
@@ -242,15 +319,19 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     APPLY_BARREL_SHIFT( 3, s_mult_exp_neg_8);
     APPLY_BARREL_SHIFT( 4, s_mult_exp_neg_16);
 
+    // Undefine the helper macro
     #undef APPLY_BARREL_SHIFT
 
+    // Handle the special case where input a = 0, the result should be 10 (INT32_MAX in Q031)
     vint32m4_t v_final_result = v_current_result;
     vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
     v_final_result = __riscv_vmerge_vxm_i32m4(v_final_result, s_result_one_q0_31, v_zero_mask, vl);
 
+    // Return the final computed exp result in Q031 format
     return v_final_result;
 }
 
+// Vectorized Softmax implementation for INT8 input and INT8/INT16 output
 template<typename OutputT>
 void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
                     const tflite::RuntimeShape& input_shape,
@@ -258,86 +339,119 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
                     const tflite::RuntimeShape& output_shape,
                     OutputT* output_data)
 {
+    // Extract Softmax quantization parameters
     const int32_t input_beta_multiplier = params.input_multiplier;
     const int32_t input_beta_left_shift = params.input_left_shift;
     const int diff_min = params.diff_min;
+
+    // Define fixed-point format for intermediate sum accumulation (Q1219)
     static const int kAccumulationIntegerBits = 12;
     static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits;
+
+    // Define fixed-point format for exp function output (Q031)
     static const int kExpOutputFractionalBits = 31;
 
+    // Get input/output shape dimensions
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
     const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
     const size_t depth_sz = static_cast<size_t>(depth);
 
+    // Loop over each row (batch or outer dimension)
     for (int i = 0; i < outer_size; ++i)
     {
+        // Set input and output pointers for the current row
         const int8_t* current_input_data = input_data + i * depth;
         OutputT* current_output_data = output_data + i * depth;
 
+        // Initialize scalar and vector max accumulators
         int8_t max_in_row = std::numeric_limits<int8_t>::min();
-        size_t vl_temp = __riscv_vsetvl_e8m1(1);
+        size_t vl_temp = __riscv_vsetvl_e8m1(1); // Set VL=1 for scalar init
         vint8m1_t v_max_acc_m1 = __riscv_vmv_v_x_i8m1(max_in_row, vl_temp);
+
+        // Process the row in vector chunks to find the maximum value
         const int8_t* Ptr_max = current_input_data;
         for (ptrdiff_t n = depth_sz; n > 0; ) {
             size_t vl = __riscv_vsetvl_e8m1(n);
             vint8m1_t v_input_m1 = __riscv_vle8_v_i8m1(Ptr_max, vl);
+            // Perform reduction max across the loaded vector and the accumulator
             v_max_acc_m1 = __riscv_vredmax_vs_i8m1_i8m1(v_input_m1, v_max_acc_m1, vl);
             Ptr_max += vl;
             n -= vl;
         }
+
+        // Extract the final scalar maximum value from the vector accumulator
         max_in_row = __riscv_vmv_x_s_i8m1_i8(v_max_acc_m1);
         const int32_t max_in_row_s32 = static_cast<int32_t>(max_in_row);
 
-        vl_temp = __riscv_vsetvl_e32m1(1);
+        // Initialize vector sum accumulator (using m1 for reduction target)
+        vl_temp = __riscv_vsetvl_e32m1(1); // Set VL=1 for scalar init
         vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_temp);
+
+        // Process the row in vector chunks
         size_t current_c = 0;
         while (current_c < depth_sz)
         {
             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
+            // Load input chunk (i8), widen to i16, then widen to i32
             vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
             vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
             vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
 
+            // Calculate difference from max (input - max_in_row)
             vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
 
+            // Create mask for elements where difference >= diff_min
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
 
+            // Rescale difference for exp input (Q526 format)
             vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
                 v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
 
+            // Calculate exponent using the vectorized exp function (output in Q031)
             vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
+            // Rescale exponent result from Q031 to accumulation format (Q1219)
             const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits;
             vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, -rescale_shift_exp_to_accum, vl);
 
+            // Zero out exponent terms where difference was below diff_min threshold
             vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
             vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
 
+            // Accumulate the sum using vector reduction sum
             v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked_q12_19, v_sum_acc_m1, vl);
 
+            // Advance pointer for the next chunk
             current_c += vl;
         }
+        
+        // Extract the final scalar sum from the vector accumulator
         int32_t sum_of_exps_raw = __riscv_vmv_x_s_i32m1_i32(v_sum_acc_m1);
 
-        int num_bits_over_unit;
+        // Calculate the reciprocal of the sum (Q031 format)
+        int num_bits_over_unit; // Headroom bits for reciprocal calculation
         int32_t reciprocal_raw_q0_31 = tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit);
         const int32_t s_shifted_scale_raw_q0_31 = reciprocal_raw_q0_31;
 
+        // Calculate the final right shift amount needed to scale to the output format
         const int output_bits = sizeof(OutputT) * 8;
         const int exponent = num_bits_over_unit + 31 - output_bits;
 
+        // Get output type min/max values for saturation
         const OutputT output_min_val = std::numeric_limits<OutputT>::min();
         const OutputT output_max_val = std::numeric_limits<OutputT>::max();
         const int32_t output_min_s32 = static_cast<int32_t>(output_min_val);
         const int32_t output_max_s32 = static_cast<int32_t>(output_max_val);
 
+        // Process the row again in vector chunks to calculate and store output
         current_c = 0;
         while (current_c < depth_sz)
         {
              size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
+            // Reload input chunk and recalculate exp (or store/reload from previous step if memory allows)
             vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
             vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
             vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
@@ -347,30 +461,43 @@ void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
                 v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
             vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
 
+            // Multiply the exponent by the reciprocal scale (SRDMH gives Q031 result)
             vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw_q0_31, vl);
 
+            // Apply the final right shift to scale to the output range
             vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, -exponent, vl);
 
+            // Add the output offset (min value of OutputT) Note: TFLM reference uses this instead of output_offset param
             vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl);
 
+            // Clamp the result to the output type's min/max range
             vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(v_shifted_output, output_min_s32, vl);
             v_clamped_output = __riscv_vmin_vx_i32m4(v_clamped_output, output_max_s32, vl);
 
+            // Merge the clamped output with the minimum output value where the difference was below diff_min
             vint32m4_t v_output_min_vec = __riscv_vmv_v_x_i32m4(output_min_s32, vl);
             vint32m4_t v_final_s32 = __riscv_vmerge_vvm_i32m4(v_output_min_vec, v_clamped_output, v_diff_mask, vl);
 
+            // Narrow the final result to the OutputT (int8 or int16) and store it
             if constexpr (sizeof(OutputT) == 1)
             {
+                 // Narrow i32m4 -> i16m2 -> i8m1
                  vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
                  vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl);
+
+                 // Store i8m1 result
                  __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl);
             }
             else
             {
+                 // Narrow i32m4 -> i16m2
                  vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
+
+                 // Store i16m2 result
                  __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl);
             }
 
+            // Advance pointer for the next chunk
             current_c += vl;
         }
     }

From 20bcb64efb3ac922f5dd8173cad693771d23942e Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 16 Oct 2025 13:47:19 -0500
Subject: [PATCH 41/86] Take number of interations as command line argument

---
 .../micro/examples/person_detection/main.cc     | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/main.cc b/tensorflow/lite/micro/examples/person_detection/main.cc
index b53d3665eb4..4b3e54e4105 100644
--- a/tensorflow/lite/micro/examples/person_detection/main.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdio.h>
+#include <stdlib.h>
+
 #include "tensorflow/lite/micro/examples/person_detection/main_functions.h"
 
 // This is the default main used on systems that have the standard C entry
@@ -20,8 +23,20 @@ limitations under the License.
 // requirements for entry code (like an app_main function) should specialize
 // this main.cc file in a target-specific subfolder.
 int main(int argc, char* argv[]) {
+  if (argc != 2) {
+    fprintf(stderr, "Usage: %s <number_of_iterations>\n", argv[0]);
+    return 1; // Indicate an error
+  }
+
+  int loop_count = atoi(argv[1]);
+  if (loop_count <= 0) {
+    fprintf(stderr, "Error: Please provide a positive number of iterations.\n");
+    return 1; // Indicate an error
+  }
+
   setup();
-  while (true) {
+
+  for (int i = 0; i < loop_count; ++i) {
     loop();
   }
 }

From cd531e32ae7c48e5b1d5ef1d93118e538c128e69 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 16 Oct 2025 13:48:21 -0500
Subject: [PATCH 42/86] Set optimization level to O3

---
 .../lite/micro/tools/make/targets/riscv32_vector_makefile.inc    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 8bb23b3e456..63896abdf90 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -20,6 +20,7 @@ export PATH := $(TARGET_TOOLCHAIN_ROOT):$(PATH)
 PLATFORM_FLAGS = \
   -march=$(RISCV_ARCH) \
   -mabi=$(RISCV_ABI) \
+  -O3 \
   -mcmodel=$(RISCV_CODE_MODEL) \
   -mexplicit-relocs \
   -fno-builtin-printf \

From 540c22f2f457855705fa2c41a623bd3132236043 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 18 Oct 2025 15:08:41 -0500
Subject: [PATCH 43/86] Add slimmed down version of micro_speech_test.cc for
 benchmarking

---
 .../micro_speech/micro_speech_test2.cc        | 188 ++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc

diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
new file mode 100644
index 00000000000..1839fcef2b6
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
@@ -0,0 +1,188 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h"
+#include "tensorflow/lite/micro/examples/micro_speech/models/audio_preprocessor_int8_model_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/models/micro_speech_quantized_model_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/no_1000ms_audio_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/no_30ms_audio_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/noise_1000ms_audio_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/silence_1000ms_audio_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/yes_1000ms_audio_data.h"
+#include "tensorflow/lite/micro/examples/micro_speech/testdata/yes_30ms_audio_data.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace {
+
+// Arena size is a guesstimate, followed by use of
+// MicroInterpreter::arena_used_bytes() on both the AudioPreprocessor and
+// MicroSpeech models and using the larger of the two results.
+constexpr size_t kArenaSize = 28584;  // xtensa p6
+alignas(16) uint8_t g_arena[kArenaSize];
+
+using Features = int8_t[kFeatureCount][kFeatureSize];
+Features g_features;
+
+constexpr int kAudioSampleDurationCount =
+    kFeatureDurationMs * kAudioSampleFrequency / 1000;
+constexpr int kAudioSampleStrideCount =
+    kFeatureStrideMs * kAudioSampleFrequency / 1000;
+
+using MicroSpeechOpResolver = tflite::MicroMutableOpResolver<4>;
+using AudioPreprocessorOpResolver = tflite::MicroMutableOpResolver<18>;
+
+TfLiteStatus RegisterOps(MicroSpeechOpResolver& op_resolver) {
+  TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFullyConnected());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddDepthwiseConv2D());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddSoftmax());
+  return kTfLiteOk;
+}
+
+TfLiteStatus RegisterOps(AudioPreprocessorOpResolver& op_resolver) {
+  TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddCast());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddStridedSlice());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddConcatenation());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddMul());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddAdd());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddDiv());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddMinimum());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddMaximum());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddWindow());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFftAutoScale());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddRfft());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddEnergy());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBank());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankSquareRoot());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankSpectralSubtraction());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddPCAN());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankLog());
+  return kTfLiteOk;
+}
+
+TfLiteStatus LoadMicroSpeechModelAndPerformInference(
+    const Features& features, const char* expected_label) {
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model =
+      tflite::GetModel(g_micro_speech_quantized_model_data);
+
+  MicroSpeechOpResolver op_resolver;
+  RegisterOps(op_resolver);
+
+  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
+  interpreter.AllocateTensors();
+
+  TfLiteTensor* input = interpreter.input(0);
+
+  TfLiteTensor* output = interpreter.output(0);
+
+  float output_scale = output->params.scale;
+  int output_zero_point = output->params.zero_point;
+
+  for (int i = 0; i < 4; i++) {
+    std::copy_n(&features[0][0], kFeatureElementCount,
+                tflite::GetTensorData<int8_t>(input));
+    interpreter.Invoke();
+  }
+
+  // Dequantize output values
+  volatile float category_predictions[kCategoryCount];
+  for (int i = 0; i < kCategoryCount; i++) {
+    category_predictions[i] =
+        (tflite::GetTensorData<int8_t>(output)[i] - output_zero_point) *
+        output_scale;
+
+    if (category_predictions[i] > -1000.0f) {
+        // Dummy read to satisfy compiler
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus GenerateSingleFeature(const int16_t* audio_data,
+                                   const int audio_data_size,
+                                   int8_t* feature_output,
+                                   tflite::MicroInterpreter* interpreter) {
+  TfLiteTensor* input = interpreter->input(0);
+  TfLiteTensor* output = interpreter->output(0);
+
+  std::copy_n(audio_data, audio_data_size,
+              tflite::GetTensorData<int16_t>(input));
+  interpreter->Invoke();
+  std::copy_n(tflite::GetTensorData<int8_t>(output), kFeatureSize,
+              feature_output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus GenerateFeatures(const int16_t* audio_data,
+                              const size_t audio_data_size,
+                              Features* features_output) {
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model =
+      tflite::GetModel(g_audio_preprocessor_int8_model_data);
+
+  AudioPreprocessorOpResolver op_resolver;
+  RegisterOps(op_resolver);
+
+  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
+  interpreter.AllocateTensors();
+
+  size_t remaining_samples = audio_data_size;
+  size_t feature_index = 0;
+  while (remaining_samples >= kAudioSampleDurationCount &&
+         feature_index < kFeatureCount) {
+  GenerateSingleFeature(audio_data, kAudioSampleDurationCount,
+                              (*features_output)[feature_index], &interpreter);
+    feature_index++;
+    audio_data += kAudioSampleStrideCount;
+    remaining_samples -= kAudioSampleStrideCount;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus TestAudioSample(const char* label, const int16_t* audio_data,
+                             const size_t audio_data_size) {
+      GenerateFeatures(audio_data, audio_data_size, &g_features);
+      LoadMicroSpeechModelAndPerformInference(g_features, label);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+int main () {
+  TestAudioSample("no", g_no_1000ms_audio_data, g_no_1000ms_audio_data_size);
+
+  TestAudioSample("yes", g_yes_1000ms_audio_data, g_yes_1000ms_audio_data_size);
+
+  TestAudioSample("silence", g_silence_1000ms_audio_data,
+                  g_silence_1000ms_audio_data_size);
+
+  TestAudioSample("silence", g_noise_1000ms_audio_data,
+                  g_noise_1000ms_audio_data_size);
+}
\ No newline at end of file

From 3176970d15e68a5df15630a49792d2127047daee Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 6 Nov 2025 15:50:16 -0600
Subject: [PATCH 44/86] Optimize FullyConnectedPerChannelRVV

---
 .../riscv_vector/fully_connected_rvv.cc       | 382 ++++++++++--------
 1 file changed, 212 insertions(+), 170 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
index 34a094a4cc4..5bde9fdf00f 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
@@ -5,112 +5,148 @@
 
 using namespace tflite;
 
-void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
-                            const int32_t* output_multiplier,
-                            const int* output_shift,
-                            const RuntimeShape& input_shape,
-                            const int8_t* input_data,
-                            const RuntimeShape& filter_shape,
-                            const int8_t* filter_data,
-                            const RuntimeShape& bias_shape,
-                            const int32_t* bias_data,
-                            const RuntimeShape& output_shape,
-                            int8_t* output_data)
-{
-    // Extract quantization parameters
-    const int32_t input_offset = params.input_offset;
-    const int32_t output_offset = params.output_offset;
-    const int32_t output_activation_min = params.quantized_activation_min;
-    const int32_t output_activation_max = params.quantized_activation_max;
-
-    // Extract shape dimensions
-    const int filter_dim_count = filter_shape.DimensionsCount();
-    const int output_dim_count = output_shape.DimensionsCount();
-    const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-    const int output_depth = output_shape.Dims(output_dim_count - 1);
-    const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-
-    // Prepare scalar constants for vector operations
-    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
-    const int32_t s_output_offset_s32 = output_offset;
-    const int32_t s_output_activation_min_s32 = output_activation_min;
-    const int32_t s_output_activation_max_s32 = output_activation_max;
-
-    // Loop over batches
-    for (int b = 0; b < batches; ++b)
-    {
-        // Set base pointers for the current batch
-        const int8_t* input_batch_ptr = input_data + b * accum_depth;
-        int8_t* output_batch_ptr = output_data + b * output_depth;
-
-        // Loop over output channels (rows of the weight matrix)
-        for (int out_c = 0; out_c < output_depth; ++out_c) {
-            // Set filter pointer and get bias for the current output channel
-            const int8_t* filter_row_ptr = filter_data + out_c * accum_depth;
-            const int32_t bias_val = bias_data ? bias_data[out_c] : 0;
-
-            // Initialize vector accumulator to zero
-            size_t initial_vl_for_acc_init = __riscv_vsetvlmax_e16m2();
-            vint32m4_t v_acc_s32m4 = __riscv_vmv_v_x_i32m4(0, initial_vl_for_acc_init);
-
-            // Initialize scalar accumulator with bias value
-            int32_t s_acc_s32 = bias_val;
-
-            // Loop over accumulation depth (dot product length) in vector
-            // chunks
-            size_t current_d = 0;
-            while (current_d < static_cast<size_t>(accum_depth))
-            {
-                // Set vector length for the current chunk
-                size_t vl = __riscv_vsetvl_e16m2(accum_depth - current_d);
-
-                // Load input vector chunk, widen to i16, and add input offset
-                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_batch_ptr + current_d, vl);
-                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_input_s16, s_input_offset_s16, vl);
-
-                // Load filter vector chunk and widen to i16
-                vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_row_ptr + current_d, vl);
-                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
-
-                // Perform widening multiply-accumulate
-                v_acc_s32m4 = __riscv_vwmacc_vv_i32m4(v_acc_s32m4, v_input_plus_offset_s16, v_filter_s16, vl);
-
-                // Advance pointer for the next chunk
-                current_d += vl;
-            }
-
-            // Reduce the final vector accumulator to a scalar sum
-            size_t final_vl = __riscv_vsetvl_e32m4(accum_depth > 0 ? 1 : 0);
-            if (accum_depth > 0)
-            {
-                // Set VL for reduction based on accumulated depth
-                final_vl = __riscv_vsetvl_e32m4(accum_depth);
-
-                // Initialize reduction target vector register to zero
-                vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
-
-                // Perform reduction sum
-                vint32m1_t v_reduced_sum_s32m1 = __riscv_vredsum_vs_i32m4_i32m1(v_acc_s32m4, v_zero, final_vl);
-
-                // Extract scalar sum and add to the bias-initialized scalar accumulator
-                s_acc_s32 += __riscv_vmv_x_s_i32m1_i32(v_reduced_sum_s32m1);
-            }
-
-            // Apply per-channel requantization (scalar multiplication and shift)
-            int32_t s_requantized_acc_s32 = MultiplyByQuantizedMultiplier(s_acc_s32, output_multiplier[out_c], output_shift[out_c]);
-
-            // Add output offset to the requantized value
-            s_requantized_acc_s32 += s_output_offset_s32;
-
-            // Clamp the result to the activation range
-            s_requantized_acc_s32 = std::max(s_requantized_acc_s32, s_output_activation_min_s32);
-            s_requantized_acc_s32 = std::min(s_requantized_acc_s32, s_output_activation_max_s32);
+void FullyConnectedPerChannelRVV(const tflite::FullyConnectedParams& params,
+                                 const int32_t* output_multiplier,
+                                 const int* output_shift,
+                                 const tflite::RuntimeShape& input_shape,
+                                 const int8_t* input_data,
+                                 const tflite::RuntimeShape& filter_shape,
+                                 const int8_t* filter_data,
+                                 const tflite::RuntimeShape& bias_shape,
+                                 const int32_t* bias_data,
+                                 const tflite::RuntimeShape& output_shape,
+                                 int8_t* output_data) {
+  // Extract quantization parameters
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Extract shape dimensions
+  const int batches = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  const int output_depth = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  const int accum_depth = filter_shape.Dims(filter_shape.DimensionsCount() - 1);
+
+  // Prepare scalar constants
+  const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+
+  // Loop over batches
+  for (int b = 0; b < batches; ++b) {
+    const int8_t* input_batch_ptr = input_data + b * accum_depth;
+    int8_t* output_batch_ptr = output_data + b * output_depth;
+
+    // Vectorized loop over output channels
+    size_t current_out_c = 0;
+    while (current_out_c < static_cast<size_t>(output_depth)) {
+      // Set vector length for this iteration
+      size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
+
+      // Initialize accumulator vector with biases
+      vint32m4_t v_acc_s32;
+      if (bias_data) {
+        v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
+      } else {
+        v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+      }
+
+      // Main MAC loop to compute dot products
+      for (int d = 0; d < accum_depth; ++d) {
+        // Load scalar input value and add offset
+        int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
+        
+        // Calculate filter pointer and stride for the current column
+        const int8_t* filter_col_ptr = filter_data + d + current_out_c * accum_depth;
+        ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
+
+        // Load filter vector, widen, and perform widening multiply-accumulate
+        vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
+        vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
+        v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_s16, vl);
+      }
+
+      // Start of fully vectorized per-channel requantization
+      vint32m4_t v_res32;
+      
+      // Load per-channel requantization parameters into vectors
+      vint32m4_t v_multiplier = __riscv_vle32_v_i32m4(output_multiplier + current_out_c, vl);
+      vint32m4_t v_shift = __riscv_vle32_v_i32m4(reinterpret_cast<const int32_t*>(output_shift) + current_out_c, vl);
+
+      // Create a mask for lanes that require a right shift (where shift > 0)
+      vbool8_t v_mask_right_shift = __riscv_vmsgt_vx_i32m4_b8(v_shift, 0, vl);
+
+      // Path 1: Right Shift (for lanes where shift > 0)
+      vint32m4_t v_res_right;
+      {
+        // Calculate the 64-bit product of accumulator and multiplier
+        vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4_m(v_mask_right_shift, v_acc_s32, v_multiplier, vl);
+        vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4_m(v_mask_right_shift, v_acc_s32, v_multiplier, vl);
+
+        // Calculate the 64-bit rounding value: (1 << (shift - 1))
+        vint32m4_t v_shift_minus_1 = __riscv_vsub_vx_i32m4_m(v_mask_right_shift, v_shift, 1, vl);
+        vuint32m4_t v_one_u = __riscv_vmv_v_x_u32m4(1, vl);
+        vuint32m4_t v_rounding_u = __riscv_vsll_vv_u32m4_m(v_mask_right_shift, v_one_u, __riscv_vreinterpret_v_i32m4_u32m4(v_shift_minus_1), vl);
+
+        // Add the 64-bit rounding value to the 64-bit product
+        vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+        vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4_m(v_mask_right_shift, v_prod_lo_u, v_rounding_u, vl);
+        vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8_m(v_mask_right_shift, v_sum_lo_u, v_prod_lo_u, vl);
+        vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_prod_hi, 1, vl);
+
+        // Create a mask to select between the two 64-bit shift emulation paths
+        vbool8_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m4_b8_m(v_mask_right_shift, v_shift, 32, vl);
+
+        // Sub-path A: Emulate 64-bit shift for 0 < shift < 32
+        vint32m4_t v_res_lt_32;
+        {
+          vuint32m4_t v_shift_u = __riscv_vreinterpret_v_i32m4_u32m4(v_shift);
+          vuint32m4_t v_shift_rev_u = __riscv_vrsub_vx_u32m4_m(v_mask_shift_lt_32, v_shift_u, 32, vl);
+          vuint32m4_t v_lo_part = __riscv_vsrl_vv_u32m4_m(v_mask_shift_lt_32, v_sum_lo_u, v_shift_u, vl);
+          vuint32m4_t v_hi_part = __riscv_vsll_vv_u32m4_m(v_mask_shift_lt_32, __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_hi), v_shift_rev_u, vl);
+          v_res_lt_32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4_m(v_mask_shift_lt_32, v_lo_part, v_hi_part, vl));
+        }
 
-            // Store the final int8 result
-            output_batch_ptr[out_c] = static_cast<int8_t>(s_requantized_acc_s32);
+        // Sub-path B: Emulate 64-bit shift for shift >= 32
+        vint32m4_t v_res_ge_32;
+        {
+          vbool8_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b8(v_mask_right_shift, v_mask_shift_lt_32, vl);
+          vint32m4_t v_shift_hi = __riscv_vsub_vx_i32m4_m(v_mask_shift_ge_32, v_shift, 32, vl);
+          v_shift_hi = __riscv_vmin_vx_i32m4_m(v_mask_shift_ge_32, v_shift_hi, 31, vl); // Clamp to 31
+          v_res_ge_32 = __riscv_vsra_vv_i32m4_m(v_mask_shift_ge_32, v_rounded_hi, __riscv_vreinterpret_v_i32m4_u32m4(v_shift_hi), vl);
         }
+
+        // Merge the results from the two 64-bit shift sub-paths
+        v_res_right = __riscv_vmerge_vvm_i32m4(v_res_ge_32, v_res_lt_32, v_mask_shift_lt_32, vl);
+      }
+
+      // Path 2: Left Shift (for lanes where shift <= 0)
+      vint32m4_t v_res_left;
+      {
+        // Negate the shift amount and perform a left shift on the accumulator
+        vint32m4_t v_neg_shift = __riscv_vneg_v_i32m4(v_shift, vl);
+        v_res_left = __riscv_vsll_vv_i32m4(v_acc_s32, __riscv_vreinterpret_v_i32m4_u32m4(v_neg_shift), vl);
+      }
+
+      // Merge the results from the right and left shift paths
+      v_res32 = __riscv_vmerge_vvm_i32m4(v_res_left, v_res_right, v_mask_right_shift, vl);
+
+      // Add the final output offset
+      v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+
+      // Clamp the results to the activation range
+      v_res32 = __riscv_vmax_vx_i32m4(v_res32, output_activation_min, vl);
+      v_res32 = __riscv_vmin_vx_i32m4(v_res32, output_activation_max, vl);
+
+      // Narrow the 32-bit results to 16-bit, then 8-bit with saturation
+      vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
+      vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+      
+      // Store the final 8-bit output vector
+      __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
+
+      // Advance to the next block of output channels
+      current_out_c += vl;
     }
+  }
 }
 
 void FullyConnectedRVV(const FullyConnectedParams& params,
@@ -123,7 +159,7 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
                   const RuntimeShape& output_shape,
                   int8_t* output_data)
 {
-    // Extract quantization parameters (scalar values for the whole layer)
+    // Extract quantization parameters
     const int32_t input_offset = params.input_offset;
     const int32_t filter_offset = params.weights_offset;
     const int32_t output_offset = params.output_offset;
@@ -142,86 +178,92 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
     // Prepare scalar constants for vector operations
     const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
     const int16_t s_filter_offset_s16 = static_cast<int16_t>(filter_offset);
-    const int32_t s_output_offset_s32 = output_offset;
-    const int32_t s_output_activation_min_s32 = output_activation_min;
-    const int32_t s_output_activation_max_s32 = output_activation_max;
 
     // Loop over batches
-    for (int b = 0; b < batches; ++b) 
+    for (int b = 0; b < batches; ++b)
     {
-        // Set base pointers for the current batch
         const int8_t* input_batch_ptr = input_data + b * accum_depth;
         int8_t* output_batch_ptr = output_data + b * output_depth;
 
-        // Loop over output channels (rows of the weight matrix)
-        for (int out_c = 0; out_c < output_depth; ++out_c)
-        {
-            // Set filter pointer and get bias for the current output channel
-            const int8_t* filter_row_ptr = filter_data + out_c * accum_depth;
-            // Bias is int32_t for non-per-channel int8 quantization
-            const int32_t bias_val = bias_data ? bias_data[out_c] : 0;
-
-            // Initialize vector accumulator to zero
-            // Use vlmax corresponding to operand type (e16m2) to determine acc size
-            size_t initial_vl_for_acc_init = __riscv_vsetvlmax_e16m2();
-            vint32m4_t v_acc_s32m4 = __riscv_vmv_v_x_i32m4(0, initial_vl_for_acc_init);
-
-            // Initialize scalar accumulator with bias value
-            int32_t s_acc_s32 = bias_val;
-
-            // Loop over accumulation depth (dot product length) in vector chunks
-            size_t current_d = 0;
-            while (current_d < static_cast<size_t>(accum_depth))
-            {
-                // Set vector length for the current chunk
-                size_t vl = __riscv_vsetvl_e16m2(accum_depth - current_d);
-
-                // Load input vector chunk, widen to i16, and add input offset
-                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(input_batch_ptr + current_d, vl);
-                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_input_s16, s_input_offset_s16, vl);
-
-                // Load filter vector chunk, widen to i16, and add filter offset
-                vint8m1_t v_filter_s8 = __riscv_vle8_v_i8m1(filter_row_ptr + current_d, vl);
+        // Vectorized loop over output channels
+        size_t current_out_c = 0;
+        while (current_out_c < static_cast<size_t>(output_depth)) {
+            // Set vector length for processing multiple output channels
+            size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
+
+            // Initialize accumulator vector with biases
+            vint32m4_t v_acc_s32;
+            if (bias_data) {
+                v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
+            } else {
+                v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
+            }
+
+            // Loop over accumulation depth to compute 'vl' dot products in parallel
+            for (int d = 0; d < accum_depth; ++d) {
+                // Load one scalar from the input vector and add offset
+                int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
+
+                // Load a vector of 'vl' filter values (a column slice)
+                const int8_t* filter_col_ptr = filter_data + current_out_c * accum_depth + d;
+                ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
+                vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
+                
+                // Widen filter values and add filter offset
                 vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
                 vint16m2_t v_filter_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_filter_s16, s_filter_offset_s16, vl);
-
-                // Perform widening multiply-accumulate
-                v_acc_s32m4 = __riscv_vwmacc_vv_i32m4(v_acc_s32m4, v_input_plus_offset_s16, v_filter_plus_offset_s16, vl);
-
-                // Advance pointer for the next chunk
-                current_d += vl;
+                
+                // Perform widening vector-scalar multiply-accumulate
+                v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_plus_offset_s16, vl);
             }
 
-            // Reduce the final vector accumulator to a scalar sum
-            size_t final_vl = __riscv_vsetvl_e32m4(accum_depth > 0 ? 1 : 0);
-            if (accum_depth > 0)
-            {
-                // Set VL for reduction based on accumulated depth
-                final_vl = __riscv_vsetvl_e32m4(accum_depth);
-
-                // Initialize reduction target vector register to zero
-                vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
-
-                // Perform reduction sum
-                vint32m1_t v_reduced_sum_s32m1 = __riscv_vredsum_vs_i32m4_i32m1(v_acc_s32m4, v_zero, final_vl);
-
-                // Extract scalar sum and add to the bias-initialized scalar accumulator
-                s_acc_s32 += __riscv_vmv_x_s_i32m1_i32(v_reduced_sum_s32m1);
+            // Start of inline vectorized requantization
+            vint32m4_t v_res32;
+            const int effective_right_shift = 31 - output_shift;
+
+            // Calculate rounding constants
+            int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
+            int32_t rounding_lo = static_cast<int32_t>(rounding_val);
+            int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
+
+            // Multiply accumulator by scalar multiplier (results in 64b intermediate)
+            vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, output_multiplier, vl);
+            vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, output_multiplier, vl);
+
+            // Add 64b rounding value
+            vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+            vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
+            vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
+            vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
+            v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+            vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
+
+            // Perform 64b arithmetic right shift
+            if (effective_right_shift == 0) {
+                v_res32 = v_rounded_lo;
+            } else if (effective_right_shift > 0 && effective_right_shift < 32) {
+                vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
+                vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
+                v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
+            } else {
+                int shift_hi = std::min(31, effective_right_shift - 32);
+                v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
             }
 
-            // Apply uniform requantization (scalar multiplication and shift)
-            int32_t s_requantized_acc_s32 = MultiplyByQuantizedMultiplier(s_acc_s32, output_multiplier, output_shift);
+            // Add output offset
+            v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
 
-            // Add output offset to the requantized value
-            s_requantized_acc_s32 += s_output_offset_s32;
+            // Clamp to activation bounds
+            v_res32 = __riscv_vmax_vx_i32m4(v_res32, output_activation_min, vl);
+            v_res32 = __riscv_vmin_vx_i32m4(v_res32, output_activation_max, vl);
 
-            // Clamp the result to the activation range
-            s_requantized_acc_s32 = std::max(s_requantized_acc_s32, s_output_activation_min_s32);
-            s_requantized_acc_s32 = std::min(s_requantized_acc_s32, s_output_activation_max_s32);
+            // Narrow result to int8 and store
+            vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
+            vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+            __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
 
-            // Store the final int8 result (using batch offset)
-            output_batch_ptr[out_c] = static_cast<int8_t>(s_requantized_acc_s32);
+            // Advance to the next block of output channels
+            current_out_c += vl;
         }
     }
-}
+}
\ No newline at end of file

From 218949879cfcbcec78f5f2ca880567d8ecb995f1 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 6 Nov 2025 15:52:43 -0600
Subject: [PATCH 45/86] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6e2b2680b38..90e4d97bc46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 venv
 gen
 .venv
+tensorflow/lite/micro/examples/micro_speech2
 
 # Ignore the directory in which `clangd` stores its local index.
 /.cache/

From 6983e555802f6865d92b709be2a7a4a9cfd9702a Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Thu, 6 Nov 2025 15:53:11 -0600
Subject: [PATCH 46/86] Update riscv32_vector makefile

---
 .../micro/tools/make/targets/riscv32_vector_makefile.inc     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 63896abdf90..254dba53c3e 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -20,7 +20,6 @@ export PATH := $(TARGET_TOOLCHAIN_ROOT):$(PATH)
 PLATFORM_FLAGS = \
   -march=$(RISCV_ARCH) \
   -mabi=$(RISCV_ABI) \
-  -O3 \
   -mcmodel=$(RISCV_CODE_MODEL) \
   -mexplicit-relocs \
   -fno-builtin-printf \
@@ -66,12 +65,12 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
+#  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/conv.cc \
   tensorflow/lite/micro/kernels/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/fully_connected.cc \
-  tensorflow/lite/micro/kernels/softmax.cc
+#  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 9fb07cff3ce6eaa0b08594ba5f9e44c55635a1ba Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 14 Nov 2025 03:31:37 -0600
Subject: [PATCH 47/86] Add vector optimized 8-bit MaxPool kernel

---
 .../micro/kernels/riscv_vector/pooling.cc     | 124 ++++++++++++++++++
 .../micro/kernels/riscv_vector/pooling_rvv.cc | 105 +++++++++++++++
 .../micro/kernels/riscv_vector/pooling_rvv.h  |  13 ++
 .../make/targets/riscv32_vector_makefile.inc  |   3 +
 4 files changed, 245 insertions(+)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc b/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
new file mode 100644
index 00000000000..0ffbd5d5681
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
@@ -0,0 +1,124 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/pooling.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#include "pooling_rvv.h"
+
+namespace tflite {
+
+namespace {
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      AveragePoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+      AveragePoolingEvalQuantized<int8_t>(context, node, params, data, input,
+                                          output);
+      break;
+    case kTfLiteInt16:
+      AveragePoolingEvalQuantized<int16_t>(context, node, params, data, input,
+                                           output);
+      break;
+    default:
+      MicroPrintf("Input type %s is not currently supported",
+                  TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      MaxPoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+        tflite::PoolParams op_params;
+        op_params.stride_height = params->stride_height;
+        op_params.stride_width = params->stride_width;
+        op_params.filter_height = params->filter_height;
+        op_params.filter_width = params->filter_width;
+        op_params.padding_values.height = data->padding.height;
+        op_params.padding_values.width = data->padding.width;
+        op_params.quantized_activation_min = data->activation_min;
+        op_params.quantized_activation_max = data->activation_max;
+
+        MaxPool8BitRVV(op_params,
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<std::int8_t>(input),
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<std::int8_t>(output));
+      break;
+    case kTfLiteInt16:
+      MaxPoolingEvalQuantized<int16_t>(context, node, params, data, input,
+                                       output);
+      break;
+    default:
+      MicroPrintf("Type %s not currently supported.",
+                  TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void* PoolInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataPooling));
+}
+
+}  // namespace
+
+TFLMRegistration Register_AVERAGE_POOL_2D() {
+  return tflite::micro::RegisterOp(PoolInit, PoolingPrepare, AverageEval);
+}
+
+TFLMRegistration Register_MAX_POOL_2D() {
+  return tflite::micro::RegisterOp(PoolInit, PoolingPrepare, MaxEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
new file mode 100644
index 00000000000..fc12f630bf3
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
@@ -0,0 +1,105 @@
+#include <riscv_vector.h>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+using namespace tflite;
+
+void MaxPool8BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data)
+{
+    // Extract pooling parameters
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  const int filter_height = params.filter_height;
+  const int filter_width = params.filter_width;
+  const int pad_height = params.padding_values.height;
+  const int pad_width = params.padding_values.width;
+  const int8_t output_activation_min = params.quantized_activation_min;
+  const int8_t output_activation_max = params.quantized_activation_max;
+
+  // Extract shape dimensions
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  // Calculate tensor strides for direct pointer arithmetic
+  const int input_y_stride = input_width * depth;
+  const int input_b_stride = input_height * input_y_stride;
+  const int output_y_stride = output_width * depth;
+  const int output_b_stride = output_height * output_y_stride;
+
+  // Loop over batches
+  for (int batch = 0; batch < batches; ++batch) {
+    const int8_t* input_batch_base = input_data + batch * input_b_stride;
+    int8_t* output_batch_base = output_data + batch * output_b_stride;
+
+    // Loop over output spatial dimensions (y, x)
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        
+        // Vectorized loop over channels (depth)
+        size_t current_channel = 0;
+        while (current_channel < static_cast<size_t>(depth))
+        {
+          // Set vector length. For `zvl128b`, VLEN=128. With SEW=8 (int8_t),
+          // VLMAX is 16 * LMUL. Using LMUL=4 provides a good balance, allowing
+          // up to 64 channels to be processed per iteration.
+          size_t vl = __riscv_vsetvl_e8m4(depth - current_channel);
+
+          // Initialize the accumulator vector with the smallest possible int8_t value.
+          vint8m4_t v_max_s8 = __riscv_vmv_v_x_i8m4(std::numeric_limits<int8_t>::lowest(), vl);
+
+          // Loop over the filter window dimensions (y, x)
+          for (int f_y = 0; f_y < filter_height; ++f_y)
+          {
+            for (int f_x = 0; f_x < filter_width; ++f_x)
+            {
+              // Calculate corresponding input coordinates for this filter tap
+              const int in_y = (out_y * stride_height) + f_y - pad_height;
+              const int in_x = (out_x * stride_width) + f_x - pad_width;
+
+              // Handle padding by checking if the input coordinates are valid
+              if (in_y >= 0 && in_y < input_height && in_x >= 0 && in_x < input_width)
+              {
+                // If valid, calculate the pointer to the input vector
+                const int8_t* input_ptr = input_batch_base +
+                                          (in_y * input_y_stride) +
+                                          (in_x * depth) +
+                                          current_channel;
+
+                // Load a vector of input values (unit-stride access)
+                vint8m4_t v_input_s8 = __riscv_vle8_v_i8m4(input_ptr, vl);
+                
+                // Perform the vector max operation
+                v_max_s8 = __riscv_vmax_vv_i8m4(v_max_s8, v_input_s8, vl);
+              }
+            }
+          }
+
+          // After iterating through the filter window, apply activation clamping
+          v_max_s8 = __riscv_vmax_vx_i8m4(v_max_s8, output_activation_min, vl);
+          v_max_s8 = __riscv_vmin_vx_i8m4(v_max_s8, output_activation_max, vl);
+          
+          // Calculate the output pointer
+          int8_t* output_ptr = output_batch_base +
+                               (out_y * output_y_stride) +
+                               (out_x * depth) +
+                               current_channel;
+          
+          // Store the final vector of maximum values (unit-stride access)
+          __riscv_vse8_v_i8m4(output_ptr, v_max_s8, vl);
+
+          // Advance to the next block of channels
+          current_channel += vl;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
new file mode 100644
index 00000000000..99fbf05ea51
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
@@ -0,0 +1,13 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_POOLING_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_POOLING_RVV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+using namespace tflite;
+
+void MaxPool8BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data);
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 254dba53c3e..acadddeea34 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -65,12 +65,15 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/pooling.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc \
 #  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/conv.cc \
   tensorflow/lite/micro/kernels/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/fully_connected.cc \
+  tensorflow/lite/micro/kernels/pooling.cc\
 #  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 86503c9a450eb320f3f87a7f4c7a2fb6adc29e81 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 14 Nov 2025 04:33:08 -0600
Subject: [PATCH 48/86] Add 16-bit vector optimized MaxPool kernel

---
 .../micro/kernels/riscv_vector/pooling.cc     |  21 +-
 .../micro/kernels/riscv_vector/pooling_rvv.cc | 270 ++++++++++++------
 .../micro/kernels/riscv_vector/pooling_rvv.h  |   4 +
 3 files changed, 207 insertions(+), 88 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc b/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
index 0ffbd5d5681..934f526d82f 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling.cc
@@ -78,6 +78,7 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxPoolingEvalFloat(context, node, params, data, input, output);
       break;
     case kTfLiteInt8:
+    {
         tflite::PoolParams op_params;
         op_params.stride_height = params->stride_height;
         op_params.stride_width = params->stride_width;
@@ -93,10 +94,26 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
                     tflite::micro::GetTensorData<std::int8_t>(input),
                     tflite::micro::GetTensorShape(output),
                     tflite::micro::GetTensorData<std::int8_t>(output));
+    }
       break;
     case kTfLiteInt16:
-      MaxPoolingEvalQuantized<int16_t>(context, node, params, data, input,
-                                       output);
+    {
+        tflite::PoolParams op_params;
+        op_params.stride_height = params->stride_height;
+        op_params.stride_width = params->stride_width;
+        op_params.filter_height = params->filter_height;
+        op_params.filter_width = params->filter_width;
+        op_params.padding_values.height = data->padding.height;
+        op_params.padding_values.width = data->padding.width;
+        op_params.quantized_activation_min = data->activation_min;
+        op_params.quantized_activation_max = data->activation_max;
+
+        MaxPool16BitRVV(op_params,
+                    tflite::micro::GetTensorShape(input),
+                    tflite::micro::GetTensorData<std::int16_t>(input),
+                    tflite::micro::GetTensorShape(output),
+                    tflite::micro::GetTensorData<std::int16_t>(output));
+    }
       break;
     default:
       MicroPrintf("Type %s not currently supported.",
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
index fc12f630bf3..dfaf4f8cb1f 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
@@ -10,96 +10,194 @@ void MaxPool8BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
                     int8_t* output_data)
 {
     // Extract pooling parameters
-  const int stride_height = params.stride_height;
-  const int stride_width = params.stride_width;
-  const int filter_height = params.filter_height;
-  const int filter_width = params.filter_width;
-  const int pad_height = params.padding_values.height;
-  const int pad_width = params.padding_values.width;
-  const int8_t output_activation_min = params.quantized_activation_min;
-  const int8_t output_activation_max = params.quantized_activation_max;
-
-  // Extract shape dimensions
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-
-  // Calculate tensor strides for direct pointer arithmetic
-  const int input_y_stride = input_width * depth;
-  const int input_b_stride = input_height * input_y_stride;
-  const int output_y_stride = output_width * depth;
-  const int output_b_stride = output_height * output_y_stride;
-
-  // Loop over batches
-  for (int batch = 0; batch < batches; ++batch) {
-    const int8_t* input_batch_base = input_data + batch * input_b_stride;
-    int8_t* output_batch_base = output_data + batch * output_b_stride;
-
-    // Loop over output spatial dimensions (y, x)
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        
-        // Vectorized loop over channels (depth)
-        size_t current_channel = 0;
-        while (current_channel < static_cast<size_t>(depth))
+    const int stride_height = params.stride_height;
+    const int stride_width = params.stride_width;
+    const int filter_height = params.filter_height;
+    const int filter_width = params.filter_width;
+    const int pad_height = params.padding_values.height;
+    const int pad_width = params.padding_values.width;
+    const int8_t output_activation_min = params.quantized_activation_min;
+    const int8_t output_activation_max = params.quantized_activation_max;
+
+    // Extract shape dimensions
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Calculate tensor strides for direct pointer arithmetic
+    const int input_y_stride = input_width * depth;
+    const int input_b_stride = input_height * input_y_stride;
+    const int output_y_stride = output_width * depth;
+    const int output_b_stride = output_height * output_y_stride;
+
+    // Loop over batches
+    for (int batch = 0; batch < batches; ++batch) {
+        const int8_t* input_batch_base = input_data + batch * input_b_stride;
+        int8_t* output_batch_base = output_data + batch * output_b_stride;
+
+        // Loop over output spatial dimensions (y, x)
+        for (int out_y = 0; out_y < output_height; ++out_y)
         {
-          // Set vector length. For `zvl128b`, VLEN=128. With SEW=8 (int8_t),
-          // VLMAX is 16 * LMUL. Using LMUL=4 provides a good balance, allowing
-          // up to 64 channels to be processed per iteration.
-          size_t vl = __riscv_vsetvl_e8m4(depth - current_channel);
-
-          // Initialize the accumulator vector with the smallest possible int8_t value.
-          vint8m4_t v_max_s8 = __riscv_vmv_v_x_i8m4(std::numeric_limits<int8_t>::lowest(), vl);
-
-          // Loop over the filter window dimensions (y, x)
-          for (int f_y = 0; f_y < filter_height; ++f_y)
-          {
-            for (int f_x = 0; f_x < filter_width; ++f_x)
+            for (int out_x = 0; out_x < output_width; ++out_x)
             {
-              // Calculate corresponding input coordinates for this filter tap
-              const int in_y = (out_y * stride_height) + f_y - pad_height;
-              const int in_x = (out_x * stride_width) + f_x - pad_width;
-
-              // Handle padding by checking if the input coordinates are valid
-              if (in_y >= 0 && in_y < input_height && in_x >= 0 && in_x < input_width)
-              {
-                // If valid, calculate the pointer to the input vector
-                const int8_t* input_ptr = input_batch_base +
-                                          (in_y * input_y_stride) +
-                                          (in_x * depth) +
-                                          current_channel;
-
-                // Load a vector of input values (unit-stride access)
-                vint8m4_t v_input_s8 = __riscv_vle8_v_i8m4(input_ptr, vl);
                 
-                // Perform the vector max operation
-                v_max_s8 = __riscv_vmax_vv_i8m4(v_max_s8, v_input_s8, vl);
-              }
+                // Vectorized loop over channels (depth)
+                size_t current_channel = 0;
+                while (current_channel < static_cast<size_t>(depth))
+                {
+                    // Set vector length. For `zvl128b`, VLEN=128. With SEW=8 (int8_t),
+                    // VLMAX is 16 * LMUL. Using LMUL=4 provides a good balance, allowing
+                    // up to 64 channels to be processed per iteration.
+                    size_t vl = __riscv_vsetvl_e8m4(depth - current_channel);
+
+                    // Initialize the accumulator vector with the smallest possible int8_t value.
+                    vint8m4_t v_max_s8 = __riscv_vmv_v_x_i8m4(std::numeric_limits<int8_t>::lowest(), vl);
+
+                    // Loop over the filter window dimensions (y, x)
+                    for (int f_y = 0; f_y < filter_height; ++f_y)
+                    {
+                        for (int f_x = 0; f_x < filter_width; ++f_x)
+                        {
+                            // Calculate corresponding input coordinates for this filter tap
+                            const int in_y = (out_y * stride_height) + f_y - pad_height;
+                            const int in_x = (out_x * stride_width) + f_x - pad_width;
+
+                            // Handle padding by checking if the input coordinates are valid
+                            if (in_y >= 0 && in_y < input_height && in_x >= 0 && in_x < input_width)
+                            {
+                                // If valid, calculate the pointer to the input vector
+                                const int8_t* input_ptr = input_batch_base +
+                                                        (in_y * input_y_stride) +
+                                                        (in_x * depth) +
+                                                        current_channel;
+
+                                // Load a vector of input values (unit-stride access)
+                                vint8m4_t v_input_s8 = __riscv_vle8_v_i8m4(input_ptr, vl);
+                                
+                                // Perform the vector max operation
+                                v_max_s8 = __riscv_vmax_vv_i8m4(v_max_s8, v_input_s8, vl);
+                            }
+                        }
+                    }
+
+                    // After iterating through the filter window, apply activation clamping
+                    v_max_s8 = __riscv_vmax_vx_i8m4(v_max_s8, output_activation_min, vl);
+                    v_max_s8 = __riscv_vmin_vx_i8m4(v_max_s8, output_activation_max, vl);
+                    
+                    // Calculate the output pointer
+                    int8_t* output_ptr = output_batch_base +
+                                        (out_y * output_y_stride) +
+                                        (out_x * depth) +
+                                        current_channel;
+                    
+                    // Store the final vector of maximum values (unit-stride access)
+                    __riscv_vse8_v_i8m4(output_ptr, v_max_s8, vl);
+
+                    // Advance to the next block of channels
+                    current_channel += vl;
+                }
+            }
+        }
+    }
+}
+
+void MaxPool16BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data)
+{
+    // Extract pooling parameters
+    const int stride_height = params.stride_height;
+    const int stride_width = params.stride_width;
+    const int filter_height = params.filter_height;
+    const int filter_width = params.filter_width;
+    const int pad_height = params.padding_values.height;
+    const int pad_width = params.padding_values.width;
+    const int16_t output_activation_min = params.quantized_activation_min;
+    const int16_t output_activation_max = params.quantized_activation_max;
+
+    // Extract shape dimensions
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Calculate tensor strides for direct pointer arithmetic
+    const int input_y_stride = input_width * depth;
+    const int input_b_stride = input_height * input_y_stride;
+    const int output_y_stride = output_width * depth;
+    const int output_b_stride = output_height * output_y_stride;
+
+    // Loop over batches
+    for (int batch = 0; batch < batches; ++batch) {
+        const int16_t* input_batch_base = input_data + batch * input_b_stride;
+        int16_t* output_batch_base = output_data + batch * output_b_stride;
+
+        // Loop over output spatial dimensions (y, x)
+        for (int out_y = 0; out_y < output_height; ++out_y)
+        {
+            for (int out_x = 0; out_x < output_width; ++out_x)
+            {
+                
+                // Vectorized loop over channels (depth)
+                size_t current_channel = 0;
+                while (current_channel < static_cast<size_t>(depth))
+                {
+                    // Set vector length. SEW is now 16 bits. With VLEN=128, VLMAX is 8 * LMUL.
+                    // LMUL=4 still provides a good balance, processing up to 32 channels.
+                    size_t vl = __riscv_vsetvl_e16m4(depth - current_channel);
+
+                    // Initialize the accumulator vector with the smallest possible int16_t value.
+                    vint16m4_t v_max_s16 = __riscv_vmv_v_x_i16m4(std::numeric_limits<int16_t>::lowest(), vl);
+
+                    // Loop over the filter window dimensions (y, x)
+                    for (int f_y = 0; f_y < filter_height; ++f_y)
+                    {
+                        for (int f_x = 0; f_x < filter_width; ++f_x)
+                        {
+                            // Calculate corresponding input coordinates for this filter tap
+                            const int in_y = (out_y * stride_height) + f_y - pad_height;
+                            const int in_x = (out_x * stride_width) + f_x - pad_width;
+
+                            // Handle padding by checking if the input coordinates are valid
+                            if (in_y >= 0 && in_y < input_height && in_x >= 0 && in_x < input_width)
+                            {
+                                // If valid, calculate the pointer to the input vector
+                                const int16_t* input_ptr = input_batch_base +
+                                                        (in_y * input_y_stride) +
+                                                        (in_x * depth) +
+                                                        current_channel;
+
+                                // Load a vector of input values (unit-stride access)
+                                vint16m4_t v_input_s16 = __riscv_vle16_v_i16m4(input_ptr, vl);
+                                
+                                // Perform the vector max operation
+                                v_max_s16 = __riscv_vmax_vv_i16m4(v_max_s16, v_input_s16, vl);
+                            }
+                        }
+                    }
+
+                    // After iterating through the filter window, apply activation clamping
+                    v_max_s16 = __riscv_vmax_vx_i16m4(v_max_s16, output_activation_min, vl);
+                    v_max_s16 = __riscv_vmin_vx_i16m4(v_max_s16, output_activation_max, vl);
+                    
+                    // Calculate the output pointer
+                    int16_t* output_ptr = output_batch_base +
+                                            (out_y * output_y_stride) +
+                                            (out_x * depth) +
+                                            current_channel;
+                    
+                    // Store the final vector of maximum values (unit-stride access)
+                    __riscv_vse16_v_i16m4(output_ptr, v_max_s16, vl);
+
+                    // Advance to the next block of channels
+                    current_channel += vl;
+                }
             }
-          }
-
-          // After iterating through the filter window, apply activation clamping
-          v_max_s8 = __riscv_vmax_vx_i8m4(v_max_s8, output_activation_min, vl);
-          v_max_s8 = __riscv_vmin_vx_i8m4(v_max_s8, output_activation_max, vl);
-          
-          // Calculate the output pointer
-          int8_t* output_ptr = output_batch_base +
-                               (out_y * output_y_stride) +
-                               (out_x * depth) +
-                               current_channel;
-          
-          // Store the final vector of maximum values (unit-stride access)
-          __riscv_vse8_v_i8m4(output_ptr, v_max_s8, vl);
-
-          // Advance to the next block of channels
-          current_channel += vl;
         }
-      }
     }
-  }
 }
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
index 99fbf05ea51..a5c818aacef 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
@@ -10,4 +10,8 @@ void MaxPool8BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
                     const int8_t* input_data, const RuntimeShape& output_shape,
                     int8_t* output_data);
 
+void MaxPool16BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data);
+
 #endif
\ No newline at end of file

From c1d046fddedb6a7621d88575a4bbcb2011ae55a5 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 14 Nov 2025 05:47:14 -0600
Subject: [PATCH 49/86] Formatting

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    |  4 ++
 .../riscv_vector/fully_connected_rvv.cc       | 40 +++++++++++++------
 .../micro/kernels/riscv_vector/pooling_rvv.cc |  5 +--
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 8baeb83abd2..ba5e893e558 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -140,7 +140,9 @@ void ConvPerChannelRVV(const ConvParams& params,
 
                         // Skip this filter row if input y is out of bounds
                         if (!is_y_inside_image)
+                        {
                             continue;
+                        }
 
                         const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
 
@@ -365,7 +367,9 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
 
                             // Skip this filter row if input y is out of bounds
                             if (!is_y_inside_image)
+                            {
                                 continue;
+                            }
 
                             const int8_t* filter_y_base = filter_data + filter_y * filter_h_stride;
 
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
index 5bde9fdf00f..f1401065620 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
@@ -31,26 +31,32 @@ void FullyConnectedPerChannelRVV(const tflite::FullyConnectedParams& params,
   const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
 
   // Loop over batches
-  for (int b = 0; b < batches; ++b) {
+  for (int b = 0; b < batches; ++b)
+  {
     const int8_t* input_batch_ptr = input_data + b * accum_depth;
     int8_t* output_batch_ptr = output_data + b * output_depth;
 
     // Vectorized loop over output channels
     size_t current_out_c = 0;
-    while (current_out_c < static_cast<size_t>(output_depth)) {
+    while (current_out_c < static_cast<size_t>(output_depth))
+    {
       // Set vector length for this iteration
       size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
 
       // Initialize accumulator vector with biases
       vint32m4_t v_acc_s32;
-      if (bias_data) {
+      if (bias_data)
+      {
         v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
-      } else {
+      }
+      else
+      {
         v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
       }
 
       // Main MAC loop to compute dot products
-      for (int d = 0; d < accum_depth; ++d) {
+      for (int d = 0; d < accum_depth; ++d)
+      {
         // Load scalar input value and add offset
         int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
         
@@ -187,20 +193,25 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
 
         // Vectorized loop over output channels
         size_t current_out_c = 0;
-        while (current_out_c < static_cast<size_t>(output_depth)) {
+        while (current_out_c < static_cast<size_t>(output_depth))
+        {
             // Set vector length for processing multiple output channels
             size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
 
             // Initialize accumulator vector with biases
             vint32m4_t v_acc_s32;
-            if (bias_data) {
+            if (bias_data)
+            {
                 v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
-            } else {
+            }
+            else
+            {
                 v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
             }
 
             // Loop over accumulation depth to compute 'vl' dot products in parallel
-            for (int d = 0; d < accum_depth; ++d) {
+            for (int d = 0; d < accum_depth; ++d)
+            {
                 // Load one scalar from the input vector and add offset
                 int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
 
@@ -239,13 +250,18 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
             vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
 
             // Perform 64b arithmetic right shift
-            if (effective_right_shift == 0) {
+            if (effective_right_shift == 0)
+            {
                 v_res32 = v_rounded_lo;
-            } else if (effective_right_shift > 0 && effective_right_shift < 32) {
+            }
+            else if (effective_right_shift > 0 && effective_right_shift < 32)
+            {
                 vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
                 vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
                 v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
-            } else {
+            }
+            else
+            {
                 int shift_hi = std::min(31, effective_right_shift - 32);
                 v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
             }
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
index dfaf4f8cb1f..2986fdb95ac 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc
@@ -43,7 +43,6 @@ void MaxPool8BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
         {
             for (int out_x = 0; out_x < output_width; ++out_x)
             {
-                
                 // Vectorized loop over channels (depth)
                 size_t current_channel = 0;
                 while (current_channel < static_cast<size_t>(depth))
@@ -133,7 +132,8 @@ void MaxPool16BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
     const int output_b_stride = output_height * output_y_stride;
 
     // Loop over batches
-    for (int batch = 0; batch < batches; ++batch) {
+    for (int batch = 0; batch < batches; ++batch)
+    {
         const int16_t* input_batch_base = input_data + batch * input_b_stride;
         int16_t* output_batch_base = output_data + batch * output_b_stride;
 
@@ -142,7 +142,6 @@ void MaxPool16BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
         {
             for (int out_x = 0; out_x < output_width; ++out_x)
             {
-                
                 // Vectorized loop over channels (depth)
                 size_t current_channel = 0;
                 while (current_channel < static_cast<size_t>(depth))

From 7bb21b1137e90f9313e36a4bb0352334e0d2865d Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 14 Nov 2025 06:30:10 -0600
Subject: [PATCH 50/86] Fix bug in FullyConnectedPerChannel and refactor
 requantization logic

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    | 207 ++++----------
 .../riscv_vector/fully_connected_rvv.cc       | 259 +++++-------------
 .../kernels/riscv_vector/requantize_rvv.h     | 169 ++++++++++++
 3 files changed, 286 insertions(+), 349 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index ba5e893e558..591e583e8c3 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -9,6 +9,8 @@
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+#include "requantize_rvv.h"
+
 using namespace tflite;
 
 void ConvPerChannelRVV(const ConvParams& params,
@@ -72,19 +74,19 @@ void ConvPerChannelRVV(const ConvParams& params,
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
     // Loop over batches
-    for (int batch = 0; batch < input_batches; ++batch) 
+    for (int batch = 0; batch < input_batches; ++batch)
     {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
 
         // Loop over output height
-        for (int out_y = 0; out_y < output_height; ++out_y) 
+        for (int out_y = 0; out_y < output_height; ++out_y)
         {
             const int in_y_origin = (out_y * stride_height) - pad_height;
             int8_t* output_row_base = output_batch_base + out_y * output_h_stride;
 
             // Loop over output channels
-            for (int out_channel = 0; out_channel < output_depth; ++out_channel) 
+            for (int out_channel = 0; out_channel < output_depth; ++out_channel)
             {
                 // Calculate group and filter parameters for this output channel
                 const int group = out_channel / filters_per_group;
@@ -99,32 +101,20 @@ void ConvPerChannelRVV(const ConvParams& params,
                 // Get bias value for this output channel
                 const int32_t bias_val = bias_data ? bias_data[out_channel] : 0;
 
-                // Calculate rounding constants for requantization
-                int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
-                int32_t rounding_lo = static_cast<int32_t>(rounding_val);
-                int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
-
                 // Calculate output pointer and stride for this channel row
                 int8_t* output_channel_base = output_row_base + out_channel * output_ch_stride;
                 const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
                 // Process output width in vector chunks
                 size_t current_out_x = 0;
-                while (current_out_x < static_cast<size_t>(output_width)) 
+                while (current_out_x < static_cast<size_t>(output_width))
                 {
                     // Set vector length for this iteration
                     size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
 
                     // Initialize accumulator vector with bias
-                    vint32m4_t v_acc_s32;
-                    if (bias_data) 
-                    {
-                        v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
-                    } 
-                    else 
-                    {
-                        v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
-                    }
+                    vint32m4_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m4(bias_val, vl)
+                                                     : __riscv_vmv_v_x_i32m4(0, vl);
 
                     // Calculate base input x coordinates for the vector lanes
                     vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
@@ -132,31 +122,21 @@ void ConvPerChannelRVV(const ConvParams& params,
                     vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
                     // Loop over filter height
-                    for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+                    for (int filter_y = 0; filter_y < filter_height; ++filter_y)
                     {
-                        // Calculate input y coordinate and check bounds
                         const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                        const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
-
-                        // Skip this filter row if input y is out of bounds
-                        if (!is_y_inside_image)
-                        {
-                            continue;
-                        }
+                        if (in_y < 0 || in_y >= input_height) continue; // Simplified boundary check
 
                         const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
 
                         // Loop over filter width
-                        for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+                        for (int filter_x = 0; filter_x < filter_width; ++filter_x)
                         {
-                            // Calculate input x offset and filter patch base pointer
                             const int in_x_offset = dilation_width_factor * filter_x;
                             const int8_t* filter_patch_base = filter_y_base + (filter_x * filter_w_stride);
-
-                            // Calculate input x coordinates for the vector lanes for this filter tap
                             vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
 
-                            // Create mask for valid input coordinates (within image width bounds)
+                            // Create mask for valid input coordinates
                             vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
                             vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
                             vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
@@ -168,65 +148,29 @@ void ConvPerChannelRVV(const ConvParams& params,
                             ptrdiff_t input_x_stride_bytes = static_cast<ptrdiff_t>(stride_width) * input_w_stride * sizeof(int8_t);
 
                             // Loop over input channels for this filter tap
-                            for (int ic = 0; ic < filter_input_depth; ++ic) 
+                            for (int ic = 0; ic < filter_input_depth; ++ic)
                             {
-                                // Load scalar filter value
                                 int8_t s_filter_val_s8 = filter_patch_base[ic * filter_ch_stride];
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
-
-                                // Calculate input pointer for this channel
                                 const int8_t* input_ic_ptr = input_base_for_y_x_patch + (ic * input_ch_stride);
-
-                                // Load input vector (masked, strided), widen, add offset
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_ic_ptr, input_x_stride_bytes, vl);
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
                                 vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
-
-                                // Perform widening multiply-accumulate (masked)
                                 v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
                     }
 
-                    // Start Vectorized Requantization
-                    vint32m4_t v_res32;
-
-                    // Multiply accumulator by scalar multiplier (results in 64b intermediate)
-                    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
-                    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
-
-                    // Add 64b rounding value using 32b operations with carry
-                    vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-                    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
-                    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
-                    vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
-                    v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
-                    vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
-
-                    // Perform 64b arithmetic right shift using 32b vector shifts
-                    if (effective_right_shift == 0) 
-                    {
-                        v_res32 = v_rounded_lo;
-                    } 
-                    else if (effective_right_shift > 0 && effective_right_shift < 32) 
-                    {
-                        vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
-                        vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
-                        v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
-                    } 
-                    else 
-                    {
-                        int shift_hi = std::min(31, effective_right_shift - 32);
-                        v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
-                    }
-
-                    // Add output offset
-                    v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
-
-                    // Clamp to activation bounds
-                    v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
-                    v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
-
+                    // Requantize the accumulated values in a single function call.
+                    vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+                        v_acc_s32,
+                        scalar_multiplier,
+                        effective_right_shift,
+                        s_output_offset_s32,
+                        s_output_activation_min_s32,
+                        s_output_activation_max_s32,
+                        vl);
+                    
                     // Narrow result to int16 and then int8 with saturation
                     vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
                     vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
@@ -299,21 +243,21 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
     const int32_t s_output_activation_max_s32 = output_activation_max;
 
     // Loop over batches
-    for (int batch = 0; batch < input_batches; ++batch) 
+    for (int batch = 0; batch < input_batches; ++batch)
     {
         const int8_t* input_batch_base = input_data + batch * input_b_stride;
         int8_t* output_batch_base = output_data + batch * output_b_stride;
 
         // Loop over output height
-        for (int out_y = 0; out_y < output_height; ++out_y) 
+        for (int out_y = 0; out_y < output_height; ++out_y)
         {
             const int in_y_origin = (out_y * stride_height) - pad_height;
 
             // Loop over input channels (depthwise)
-            for (int in_channel = 0; in_channel < input_depth; ++in_channel) 
+            for (int in_channel = 0; in_channel < input_depth; ++in_channel)
             {
                 // Loop over depth multiplier
-                for (int m = 0; m < depth_multiplier; ++m) 
+                for (int m = 0; m < depth_multiplier; ++m)
                 {
                     // Calculate the current output channel
                     const int output_channel = m + in_channel * depth_multiplier;
@@ -326,32 +270,20 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                     // Get bias value for this output channel
                     const int32_t bias_val = bias_data ? bias_data[output_channel] : 0;
 
-                    // Calculate rounding constants for requantization
-                    int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
-                    int32_t rounding_lo = static_cast<int32_t>(rounding_val);
-                    int32_t rounding_hi = static_cast<int32_t>((rounding_val) >> 32);
-
                     // Calculate output pointer and stride for this channel row
                     int8_t* output_channel_row_base = output_batch_base + out_y * output_h_stride + output_channel * output_ch_stride;
                     const ptrdiff_t output_x_stride_bytes = output_w_stride * sizeof(int8_t);
 
                     // Process output width in vector chunks
                     size_t current_out_x = 0;
-                    while (current_out_x < static_cast<size_t>(output_width)) 
+                    while (current_out_x < static_cast<size_t>(output_width))
                     {
                         // Set vector length for this iteration
                         size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
 
                         // Initialize accumulator vector with bias
-                        vint32m4_t v_acc_s32;
-                        if (bias_data) 
-                        {
-                            v_acc_s32 = __riscv_vmv_v_x_i32m4(bias_val, vl);
-                        } 
-                        else 
-                        {
-                            v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
-                        }
+                        vint32m4_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m4(bias_val, vl)
+                                                         : __riscv_vmv_v_x_i32m4(0, vl);
 
                         // Calculate base input x coordinates for the vector lanes
                         vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
@@ -359,97 +291,52 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                         vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
 
                         // Loop over filter height
-                        for (int filter_y = 0; filter_y < filter_height; ++filter_y) 
+                        for (int filter_y = 0; filter_y < filter_height; ++filter_y)
                         {
-                            // Calculate input y coordinate and check bounds
                             const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                            const bool is_y_inside_image = (in_y >= 0) && (in_y < input_height);
-
-                            // Skip this filter row if input y is out of bounds
-                            if (!is_y_inside_image)
-                            {
-                                continue;
-                            }
+                            if (in_y < 0 || in_y >= input_height) continue;
 
                             const int8_t* filter_y_base = filter_data + filter_y * filter_h_stride;
 
                             // Loop over filter width
-                            for (int filter_x = 0; filter_x < filter_width; ++filter_x) 
+                            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
                             {
-                                // Calculate input x coordinates for the vector lanes for this filter tap
                                 const int in_x_offset = dilation_width_factor * filter_x;
                                 vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
 
-                                // Create mask for valid input coordinates (within image width bounds)
+                                // Create mask for valid input coordinates
                                 vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
                                 vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
                                 vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
 
-                                // Skip MAC calculation if all lanes are masked off for this tap
-                                uint32_t first_mask_bit = __riscv_vfirst_m_b8(v_active_lane_mask_b8, vl);
-                                if (first_mask_bit == static_cast<uint32_t>(-1) && vl > 0)
-                                    continue;
+                                // Optimization: skip MAC if all lanes are masked off
+                                if (__riscv_vfirst_m_b8(v_active_lane_mask_b8, vl) == -1) continue;
 
-                                // Load scalar filter value for this tap and output channel
                                 const int8_t* filter_ptr = filter_y_base + filter_x * filter_w_stride + output_channel * filter_ch_stride;
-                                int8_t s_filter_val_s8 = *filter_ptr;
-                                int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
+                                int16_t s_filter_val_s16 = static_cast<int16_t>(*filter_ptr);
 
-                                // Calculate base input pointer and stride for vector load (using in_channel)
                                 int32_t base_in_x_for_vector0 = static_cast<int32_t>(current_out_x) * stride_width - pad_width + in_x_offset;
                                 const int8_t* input_base_ptr =
                                   input_batch_base + in_y * input_h_stride + base_in_x_for_vector0 * input_w_stride + in_channel * input_ch_stride;
                                 ptrdiff_t input_x_stride_bytes = static_cast<ptrdiff_t>(stride_width) * input_w_stride * sizeof(int8_t);
 
-                                // Load input vector (masked, strided), widen, add offset
                                 vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);
                                 vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
                                 vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
-
-                                // Perform widening multiply-accumulate (masked)
                                 v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
 
-                        // Start Vectorized Requantization
-                        vint32m4_t v_res32;
-
-                        // Multiply accumulator by scalar multiplier (results in 64b intermediate)
-                        vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
-                        vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, scalar_multiplier, vl);
-
-                        // Add 64b rounding value using 32b operations with carry
-                        vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-                        vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
-                        vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
-                        vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
-                        v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
-                        vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
-
-                        // Perform 64b arithmetic right shift using 32b vector shifts
-                        if (effective_right_shift == 0) 
-                        {
-                            v_res32 = v_rounded_lo;
-                        } 
-                        else if (effective_right_shift > 0 && effective_right_shift < 32) 
-                        {
-                            vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
-                            vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
-                            v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
-                        } 
-                        else 
-                        {
-                            int shift_hi = std::min(31, effective_right_shift - 32);
-                            v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
-                        }
-
-                        // Add output offset
-                        v_res32 = __riscv_vadd_vx_i32m4(v_res32, s_output_offset_s32, vl);
-
-                        // Clamp to activation bounds
-                        v_res32 = __riscv_vmax_vx_i32m4(v_res32, s_output_activation_min_s32, vl);
-                        v_res32 = __riscv_vmin_vx_i32m4(v_res32, s_output_activation_max_s32, vl);
-
+                        // Requantize the accumulated values in a single function call.
+                        vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+                            v_acc_s32,
+                            scalar_multiplier,
+                            effective_right_shift,
+                            s_output_offset_s32,
+                            s_output_activation_min_s32,
+                            s_output_activation_max_s32,
+                            vl);
+                        
                         // Narrow result to int16 and then int8 with saturation
                         vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
                         vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
index f1401065620..3e1110596a1 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
@@ -3,156 +3,85 @@
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+#include "requantize_rvv.h"
+
 using namespace tflite;
 
-void FullyConnectedPerChannelRVV(const tflite::FullyConnectedParams& params,
+void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
                                  const int32_t* output_multiplier,
                                  const int* output_shift,
-                                 const tflite::RuntimeShape& input_shape,
+                                 const RuntimeShape& input_shape,
                                  const int8_t* input_data,
-                                 const tflite::RuntimeShape& filter_shape,
+                                 const RuntimeShape& filter_shape,
                                  const int8_t* filter_data,
-                                 const tflite::RuntimeShape& bias_shape,
+                                 const RuntimeShape& bias_shape,
                                  const int32_t* bias_data,
-                                 const tflite::RuntimeShape& output_shape,
+                                 const RuntimeShape& output_shape,
                                  int8_t* output_data) {
-  // Extract quantization parameters
-  const int32_t input_offset = params.input_offset;
-  const int32_t output_offset = params.output_offset;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-
-  // Extract shape dimensions
-  const int batches = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
-  const int output_depth = output_shape.Dims(output_shape.DimensionsCount() - 1);
-  const int accum_depth = filter_shape.Dims(filter_shape.DimensionsCount() - 1);
+    // Extract quantization parameters
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
 
-  // Prepare scalar constants
-  const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
+    // Extract shape dimensions
+    const int batches = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+    const int output_depth = output_shape.Dims(output_shape.DimensionsCount() - 1);
+    const int accum_depth = filter_shape.Dims(filter_shape.DimensionsCount() - 1);
 
-  // Loop over batches
-  for (int b = 0; b < batches; ++b)
-  {
-    const int8_t* input_batch_ptr = input_data + b * accum_depth;
-    int8_t* output_batch_ptr = output_data + b * output_depth;
+    // Prepare scalar constants
+    const int16_t s_input_offset_s16 = static_cast<int16_t>(input_offset);
 
-    // Vectorized loop over output channels
-    size_t current_out_c = 0;
-    while (current_out_c < static_cast<size_t>(output_depth))
+    // Loop over batches
+    for (int b = 0; b < batches; ++b)
     {
-      // Set vector length for this iteration
-      size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
-
-      // Initialize accumulator vector with biases
-      vint32m4_t v_acc_s32;
-      if (bias_data)
-      {
-        v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
-      }
-      else
-      {
-        v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
-      }
-
-      // Main MAC loop to compute dot products
-      for (int d = 0; d < accum_depth; ++d)
-      {
-        // Load scalar input value and add offset
-        int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
-        
-        // Calculate filter pointer and stride for the current column
-        const int8_t* filter_col_ptr = filter_data + d + current_out_c * accum_depth;
-        ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
-
-        // Load filter vector, widen, and perform widening multiply-accumulate
-        vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
-        vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
-        v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_s16, vl);
-      }
-
-      // Start of fully vectorized per-channel requantization
-      vint32m4_t v_res32;
-      
-      // Load per-channel requantization parameters into vectors
-      vint32m4_t v_multiplier = __riscv_vle32_v_i32m4(output_multiplier + current_out_c, vl);
-      vint32m4_t v_shift = __riscv_vle32_v_i32m4(reinterpret_cast<const int32_t*>(output_shift) + current_out_c, vl);
-
-      // Create a mask for lanes that require a right shift (where shift > 0)
-      vbool8_t v_mask_right_shift = __riscv_vmsgt_vx_i32m4_b8(v_shift, 0, vl);
-
-      // Path 1: Right Shift (for lanes where shift > 0)
-      vint32m4_t v_res_right;
-      {
-        // Calculate the 64-bit product of accumulator and multiplier
-        vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4_m(v_mask_right_shift, v_acc_s32, v_multiplier, vl);
-        vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4_m(v_mask_right_shift, v_acc_s32, v_multiplier, vl);
-
-        // Calculate the 64-bit rounding value: (1 << (shift - 1))
-        vint32m4_t v_shift_minus_1 = __riscv_vsub_vx_i32m4_m(v_mask_right_shift, v_shift, 1, vl);
-        vuint32m4_t v_one_u = __riscv_vmv_v_x_u32m4(1, vl);
-        vuint32m4_t v_rounding_u = __riscv_vsll_vv_u32m4_m(v_mask_right_shift, v_one_u, __riscv_vreinterpret_v_i32m4_u32m4(v_shift_minus_1), vl);
-
-        // Add the 64-bit rounding value to the 64-bit product
-        vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-        vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4_m(v_mask_right_shift, v_prod_lo_u, v_rounding_u, vl);
-        vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8_m(v_mask_right_shift, v_sum_lo_u, v_prod_lo_u, vl);
-        vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_prod_hi, 1, vl);
-
-        // Create a mask to select between the two 64-bit shift emulation paths
-        vbool8_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m4_b8_m(v_mask_right_shift, v_shift, 32, vl);
-
-        // Sub-path A: Emulate 64-bit shift for 0 < shift < 32
-        vint32m4_t v_res_lt_32;
-        {
-          vuint32m4_t v_shift_u = __riscv_vreinterpret_v_i32m4_u32m4(v_shift);
-          vuint32m4_t v_shift_rev_u = __riscv_vrsub_vx_u32m4_m(v_mask_shift_lt_32, v_shift_u, 32, vl);
-          vuint32m4_t v_lo_part = __riscv_vsrl_vv_u32m4_m(v_mask_shift_lt_32, v_sum_lo_u, v_shift_u, vl);
-          vuint32m4_t v_hi_part = __riscv_vsll_vv_u32m4_m(v_mask_shift_lt_32, __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_hi), v_shift_rev_u, vl);
-          v_res_lt_32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4_m(v_mask_shift_lt_32, v_lo_part, v_hi_part, vl));
-        }
+        const int8_t* input_batch_ptr = input_data + b * accum_depth;
+        int8_t* output_batch_ptr = output_data + b * output_depth;
 
-        // Sub-path B: Emulate 64-bit shift for shift >= 32
-        vint32m4_t v_res_ge_32;
+        // Vectorized loop over output channels
+        size_t current_out_c = 0;
+        while (current_out_c < static_cast<size_t>(output_depth))
         {
-          vbool8_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b8(v_mask_right_shift, v_mask_shift_lt_32, vl);
-          vint32m4_t v_shift_hi = __riscv_vsub_vx_i32m4_m(v_mask_shift_ge_32, v_shift, 32, vl);
-          v_shift_hi = __riscv_vmin_vx_i32m4_m(v_mask_shift_ge_32, v_shift_hi, 31, vl); // Clamp to 31
-          v_res_ge_32 = __riscv_vsra_vv_i32m4_m(v_mask_shift_ge_32, v_rounded_hi, __riscv_vreinterpret_v_i32m4_u32m4(v_shift_hi), vl);
-        }
-
-        // Merge the results from the two 64-bit shift sub-paths
-        v_res_right = __riscv_vmerge_vvm_i32m4(v_res_ge_32, v_res_lt_32, v_mask_shift_lt_32, vl);
-      }
-
-      // Path 2: Left Shift (for lanes where shift <= 0)
-      vint32m4_t v_res_left;
-      {
-        // Negate the shift amount and perform a left shift on the accumulator
-        vint32m4_t v_neg_shift = __riscv_vneg_v_i32m4(v_shift, vl);
-        v_res_left = __riscv_vsll_vv_i32m4(v_acc_s32, __riscv_vreinterpret_v_i32m4_u32m4(v_neg_shift), vl);
-      }
-
-      // Merge the results from the right and left shift paths
-      v_res32 = __riscv_vmerge_vvm_i32m4(v_res_left, v_res_right, v_mask_right_shift, vl);
+            // Set vector length for this iteration
+            size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
 
-      // Add the final output offset
-      v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+            // Initialize accumulator vector with biases
+            vint32m4_t v_acc_s32 = bias_data
+                ? __riscv_vle32_v_i32m4(bias_data + current_out_c, vl)
+                : __riscv_vmv_v_x_i32m4(0, vl);
 
-      // Clamp the results to the activation range
-      v_res32 = __riscv_vmax_vx_i32m4(v_res32, output_activation_min, vl);
-      v_res32 = __riscv_vmin_vx_i32m4(v_res32, output_activation_max, vl);
+            // Main MAC loop to compute dot products
+            for (int d = 0; d < accum_depth; ++d)
+            {
+                int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
+                const int8_t* filter_col_ptr = filter_data + d + current_out_c * accum_depth;
+                ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
+                vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
+                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
+                v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_s16, vl);
+            }
 
-      // Narrow the 32-bit results to 16-bit, then 8-bit with saturation
-      vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
-      vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+            // Load per-channel requantization parameters into vectors
+            vint32m4_t v_multiplier = __riscv_vle32_v_i32m4(output_multiplier + current_out_c, vl);
+            vint32m4_t v_shift = __riscv_vle32_v_i32m4(
+                reinterpret_cast<const int32_t*>(output_shift) + current_out_c, vl);
+
+            // Requantize the accumulated values using the fully vectorized helper.
+            vint32m4_t v_res32 = RequantizeVectorPerChannelS32(
+                v_acc_s32, v_multiplier, v_shift,
+                output_offset, output_activation_min, output_activation_max, vl);
+            
+            // Narrow the 32-bit results to 16-bit, then 8-bit with saturation
+            vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
+            vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
       
-      // Store the final 8-bit output vector
-      __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
+            // Store the final 8-bit output vector
+            __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
 
-      // Advance to the next block of output channels
-      current_out_c += vl;
+            // Advance to the next block of output channels
+            current_out_c += vl;
+        }
     }
-  }
 }
 
 void FullyConnectedRVV(const FullyConnectedParams& params,
@@ -199,79 +128,31 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
             size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
 
             // Initialize accumulator vector with biases
-            vint32m4_t v_acc_s32;
-            if (bias_data)
-            {
-                v_acc_s32 = __riscv_vle32_v_i32m4(bias_data + current_out_c, vl);
-            }
-            else
-            {
-                v_acc_s32 = __riscv_vmv_v_x_i32m4(0, vl);
-            }
+            vint32m4_t v_acc_s32 = bias_data
+                ? __riscv_vle32_v_i32m4(bias_data + current_out_c, vl)
+                : __riscv_vmv_v_x_i32m4(0, vl);
 
             // Loop over accumulation depth to compute 'vl' dot products in parallel
             for (int d = 0; d < accum_depth; ++d)
             {
-                // Load one scalar from the input vector and add offset
                 int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
-
-                // Load a vector of 'vl' filter values (a column slice)
                 const int8_t* filter_col_ptr = filter_data + current_out_c * accum_depth + d;
                 ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
                 vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
-                
-                // Widen filter values and add filter offset
                 vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
                 vint16m2_t v_filter_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_filter_s16, s_filter_offset_s16, vl);
-                
-                // Perform widening vector-scalar multiply-accumulate
                 v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_plus_offset_s16, vl);
             }
 
-            // Start of inline vectorized requantization
-            vint32m4_t v_res32;
             const int effective_right_shift = 31 - output_shift;
-
-            // Calculate rounding constants
-            int64_t rounding_val = (effective_right_shift > 0) ? (INT64_C(1) << (effective_right_shift - 1)) : 0;
-            int32_t rounding_lo = static_cast<int32_t>(rounding_val);
-            int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
-
-            // Multiply accumulator by scalar multiplier (results in 64b intermediate)
-            vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc_s32, output_multiplier, vl);
-            vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc_s32, output_multiplier, vl);
-
-            // Add 64b rounding value
-            vuint32m4_t v_acc_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-            vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_acc_lo_u, rounding_lo, vl);
-            vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
-            vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
-            v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
-            vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
-
-            // Perform 64b arithmetic right shift
-            if (effective_right_shift == 0)
-            {
-                v_res32 = v_rounded_lo;
-            }
-            else if (effective_right_shift > 0 && effective_right_shift < 32)
-            {
-                vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo), effective_right_shift, vl);
-                vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(v_rounded_hi, 32 - effective_right_shift, vl);
-                v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
-            }
-            else
-            {
-                int shift_hi = std::min(31, effective_right_shift - 32);
-                v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
-            }
-
-            // Add output offset
-            v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
-
-            // Clamp to activation bounds
-            v_res32 = __riscv_vmax_vx_i32m4(v_res32, output_activation_min, vl);
-            v_res32 = __riscv_vmin_vx_i32m4(v_res32, output_activation_max, vl);
+            vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+                v_acc_s32,
+                output_multiplier,
+                effective_right_shift,
+                output_offset,
+                output_activation_min,
+                output_activation_max,
+                vl);
 
             // Narrow result to int8 and store
             vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
new file mode 100644
index 00000000000..c791b7613d7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
@@ -0,0 +1,169 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_COMMON_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_COMMON_RVV_H_
+
+inline vint32m4_t RequantizeVectorPerTensorS32(
+    vint32m4_t v_acc, const int32_t multiplier, const int effective_right_shift,
+    const int32_t output_offset, const int32_t activation_min,
+    const int32_t activation_max, const size_t vl)
+{
+    // Calculate rounding constants for the 64-bit shift
+    const int64_t rounding_val =
+        (effective_right_shift > 0)
+            ? (INT64_C(1) << (effective_right_shift - 1))
+            : 0;
+    const int32_t rounding_lo = static_cast<int32_t>(rounding_val);
+    const int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
+
+    // Multiply accumulator by scalar multiplier (results in 64b intermediate)
+    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc, multiplier, vl);
+    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc, multiplier, vl);
+
+    // Add 64b rounding value using 32b operations with carry
+    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, rounding_lo, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
+    vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
+    v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+    vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
+
+    // Perform 64b arithmetic right shift using 32b vector shifts
+    vint32m4_t v_res32;
+    if (effective_right_shift == 0)
+    {
+        v_res32 = v_rounded_lo;
+    }
+    else if (effective_right_shift > 0 && effective_right_shift < 32)
+    {
+        vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(
+            __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo),
+            effective_right_shift, vl);
+        vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(
+            v_rounded_hi, 32 - effective_right_shift, vl);
+        v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(
+            v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
+    }
+    else
+    {
+        const int shift_hi = std::min(31, effective_right_shift - 32);
+        v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
+    }
+
+    // Add output offset
+    v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+
+    // Clamp to activation bounds
+    v_res32 = __riscv_vmax_vx_i32m4(v_res32, activation_min, vl);
+    v_res32 = __riscv_vmin_vx_i32m4(v_res32, activation_max, vl);
+
+    return v_res32;
+}
+
+inline vint32m4_t RequantizeVectorPerChannelS32(
+    vint32m4_t v_acc, vint32m4_t v_multiplier, vint32m4_t v_shift,
+    const int32_t output_offset, const int32_t activation_min,
+    const int32_t activation_max, const size_t vl)
+{
+    // Perform 32x32 -> 64-bit multiplication, getting high and low parts.
+    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_acc, v_multiplier, vl);
+    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_acc, v_multiplier, vl);
+
+    // Calculate the effective right shift for TFLM's fixed-point scheme.
+    vint32m4_t v_effective_shift = __riscv_vrsub_vx_i32m4(v_shift, 31, vl);
+
+    // Create masks to separate lanes into right-shift and left-shift paths.
+    vbool8_t v_mask_right_shift =
+        __riscv_vmsgt_vx_i32m4_b8(v_effective_shift, 0, vl);
+    vbool8_t v_mask_left_shift = __riscv_vmnot_m_b8(v_mask_right_shift, vl);
+
+    // Path 1: Right Shift (for lanes where effective_shift > 0)
+    vint32m4_t v_res_right;
+    {
+        // Calculate the 64-bit rounding value: (1LL << (effective_shift - 1)).
+        vint32m4_t v_shift_minus_1 = __riscv_vsub_vx_i32m4_m(
+            v_mask_right_shift, v_effective_shift, 1, vl);
+        vuint32m4_t v_shift_minus_1_u =
+            __riscv_vreinterpret_v_i32m4_u32m4(v_shift_minus_1);
+        vbool8_t v_mask_round_lt_32 = __riscv_vmsltu_vx_u32m4_b8_m(
+            v_mask_right_shift, v_shift_minus_1_u, 32, vl);
+        vbool8_t v_mask_round_ge_32 = __riscv_vmandn_mm_b8(
+            v_mask_right_shift, v_mask_round_lt_32, vl);
+        vuint32m4_t v_one_u = __riscv_vmv_v_x_u32m4(1, vl);
+        vuint32m4_t v_zero_u = __riscv_vmv_v_x_u32m4(0, vl);
+        vuint32m4_t v_rounding_lo_u = __riscv_vmerge_vvm_u32m4(
+            v_zero_u,
+            __riscv_vsll_vv_u32m4_m(v_mask_round_lt_32, v_one_u,
+                                   v_shift_minus_1_u, vl),
+            v_mask_round_lt_32, vl);
+        vuint32m4_t v_rounding_hi_u = __riscv_vmerge_vvm_u32m4(
+            v_zero_u,
+            __riscv_vsll_vv_u32m4_m(
+                v_mask_round_ge_32, v_one_u,
+                __riscv_vsub_vx_u32m4_m(v_mask_round_ge_32, v_shift_minus_1_u,
+                                        32, vl),
+                vl),
+            v_mask_round_ge_32, vl);
+
+        // Add the 64-bit rounding value to the 64-bit product using 32-bit ops.
+        vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
+        vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4_m(
+            v_mask_right_shift, v_prod_lo_u, v_rounding_lo_u, vl);
+        vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8_m(
+            v_mask_right_shift, v_sum_lo_u, v_prod_lo_u, vl);
+        vint32m4_t v_rounded_hi = __riscv_vadd_vv_i32m4_m(
+            v_mask_right_shift, v_prod_hi,
+            __riscv_vreinterpret_v_u32m4_i32m4(v_rounding_hi_u), vl);
+        v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+
+        // Emulate a 64-bit arithmetic right shift using two 32-bit sub-paths.
+        vbool8_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m4_b8_m(
+            v_mask_right_shift, v_effective_shift, 32, vl);
+        vbool8_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b8(
+            v_mask_right_shift, v_mask_shift_lt_32, vl);
+        vuint32m4_t v_shift_u =
+            __riscv_vreinterpret_v_i32m4_u32m4(v_effective_shift);
+        vuint32m4_t v_lo_part = __riscv_vsrl_vv_u32m4_m(
+            v_mask_shift_lt_32, v_sum_lo_u, v_shift_u, vl);
+        vuint32m4_t v_hi_part = __riscv_vsll_vv_u32m4_m(
+            v_mask_shift_lt_32,
+            __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_hi),
+            __riscv_vrsub_vx_u32m4_m(v_mask_shift_lt_32, v_shift_u, 32, vl),
+            vl);
+        vint32m4_t v_res_lt_32 = __riscv_vreinterpret_v_u32m4_i32m4(
+            __riscv_vor_vv_u32m4_m(v_mask_shift_lt_32, v_lo_part, v_hi_part, vl));
+        vint32m4_t v_res_ge_32 = __riscv_vsra_vv_i32m4_m(
+            v_mask_shift_ge_32, v_rounded_hi,
+            __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vsub_vx_i32m4_m(
+                v_mask_shift_ge_32, v_effective_shift, 32, vl)),
+            vl);
+        v_res_right = __riscv_vmerge_vvm_i32m4(v_res_ge_32, v_res_lt_32,
+                                              v_mask_shift_lt_32, vl);
+    }
+
+    // Path 2: Left Shift (for lanes where effective_shift <= 0)
+    vint32m4_t v_res_left;
+    {
+        // Calculate the positive left shift amount.
+        vint32m4_t v_left_shift_amount =
+            __riscv_vneg_v_i32m4_m(v_mask_left_shift, v_effective_shift, vl);
+
+        // Perform the left shift on the low 32 bits of the product.
+        v_res_left = __riscv_vsll_vv_i32m4_m(
+            v_mask_left_shift, v_prod_lo,
+            __riscv_vreinterpret_v_i32m4_u32m4(v_left_shift_amount), vl);
+    }
+
+    // Merge the results from the right and left shift paths.
+    vint32m4_t v_res32 =
+        __riscv_vmerge_vvm_i32m4(v_res_left, v_res_right, v_mask_right_shift, vl);
+
+    // Add the final output offset.
+    v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+
+    // Clamp the results to the activation range.
+    v_res32 = __riscv_vmax_vx_i32m4(v_res32, activation_min, vl);
+    v_res32 = __riscv_vmin_vx_i32m4(v_res32, activation_max, vl);
+
+    return v_res32;
+}
+
+#endif
\ No newline at end of file

From 2b58b5fedf2ec36c9e2ea6c1471383a055fa41ed Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 14 Nov 2025 10:44:33 -0600
Subject: [PATCH 51/86] Change header guard

---
 tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
index c791b7613d7..678c26e35a7 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
@@ -1,5 +1,5 @@
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_COMMON_RVV_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_COMMON_RVV_H_
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_REQUANTIZE_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_REQUANTIZE_RVV_H_
 
 inline vint32m4_t RequantizeVectorPerTensorS32(
     vint32m4_t v_acc, const int32_t multiplier, const int effective_right_shift,

From 69d3f00b75a60f96ba1cb941f7f4b270f87cc7d1 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sat, 15 Nov 2025 23:45:05 -0600
Subject: [PATCH 52/86] Reformat and fix bugs in SoftMax

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 608 +++++++++---------
 1 file changed, 290 insertions(+), 318 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index cad20c63e19..9187f175e16 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -12,274 +12,240 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
+// Vectorized absolute value for signed 32-bit integers
+inline vint32m4_t vabs_i32m4(vint32m4_t v_in, size_t vl)
+{
+  // Create a mask for elements that are less than zero
+  vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
+  
+  // Negate the elements that are negative by calculating (0 - v_in)
+  vint32m4_t v_negated = __riscv_vrsub_vx_i32m4(v_in, 0, vl);
+  
+  // Use the mask to merge the original values (where mask is false) with the negated values
+  return __riscv_vmerge_vvm_i32m4(v_in, v_negated, v_neg_mask, vl);
+}
+
 // Vectorized Saturating Rounding Doubling High Multiply (Vector-Vector)
 inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
 {
-    // Define constants for saturation and rounding nudge
+    // Define scalar constants for saturation and rounding
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_nudge_pos = (INT32_C(1) << 30); // 05 represented as Q31
-    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30); // -05 represented as Q31
+    const int32_t s_rounding_nudge = (INT32_C(1) << 30);
 
-    // Check for overflow condition (INT32_MIN * INT32_MIN)
+    // Create a mask for the specific overflow case: INT32_MIN * INT32_MIN
     vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
     vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
     vbool8_t v_overflow_mask = __riscv_vmand_mm_b8(v_min_mask_a, v_min_mask_b, vl);
 
-    // Calculate the 64-bit product using 32-bit low and high parts
-    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
+    // Perform a 32x32 -> 64-bit multiplication, storing high and low parts
     vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
+    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
-    // Determine the sign of the product to apply the correct rounding nudge
-    vint32m4_t v_xor_signs = __riscv_vxor_vv_i32m4(v_a, v_b, vl);
-    vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
-
-    // Select the appropriate nudge value (positive or negative 05 in Q31)
-    vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
-    v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
-    vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
-
-    // Prepare the high part of the nudge (-1 for negative, 0 for positive)
-    vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
-    v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
-
-    // Add the 64-bit nudge value to the 64-bit product using 32-bit vector ops with carry
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
+    // Add the rounding nudge and detect if a carry-out occurred
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_rounding_nudge, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
-    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
-
-    // Perform the effective right shift by 31 (doubling high multiply)
-    vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
-    vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
-
-    // Apply saturation for the overflow case (INT32_MIN * INT32_MIN)
-    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
-
-    // Return the final saturated and rounded result
-    return v_result;
+    
+    // Add the carry to the high part of the product
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
+
+    // Combine the high and low parts to form the doubled result and apply saturation
+    vint32m4_t v_result_hi_part = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
+    vuint32m4_t v_result_lo_part_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
+    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(
+        v_result_hi_part, 
+        __riscv_vreinterpret_v_u32m4_i32m4(v_result_lo_part_u), vl);
+    
+    // Apply saturation for the INT32_MIN * INT32_MIN case
+    return __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 }
 
 // Vectorized Saturating Rounding Doubling High Multiply (Vector-Scalar)
 inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
 {
-    // Define constants for saturation and rounding nudge
+    // Define scalar constants for saturation and rounding
     const int32_t s_int32_min = INT32_MIN;
     const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_nudge_pos = (INT32_C(1) << 30); // 05 represented as Q31
-    const int32_t s_nudge_neg = 1 - (INT32_C(1) << 30); // -05 represented as Q31
+    const int32_t s_rounding_nudge = (INT32_C(1) << 30);
 
-    // Check for overflow condition (vector elements == INT32_MIN and s_b == INT32_MIN)
+    // Create a mask for the specific overflow case: v_a[i] == INT32_MIN and s_b == INT32_MIN
     vbool8_t v_overflow_mask;
     if (s_b == s_int32_min)
     {
         v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
-    } 
+    }
     else
     {
-        // Create a mask that is all false if scalar is not INT32_MIN
         vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-        v_overflow_mask = __riscv_vmslt_vv_i32m4_b8(v_zero, v_zero, vl); // Always false
+        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_zero, 1, vl); // Always false
     }
 
-    // Calculate the 64-bit product using 32-bit low and high parts
-    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
+    // Perform a 32x32 -> 64-bit multiplication, storing high and low parts
     vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
+    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
     vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
 
-    // Determine the sign of the product to apply the correct rounding nudge
-    vint32m4_t v_xor_signs = __riscv_vxor_vx_i32m4(v_a, s_b, vl);
-    vbool8_t v_prod_sign_pos_mask = __riscv_vmsge_vx_i32m4_b8(v_xor_signs, 0, vl);
-
-    // Select the appropriate nudge value (positive or negative 05 in Q31)
-    vint32m4_t v_nudge_lo = __riscv_vmv_v_x_i32m4(s_nudge_neg, vl);
-    v_nudge_lo = __riscv_vmerge_vxm_i32m4(v_nudge_lo, s_nudge_pos, v_prod_sign_pos_mask, vl);
-    vuint32m4_t v_nudge_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_nudge_lo);
-
-    // Prepare the high part of the nudge (-1 for negative, 0 for positive)
-    vint32m4_t v_nudge_hi = __riscv_vmv_v_x_i32m4(-1, vl);
-    v_nudge_hi = __riscv_vmerge_vxm_i32m4(v_nudge_hi, 0, v_prod_sign_pos_mask, vl);
-
-    // Add the 64-bit nudge value to the 64-bit product using 32-bit vector ops with carry
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4(v_prod_lo_u, v_nudge_lo_u, vl);
+    // Add the rounding nudge and detect if a carry-out occurred
+    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_rounding_nudge, vl);
     vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    vint32m4_t v_sum_hi = __riscv_vadd_vv_i32m4(v_prod_hi, v_nudge_hi, vl);
-    v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_sum_hi, 1, vl);
-
-    // Perform the effective right shift by 31 (doubling high multiply)
-    vuint32m4_t v_sum_lo_shifted_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
-    vint32m4_t v_sum_hi_shifted = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(v_sum_hi_shifted, __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_shifted_u), vl);
-
-    // Apply saturation for the overflow case
-    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
-
-    // Return the final saturated and rounded result
-    return v_result;
+    
+    // Add the carry to the high part of the product
+    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
+
+    // Combine the high and low parts to form the doubled result
+    vint32m4_t v_result_hi_part = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
+    vuint32m4_t v_result_lo_part_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
+    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(
+        v_result_hi_part,
+        __riscv_vreinterpret_v_u32m4_i32m4(v_result_lo_part_u), vl);
+
+    // Apply saturation for the INT32_MIN * INT32_MIN case
+    return __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
 }
 
-// Vectorized Saturating Rounded Multiply by Power of Two (Vector-Scalar)
+// Vectorized Saturating Rounding Multiply by Power-of-Two
 inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 {
-    // Handle left shifts (shift > 0)
-    if (shift > 0) 
+  // If shift is zero, return the original vector
+  if (shift == 0)
+  {
+    return v_vec;
+  }
+
+  // This section handles left shifts (positive shift values)
+  if (shift > 0)
+  {
+    // Define scalar constants for saturation and shifting
+    const int32_t s_shift = shift;
+    const int32_t s_max_val = INT32_MAX;
+    const int32_t s_min_val = INT32_MIN;
+
+    // Handle extreme shifts that always result in saturation
+    if (s_shift >= 31)
     {
-        // Cast shift amount to signed 32-bit integer
-        const int32_t s_shift = shift;
-        // If shift is zero, return the original vector
-        if (s_shift == 0) return v_vec;
-
-        // Define saturation limits
-        const int32_t s_max_val = INT32_MAX;
-        const int32_t s_min_val = INT32_MIN;
+      vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
+      vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
+      vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+      vint32m4_t v_saturated = __riscv_vmerge_vxm_i32m4(v_zero, s_max_val, v_pos_mask, vl);
+      return __riscv_vmerge_vxm_i32m4(v_saturated, s_min_val, v_neg_mask, vl);
+    }
 
-        // Handle large left shifts (>= 31), resulting in saturation or zero
-        if (s_shift >= 31) 
-        {
-             vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-             vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
-             vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
-             // Saturate positive values to max, negative values to min
-             vint32m4_t v_saturated = __riscv_vmerge_vxm_i32m4(v_zero, s_max_val, v_pos_mask, vl);
-             v_saturated = __riscv_vmerge_vxm_i32m4(v_saturated, s_min_val, v_neg_mask, vl);
-             return v_saturated;
-        } 
-        else 
-        {
-            // Calculate saturation thresholds for smaller left shifts
-            const int32_t scalar_type_bits = 32;
-            const int64_t pos_threshold_64 = (INT64_C(1) << (scalar_type_bits - 1 - s_shift)) - 1;
-            const int64_t neg_threshold_64 = -(INT64_C(1) << (scalar_type_bits - 1 - s_shift));
-
-            // Clamp thresholds to int32 range
-            const int32_t pos_threshold = (pos_threshold_64 > INT32_MAX) ? INT32_MAX : (int32_t)pos_threshold_64;
-            const int32_t neg_threshold = (neg_threshold_64 < INT32_MIN) ? INT32_MIN : (int32_t)neg_threshold_64;
-
-            // Create masks for elements exceeding positive or negative thresholds
-            vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, pos_threshold, vl);
-            vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, neg_threshold, vl);
-
-            // Perform the left shift
-            vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
-
-            // Merge shifted results with saturation values based on overflow masks
-            vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
-            v_result = __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
-            return v_result;
-        }
-    } 
-    else if (shift == 0) 
-    { // Handle no shift (shift == 0)
-         return v_vec;
-    } 
-    else // Handle right shifts (shift < 0)
+    // Calculate thresholds for overflow detection
+    const int32_t pos_threshold = (INT32_C(1) << (31 - s_shift));
+    const int32_t neg_threshold = -pos_threshold;
+    
+    // Create masks for positive and negative overflow
+    vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, pos_threshold - 1, vl);
+    vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, neg_threshold, vl);
+
+    // Perform the left shift
+    vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
+
+    // Merge the shifted result with saturated values based on overflow masks
+    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
+    return __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
+
+  }
+  else
+  {
+    // This section handles right shifts (negative shift values) with rounding
+    const int exponent = -shift;
+    if (exponent <= 0) return v_vec;
+    
+    // Handle extreme shifts that result in 0 or -1
+    if (exponent > 31)
     {
-        // Calculate the positive exponent for right shift, capped at 31
-        int exponent = -shift;
-        exponent = std::min(31, exponent);
-        // If exponent is zero, return the original vector
-        if (exponent == 0) return v_vec;
-
-        // Calculate the mask for extracting the remainder bits
-        const int32_t s_mask_val = (INT32_C(1) << exponent) - 1;
-        const int32_t s_zero = 0;
-        const int32_t s_one = 1;
-
-        // Extract the remainder and perform the arithmetic right shift
-        vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_mask_val, vl);
-        vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, exponent, vl);
-
-        // Calculate the rounding threshold (half the divisor)
-        const int32_t s_threshold_base = s_mask_val >> 1;
-        vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
-        // Adjust threshold for negative numbers (round away from zero)
-        vbool8_t v_is_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, s_zero, vl);
-        v_threshold = __riscv_vadd_vx_i32m4_m(v_is_neg_mask, v_threshold, s_one, vl);
-
-        // Create mask for elements where remainder > threshold, requiring rounding up
-        vbool8_t v_add1_mask = __riscv_vmsgt_vv_i32m4_b8(v_remainder, v_threshold, vl);
-        // Add 1 to the shifted result for elements needing rounding up
-        vint32m4_t v_result = __riscv_vadd_vx_i32m4_m(v_add1_mask, v_shifted, s_one, vl);
-
-        // Return the rounded right-shifted result
-        return v_result;
+        vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
+        vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+        return __riscv_vmerge_vxm_i32m4(v_zero, -1, v_neg_mask, vl);
     }
+    
+    // Calculate the rounding threshold ("round half away from zero")
+    const int32_t s_mask = (INT32_C(1) << exponent) - 1;
+    const int32_t s_threshold_base = s_mask >> 1;
+    vbool8_t v_is_negative_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
+    vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
+    v_threshold = __riscv_vadd_vx_i32m4_m(v_is_negative_mask, v_threshold, 1, vl);
+
+    // Check if the remainder requires rounding up
+    vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_mask, vl);
+    vint32m4_t v_abs_remainder = vabs_i32m4(v_remainder, vl);
+    vbool8_t v_should_round_mask = __riscv_vmsgt_vv_i32m4_b8(v_abs_remainder, v_threshold, vl);
+
+    // Perform the arithmetic right shift
+    vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, exponent, vl);
+    
+    // Add 1 to the result if rounding is needed
+    return __riscv_vadd_vx_i32m4_m(v_should_round_mask, v_shifted, 1, vl);
+  }
 }
 
 // Vectorized MultiplyByQuantizedMultiplier for multipliers > 1 (Vector-Scalar)
 inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
     vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) 
 {
-
-    // Perform saturating left shift using SRMPOT
-    vint32m4_t v_shifted_x = SRMPOT_vx_i32m4(v_x, left_shift, vl);
-
-    // Perform saturating rounding doubling high multiply using SRDMH
+    // Apply the left shift to the input vector
+    vint32m4_t v_shifted_x = __riscv_vsll_vx_i32m4(v_x, left_shift, vl);
+    
+    // Perform the saturating rounding doubling high multiply
     return SRDMH_vx_i32m4(v_shifted_x, quantized_multiplier, vl);
 }
 
-// Vectorized exp function for quantized negative inputs
+// Vectorized fixed-point implementation of exp(x) for negative q526 inputs
 vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 {
-    // Define quantization parameters and constants
-    const int kInputIntegerBits = 5; // Q526 input format
-    const int kInputFractionalBits = 32 - 1 - kInputIntegerBits;
-    const int kOutputFractionalBits = 31; // Q031 output format
+    // Define fixed-point constants for input and output formats
+    const int kInputIntegerBits = 5;
+    const int kInputFractionalBits = 26; 
+    const int kOutputFractionalBits = 31;
 
-    // Constants for input range reduction (modulo 1/4)
+    // Define constants for range reduction (exp(x) = exp(x/4) * exp(3x/4))
     const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
     const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
 
-    // Constants for Taylor series approximation and final result assembly
-    const int32_t s_result_one_q0_31 = INT32_MAX; // 10 in Q031
-    const int32_t s_exp_neg_1_8_q0_31 = 1895147668; // exp(-1/8) in Q031
-    const int32_t s_one_third_q0_31 = 715827883; // 1/3 in Q031
-    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3); // 1/8 in Q031
+    // Define constants for Taylor series approximation of exp(x) around -1/8
+    const int32_t s_result_one_q0_31 = INT32_MAX; 
+    const int32_t s_exp_neg_1_8_q0_31 = 1895147668; 
+    const int32_t s_one_third_q0_31 = 715827883; 
+    const int32_t s_one_24th_q0_31 = 89478485; 
+    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
 
-    // Reduce input `a` to the range [-1/4, 0] by finding `a mod (-1/4)`
+    // Perform range reduction to map the input to the [-1/4, 0] interval
     vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
     vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
-    // Store the multiple of -1/4 that was subtracted
-    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_q5_26, v_a_mod_q_m_q_q5_26, vl);
+    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
 
-    // Rescale the reduced input from Q526 to Q031 for Taylor series input
-    const int rescale_shift = kInputIntegerBits - 0;
-    vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, -rescale_shift, vl);
+    // Rescale for Taylor series input
+    const int rescale_shift = kOutputFractionalBits - kInputFractionalBits;
+    vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
 
-    // Calculate Taylor series approximation for exp(x) around x = -1/8 Let y = x + 1/8
+    // Center the input around -1/8 for better Taylor series accuracy
     vint32m4_t v_y = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
 
-    // Calculate powers of y needed for the Taylor expansion (y^2, y^3, y^4)
+    // Calculate polynomial terms: y^2, y^3, y^4
     vint32m4_t v_y2 = SRDMH_vv_i32m4(v_y, v_y, vl);
     vint32m4_t v_y3 = SRDMH_vv_i32m4(v_y2, v_y, vl);
     vint32m4_t v_y4 = SRDMH_vv_i32m4(v_y2, v_y2, vl);
 
-    // Calculate term y^4 / 4
-    vint32m4_t v_y4_over_4 = SRMPOT_vx_i32m4(v_y4, -2, vl);
-
-    // Combine Taylor series terms: exp(-1/8) * (1 + y + y^2/2! + y^3/3! + y^4/4! + )
-    // Approximation used: exp(-1/8) * (1 + y + (y^2 + (y^3 + y^4/4) / 3) / 2)
-    vint32m4_t v_term1 = __riscv_vadd_vv_i32m4(v_y4_over_4, v_y3, vl);
-    vint32m4_t v_term2 = SRDMH_vx_i32m4(v_term1, s_one_third_q0_31, vl);
-    vint32m4_t v_term3 = __riscv_vadd_vv_i32m4(v_term2, v_y2, vl);
-    vint32m4_t v_sum_of_higher_terms = SRMPOT_vx_i32m4(v_term3, -1, vl); // Division by 2
-
-    // Calculate the term inside the main bracket (y + higher terms)
-    vint32m4_t v_bracket_term = __riscv_vadd_vv_i32m4(v_y, v_sum_of_higher_terms, vl);
-
-    // Multiply bracket term by precomputed exp(-1/8)
+    // Calculate scaled polynomial terms: y^2/2, y^3/6, y^4/24
+    vint32m4_t v_term_y2_over_2 = SRMPOT_vx_i32m4(v_y2, -1, vl);
+    vint32m4_t v_term_y3_over_3 = SRDMH_vx_i32m4(v_y3, s_one_third_q0_31, vl);
+    vint32m4_t v_term_y3_over_6 = SRMPOT_vx_i32m4(v_term_y3_over_3, -1, vl);
+    vint32m4_t v_term_y4_over_24 = SRDMH_vx_i32m4(v_y4, s_one_24th_q0_31, vl);
+
+    // Sum the polynomial terms: y + y^2/2 + y^3/6 + y^4/24
+    vint32m4_t v_poly_sum = __riscv_vadd_vv_i32m4(v_y, v_term_y2_over_2, vl);
+    v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y3_over_6, vl);
+    v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y4_over_24, vl);
+    
+    // Calculate the final result for the interval: exp(-1/8) * (1 + poly_sum)
     vint32m4_t v_const_term_vec = __riscv_vmv_v_x_i32m4(s_exp_neg_1_8_q0_31, vl);
-    vint32m4_t v_mul_term = SRDMH_vv_i32m4(v_bracket_term, v_const_term_vec, vl);
-
-    // Add the constant term exp(-1/8) to complete the Taylor approximation for the interval [-1/4, 0]
+    vint32m4_t v_mul_term = SRDMH_vv_i32m4(v_poly_sum, v_const_term_vec, vl);
     vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vv_i32m4(v_mul_term, v_const_term_vec, vl);
 
-    // Start reconstructing the full result using the remainder and precomputed exp factors
+    // Reconstruct the full result using a barrel shifter based on the remainder
     vint32m4_t v_current_result = v_interval_result_q0_31;
-
-    // Define precomputed multipliers exp(-1/4), exp(-1/2), exp(-1), etc in Q031
     const int32_t s_mult_exp_neg_1_4 = 1672461947;
     const int32_t s_mult_exp_neg_1_2 = 1302514674;
     const int32_t s_mult_exp_neg_1   = 790015084;
@@ -288,29 +254,25 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     const int32_t s_mult_exp_neg_8   = 720401;
     const int32_t s_mult_exp_neg_16  = 242;
 
-    // Macro to apply barrel shifter logic: multiply by exp(-2^k) if corresponding bit in remainder is set
+    // Macro to conditionally apply multipliers based on remainder bits
     #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
-    do { \
-        /* Check if the exponent is within the representable input integer bits */ \
-        if (kInputIntegerBits > exponent) { \
-            /* Calculate the bit position corresponding to this exponent in the Q526 remainder */ \
+    do \
+    { \
+        if (kInputIntegerBits > exponent) \
+        { \
             const int shift_amount = kInputFractionalBits + exponent; \
-            /* Ensure the bit position is valid */ \
-            if (shift_amount >= 0 && shift_amount < 32) { \
-                /* Create a mask for the specific bit */ \
+            if (shift_amount >= 0 && shift_amount < 32) \
+            { \
                 int32_t bit_mask_val = INT32_C(1) << shift_amount; \
-                /* Check if the bit is set in the remainder vector */ \
                 vint32m4_t v_rem_masked = __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask_val, vl); \
                 vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl); \
-                /* Multiply the current result by the precomputed factor */ \
                 vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
-                /* Merge the multiplied result where the mask is true */ \
                 v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
             } \
         } \
     } while(0)
 
-    // Apply the barrel shifter for each power of 2 from exp(-1/4) up to exp(-16)
+    // Apply barrel shifter for each power-of-two component
     APPLY_BARREL_SHIFT(-2, s_mult_exp_neg_1_4);
     APPLY_BARREL_SHIFT(-1, s_mult_exp_neg_1_2);
     APPLY_BARREL_SHIFT( 0, s_mult_exp_neg_1);
@@ -319,185 +281,195 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     APPLY_BARREL_SHIFT( 3, s_mult_exp_neg_8);
     APPLY_BARREL_SHIFT( 4, s_mult_exp_neg_16);
 
-    // Undefine the helper macro
     #undef APPLY_BARREL_SHIFT
 
-    // Handle the special case where input a = 0, the result should be 10 (INT32_MAX in Q031)
+    // Handle the case where input is 0, for which exp(0) = 1
     vint32m4_t v_final_result = v_current_result;
     vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
     v_final_result = __riscv_vmerge_vxm_i32m4(v_final_result, s_result_one_q0_31, v_zero_mask, vl);
 
-    // Return the final computed exp result in Q031 format
     return v_final_result;
 }
 
-// Vectorized Softmax implementation for INT8 input and INT8/INT16 output
-template<typename OutputT>
-void SoftmaxInt8RVV(const tflite::SoftmaxParams& params,
-                    const tflite::RuntimeShape& input_shape,
-                    const int8_t* input_data,
-                    const tflite::RuntimeShape& output_shape,
-                    OutputT* output_data)
+// Main RVV-accelerated Softmax kernel function
+template<typename InputT, typename OutputT>
+void SoftmaxRVV(const tflite::SoftmaxParams& params,
+                const tflite::RuntimeShape& input_shape,
+                const InputT* input_data,
+                const tflite::RuntimeShape& output_shape,
+                OutputT* output_data)
 {
-    // Extract Softmax quantization parameters
+    // Extract quantization parameters
     const int32_t input_beta_multiplier = params.input_multiplier;
     const int32_t input_beta_left_shift = params.input_left_shift;
     const int diff_min = params.diff_min;
-
-    // Define fixed-point format for intermediate sum accumulation (Q1219)
+    
+    // Define fixed-point constants for accumulation and output
     static const int kAccumulationIntegerBits = 12;
     static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits;
-
-    // Define fixed-point format for exp function output (Q031)
     static const int kExpOutputFractionalBits = 31;
-
-    // Get input/output shape dimensions
+    
+    // Extract shape dimensions
     const int trailing_dim = input_shape.DimensionsCount() - 1;
     const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
     const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
     const size_t depth_sz = static_cast<size_t>(depth);
 
-    // Loop over each row (batch or outer dimension)
+    // Loop over each row in the outer dimensions
     for (int i = 0; i < outer_size; ++i)
     {
-        // Set input and output pointers for the current row
-        const int8_t* current_input_data = input_data + i * depth;
+        const InputT* current_input_data = input_data + i * depth;
         OutputT* current_output_data = output_data + i * depth;
 
-        // Initialize scalar and vector max accumulators
-        int8_t max_in_row = std::numeric_limits<int8_t>::min();
-        size_t vl_temp = __riscv_vsetvl_e8m1(1); // Set VL=1 for scalar init
-        vint8m1_t v_max_acc_m1 = __riscv_vmv_v_x_i8m1(max_in_row, vl_temp);
-
-        // Process the row in vector chunks to find the maximum value
-        const int8_t* Ptr_max = current_input_data;
-        for (ptrdiff_t n = depth_sz; n > 0; ) {
+        // Find the maximum value in the current row for numerical stability
+        InputT max_in_row = std::numeric_limits<InputT>::min();
+        const InputT* ptr_max = current_input_data;
+        ptrdiff_t n = depth_sz;
+        while (n > 0)
+        {
             size_t vl = __riscv_vsetvl_e8m1(n);
-            vint8m1_t v_input_m1 = __riscv_vle8_v_i8m1(Ptr_max, vl);
-            // Perform reduction max across the loaded vector and the accumulator
-            v_max_acc_m1 = __riscv_vredmax_vs_i8m1_i8m1(v_input_m1, v_max_acc_m1, vl);
-            Ptr_max += vl;
+            if constexpr (std::is_signed_v<InputT>)
+            {
+                vint8m1_t v_input = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(ptr_max), vl);
+                vint8m1_t v_scalar = __riscv_vmv_v_x_i8m1(max_in_row, vl);
+                vint8m1_t v_red = __riscv_vredmax_vs_i8m1_i8m1(v_input, v_scalar, vl);
+                max_in_row = std::max(max_in_row, __riscv_vmv_x_s_i8m1_i8(v_red));
+            }
+            else
+            {
+                vuint8m1_t v_input = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(ptr_max), vl);
+                vuint8m1_t v_scalar = __riscv_vmv_v_x_u8m1(max_in_row, vl);
+                vuint8m1_t v_red = __riscv_vredmaxu_vs_u8m1_u8m1(v_input, v_scalar, vl);
+                max_in_row = std::max(max_in_row, __riscv_vmv_x_s_u8m1_u8(v_red));
+            }
+            ptr_max += vl;
             n -= vl;
         }
-
-        // Extract the final scalar maximum value from the vector accumulator
-        max_in_row = __riscv_vmv_x_s_i8m1_i8(v_max_acc_m1);
         const int32_t max_in_row_s32 = static_cast<int32_t>(max_in_row);
 
-        // Initialize vector sum accumulator (using m1 for reduction target)
-        vl_temp = __riscv_vsetvl_e32m1(1); // Set VL=1 for scalar init
-        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_temp);
-
-        // Process the row in vector chunks
+        // Calculate the sum of exponentials of (input - max)
+        size_t vl_temp_sum = __riscv_vsetvl_e32m1(1);
+        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_temp_sum);
         size_t current_c = 0;
         while (current_c < depth_sz)
         {
             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
-            // Load input chunk (i8), widen to i16, then widen to i32
-            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
-            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+            // Load 8-bit input data and widen to 32-bit
+            vint32m4_t v_input_s32;
+            if constexpr (std::is_signed_v<InputT>)
+            {
+                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
+                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+                v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+            }
+            else
+            {
+                vuint8m1_t v_input_u8 = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
+                vuint16m2_t v_input_u16 = __riscv_vwaddu_vx_u16m2(v_input_u8, 0, vl);
+                vuint32m4_t v_input_u32 = __riscv_vwaddu_vx_u32m4(v_input_u16, 0, vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_input_u32);
+            }
 
-            // Calculate difference from max (input - max_in_row)
+            // Calculate the difference and create a mask for values >= diff_min
             vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-
-            // Create mask for elements where difference >= diff_min
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
-
-            // Rescale difference for exp input (Q526 format)
+            
+            // Rescale the difference for the exp function
             vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
                 v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
-
-            // Calculate exponent using the vectorized exp function (output in Q031)
+            
+            // Calculate the exponential of the rescaled difference
             vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
-
-            // Rescale exponent result from Q031 to accumulation format (Q1219)
-            const int rescale_shift_exp_to_accum = kExpOutputFractionalBits - kAccumulationFractionalBits;
-            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, -rescale_shift_exp_to_accum, vl);
-
-            // Zero out exponent terms where difference was below diff_min threshold
+            
+            // Rescale the exponential result to the accumulation format
+            const int rescale_shift = kAccumulationFractionalBits - kExpOutputFractionalBits;
+            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift, vl);
+            
+            // Mask out values that were below the diff_min threshold and accumulate
             vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
-            vint32m4_t v_exp_term_masked_q12_19 = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
-
-            // Accumulate the sum using vector reduction sum
-            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked_q12_19, v_sum_acc_m1, vl);
-
-            // Advance pointer for the next chunk
+            vint32m4_t v_exp_term_masked = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
+            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked, v_sum_acc_m1, vl);
+            
             current_c += vl;
         }
-        
-        // Extract the final scalar sum from the vector accumulator
         int32_t sum_of_exps_raw = __riscv_vmv_x_s_i32m1_i32(v_sum_acc_m1);
 
-        // Calculate the reciprocal of the sum (Q031 format)
-        int num_bits_over_unit; // Headroom bits for reciprocal calculation
+        // Calculate the reciprocal of the sum of exponentials
+        int num_bits_over_unit;
         int32_t reciprocal_raw_q0_31 = tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit);
-        const int32_t s_shifted_scale_raw_q0_31 = reciprocal_raw_q0_31;
-
-        // Calculate the final right shift amount needed to scale to the output format
-        const int output_bits = sizeof(OutputT) * 8;
-        const int exponent = num_bits_over_unit + 31 - output_bits;
-
-        // Get output type min/max values for saturation
-        const OutputT output_min_val = std::numeric_limits<OutputT>::min();
-        const OutputT output_max_val = std::numeric_limits<OutputT>::max();
-        const int32_t output_min_s32 = static_cast<int32_t>(output_min_val);
-        const int32_t output_max_s32 = static_cast<int32_t>(output_max_val);
+        
+        // Calculate the final output shift exponent
+        const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
+        const int32_t output_min_s32 = static_cast<int32_t>(std::numeric_limits<OutputT>::min());
+        const int32_t output_max_s32 = static_cast<int32_t>(std::numeric_limits<OutputT>::max());
 
-        // Process the row again in vector chunks to calculate and store output
+        // Compute and store the final output values
         current_c = 0;
         while (current_c < depth_sz)
         {
-             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
+            size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
-            // Reload input chunk and recalculate exp (or store/reload from previous step if memory allows)
-            vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(current_input_data + current_c, vl);
-            vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-            vint32m4_t v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+            // Reload and widen the input data
+            vint32m4_t v_input_s32;
+            if constexpr (std::is_signed_v<InputT>)
+            {
+                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
+                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
+                v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+            }
+            else
+            {
+                vuint8m1_t v_input_u8 = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
+                vuint16m2_t v_input_u16 = __riscv_vwaddu_vx_u16m2(v_input_u8, 0, vl);
+                vuint32m4_t v_input_u32 = __riscv_vwaddu_vx_u32m4(v_input_u16, 0, vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_input_u32);
+            }
+            
+            // Recompute the difference, mask, and exponential
             vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
             vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
             vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
                 v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
             vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
-
-            // Multiply the exponent by the reciprocal scale (SRDMH gives Q031 result)
-            vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, s_shifted_scale_raw_q0_31, vl);
-
-            // Apply the final right shift to scale to the output range
+            
+            // Multiply the exponential by the reciprocal to get the normalized result
+            vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, reciprocal_raw_q0_31, vl);
+            
+            // Rescale the output and add the output offset (zero point)
             vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, -exponent, vl);
-
-            // Add the output offset (min value of OutputT) Note: TFLM reference uses this instead of output_offset param
             vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl);
-
-            // Clamp the result to the output type's min/max range
-            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(v_shifted_output, output_min_s32, vl);
-            v_clamped_output = __riscv_vmin_vx_i32m4(v_clamped_output, output_max_s32, vl);
-
-            // Merge the clamped output with the minimum output value where the difference was below diff_min
+            
+            // Clamp the result to the output data type's range
+            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(__riscv_vmin_vx_i32m4(v_shifted_output, output_max_s32, vl), output_min_s32, vl);
+            
+            // Apply the diff_min mask one last time
             vint32m4_t v_output_min_vec = __riscv_vmv_v_x_i32m4(output_min_s32, vl);
             vint32m4_t v_final_s32 = __riscv_vmerge_vvm_i32m4(v_output_min_vec, v_clamped_output, v_diff_mask, vl);
 
-            // Narrow the final result to the OutputT (int8 or int16) and store it
+            // Narrow the 32-bit results down to the output type and store
             if constexpr (sizeof(OutputT) == 1)
             {
-                 // Narrow i32m4 -> i16m2 -> i8m1
-                 vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
-                 vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl);
-
-                 // Store i8m1 result
-                 __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl);
+                if constexpr (std::is_signed_v<OutputT>)
+                {
+                    vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
+                    vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl);
+                    __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl);
+                }
+                else
+                {
+                    vuint32m4_t v_final_u32 = __riscv_vreinterpret_v_i32m4_u32m4(v_final_s32);
+                    vuint16m2_t v_temp_u16 = __riscv_vncvt_x_x_w_u16m2(v_final_u32, vl);
+                    vuint8m1_t v_final_output = __riscv_vncvt_x_x_w_u8m1(v_temp_u16, vl);
+                    __riscv_vse8_v_u8m1(reinterpret_cast<uint8_t*>(current_output_data + current_c), v_final_output, vl);
+                }
             }
             else
             {
-                 // Narrow i32m4 -> i16m2
-                 vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
-
-                 // Store i16m2 result
-                 __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl);
+                vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
+                __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl);
             }
-
-            // Advance pointer for the next chunk
+            
             current_c += vl;
         }
     }

From 081f34b0ce71c887f833de52e92430e34a4b7ddc Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 16 Nov 2025 02:05:53 -0600
Subject: [PATCH 53/86] Initial vector optimized RFFT kernel

---
 .../lite/micro/kernels/riscv_vector/rfft.cc   | 241 ++++++++++++++++++
 .../kernels/riscv_vector/rfft_int16_rvv.cc    | 146 +++++++++++
 .../kernels/riscv_vector/rfft_int16_rvv.h     |  13 +
 .../make/targets/riscv32_vector_makefile.inc  |   4 +
 4 files changed, 404 insertions(+)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft.cc b/tensorflow/lite/micro/kernels/riscv_vector/rfft.cc
new file mode 100644
index 00000000000..b50f082faa5
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/rfft.cc
@@ -0,0 +1,241 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "signal/src/rfft.h"
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "signal/micro/kernels/rfft.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/flatbuffer_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
+#include "rfft_int16_rvv.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// Indices into the init flexbuffer's vector.
+// The parameter's name is in the comment that follows.
+// Elements in the vectors are ordered alphabetically by parameter name.
+// 'T' is added implicitly by the TensorFlow framework when the type is resolved
+// during graph construction.
+// constexpr int kTypeIndex = 0;  // 'T' (unused)
+constexpr int kFftLengthIndex = 1;  // 'fft_length'
+
+template <typename T>
+struct TfLiteAudioFrontendRfftParams {
+  int32_t fft_length;
+  int32_t input_size;
+  int32_t input_length;
+  int32_t output_length;
+  TfLiteType fft_type;
+  T* work_area;
+  int scratch_buffer_index;
+  int8_t* state;
+};
+
+template <typename T, size_t (*get_needed_memory_func)(int32_t),
+          void* (*init_func)(int32_t, void*, size_t)>
+void* RfftInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  auto* params = static_cast<TfLiteAudioFrontendRfftParams<T>*>(
+      context->AllocatePersistentBuffer(
+          context, sizeof(TfLiteAudioFrontendRfftParams<T>)));
+
+  tflite::FlexbufferWrapper fbw(buffer_t, length);
+  params->fft_length = fbw.ElementAsInt32(kFftLengthIndex);
+  params->fft_type = typeToTfLiteType<T>();
+
+  size_t state_size = (*get_needed_memory_func)(params->fft_length);
+  params->state = static_cast<int8_t*>(
+      context->AllocatePersistentBuffer(context, state_size * sizeof(int8_t)));
+  (*init_func)(params->fft_length, params->state, state_size);
+  return params;
+}
+
+template <typename T, TfLiteType TfLiteTypeEnum>
+TfLiteStatus RfftPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), NumDimensions(output));
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, TfLiteTypeEnum);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, TfLiteTypeEnum);
+
+  auto* params =
+      reinterpret_cast<TfLiteAudioFrontendRfftParams<T>*>(node->user_data);
+  RuntimeShape input_shape = GetTensorShape(input);
+  RuntimeShape output_shape = GetTensorShape(output);
+  params->input_length = input_shape.Dims(input_shape.DimensionsCount() - 1);
+  params->input_size = input_shape.FlatSize();
+  // Divide by 2 because output is complex.
+  params->output_length =
+      output_shape.Dims(output_shape.DimensionsCount() - 1) / 2;
+
+  context->RequestScratchBufferInArena(context, params->fft_length * sizeof(T),
+                                       &params->scratch_buffer_index);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+template <typename T, void (*apply_func)(void*, const T* input, Complex<T>*)>
+TfLiteStatus RfftEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioFrontendRfftParams<T>*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+
+  const T* input_data = tflite::micro::GetTensorData<T>(input);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  Complex<T>* output_data = tflite::micro::GetTensorData<Complex<T>>(output);
+
+  T* work_area = static_cast<T*>(
+      context->GetScratchBuffer(context, params->scratch_buffer_index));
+
+  for (int input_idx = 0, output_idx = 0; input_idx < params->input_size;
+       input_idx += params->input_length, output_idx += params->output_length) {
+    memcpy(work_area, &input_data[input_idx], sizeof(T) * params->input_length);
+    // Zero pad input to FFT length
+    memset(&work_area[params->input_length], 0,
+           sizeof(T) * (params->fft_length - params->input_length));
+
+    (*apply_func)(params->state, work_area, &output_data[output_idx]);
+  }
+  return kTfLiteOk;
+}
+
+void* RfftInitAll(TfLiteContext* context, const char* buffer, size_t length) {
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  auto tensor_type = static_cast<tflite::TensorType>(m["T"].AsInt32());
+
+  switch (tensor_type) {
+    case TensorType_INT16: {
+      return RfftInit<int16_t, ::tflm_signal::RfftInt16GetNeededMemory,
+                      ::tflm_signal::RfftInt16Init>(context, buffer, length);
+    }
+    case TensorType_INT32: {
+      return RfftInit<int32_t, ::tflm_signal::RfftInt32GetNeededMemory,
+                      ::tflm_signal::RfftInt32Init>(context, buffer, length);
+    }
+    case TensorType_FLOAT32: {
+      return RfftInit<float, ::tflm_signal::RfftFloatGetNeededMemory,
+                      ::tflm_signal::RfftFloatInit>(context, buffer, length);
+    }
+    default:
+      return nullptr;
+  }
+}
+
+TfLiteStatus RfftPrepareAll(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioFrontendRfftParams<void>*>(node->user_data);
+
+  switch (params->fft_type) {
+    case kTfLiteInt16: {
+      return RfftPrepare<int16_t, kTfLiteInt16>(context, node);
+    }
+    case kTfLiteInt32: {
+      return RfftPrepare<int32_t, kTfLiteInt32>(context, node);
+    }
+    case kTfLiteFloat32: {
+      return RfftPrepare<float, kTfLiteFloat32>(context, node);
+    }
+    default:
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus RfftEvalAll(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioFrontendRfftParams<void>*>(node->user_data);
+
+  switch (params->fft_type) {
+    case kTfLiteInt16: {
+      return RfftEval<int16_t, RfftInt16ApplyRVV>(context, node);
+    }
+    case kTfLiteInt32: {
+      return RfftEval<int32_t, ::tflm_signal::RfftInt32Apply>(context, node);
+    }
+    case kTfLiteFloat32: {
+      return RfftEval<float, ::tflm_signal::RfftFloatApply>(context, node);
+    }
+    default:
+      return kTfLiteError;
+  }
+}
+}  // namespace
+
+// TODO(b/286250473): remove namespace once de-duped libraries
+namespace tflm_signal {
+
+TFLMRegistration* Register_RFFT() {
+  static TFLMRegistration r =
+      tflite::micro::RegisterOp(RfftInitAll, RfftPrepareAll, RfftEvalAll);
+  return &r;
+}
+
+TFLMRegistration* Register_RFFT_FLOAT() {
+  static TFLMRegistration r = tflite::micro::RegisterOp(
+      RfftInit<float, ::tflm_signal::RfftFloatGetNeededMemory,
+               ::tflm_signal::RfftFloatInit>,
+      RfftPrepare<float, kTfLiteFloat32>,
+      RfftEval<float, ::tflm_signal::RfftFloatApply>);
+  return &r;
+}
+
+TFLMRegistration* Register_RFFT_INT16() {
+  static TFLMRegistration r = tflite::micro::RegisterOp(
+      RfftInit<int16_t, ::tflm_signal::RfftInt16GetNeededMemory,
+               ::tflm_signal::RfftInt16Init>,
+      RfftPrepare<int16_t, kTfLiteInt16>,
+      RfftEval<int16_t, ::tflm_signal::RfftInt16Apply>);
+  return &r;
+}
+
+TFLMRegistration* Register_RFFT_INT32() {
+  static TFLMRegistration r = tflite::micro::RegisterOp(
+      RfftInit<int32_t, ::tflm_signal::RfftInt32GetNeededMemory,
+               ::tflm_signal::RfftInt32Init>,
+      RfftPrepare<int32_t, kTfLiteInt32>,
+      RfftEval<int32_t, ::tflm_signal::RfftInt32Apply>);
+  return &r;
+}
+
+}  // namespace tflm_signal
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
new file mode 100644
index 00000000000..2ed889313fd
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
@@ -0,0 +1,146 @@
+#include <riscv_vector.h>
+
+#include "signal/src/complex.h"
+#include "signal/src/kiss_fft_wrappers/kiss_fft_int16.h"
+#include "signal/src/kiss_fft_wrappers/kiss_fft_common.h"
+
+#define FIXED_POINT 16
+
+namespace kiss_fft_fixed16 
+{
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+#include "kiss_fft.c"
+#include "tools/kiss_fftr.c"
+}  
+
+void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* timedata,
+                   kiss_fft_fixed16::kiss_fft_cpx* freqdata)
+{
+  // Handle inverse FFT case and perform the initial complex FFT
+  if (st->substate->inverse)
+  {
+    return;
+  }
+  kiss_fft_fixed16::kiss_fft(st->substate, (const kiss_fft_fixed16::kiss_fft_cpx*)timedata, st->tmpbuf);
+
+  // Process DC and Nyquist bins separately (scalar operations)
+  const int ncfft = st->substate->nfft;
+  kiss_fft_fixed16::kiss_fft_cpx tdc;
+  tdc.r = st->tmpbuf[0].r;
+  tdc.i = st->tmpbuf[0].i;
+  C_FIXDIV(tdc, 2);
+  freqdata[0].r = tdc.r + tdc.i;
+  freqdata[ncfft].r = tdc.r - tdc.i;
+  freqdata[0].i = 0;
+  freqdata[ncfft].i = 0;
+
+  // Initialize pointers and loop variables for the main vector processing loop
+  size_t k = 1;
+  const size_t loop_end = ncfft / 2;
+  const int16_t* tmpbuf_base_ptr = (const int16_t*)st->tmpbuf;
+  const int16_t* twiddles_base_ptr = (const int16_t*)st->super_twiddles;
+  int16_t* freqdata_base_ptr = (int16_t*)freqdata;
+  ptrdiff_t stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+
+  // Main loop to process FFT bins in vector chunks
+  while (k <= loop_end)
+  {
+    // Set the vector length (vl) for the current iteration
+    size_t vl = __riscv_vsetvl_e16m4(loop_end - k + 1);
+
+    // Generate index vectors for accessing fpk, fpnk, and twiddles
+    vuint16m4_t v_k_indices = __riscv_vid_v_u16m4(vl);
+    v_k_indices = __riscv_vadd_vx_u16m4(v_k_indices, k, vl);
+    vuint16m4_t v_neg_k_indices = __riscv_vrsub_vx_u16m4(v_k_indices, ncfft, vl);
+    vuint16m4_t v_twiddle_indices = __riscv_vsub_vx_u16m4(v_k_indices, 1, vl);
+
+    // Load the 'fpk' vector using a strided load
+    vint16m4_t v_fpk_r = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k], stride, vl);
+    vint16m4_t v_fpk_i = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
+
+    // Gather the 'fpnk' vector using indexed loads
+    vuint32m8_t v_tmp_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_tmp_i_offsets = __riscv_vadd_vx_u32m8(v_tmp_r_offsets, sizeof(int16_t), vl);
+    vint16m4_t v_fpnk_r_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_r_offsets, vl);
+    vint16m4_t v_fpnk_i_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_i_offsets, vl);
+
+    // Gather the twiddle factors using indexed loads
+    vuint32m8_t v_tw_r_offsets = __riscv_vwmulu_vx_u32m8(v_twiddle_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_tw_i_offsets = __riscv_vadd_vx_u32m8(v_tw_r_offsets, sizeof(int16_t), vl);
+    vint16m4_t v_tw_r = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_r_offsets, vl);
+    vint16m4_t v_tw_i = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_i_offsets, vl);
+
+    // Perform high-precision rounding division on fpk
+    const int16_t scale = 16383;
+    const int32_t round_const = 16384;
+    vint32m8_t v_fpk_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_r, scale, vl), round_const, vl), 15, vl);
+    vint32m8_t v_fpk_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_i, scale, vl), round_const, vl), 15, vl);
+    vint16m4_t v_fpk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fpk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpk_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Perform high-precision rounding division on fpnk (with negated imaginary part)
+    vint16m4_t v_fpnk_i_neg = __riscv_vneg_v_i16m4(v_fpnk_i_raw, vl);
+    vint32m8_t v_fpnk_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_r_raw, scale, vl), round_const, vl), 15, vl);
+    vint32m8_t v_fpnk_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_i_neg, scale, vl), round_const, vl), 15, vl);
+    vint16m4_t v_fpnk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fpnk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate intermediate values f1k (add) and f2k (subtract)
+    vint16m4_t v_f1k_r = __riscv_vadd_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m4_t v_f1k_i = __riscv_vadd_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
+    vint16m4_t v_f2k_r = __riscv_vsub_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m4_t v_f2k_i = __riscv_vsub_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
+
+    // Perform complex multiplication
+    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_r, vl);
+    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_i, vl);
+    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_i, vl);
+    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_r, vl);
+    vint32m8_t v_tw_res_r_32 = __riscv_vssra_vx_i32m8(__riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint32m8_t v_tw_res_i_32 = __riscv_vssra_vx_i32m8(__riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_tw_res_r = __riscv_vnclip_wx_i16m4(v_tw_res_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_tw_res_i = __riscv_vnclip_wx_i16m4(v_tw_res_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate final output vectors
+    vint16m4_t v_out_k_r = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m4_t v_out_k_i = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_i, v_tw_res_i, vl), 1, vl);
+    vint16m4_t v_out_nk_r = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m4_t v_out_nk_i = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_tw_res_i, v_f1k_i, vl), 1, vl);
+
+    // Store the results using a strided store
+    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
+    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
+
+    // Scatter the results using an indexed store
+    vuint32m8_t v_freq_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_freq_i_offsets = __riscv_vadd_vx_u32m8(v_freq_r_offsets, sizeof(int16_t), vl);
+    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_r_offsets, v_out_nk_r, vl);
+    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_i_offsets, v_out_nk_i, vl);
+
+    // Advance to the next vector chunk
+    k += vl;
+  }
+}
+
+size_t RfftInt16GetNeededMemory(int32_t fft_length) {
+  size_t state_size = 0;
+  kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, nullptr, &state_size);
+  return state_size;
+}
+
+void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size) {
+  return kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, state, &state_size);
+}
+
+void RfftInt16ApplyRVV(void* state, const int16_t* input,
+                    Complex<int16_t>* output) {
+  kiss_fftr_rvv(
+      static_cast<kiss_fft_fixed16::kiss_fftr_cfg>(state),
+      reinterpret_cast<const kiss_fft_scalar*>(input),
+      reinterpret_cast<kiss_fft_fixed16::kiss_fft_cpx*>(output));
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h
new file mode 100644
index 00000000000..169bf899f93
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h
@@ -0,0 +1,13 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+size_t RfftInt16GetNeededMemory(int32_t fft_length);
+
+void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size);
+
+void RfftInt16ApplyRVV(void* state, const int16_t* input,
+                    Complex<int16_t>* output);
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index acadddeea34..2d04670fc9f 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -67,6 +67,8 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \
   tensorflow/lite/micro/kernels/riscv_vector/pooling.cc \
   tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/rfft.cc\
+  tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
 #  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
@@ -74,6 +76,8 @@ EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/fully_connected.cc \
   tensorflow/lite/micro/kernels/pooling.cc\
+  tflite-micro/signal/micro/kernels/rfft.cc \
+  tflite-micro/signal/src/rfft_int16.cc \
 #  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 871cbc98736bd946b991044ab83372c1ad1ac3b8 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Sun, 16 Nov 2025 03:48:55 -0600
Subject: [PATCH 54/86] Full vector optimized RFFT and FFT kernels

---
 .../kernels/riscv_vector/rfft_int16_rvv.cc    | 146 ---
 .../kernels/riscv_vector/{ => signal}/rfft.cc |   0
 .../riscv_vector/signal/rfft_int16_rvv.cc     | 873 ++++++++++++++++++
 .../{ => signal}/rfft_int16_rvv.h             |   0
 .../make/targets/riscv32_vector_makefile.inc  |   5 +-
 5 files changed, 876 insertions(+), 148 deletions(-)
 delete mode 100644 tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
 rename tensorflow/lite/micro/kernels/riscv_vector/{ => signal}/rfft.cc (100%)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
 rename tensorflow/lite/micro/kernels/riscv_vector/{ => signal}/rfft_int16_rvv.h (100%)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
deleted file mode 100644
index 2ed889313fd..00000000000
--- a/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <riscv_vector.h>
-
-#include "signal/src/complex.h"
-#include "signal/src/kiss_fft_wrappers/kiss_fft_int16.h"
-#include "signal/src/kiss_fft_wrappers/kiss_fft_common.h"
-
-#define FIXED_POINT 16
-
-namespace kiss_fft_fixed16 
-{
-#include "kiss_fft.h"
-#include "tools/kiss_fftr.h"
-#include "kiss_fft.c"
-#include "tools/kiss_fftr.c"
-}  
-
-void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* timedata,
-                   kiss_fft_fixed16::kiss_fft_cpx* freqdata)
-{
-  // Handle inverse FFT case and perform the initial complex FFT
-  if (st->substate->inverse)
-  {
-    return;
-  }
-  kiss_fft_fixed16::kiss_fft(st->substate, (const kiss_fft_fixed16::kiss_fft_cpx*)timedata, st->tmpbuf);
-
-  // Process DC and Nyquist bins separately (scalar operations)
-  const int ncfft = st->substate->nfft;
-  kiss_fft_fixed16::kiss_fft_cpx tdc;
-  tdc.r = st->tmpbuf[0].r;
-  tdc.i = st->tmpbuf[0].i;
-  C_FIXDIV(tdc, 2);
-  freqdata[0].r = tdc.r + tdc.i;
-  freqdata[ncfft].r = tdc.r - tdc.i;
-  freqdata[0].i = 0;
-  freqdata[ncfft].i = 0;
-
-  // Initialize pointers and loop variables for the main vector processing loop
-  size_t k = 1;
-  const size_t loop_end = ncfft / 2;
-  const int16_t* tmpbuf_base_ptr = (const int16_t*)st->tmpbuf;
-  const int16_t* twiddles_base_ptr = (const int16_t*)st->super_twiddles;
-  int16_t* freqdata_base_ptr = (int16_t*)freqdata;
-  ptrdiff_t stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
-
-  // Main loop to process FFT bins in vector chunks
-  while (k <= loop_end)
-  {
-    // Set the vector length (vl) for the current iteration
-    size_t vl = __riscv_vsetvl_e16m4(loop_end - k + 1);
-
-    // Generate index vectors for accessing fpk, fpnk, and twiddles
-    vuint16m4_t v_k_indices = __riscv_vid_v_u16m4(vl);
-    v_k_indices = __riscv_vadd_vx_u16m4(v_k_indices, k, vl);
-    vuint16m4_t v_neg_k_indices = __riscv_vrsub_vx_u16m4(v_k_indices, ncfft, vl);
-    vuint16m4_t v_twiddle_indices = __riscv_vsub_vx_u16m4(v_k_indices, 1, vl);
-
-    // Load the 'fpk' vector using a strided load
-    vint16m4_t v_fpk_r = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k], stride, vl);
-    vint16m4_t v_fpk_i = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
-
-    // Gather the 'fpnk' vector using indexed loads
-    vuint32m8_t v_tmp_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_tmp_i_offsets = __riscv_vadd_vx_u32m8(v_tmp_r_offsets, sizeof(int16_t), vl);
-    vint16m4_t v_fpnk_r_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_r_offsets, vl);
-    vint16m4_t v_fpnk_i_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_i_offsets, vl);
-
-    // Gather the twiddle factors using indexed loads
-    vuint32m8_t v_tw_r_offsets = __riscv_vwmulu_vx_u32m8(v_twiddle_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_tw_i_offsets = __riscv_vadd_vx_u32m8(v_tw_r_offsets, sizeof(int16_t), vl);
-    vint16m4_t v_tw_r = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_r_offsets, vl);
-    vint16m4_t v_tw_i = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_i_offsets, vl);
-
-    // Perform high-precision rounding division on fpk
-    const int16_t scale = 16383;
-    const int32_t round_const = 16384;
-    vint32m8_t v_fpk_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_r, scale, vl), round_const, vl), 15, vl);
-    vint32m8_t v_fpk_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_i, scale, vl), round_const, vl), 15, vl);
-    vint16m4_t v_fpk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpk_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fpk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpk_i_32, 0, __RISCV_VXRM_RNU, vl);
-
-    // Perform high-precision rounding division on fpnk (with negated imaginary part)
-    vint16m4_t v_fpnk_i_neg = __riscv_vneg_v_i16m4(v_fpnk_i_raw, vl);
-    vint32m8_t v_fpnk_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_r_raw, scale, vl), round_const, vl), 15, vl);
-    vint32m8_t v_fpnk_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_i_neg, scale, vl), round_const, vl), 15, vl);
-    vint16m4_t v_fpnk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fpnk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_i_32, 0, __RISCV_VXRM_RNU, vl);
-
-    // Calculate intermediate values f1k (add) and f2k (subtract)
-    vint16m4_t v_f1k_r = __riscv_vadd_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
-    vint16m4_t v_f1k_i = __riscv_vadd_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
-    vint16m4_t v_f2k_r = __riscv_vsub_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
-    vint16m4_t v_f2k_i = __riscv_vsub_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
-
-    // Perform complex multiplication
-    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_r, vl);
-    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_i, vl);
-    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_i, vl);
-    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_r, vl);
-    vint32m8_t v_tw_res_r_32 = __riscv_vssra_vx_i32m8(__riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint32m8_t v_tw_res_i_32 = __riscv_vssra_vx_i32m8(__riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_tw_res_r = __riscv_vnclip_wx_i16m4(v_tw_res_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_tw_res_i = __riscv_vnclip_wx_i16m4(v_tw_res_i_32, 0, __RISCV_VXRM_RNU, vl);
-
-    // Calculate final output vectors
-    vint16m4_t v_out_k_r = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
-    vint16m4_t v_out_k_i = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_i, v_tw_res_i, vl), 1, vl);
-    vint16m4_t v_out_nk_r = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
-    vint16m4_t v_out_nk_i = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_tw_res_i, v_f1k_i, vl), 1, vl);
-
-    // Store the results using a strided store
-    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
-    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
-
-    // Scatter the results using an indexed store
-    vuint32m8_t v_freq_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_freq_i_offsets = __riscv_vadd_vx_u32m8(v_freq_r_offsets, sizeof(int16_t), vl);
-    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_r_offsets, v_out_nk_r, vl);
-    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_i_offsets, v_out_nk_i, vl);
-
-    // Advance to the next vector chunk
-    k += vl;
-  }
-}
-
-size_t RfftInt16GetNeededMemory(int32_t fft_length) {
-  size_t state_size = 0;
-  kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, nullptr, &state_size);
-  return state_size;
-}
-
-void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size) {
-  return kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, state, &state_size);
-}
-
-void RfftInt16ApplyRVV(void* state, const int16_t* input,
-                    Complex<int16_t>* output) {
-  kiss_fftr_rvv(
-      static_cast<kiss_fft_fixed16::kiss_fftr_cfg>(state),
-      reinterpret_cast<const kiss_fft_scalar*>(input),
-      reinterpret_cast<kiss_fft_fixed16::kiss_fft_cpx*>(output));
-}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/riscv_vector/rfft.cc
rename to tensorflow/lite/micro/kernels/riscv_vector/signal/rfft.cc
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
new file mode 100644
index 00000000000..78665010511
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
@@ -0,0 +1,873 @@
+#include <riscv_vector.h>
+
+#include "signal/src/complex.h"
+#include "signal/src/kiss_fft_wrappers/kiss_fft_int16.h"
+#include "signal/src/rfft.h"
+#include "signal/src/kiss_fft_wrappers/kiss_fft_common.h"
+
+#define FIXED_POINT 16
+
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+
+namespace kiss_fft_fixed16 {
+#include "_kiss_fft_guts.h"
+struct kiss_fftr_state{
+    kiss_fft_cfg substate;
+    kiss_fft_cpx * tmpbuf;
+    kiss_fft_cpx * super_twiddles;
+#ifdef USE_SIMD
+    void * pad;
+#endif
+};
+}
+
+static void kf_bfly2_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
+                         const size_t fstride,
+                         const kiss_fft_fixed16::kiss_fft_cfg st, size_t m)
+{
+  // Initialize pointers and constants
+  kiss_fft_fixed16::kiss_fft_cpx* Fout2 = Fout + m;
+  const int16_t* tw1_base = (const int16_t*)st->twiddles;
+  int16_t* Fout_base = (int16_t*)Fout;
+  int16_t* Fout2_base = (int16_t*)Fout2;
+  ptrdiff_t cpx_stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+  ptrdiff_t tw_stride = fstride * cpx_stride;
+  const int16_t scale = 16383;
+  const int32_t round_const = 16384;
+
+  // Main processing loop
+  size_t k = 0;
+  while (k < m)
+  {
+    // Set the vector length for this iteration
+    size_t vl = __riscv_vsetvl_e16m4(m - k);
+
+    // Load input data vectors
+    vint16m4_t v_fout_r =
+        __riscv_vlse16_v_i16m4(Fout_base + 2 * k, cpx_stride, vl);
+    vint16m4_t v_fout_i =
+        __riscv_vlse16_v_i16m4(Fout_base + 2 * k + 1, cpx_stride, vl);
+    vint16m4_t v_fout2_r =
+        __riscv_vlse16_v_i16m4(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m4_t v_fout2_i =
+        __riscv_vlse16_v_i16m4(Fout2_base + 2 * k + 1, cpx_stride, vl);
+
+    // Load twiddle factor vectors
+    vint16m4_t v_tw_r =
+        __riscv_vlse16_v_i16m4(tw1_base + (k * fstride * 2), tw_stride, vl);
+    vint16m4_t v_tw_i =
+        __riscv_vlse16_v_i16m4(tw1_base + (k * fstride * 2) + 1, tw_stride, vl);
+
+    // Perform rounding division by 2 on input data
+    vint32m8_t v_fout_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout_r, scale, vl),
+                              round_const, vl),
+        15, vl);
+    vint32m8_t v_fout_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout_i, scale, vl),
+                              round_const, vl),
+        15, vl);
+    vint16m4_t v_fout_r_div2 =
+        __riscv_vnclip_wx_i16m4(v_fout_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fout_i_div2 =
+        __riscv_vnclip_wx_i16m4(v_fout_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint32m8_t v_fout2_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout2_r, scale, vl),
+                              round_const, vl),
+        15, vl);
+    vint32m8_t v_fout2_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout2_i, scale, vl),
+                              round_const, vl),
+        15, vl);
+    vint16m4_t v_fout2_r_div2 =
+        __riscv_vnclip_wx_i16m4(v_fout2_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fout2_i_div2 =
+        __riscv_vnclip_wx_i16m4(v_fout2_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Perform complex multiplication: t = Fout2 * tw
+    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_fout2_r_div2, v_tw_r, vl);
+    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_fout2_i_div2, v_tw_i, vl);
+    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_fout2_r_div2, v_tw_i, vl);
+    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_fout2_i_div2, v_tw_r, vl);
+    vint32m8_t v_t_r_32 = __riscv_vssra_vx_i32m8(
+        __riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint32m8_t v_t_i_32 = __riscv_vssra_vx_i32m8(
+        __riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_t_r = __riscv_vnclip_wx_i16m4(v_t_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_t_i = __riscv_vnclip_wx_i16m4(v_t_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate butterfly outputs: Fout = Fout + t and Fout2 = Fout - t
+    vint16m4_t v_res_fout2_r = __riscv_vsub_vv_i16m4(v_fout_r_div2, v_t_r, vl);
+    vint16m4_t v_res_fout2_i = __riscv_vsub_vv_i16m4(v_fout_i_div2, v_t_i, vl);
+    vint16m4_t v_res_fout_r = __riscv_vadd_vv_i16m4(v_fout_r_div2, v_t_r, vl);
+    vint16m4_t v_res_fout_i = __riscv_vadd_vv_i16m4(v_fout_i_div2, v_t_i, vl);
+
+    // Store results
+    __riscv_vsse16_v_i16m4(Fout_base + 2 * k, cpx_stride, v_res_fout_r, vl);
+    __riscv_vsse16_v_i16m4(Fout_base + 2 * k + 1, cpx_stride, v_res_fout_i, vl);
+    __riscv_vsse16_v_i16m4(Fout2_base + 2 * k, cpx_stride, v_res_fout2_r, vl);
+    __riscv_vsse16_v_i16m4(Fout2_base + 2 * k + 1, cpx_stride, v_res_fout2_i, vl);
+
+    // Advance loop counter
+    k += vl;
+  }
+}
+
+static void kf_bfly4_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
+                         const size_t fstride,
+                         const kiss_fft_fixed16::kiss_fft_cfg st,
+                         const size_t m)
+{
+  // Initialize pointers and constants
+  const size_t m2 = 2 * m;
+  const size_t m3 = 3 * m;
+
+  int16_t* Fout0_base = (int16_t*)(Fout);
+  int16_t* Fout1_base = (int16_t*)(Fout + m);
+  int16_t* Fout2_base = (int16_t*)(Fout + m2);
+  int16_t* Fout3_base = (int16_t*)(Fout + m3);
+  const int16_t* tw_base = (const int16_t*)st->twiddles;
+  
+  ptrdiff_t cpx_stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+  ptrdiff_t tw1_stride = fstride * cpx_stride;
+  ptrdiff_t tw2_stride = fstride * 2 * cpx_stride;
+  ptrdiff_t tw3_stride = fstride * 3 * cpx_stride;
+
+  const int16_t scale = 8191;
+  const int32_t round_const = 16384;
+
+  // Main processing loop
+  size_t k = 0;
+  while (k < m)
+  {
+    // Set the vector length for this iteration
+    size_t vl = __riscv_vsetvl_e16m2(m - k);
+
+    // Load input data vectors
+    vint16m2_t v_f0_r =
+        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f0_i =
+        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_f1_r =
+        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f1_i =
+        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_f2_r =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f2_i =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_f3_r =
+        __riscv_vlse16_v_i16m2(Fout3_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f3_i =
+        __riscv_vlse16_v_i16m2(Fout3_base + 2 * k + 1, cpx_stride, vl);
+
+    // Perform rounding division by 4 on input data
+    vint16m2_t v_f0d_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f0_r, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f0d_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f0_i, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f1d_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f1_r, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f1d_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f1_i, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f2d_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f2_r, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f2d_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f2_i, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f3d_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f3_r, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_f3d_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vsra_vx_i32m4(
+            __riscv_vadd_vx_i32m4(
+                __riscv_vwmul_vx_i32m4(v_f3_i, scale, vl), round_const, vl),
+            15, vl),
+        0, __RISCV_VXRM_RNU, vl);
+
+    // Load twiddle factor vectors
+    vint16m2_t v_tw1_r =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 2), tw1_stride, vl);
+    vint16m2_t v_tw1_i =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 2) + 1, tw1_stride, vl);
+    vint16m2_t v_tw2_r =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 4), tw2_stride, vl);
+    vint16m2_t v_tw2_i =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 4) + 1, tw2_stride, vl);
+    vint16m2_t v_tw3_r =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 6), tw3_stride, vl);
+    vint16m2_t v_tw3_i =
+        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 6) + 1, tw3_stride, vl);
+
+    // Perform complex multiplications
+    vint16m2_t v_s0_r, v_s0_i, v_s1_r, v_s1_i, v_s2_r, v_s2_i;
+    do
+    {
+      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f1d_r, v_tw1_r, vl);
+      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f1d_i, v_tw1_i, vl);
+      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f1d_r, v_tw1_i, vl);
+      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f1d_i, v_tw1_r, vl);
+      v_s0_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+      v_s0_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+    } while (0);
+
+    do
+    {
+      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f2d_r, v_tw2_r, vl);
+      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f2d_i, v_tw2_i, vl);
+      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f2d_r, v_tw2_i, vl);
+      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f2d_i, v_tw2_r, vl);
+      v_s1_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+      v_s1_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+    } while (0);
+
+    do
+    {
+      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f3d_r, v_tw3_r, vl);
+      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f3d_i, v_tw3_i, vl);
+      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f3d_r, v_tw3_i, vl);
+      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f3d_i, v_tw3_r, vl);
+      v_s2_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+      v_s2_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
+          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+                                       0, __RISCV_VXRM_RNU, vl);
+    } while (0);
+
+    // Calculate intermediate butterfly values
+    vint16m2_t v_s5_r = __riscv_vsub_vv_i16m2(v_f0d_r, v_s1_r, vl);
+    vint16m2_t v_s5_i = __riscv_vsub_vv_i16m2(v_f0d_i, v_s1_i, vl);
+    vint16m2_t v_f0d_plus_s1_r = __riscv_vadd_vv_i16m2(v_f0d_r, v_s1_r, vl);
+    vint16m2_t v_f0d_plus_s1_i = __riscv_vadd_vv_i16m2(v_f0d_i, v_s1_i, vl);
+    vint16m2_t v_s3_r = __riscv_vadd_vv_i16m2(v_s0_r, v_s2_r, vl);
+    vint16m2_t v_s3_i = __riscv_vadd_vv_i16m2(v_s0_i, v_s2_i, vl);
+    vint16m2_t v_s4_r = __riscv_vsub_vv_i16m2(v_s0_r, v_s2_r, vl);
+    vint16m2_t v_s4_i = __riscv_vsub_vv_i16m2(v_s0_i, v_s2_i, vl);
+    vint16m2_t v_res_f0_r = __riscv_vadd_vv_i16m2(v_f0d_plus_s1_r, v_s3_r, vl);
+    vint16m2_t v_res_f0_i = __riscv_vadd_vv_i16m2(v_f0d_plus_s1_i, v_s3_i, vl);
+    vint16m2_t v_res_f2_r = __riscv_vsub_vv_i16m2(v_f0d_plus_s1_r, v_s3_r, vl);
+    vint16m2_t v_res_f2_i = __riscv_vsub_vv_i16m2(v_f0d_plus_s1_i, v_s3_i, vl);
+
+    // Calculate final results, handling inverse case
+    vint16m2_t v_res_f1_r, v_res_f1_i, v_res_f3_r, v_res_f3_i;
+    if (st->inverse)
+    {
+      v_res_f1_r = __riscv_vsub_vv_i16m2(v_s5_r, v_s4_i, vl);
+      v_res_f1_i = __riscv_vadd_vv_i16m2(v_s5_i, v_s4_r, vl);
+      v_res_f3_r = __riscv_vadd_vv_i16m2(v_s5_r, v_s4_i, vl);
+      v_res_f3_i = __riscv_vsub_vv_i16m2(v_s5_i, v_s4_r, vl);
+    }
+    else
+    {
+      v_res_f1_r = __riscv_vadd_vv_i16m2(v_s5_r, v_s4_i, vl);
+      v_res_f1_i = __riscv_vsub_vv_i16m2(v_s5_i, v_s4_r, vl);
+      v_res_f3_r = __riscv_vsub_vv_i16m2(v_s5_r, v_s4_i, vl);
+      v_res_f3_i = __riscv_vadd_vv_i16m2(v_s5_i, v_s4_r, vl);
+    }
+
+    // Store final results
+    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
+    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
+    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
+    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
+    __riscv_vsse16_v_i16m2(Fout3_base + 2 * k, cpx_stride, v_res_f3_r, vl);
+    __riscv_vsse16_v_i16m2(Fout3_base + 2 * k + 1, cpx_stride, v_res_f3_i, vl);
+
+    // Advance loop counter
+    k += vl;
+  }
+}
+
+static void kf_bfly3_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
+                         const size_t fstride,
+                         const kiss_fft_fixed16::kiss_fft_cfg st, size_t m)
+{
+  // Initialize pointers and constants
+  kiss_fft_fixed16::kiss_fft_cpx* Fout1 = Fout + m;
+  kiss_fft_fixed16::kiss_fft_cpx* Fout2 = Fout + m * 2;
+  const int16_t* tw1_base = (const int16_t*)st->twiddles;
+  const int16_t* tw2_base = tw1_base;
+  const int16_t tw3i = -28378;  // Q15 value for sin(-2*pi/3)
+  int16_t* Fout0_base = (int16_t*)Fout;
+  int16_t* Fout1_base = (int16_t*)Fout1;
+  int16_t* Fout2_base = (int16_t*)Fout2;
+  ptrdiff_t cpx_stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+  ptrdiff_t tw1_stride = fstride * cpx_stride;
+  ptrdiff_t tw2_stride = fstride * 2 * cpx_stride;
+
+  // Main processing loop
+  size_t k = 0;
+  while (k < m)
+  {
+    // Set the vector length for this iteration
+    size_t vl = __riscv_vsetvl_e16m2(m - k);
+
+    // Load input data vectors
+    vint16m2_t v_f0_r =
+        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f0_i =
+        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_f1_r =
+        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f1_i =
+        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_f2_r =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_f2_i =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, vl);
+
+    // Load twiddle factor vectors
+    vint16m2_t v_tw1_r =
+        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2), tw1_stride, vl);
+    vint16m2_t v_tw1_i =
+        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2) + 1, tw1_stride, vl);
+    vint16m2_t v_tw2_r =
+        __riscv_vlse16_v_i16m2(tw2_base + (k * fstride * 4), tw2_stride, vl);
+    vint16m2_t v_tw2_i =
+        __riscv_vlse16_v_i16m2(tw2_base + (k * fstride * 4) + 1, tw2_stride, vl);
+
+    // Perform complex multiplications: v_s0 = v_f1 * v_tw1
+    vint32m4_t v_ac0 = __riscv_vwmul_vv_i32m4(v_f1_r, v_tw1_r, vl);
+    vint32m4_t v_bd0 = __riscv_vwmul_vv_i32m4(v_f1_i, v_tw1_i, vl);
+    vint32m4_t v_ad0 = __riscv_vwmul_vv_i32m4(v_f1_r, v_tw1_i, vl);
+    vint32m4_t v_bc0 = __riscv_vwmul_vv_i32m4(v_f1_i, v_tw1_r, vl);
+    vint16m2_t v_s0_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(__riscv_vsub_vv_i32m4(v_ac0, v_bd0, vl), 15,
+                               __RISCV_VXRM_RNU, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_s0_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(__riscv_vadd_vv_i32m4(v_ad0, v_bc0, vl), 15,
+                               __RISCV_VXRM_RNU, vl),
+        0, __RISCV_VXRM_RNU, vl);
+
+    // Perform complex multiplications: v_s1 = v_f2 * v_tw2
+    vint32m4_t v_ac1 = __riscv_vwmul_vv_i32m4(v_f2_r, v_tw2_r, vl);
+    vint32m4_t v_bd1 = __riscv_vwmul_vv_i32m4(v_f2_i, v_tw2_i, vl);
+    vint32m4_t v_ad1 = __riscv_vwmul_vv_i32m4(v_f2_r, v_tw2_i, vl);
+    vint32m4_t v_bc1 = __riscv_vwmul_vv_i32m4(v_f2_i, v_tw2_r, vl);
+    vint16m2_t v_s1_r = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(__riscv_vsub_vv_i32m4(v_ac1, v_bd1, vl), 15,
+                               __RISCV_VXRM_RNU, vl),
+        0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_s1_i = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(__riscv_vadd_vv_i32m4(v_ad1, v_bc1, vl), 15,
+                               __RISCV_VXRM_RNU, vl),
+        0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate intermediate butterfly values
+    vint16m2_t v_s_add_r = __riscv_vadd_vv_i16m2(v_s0_r, v_s1_r, vl);
+    vint16m2_t v_s_add_i = __riscv_vadd_vv_i16m2(v_s0_i, v_s1_i, vl);
+    vint16m2_t v_s_sub_r = __riscv_vsub_vv_i16m2(v_s0_r, v_s1_r, vl);
+    vint16m2_t v_s_sub_i = __riscv_vsub_vv_i16m2(v_s0_i, v_s1_i, vl);
+
+    // Calculate Fout0 = Fout0 + s_add
+    vint16m2_t v_res_f0_r = __riscv_vadd_vv_i16m2(v_f0_r, v_s_add_r, vl);
+    vint16m2_t v_res_f0_i = __riscv_vadd_vv_i16m2(v_f0_i, v_s_add_i, vl);
+
+    // Calculate remaining outputs using rotations
+    vint16m2_t v_s_add_r_neg_half =
+        __riscv_vneg_v_i16m2(__riscv_vsra_vx_i16m2(v_s_add_r, 1, vl), vl);
+    vint16m2_t v_s_add_i_neg_half =
+        __riscv_vneg_v_i16m2(__riscv_vsra_vx_i16m2(v_s_add_i, 1, vl), vl);
+    vint32m4_t v_s_sub_i_mul_tw3i = __riscv_vwmul_vx_i32m4(v_s_sub_i, tw3i, vl);
+    vint32m4_t v_s_sub_r_mul_tw3i = __riscv_vwmul_vx_i32m4(v_s_sub_r, tw3i, vl);
+    vint16m2_t v_s_sub_i_scaled = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(v_s_sub_i_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
+        __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_s_sub_r_scaled = __riscv_vnclip_wx_i16m2(
+        __riscv_vssra_vx_i32m4(v_s_sub_r_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
+        __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_tmp_r1 = __riscv_vadd_vv_i16m2(v_f0_r, v_s_add_r_neg_half, vl);
+    vint16m2_t v_res_f1_r = __riscv_vsub_vv_i16m2(v_tmp_r1, v_s_sub_i_scaled, vl);
+    vint16m2_t v_tmp_i1 = __riscv_vadd_vv_i16m2(v_f0_i, v_s_add_i_neg_half, vl);
+    vint16m2_t v_res_f1_i = __riscv_vadd_vv_i16m2(v_tmp_i1, v_s_sub_r_scaled, vl);
+    vint16m2_t v_res_f2_r = __riscv_vadd_vv_i16m2(v_tmp_r1, v_s_sub_i_scaled, vl);
+    vint16m2_t v_res_f2_i = __riscv_vsub_vv_i16m2(v_tmp_i1, v_s_sub_r_scaled, vl);
+
+    // Store results
+    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
+    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
+    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
+    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
+
+    // Advance loop counter
+    k += vl;
+  }
+}
+
+static void kf_bfly5_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
+                         const size_t fstride,
+                         const kiss_fft_fixed16::kiss_fft_cfg st, size_t m)
+{
+  // Initialize pointers and constants
+  kiss_fft_fixed16::kiss_fft_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
+  const int16_t* tw_base = (const int16_t*)st->twiddles;
+  const int16_t ya1 = 19021;   // Q15 value for cos(2*pi/5)
+  const int16_t yb1 = 31164;   // Q15 value for sin(2*pi/5)
+  const int16_t ya2 = -30777;  // Q15 value for cos(4*pi/5)
+  const int16_t yb2 = 19021;   // Q15 value for sin(4*pi/5)
+
+  Fout0 = Fout;
+  Fout1 = Fout + m;
+  Fout2 = Fout + 2 * m;
+  Fout3 = Fout + 3 * m;
+  Fout4 = Fout + 4 * m;
+
+  int16_t* Fout0_base = (int16_t*)Fout0;
+  int16_t* Fout1_base = (int16_t*)Fout1;
+  int16_t* Fout2_base = (int16_t*)Fout2;
+  int16_t* Fout3_base = (int16_t*)Fout3;
+  int16_t* Fout4_base = (int16_t*)Fout4;
+
+  ptrdiff_t cpx_stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+  ptrdiff_t tw1_stride = fstride * cpx_stride;
+  ptrdiff_t tw2_stride = 2 * tw1_stride;
+  ptrdiff_t tw3_stride = 3 * tw1_stride;
+  ptrdiff_t tw4_stride = 4 * tw1_stride;
+
+  // Main processing loop
+  size_t k = 0;
+  while (k < m)
+  {
+    // Set the vector length for this iteration
+    size_t vl = __riscv_vsetvl_e16m1(m - k);
+
+    // Load input data vectors
+    vint16m1_t v_f0_r =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f0_i =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f1_r =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f1_i =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f2_r =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f2_i =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f3_r =
+        __riscv_vlse16_v_i16m1(Fout3_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f3_i =
+        __riscv_vlse16_v_i16m1(Fout3_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f4_r =
+        __riscv_vlse16_v_i16m1(Fout4_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f4_i =
+        __riscv_vlse16_v_i16m1(Fout4_base + 2 * k + 1, cpx_stride, vl);
+
+    // Load twiddle factor vectors
+    vint16m1_t v_tw1_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 2), tw1_stride, vl);
+    vint16m1_t v_tw1_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 2) + 1, tw1_stride, vl);
+    vint16m1_t v_tw2_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 4), tw2_stride, vl);
+    vint16m1_t v_tw2_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 4) + 1, tw2_stride, vl);
+    vint16m1_t v_tw3_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 6), tw3_stride, vl);
+    vint16m1_t v_tw3_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 6) + 1, tw3_stride, vl);
+    vint16m1_t v_tw4_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 8), tw4_stride, vl);
+    vint16m1_t v_tw4_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 8) + 1, tw4_stride, vl);
+
+// Macro for complex multiplication, wrapped in do-while(0) to prevent scope issues
+#define C_MUL_VEC(res_r, res_i, f_r, f_i, tw_r, tw_i)                           \
+  do                                                                           \
+  {                                                                            \
+    vint32m2_t ac = __riscv_vwmul_vv_i32m2(f_r, tw_r, vl);                      \
+    vint32m2_t bd = __riscv_vwmul_vv_i32m2(f_i, tw_i, vl);                      \
+    vint32m2_t ad = __riscv_vwmul_vv_i32m2(f_r, tw_i, vl);                      \
+    vint32m2_t bc = __riscv_vwmul_vv_i32m2(f_i, tw_r, vl);                      \
+    res_r = __riscv_vnclip_wx_i16m1(                                           \
+        __riscv_vssra_vx_i32m2(__riscv_vsub_vv_i32m2(ac, bd, vl), 15,           \
+                               __RISCV_VXRM_RNU, vl),                          \
+        0, __RISCV_VXRM_RNU, vl);                                              \
+    res_i = __riscv_vnclip_wx_i16m1(                                           \
+        __riscv_vssra_vx_i32m2(__riscv_vadd_vv_i32m2(ad, bc, vl), 15,           \
+                               __RISCV_VXRM_RNU, vl),                          \
+        0, __RISCV_VXRM_RNU, vl);                                              \
+  } while (0)
+
+    // Perform complex multiplications
+    vint16m1_t v_s0_r, v_s0_i, v_s1_r, v_s1_i, v_s2_r, v_s2_i, v_s3_r, v_s3_i;
+    C_MUL_VEC(v_s0_r, v_s0_i, v_f1_r, v_f1_i, v_tw1_r, v_tw1_i);
+    C_MUL_VEC(v_s1_r, v_s1_i, v_f2_r, v_f2_i, v_tw2_r, v_tw2_i);
+    C_MUL_VEC(v_s2_r, v_s2_i, v_f3_r, v_f3_i, v_tw3_r, v_tw3_i);
+    C_MUL_VEC(v_s3_r, v_s3_i, v_f4_r, v_f4_i, v_tw4_r, v_tw4_i);
+#undef C_MUL_VEC
+
+    // Calculate intermediate butterfly values
+    vint16m1_t v_s03_add_r = __riscv_vadd_vv_i16m1(v_s0_r, v_s3_r, vl);
+    vint16m1_t v_s03_add_i = __riscv_vadd_vv_i16m1(v_s0_i, v_s3_i, vl);
+    vint16m1_t v_s03_sub_r = __riscv_vsub_vv_i16m1(v_s0_r, v_s3_r, vl);
+    vint16m1_t v_s03_sub_i = __riscv_vsub_vv_i16m1(v_s0_i, v_s3_i, vl);
+    vint16m1_t v_s12_add_r = __riscv_vadd_vv_i16m1(v_s1_r, v_s2_r, vl);
+    vint16m1_t v_s12_add_i = __riscv_vadd_vv_i16m1(v_s1_i, v_s2_i, vl);
+    vint16m1_t v_s12_sub_r = __riscv_vsub_vv_i16m1(v_s1_r, v_s2_r, vl);
+    vint16m1_t v_s12_sub_i = __riscv_vsub_vv_i16m1(v_s1_i, v_s2_i, vl);
+
+    // Calculate Fout0 = f0 + s03_add + s12_add
+    vint16m1_t v_res_f0_r = __riscv_vadd_vv_i16m1(
+        v_f0_r, __riscv_vadd_vv_i16m1(v_s03_add_r, v_s12_add_r, vl), vl);
+    vint16m1_t v_res_f0_i = __riscv_vadd_vv_i16m1(
+        v_f0_i, __riscv_vadd_vv_i16m1(v_s03_add_i, v_s12_add_i, vl), vl);
+
+// Macro for scalar multiplication, wrapped in do-while(0) to prevent scope issues
+#define S_MUL_VX(res, val, const_val)                                          \
+  do                                                                           \
+  {                                                                            \
+    vint32m2_t tmp_mul = __riscv_vwmul_vx_i32m2(val, const_val, vl);            \
+    res = __riscv_vnclip_wx_i16m1(                                             \
+        __riscv_vssra_vx_i32m2(tmp_mul, 15, __RISCV_VXRM_RNU, vl), 0,           \
+        __RISCV_VXRM_RNU, vl);                                                 \
+  } while (0)
+
+    // Perform final rotations
+    vint16m1_t v_tmp1_r, v_tmp1_i, v_tmp2_r, v_tmp2_i;
+    S_MUL_VX(v_tmp1_r, v_s03_add_r, ya1);
+    S_MUL_VX(v_tmp1_i, v_s03_add_i, ya1);
+    S_MUL_VX(v_tmp2_r, v_s12_add_r, ya2);
+    S_MUL_VX(v_tmp2_i, v_s12_add_i, ya2);
+    vint16m1_t v_r_part1 = __riscv_vadd_vv_i16m1(
+        v_f0_r, __riscv_vadd_vv_i16m1(v_tmp1_r, v_tmp2_r, vl), vl);
+    vint16m1_t v_i_part1 = __riscv_vadd_vv_i16m1(
+        v_f0_i, __riscv_vadd_vv_i16m1(v_tmp1_i, v_tmp2_i, vl), vl);
+    S_MUL_VX(v_tmp1_r, v_s03_sub_i, yb1);
+    S_MUL_VX(v_tmp1_i, v_s03_sub_r, yb1);
+    S_MUL_VX(v_tmp2_r, v_s12_sub_i, yb2);
+    S_MUL_VX(v_tmp2_i, v_s12_sub_r, yb2);
+    vint16m1_t v_r_part2 = __riscv_vsub_vv_i16m1(v_tmp1_r, v_tmp2_r, vl);
+    vint16m1_t v_i_part2 = __riscv_vadd_vv_i16m1(v_tmp1_i, v_tmp2_i, vl);
+
+    // Calculate final butterfly outputs
+    vint16m1_t v_res_f1_r = __riscv_vadd_vv_i16m1(v_r_part1, v_r_part2, vl);
+    vint16m1_t v_res_f1_i = __riscv_vadd_vv_i16m1(v_i_part1, v_i_part2, vl);
+    vint16m1_t v_res_f4_r = __riscv_vsub_vv_i16m1(v_r_part1, v_r_part2, vl);
+    vint16m1_t v_res_f4_i = __riscv_vsub_vv_i16m1(v_i_part1, v_i_part2, vl);
+    v_r_part2 = __riscv_vadd_vv_i16m1(v_tmp1_r, v_tmp2_r, vl);
+    v_i_part2 = __riscv_vsub_vv_i16m1(v_tmp1_i, v_tmp2_i, vl);
+    vint16m1_t v_res_f2_r = __riscv_vsub_vv_i16m1(v_r_part1, v_r_part2, vl);
+    vint16m1_t v_res_f2_i = __riscv_vadd_vv_i16m1(v_i_part1, v_i_part2, vl);
+    vint16m1_t v_res_f3_r = __riscv_vadd_vv_i16m1(v_r_part1, v_r_part2, vl);
+    vint16m1_t v_res_f3_i = __riscv_vsub_vv_i16m1(v_i_part1, v_i_part2, vl);
+#undef S_MUL_VX
+
+    // Store results
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
+    __riscv_vsse16_v_i16m1(Fout3_base + 2 * k, cpx_stride, v_res_f3_r, vl);
+    __riscv_vsse16_v_i16m1(Fout3_base + 2 * k + 1, cpx_stride, v_res_f3_i, vl);
+    __riscv_vsse16_v_i16m1(Fout4_base + 2 * k, cpx_stride, v_res_f4_r, vl);
+    __riscv_vsse16_v_i16m1(Fout4_base + 2 * k + 1, cpx_stride, v_res_f4_i, vl);
+
+    // Advance loop counter
+    k += vl;
+  }
+}
+
+// Generic radix implementation copy/pasted from kissfft (kiss_fft.c)
+static void kf_bfly_generic(
+        kiss_fft_fixed16::kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_fixed16::kiss_fft_cfg st,
+        int m,
+        int p
+        )
+{
+    int u,k,q1,q;
+    kiss_fft_fixed16::kiss_fft_cpx * twiddles = st->twiddles;
+    kiss_fft_fixed16::kiss_fft_cpx t;
+    int Norig = st->nfft;
+
+    kiss_fft_fixed16::kiss_fft_cpx * scratch = (kiss_fft_fixed16::kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_fixed16::kiss_fft_cpx)*p);
+    if (scratch == NULL){
+        return;
+    }
+
+    for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            scratch[q1] = Fout[ k  ];
+            C_FIXDIV(scratch[q1],p);
+            k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+            int twidx=0;
+            Fout[ k ] = scratch[0];
+            for (q=1;q<p;++q ) {
+                twidx += fstride * k;
+                if (twidx>=Norig) twidx-=Norig;
+                C_MUL(t,scratch[q] , twiddles[twidx] );
+                C_ADDTO( Fout[ k ] ,t);
+            }
+            k += m;
+        }
+    }
+    KISS_FFT_TMP_FREE(scratch);
+}
+
+static void kf_work_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
+                        const kiss_fft_fixed16::kiss_fft_cpx* f,
+                        const size_t fstride, int in_stride, int* factors,
+                        const kiss_fft_fixed16::kiss_fft_cfg st)
+{
+  // Decompose the problem into factors p and m
+  const int p = *factors++;
+  const int m = *factors++;
+  kiss_fft_fixed16::kiss_fft_cpx* Fout_beg = Fout;
+  const kiss_fft_fixed16::kiss_fft_cpx* Fout_end = Fout + p * m;
+
+  // Perform recursion for the m-point DFTs
+  if (m == 1)
+  {
+    do
+    {
+      *Fout = *f;
+      f += fstride * in_stride;
+    } while (++Fout != Fout_end);
+  }
+  else
+  {
+    do
+    {
+      kf_work_rvv(Fout, f, fstride * p, in_stride, factors, st);
+      f += fstride * in_stride;
+    } while ((Fout += m) != Fout_end);
+  }
+
+  // Perform the p-point butterfly operations
+  Fout = Fout_beg;
+  switch (p)
+  {
+    case 2:
+      kf_bfly2_rvv(Fout, fstride, st, m);
+      break;
+    case 3:
+      kf_bfly3_rvv(Fout, fstride, st, m);
+      break;
+    case 4:
+      kf_bfly4_rvv(Fout, fstride, st, m);
+      break;
+    case 5:
+      kf_bfly5_rvv(Fout, fstride, st, m);
+      break;
+      default: kf_bfly_generic(Fout, fstride, st, m, p); break;
+  }
+}
+
+void kiss_fft_stride_rvv(kiss_fft_fixed16::kiss_fft_cfg st, const kiss_fft_fixed16::kiss_fft_cpx* fin,
+                         kiss_fft_fixed16::kiss_fft_cpx* fout, int in_stride)
+{
+  // Handle in-place transform
+  if (fin == fout)
+  {
+    if (fout == NULL)
+    {
+      return;
+    }
+
+    kiss_fft_fixed16::kiss_fft_cpx* tmpbuf =
+        (kiss_fft_fixed16::kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(
+            sizeof(kiss_fft_fixed16::kiss_fft_cpx) * st->nfft);
+
+    if (tmpbuf == NULL)
+    {
+      return;
+    }
+
+    kf_work_rvv(tmpbuf, fin, 1, in_stride, st->factors, st);
+
+    memcpy(fout, tmpbuf, sizeof(kiss_fft_fixed16::kiss_fft_cpx) * st->nfft);
+
+    KISS_FFT_TMP_FREE(tmpbuf);
+  }
+  else
+  {
+    // Handle out-of-place transform
+    kf_work_rvv(fout, fin, 1, in_stride, st->factors, st);
+  }
+}
+
+void kiss_fft_rvv(kiss_fft_fixed16::kiss_fft_cfg cfg, const kiss_fft_fixed16::kiss_fft_cpx* fin, kiss_fft_fixed16::kiss_fft_cpx* fout)
+{
+  kiss_fft_stride_rvv(cfg, fin, fout, 1);
+}
+
+void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* timedata,
+                   kiss_fft_fixed16::kiss_fft_cpx* freqdata)
+{
+  // Handle inverse FFT case and perform the initial complex FFT
+  if (st->substate->inverse)
+  {
+    return;
+  }
+  kiss_fft_rvv(st->substate, (const kiss_fft_fixed16::kiss_fft_cpx*)timedata, st->tmpbuf);
+
+  // Process DC and Nyquist bins separately (scalar operations)
+  const int ncfft = st->substate->nfft;
+  kiss_fft_fixed16::kiss_fft_cpx tdc;
+  tdc.r = st->tmpbuf[0].r;
+  tdc.i = st->tmpbuf[0].i;
+  C_FIXDIV(tdc, 2);
+  freqdata[0].r = tdc.r + tdc.i;
+  freqdata[ncfft].r = tdc.r - tdc.i;
+  freqdata[0].i = 0;
+  freqdata[ncfft].i = 0;
+
+  // Initialize pointers and loop variables for the main vector processing loop
+  size_t k = 1;
+  const size_t loop_end = ncfft / 2;
+  const int16_t* tmpbuf_base_ptr = (const int16_t*)st->tmpbuf;
+  const int16_t* twiddles_base_ptr = (const int16_t*)st->super_twiddles;
+  int16_t* freqdata_base_ptr = (int16_t*)freqdata;
+  ptrdiff_t stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+
+  // Main loop to process FFT bins in vector chunks
+  while (k <= loop_end)
+  {
+    // Set the vector length (vl) for the current iteration
+    size_t vl = __riscv_vsetvl_e16m4(loop_end - k + 1);
+
+    // Generate index vectors for accessing fpk, fpnk, and twiddles
+    vuint16m4_t v_k_indices = __riscv_vid_v_u16m4(vl);
+    v_k_indices = __riscv_vadd_vx_u16m4(v_k_indices, k, vl);
+    vuint16m4_t v_neg_k_indices = __riscv_vrsub_vx_u16m4(v_k_indices, ncfft, vl);
+    vuint16m4_t v_twiddle_indices = __riscv_vsub_vx_u16m4(v_k_indices, 1, vl);
+
+    // Load the 'fpk' vector using a strided load
+    vint16m4_t v_fpk_r = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k], stride, vl);
+    vint16m4_t v_fpk_i = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
+
+    // Gather the 'fpnk' vector using indexed loads
+    vuint32m8_t v_tmp_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_tmp_i_offsets = __riscv_vadd_vx_u32m8(v_tmp_r_offsets, sizeof(int16_t), vl);
+    vint16m4_t v_fpnk_r_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_r_offsets, vl);
+    vint16m4_t v_fpnk_i_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_i_offsets, vl);
+
+    // Gather the twiddle factors using indexed loads
+    vuint32m8_t v_tw_r_offsets = __riscv_vwmulu_vx_u32m8(v_twiddle_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_tw_i_offsets = __riscv_vadd_vx_u32m8(v_tw_r_offsets, sizeof(int16_t), vl);
+    vint16m4_t v_tw_r = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_r_offsets, vl);
+    vint16m4_t v_tw_i = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_i_offsets, vl);
+
+    // Perform high-precision rounding division on fpk
+    const int16_t scale = 16383;
+    const int32_t round_const = 16384;
+    vint32m8_t v_fpk_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_r, scale, vl), round_const, vl), 15, vl);
+    vint32m8_t v_fpk_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_i, scale, vl), round_const, vl), 15, vl);
+    vint16m4_t v_fpk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fpk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpk_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Perform high-precision rounding division on fpnk (with negated imaginary part)
+    vint16m4_t v_fpnk_i_neg = __riscv_vneg_v_i16m4(v_fpnk_i_raw, vl);
+    vint32m8_t v_fpnk_r_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_r_raw, scale, vl), round_const, vl), 15, vl);
+    vint32m8_t v_fpnk_i_32 = __riscv_vsra_vx_i32m8(
+        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_i_neg, scale, vl), round_const, vl), 15, vl);
+    vint16m4_t v_fpnk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_fpnk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate intermediate values f1k (add) and f2k (subtract)
+    vint16m4_t v_f1k_r = __riscv_vadd_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m4_t v_f1k_i = __riscv_vadd_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
+    vint16m4_t v_f2k_r = __riscv_vsub_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m4_t v_f2k_i = __riscv_vsub_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
+
+    // Perform complex multiplication
+    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_r, vl);
+    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_i, vl);
+    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_i, vl);
+    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_r, vl);
+    vint32m8_t v_tw_res_r_32 = __riscv_vssra_vx_i32m8(__riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint32m8_t v_tw_res_i_32 = __riscv_vssra_vx_i32m8(__riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_tw_res_r = __riscv_vnclip_wx_i16m4(v_tw_res_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m4_t v_tw_res_i = __riscv_vnclip_wx_i16m4(v_tw_res_i_32, 0, __RISCV_VXRM_RNU, vl);
+
+    // Calculate final output vectors
+    vint16m4_t v_out_k_r = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m4_t v_out_k_i = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_i, v_tw_res_i, vl), 1, vl);
+    vint16m4_t v_out_nk_r = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m4_t v_out_nk_i = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_tw_res_i, v_f1k_i, vl), 1, vl);
+
+    // Store the results using a strided store
+    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
+    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
+
+    // Scatter the results using an indexed store
+    vuint32m8_t v_freq_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
+    vuint32m8_t v_freq_i_offsets = __riscv_vadd_vx_u32m8(v_freq_r_offsets, sizeof(int16_t), vl);
+    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_r_offsets, v_out_nk_r, vl);
+    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_i_offsets, v_out_nk_i, vl);
+
+    // Advance to the next vector chunk
+    k += vl;
+  }
+}
+
+size_t RfftInt16GetNeededMemory(int32_t fft_length) {
+  size_t state_size = 0;
+  kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, nullptr, &state_size);
+  return state_size;
+}
+
+void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size) {
+  return kiss_fft_fixed16::kiss_fftr_alloc(fft_length, 0, state, &state_size);
+}
+
+void RfftInt16ApplyRVV(void* state, const int16_t* input,
+                    Complex<int16_t>* output) {
+  kiss_fftr_rvv(
+      static_cast<kiss_fft_fixed16::kiss_fftr_cfg>(state),
+      reinterpret_cast<const kiss_fft_scalar*>(input),
+      reinterpret_cast<kiss_fft_fixed16::kiss_fft_cpx*>(output));
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.h
rename to tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 2d04670fc9f..ab3313e788f 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -67,8 +67,8 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/fully_connected.cc \
   tensorflow/lite/micro/kernels/riscv_vector/pooling.cc \
   tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc \
-  tensorflow/lite/micro/kernels/riscv_vector/rfft.cc\
-  tensorflow/lite/micro/kernels/riscv_vector/rfft_int16_rvv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/signal/rfft.cc\
+  tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
 #  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
@@ -78,6 +78,7 @@ EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/pooling.cc\
   tflite-micro/signal/micro/kernels/rfft.cc \
   tflite-micro/signal/src/rfft_int16.cc \
+  tflite-micro/signal/src/kiss_fft_wrappers/kiss_fft_int16.cc \
 #  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 31d59ca9bdbf1ea44147d9b9a8d9d5142e321064 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 00:51:26 -0600
Subject: [PATCH 55/86] Initial vector optimized FilterBank kernel

---
 .../riscv_vector/signal/filter_bank.cc        | 178 ++++++++++++++++++
 .../riscv_vector/signal/filter_bank_rvv.cc    |  76 ++++++++
 .../riscv_vector/signal/filter_bank_rvv.h     |  23 +++
 .../make/targets/riscv32_vector_makefile.inc  |  14 +-
 4 files changed, 286 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc
new file mode 100644
index 00000000000..150373b7456
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc
@@ -0,0 +1,178 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/flatbuffer_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightTensor = 1;
+constexpr int kUnweightTensor = 2;
+constexpr int kChFreqStartsTensor = 3;
+constexpr int kChWeightStartsTensor = 4;
+constexpr int kChannelWidthsTensor = 5;
+constexpr int kOutputTensor = 0;
+
+// Indices into the init flexbuffer's vector.
+// The parameter's name is in the comment that follows.
+// Elements in the vectors are ordered alphabetically by parameter name.
+constexpr int kNumChannelsIndex = 0;  // 'num_channels'
+
+struct TFLMSignalFilterBankParams {
+  FilterbankConfig config;
+  uint64_t* work_area;
+};
+
+void* FilterBankInit(TfLiteContext* context, const char* buffer,
+                     size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+
+  auto* params = static_cast<TFLMSignalFilterBankParams*>(
+      context->AllocatePersistentBuffer(context,
+                                        sizeof(TFLMSignalFilterBankParams)));
+  if (params == nullptr) {
+    return nullptr;
+  }
+
+  tflite::FlexbufferWrapper fbw(reinterpret_cast<const uint8_t*>(buffer),
+                                length);
+  params->config.num_channels = fbw.ElementAsInt32(kNumChannelsIndex);
+
+  params->work_area = static_cast<uint64_t*>(context->AllocatePersistentBuffer(
+      context, (params->config.num_channels + 1) * sizeof(uint64_t)));
+
+  if (params->work_area == nullptr) {
+    return nullptr;
+  }
+
+  return params;
+}
+
+TfLiteStatus FilterBankPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 6);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteUInt32);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  input = micro_context->AllocateTempInputTensor(node, kWeightTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  input = micro_context->AllocateTempInputTensor(node, kUnweightTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  input = micro_context->AllocateTempInputTensor(node, kChFreqStartsTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  input = micro_context->AllocateTempInputTensor(node, kChWeightStartsTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  input = micro_context->AllocateTempInputTensor(node, kChannelWidthsTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 1);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt64);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus FilterBankEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TFLMSignalFilterBankParams*>(node->user_data);
+
+  const TfLiteEvalTensor* input0 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kWeightTensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kUnweightTensor);
+  const TfLiteEvalTensor* input3 =
+      tflite::micro::GetEvalInput(context, node, kChFreqStartsTensor);
+  const TfLiteEvalTensor* input4 =
+      tflite::micro::GetEvalInput(context, node, kChWeightStartsTensor);
+  const TfLiteEvalTensor* input5 =
+      tflite::micro::GetEvalInput(context, node, kChannelWidthsTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  params->config.weights = tflite::micro::GetTensorData<int16_t>(input1);
+  params->config.unweights = tflite::micro::GetTensorData<int16_t>(input2);
+  params->config.channel_frequency_starts =
+      tflite::micro::GetTensorData<int16_t>(input3);
+  params->config.channel_weight_starts =
+      tflite::micro::GetTensorData<int16_t>(input4);
+  params->config.channel_widths = tflite::micro::GetTensorData<int16_t>(input5);
+
+  const uint32_t* input_data = tflite::micro::GetTensorData<uint32_t>(input0);
+  uint64_t* output_data = tflite::micro::GetTensorData<uint64_t>(output);
+
+  FilterbankAccumulateChannelsRVV(&params->config, input_data,
+                                            params->work_area);
+
+  size_t output_size;
+  TfLiteTypeSizeOf(output->type, &output_size);
+  output_size *= ElementCount(*output->dims);
+  // Discard channel 0, which is just scratch
+  memcpy(output_data, params->work_area + 1, output_size);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+namespace tflm_signal {
+
+TFLMRegistration* Register_FILTER_BANK() {
+  static TFLMRegistration r = tflite::micro::RegisterOp(
+      FilterBankInit, FilterBankPrepare, FilterBankEval);
+  return &r;
+}
+
+}  // namespace tflm_signal
+
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
new file mode 100644
index 00000000000..e6cac82020c
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
@@ -0,0 +1,76 @@
+#include <riscv_vector.h>
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+#define RVV_MAX_BUFFER_VL 64
+
+void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
+                                  const uint32_t* input, uint64_t* output) {
+  uint64_t weight_accumulator = 0;
+  uint64_t unweight_accumulator = 0;
+
+  for (int i = 0; i < config->num_channels + 1; i++) {
+    const int16_t freq_start = config->channel_frequency_starts[i];
+    const int16_t weight_start = config->channel_weight_starts[i];
+    const int16_t channel_width = config->channel_widths[i];
+
+    int j = 0;
+    while (j < channel_width) {
+      size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
+
+      vuint32m4_t v_input =
+          __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
+
+      vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
+          reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]),
+          vl);
+      vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
+          reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]),
+          vl);
+
+      // Widen 16-bit weights to 32-bit
+      vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
+      vuint32m4_t v_unweights32 = __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
+
+      // Perform 32x32 -> high/low 32-bit multiplication
+      vuint32m4_t v_prod_w_low =
+          __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
+      vuint32m4_t v_prod_w_high =
+          __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
+
+      vuint32m4_t v_prod_uw_low =
+          __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
+      vuint32m4_t v_prod_uw_high =
+          __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
+
+      // Use fixed-size buffers for scalar reduction
+      uint32_t prod_w_low_buf[RVV_MAX_BUFFER_VL];
+      uint32_t prod_w_high_buf[RVV_MAX_BUFFER_VL];
+      __riscv_vse32_v_u32m4(prod_w_low_buf, v_prod_w_low, vl);
+      __riscv_vse32_v_u32m4(prod_w_high_buf, v_prod_w_high, vl);
+
+      uint32_t prod_uw_low_buf[RVV_MAX_BUFFER_VL];
+      uint32_t prod_uw_high_buf[RVV_MAX_BUFFER_VL];
+      __riscv_vse32_v_u32m4(prod_uw_low_buf, v_prod_uw_low, vl);
+      __riscv_vse32_v_u32m4(prod_uw_high_buf, v_prod_uw_high, vl);
+
+      // Reconstruct 64-bit products and accumulate
+      for (size_t k = 0; k < vl; k++) {
+        uint64_t prod_w =
+            ((uint64_t)prod_w_high_buf[k] << 32) | prod_w_low_buf[k];
+        weight_accumulator += prod_w;
+
+        uint64_t prod_uw =
+            ((uint64_t)prod_uw_high_buf[k] << 32) | prod_uw_low_buf[k];
+        unweight_accumulator += prod_uw;
+      }
+
+      j += vl;
+    }
+
+    output[i] = weight_accumulator;
+    weight_accumulator = unweight_accumulator;
+    unweight_accumulator = 0;
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
new file mode 100644
index 00000000000..14cd5c58461
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
@@ -0,0 +1,23 @@
+#ifndef SIGNAL_SRC_FILTER_BANK_H_
+#define SIGNAL_SRC_FILTER_BANK_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+struct FilterbankConfig {
+  int32_t num_channels;
+  const int16_t* channel_frequency_starts;
+  const int16_t* channel_weight_starts;
+  const int16_t* channel_widths;
+  const int16_t* weights;
+  const int16_t* unweights;
+  int32_t output_scale;
+
+  int32_t input_correction_bits;
+};
+
+void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
+                                  const uint32_t* input, uint64_t* output);
+
+#endif  // SIGNAL_SRC_FILTER_BANK_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index ab3313e788f..9e466b10616 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -4,7 +4,7 @@ TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
 
 RISCV_ARCH := rv32imc_zve32x_zvl128b
 RISCV_ABI := ilp32
-RISCV_CODE_MODEL := medany
+RISCV_CODE_MODEL := medlow
 
 # Allow additional flags on the command line for debugging.
 RISCV_EXTRA_CFLAGS :=
@@ -68,7 +68,9 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/pooling.cc \
   tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/signal/rfft.cc\
-  tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
+  tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc \
 #  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
@@ -76,9 +78,11 @@ EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/depthwise_conv.cc \
   tensorflow/lite/micro/kernels/fully_connected.cc \
   tensorflow/lite/micro/kernels/pooling.cc\
-  tflite-micro/signal/micro/kernels/rfft.cc \
-  tflite-micro/signal/src/rfft_int16.cc \
-  tflite-micro/signal/src/kiss_fft_wrappers/kiss_fft_int16.cc \
+  signal/micro/kernels/rfft.cc \
+  signal/src/rfft_int16.cc \
+  signal/src/kiss_fft_wrappers/kiss_fft_int16.cc \
+  signal/micro/kernels/filter_bank.cc \
+  signal/src/filter_bank.cc \
 #  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 536315ffe2939c6a89a63563f8d6bcde8703545b Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 01:02:50 -0600
Subject: [PATCH 56/86] Update vector optimized FilterBank kernel

---
 .../riscv_vector/signal/filter_bank_rvv.cc    | 125 ++++++++++--------
 1 file changed, 71 insertions(+), 54 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
index e6cac82020c..b896f9e9317 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
@@ -7,7 +7,6 @@
 
 void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
                                   const uint32_t* input, uint64_t* output) {
-  uint64_t weight_accumulator = 0;
   uint64_t unweight_accumulator = 0;
 
   for (int i = 0; i < config->num_channels + 1; i++) {
@@ -15,62 +14,80 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
     const int16_t weight_start = config->channel_weight_starts[i];
     const int16_t channel_width = config->channel_widths[i];
 
-    int j = 0;
-    while (j < channel_width) {
-      size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
-
-      vuint32m4_t v_input =
-          __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
-
-      vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
-          reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]),
-          vl);
-      vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
-          reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]),
-          vl);
-
-      // Widen 16-bit weights to 32-bit
-      vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
-      vuint32m4_t v_unweights32 = __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
-
-      // Perform 32x32 -> high/low 32-bit multiplication
-      vuint32m4_t v_prod_w_low =
-          __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
-      vuint32m4_t v_prod_w_high =
-          __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
-
-      vuint32m4_t v_prod_uw_low =
-          __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
-      vuint32m4_t v_prod_uw_high =
-          __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
-
-      // Use fixed-size buffers for scalar reduction
-      uint32_t prod_w_low_buf[RVV_MAX_BUFFER_VL];
-      uint32_t prod_w_high_buf[RVV_MAX_BUFFER_VL];
-      __riscv_vse32_v_u32m4(prod_w_low_buf, v_prod_w_low, vl);
-      __riscv_vse32_v_u32m4(prod_w_high_buf, v_prod_w_high, vl);
-
-      uint32_t prod_uw_low_buf[RVV_MAX_BUFFER_VL];
-      uint32_t prod_uw_high_buf[RVV_MAX_BUFFER_VL];
-      __riscv_vse32_v_u32m4(prod_uw_low_buf, v_prod_uw_low, vl);
-      __riscv_vse32_v_u32m4(prod_uw_high_buf, v_prod_uw_high, vl);
-
-      // Reconstruct 64-bit products and accumulate
-      for (size_t k = 0; k < vl; k++) {
-        uint64_t prod_w =
-            ((uint64_t)prod_w_high_buf[k] << 32) | prod_w_low_buf[k];
-        weight_accumulator += prod_w;
-
-        uint64_t prod_uw =
-            ((uint64_t)prod_uw_high_buf[k] << 32) | prod_uw_low_buf[k];
-        unweight_accumulator += prod_uw;
+    uint64_t channel_w_acc = unweight_accumulator;
+    uint64_t channel_uw_acc = 0;
+
+    if (channel_width > 0) {
+      size_t vl_max_for_channel = __riscv_vsetvl_e32m4(channel_width);
+
+      vuint32m4_t v_acc_w_low = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
+      vuint32m4_t v_acc_w_high = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
+      vuint32m4_t v_acc_uw_low = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
+      vuint32m4_t v_acc_uw_high = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
+
+      int j = 0;
+      while (j < channel_width) {
+        size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
+
+        vuint32m4_t v_input =
+            __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
+
+        vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
+            reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]),
+            vl);
+        vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
+            reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]),
+            vl);
+
+        vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
+        vuint32m4_t v_unweights32 =
+            __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
+
+        vuint32m4_t v_prod_w_low =
+            __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
+        vuint32m4_t v_prod_w_high =
+            __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
+        vuint32m4_t v_prod_uw_low =
+            __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
+        vuint32m4_t v_prod_uw_high =
+            __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
+
+        vuint32m4_t v_next_acc_w_low =
+            __riscv_vadd_vv_u32m4(v_acc_w_low, v_prod_w_low, vl);
+        vuint32m4_t v_next_acc_uw_low =
+            __riscv_vadd_vv_u32m4(v_acc_uw_low, v_prod_uw_low, vl);
+
+        vbool8_t v_carry_w =
+            __riscv_vmsltu_vv_u32m4_b8(v_next_acc_w_low, v_acc_w_low, vl);
+        vbool8_t v_carry_uw =
+            __riscv_vmsltu_vv_u32m4_b8(v_next_acc_uw_low, v_acc_uw_low, vl);
+
+        v_acc_w_high = __riscv_vadc_vvm_u32m4(v_acc_w_high, v_prod_w_high, v_carry_w, vl);
+        v_acc_uw_high = __riscv_vadc_vvm_u32m4(v_acc_uw_high, v_prod_uw_high, v_carry_uw, vl);
+
+        v_acc_w_low = v_next_acc_w_low;
+        v_acc_uw_low = v_next_acc_uw_low;
+
+        j += vl;
       }
 
-      j += vl;
+      uint32_t acc_w_low_buf[RVV_MAX_BUFFER_VL], acc_w_high_buf[RVV_MAX_BUFFER_VL];
+      uint32_t acc_uw_low_buf[RVV_MAX_BUFFER_VL], acc_uw_high_buf[RVV_MAX_BUFFER_VL];
+
+      __riscv_vse32_v_u32m4(acc_w_low_buf, v_acc_w_low, vl_max_for_channel);
+      __riscv_vse32_v_u32m4(acc_w_high_buf, v_acc_w_high, vl_max_for_channel);
+      __riscv_vse32_v_u32m4(acc_uw_low_buf, v_acc_uw_low, vl_max_for_channel);
+      __riscv_vse32_v_u32m4(acc_uw_high_buf, v_acc_uw_high, vl_max_for_channel);
+      
+      for (size_t k = 0; k < (size_t)channel_width; ++k) {
+        channel_w_acc +=
+            ((uint64_t)acc_w_high_buf[k] << 32) | acc_w_low_buf[k];
+        channel_uw_acc +=
+            ((uint64_t)acc_uw_high_buf[k] << 32) | acc_uw_low_buf[k];
+      }
     }
 
-    output[i] = weight_accumulator;
-    weight_accumulator = unweight_accumulator;
-    unweight_accumulator = 0;
+    output[i] = channel_w_acc;
+    unweight_accumulator = channel_uw_acc;
   }
 }
\ No newline at end of file

From 3c21233304de5f2dae0a53f1c60c5cb85107c5e7 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 01:12:52 -0600
Subject: [PATCH 57/86] Update vector optimized FilterBank kernel

---
 .../riscv_vector/signal/filter_bank_rvv.cc    | 198 ++++++++++--------
 1 file changed, 113 insertions(+), 85 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
index b896f9e9317..ad419a9930c 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
@@ -3,91 +3,119 @@
 #include "tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-#define RVV_MAX_BUFFER_VL 64
-
 void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
-                                  const uint32_t* input, uint64_t* output) {
-  uint64_t unweight_accumulator = 0;
-
-  for (int i = 0; i < config->num_channels + 1; i++) {
-    const int16_t freq_start = config->channel_frequency_starts[i];
-    const int16_t weight_start = config->channel_weight_starts[i];
-    const int16_t channel_width = config->channel_widths[i];
-
-    uint64_t channel_w_acc = unweight_accumulator;
-    uint64_t channel_uw_acc = 0;
-
-    if (channel_width > 0) {
-      size_t vl_max_for_channel = __riscv_vsetvl_e32m4(channel_width);
-
-      vuint32m4_t v_acc_w_low = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
-      vuint32m4_t v_acc_w_high = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
-      vuint32m4_t v_acc_uw_low = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
-      vuint32m4_t v_acc_uw_high = __riscv_vmv_v_x_u32m4(0, vl_max_for_channel);
-
-      int j = 0;
-      while (j < channel_width) {
-        size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
-
-        vuint32m4_t v_input =
-            __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
-
-        vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
-            reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]),
-            vl);
-        vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
-            reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]),
-            vl);
-
-        vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
-        vuint32m4_t v_unweights32 =
-            __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
-
-        vuint32m4_t v_prod_w_low =
-            __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
-        vuint32m4_t v_prod_w_high =
-            __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
-        vuint32m4_t v_prod_uw_low =
-            __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
-        vuint32m4_t v_prod_uw_high =
-            __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
-
-        vuint32m4_t v_next_acc_w_low =
-            __riscv_vadd_vv_u32m4(v_acc_w_low, v_prod_w_low, vl);
-        vuint32m4_t v_next_acc_uw_low =
-            __riscv_vadd_vv_u32m4(v_acc_uw_low, v_prod_uw_low, vl);
-
-        vbool8_t v_carry_w =
-            __riscv_vmsltu_vv_u32m4_b8(v_next_acc_w_low, v_acc_w_low, vl);
-        vbool8_t v_carry_uw =
-            __riscv_vmsltu_vv_u32m4_b8(v_next_acc_uw_low, v_acc_uw_low, vl);
-
-        v_acc_w_high = __riscv_vadc_vvm_u32m4(v_acc_w_high, v_prod_w_high, v_carry_w, vl);
-        v_acc_uw_high = __riscv_vadc_vvm_u32m4(v_acc_uw_high, v_prod_uw_high, v_carry_uw, vl);
-
-        v_acc_w_low = v_next_acc_w_low;
-        v_acc_uw_low = v_next_acc_uw_low;
-
-        j += vl;
-      }
-
-      uint32_t acc_w_low_buf[RVV_MAX_BUFFER_VL], acc_w_high_buf[RVV_MAX_BUFFER_VL];
-      uint32_t acc_uw_low_buf[RVV_MAX_BUFFER_VL], acc_uw_high_buf[RVV_MAX_BUFFER_VL];
-
-      __riscv_vse32_v_u32m4(acc_w_low_buf, v_acc_w_low, vl_max_for_channel);
-      __riscv_vse32_v_u32m4(acc_w_high_buf, v_acc_w_high, vl_max_for_channel);
-      __riscv_vse32_v_u32m4(acc_uw_low_buf, v_acc_uw_low, vl_max_for_channel);
-      __riscv_vse32_v_u32m4(acc_uw_high_buf, v_acc_uw_high, vl_max_for_channel);
-      
-      for (size_t k = 0; k < (size_t)channel_width; ++k) {
-        channel_w_acc +=
-            ((uint64_t)acc_w_high_buf[k] << 32) | acc_w_low_buf[k];
-        channel_uw_acc +=
-            ((uint64_t)acc_uw_high_buf[k] << 32) | acc_uw_low_buf[k];
-      }
+                                  const uint32_t* input, uint64_t* output)
+{
+    // Initialize unweighted accumulator for the first channel
+    uint64_t unweight_accumulator = 0;
+
+    // Loop over each channel
+    for (int i = 0; i < config->num_channels + 1; i++)
+    {
+        // Get parameters for the current channel
+        const int16_t freq_start = config->channel_frequency_starts[i];
+        const int16_t weight_start = config->channel_weight_starts[i];
+        const int16_t channel_width = config->channel_widths[i];
+
+        // Initialize scalar accumulators for this channel
+        uint64_t channel_w_acc = unweight_accumulator;
+        uint64_t channel_uw_acc = 0;
+
+        // Process channel only if it has non-zero width
+        if (channel_width > 0)
+        {
+            // Set max vector length for the channel
+            size_t vl_max = __riscv_vsetvl_e32m4(channel_width);
+
+            // Initialize vector accumulators for 64-bit sums (low and high parts)
+            vuint32m4_t v_acc_w_low = __riscv_vmv_v_x_u32m4(0, vl_max);
+            vuint32m4_t v_acc_w_high = __riscv_vmv_v_x_u32m4(0, vl_max);
+            vuint32m4_t v_acc_uw_low = __riscv_vmv_v_x_u32m4(0, vl_max);
+            vuint32m4_t v_acc_uw_high = __riscv_vmv_v_x_u32m4(0, vl_max);
+
+            // Initialize scalar counters for total carries
+            size_t w_carry_count = 0;
+            size_t uw_carry_count = 0;
+
+            // Process the channel width in vector-sized chunks (stripmining)
+            int j = 0;
+            while (j < channel_width)
+            {
+                // Set vector length for the current strip
+                size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
+
+                // Load vector of input data
+                vuint32m4_t v_input =
+                    __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
+
+                // Load 16-bit weights and unweights
+                vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
+                    reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]), vl);
+                vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
+                    reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]), vl);
+
+                // Widen weights and unweights to 32-bit
+                vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
+                vuint32m4_t v_unweights32 = __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
+
+                // Perform 32x32 multiply, producing 64-bit results as low/high pairs
+                vuint32m4_t v_prod_w_low = __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
+                vuint32m4_t v_prod_w_high = __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
+                vuint32m4_t v_prod_uw_low = __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
+                vuint32m4_t v_prod_uw_high = __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
+
+                // Add the low 32-bit parts of the products
+                vuint32m4_t v_next_acc_w_low = __riscv_vadd_vv_u32m4(v_acc_w_low, v_prod_w_low, vl);
+                vuint32m4_t v_next_acc_uw_low = __riscv_vadd_vv_u32m4(v_acc_uw_low, v_prod_uw_low, vl);
+
+                // Detect carries from the low-part addition
+                vbool8_t v_carry_w = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_w_low, v_acc_w_low, vl);
+                vbool8_t v_carry_uw = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_uw_low, v_acc_uw_low, vl);
+
+                // Count the number of carries that occurred in this iteration
+                w_carry_count += __riscv_vcpop_m_b8(v_carry_w, vl);
+                uw_carry_count += __riscv_vcpop_m_b8(v_carry_uw, vl);
+
+                // Add the high 32-bit parts of the products
+                v_acc_w_high = __riscv_vadd_vv_u32m4(v_acc_w_high, v_prod_w_high, vl);
+                v_acc_uw_high = __riscv_vadd_vv_u32m4(v_acc_uw_high, v_prod_uw_high, vl);
+
+                // Update the low-part accumulators
+                v_acc_w_low = v_next_acc_w_low;
+                v_acc_uw_low = v_next_acc_uw_low;
+
+                // Advance stripmining index
+                j += vl;
+            }
+
+            // Initialize a zero vector for reduction
+            vuint32m1_t v_zero = __riscv_vmv_v_x_u32m1(0, vl_max);
+
+            // Reduce the 32-bit vector accumulators to scalar sums
+            vuint32m1_t v_sum_w_low = __riscv_vredsum_vs_u32m4_u32m1(v_acc_w_low, v_zero, vl_max);
+            vuint32m1_t v_sum_uw_low = __riscv_vredsum_vs_u32m4_u32m1(v_acc_uw_low, v_zero, vl_max);
+            vuint32m1_t v_sum_w_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_w_high, v_zero, vl_max);
+            vuint32m1_t v_sum_uw_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_uw_high, v_zero, vl_max);
+
+            // Extract scalar results from vector registers
+            uint32_t final_w_low = __riscv_vmv_x_s_u32m1_u32(v_sum_w_low);
+            uint32_t final_uw_low = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_low);
+            uint32_t final_w_high = __riscv_vmv_x_s_u32m1_u32(v_sum_w_high);
+            uint32_t final_uw_high = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_high);
+
+            // Reconstruct the final 64-bit sum, adding the total carry count to the high part
+            uint64_t final_w = ((uint64_t)(final_w_high + w_carry_count) << 32) | final_w_low;
+            uint64_t final_uw = ((uint64_t)(final_uw_high + uw_carry_count) << 32) | final_uw_low;
+
+            // Add the vector reduction result to the channel's scalar accumulator
+            channel_w_acc += final_w;
+            channel_uw_acc += final_uw;
+        }
+
+        // Store the final weighted result for this channel
+        output[i] = channel_w_acc;
+
+        // The unweighted sum from this channel becomes the starting accumulator for the next
+        unweight_accumulator = channel_uw_acc;
     }
-
-    output[i] = channel_w_acc;
-    unweight_accumulator = channel_uw_acc;
-  }
 }
\ No newline at end of file

From 64c8a1803fef8bfc88c20ba503826149c3998712 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 02:44:30 -0600
Subject: [PATCH 58/86] Update micro_speech_test for more  accurate
 benchmarking

---
 .../micro/examples/micro_speech/Makefile.inc  |   2 +-
 .../micro_speech/micro_speech_test2.cc        | 163 +++++++++---------
 2 files changed, 78 insertions(+), 87 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index a1b5b565cf5..d8a93991ef7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -1,6 +1,6 @@
 
 MICRO_SPEECH_TEST_SRCS := \
-$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
 
 MICRO_SPEECH_TEST_HDRS := \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h \
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
index 1839fcef2b6..75d0ad07518 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
@@ -1,20 +1,7 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
 #include <algorithm>
+#include <cstdio>
 #include <cstdint>
+#include <cstdlib>  // Required for atoi
 #include <iterator>
 
 #include "tensorflow/lite/core/c/common.h"
@@ -22,11 +9,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/micro_speech/models/audio_preprocessor_int8_model_data.h"
 #include "tensorflow/lite/micro/examples/micro_speech/models/micro_speech_quantized_model_data.h"
 #include "tensorflow/lite/micro/examples/micro_speech/testdata/no_1000ms_audio_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/no_30ms_audio_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/noise_1000ms_audio_data.h"
 #include "tensorflow/lite/micro/examples/micro_speech/testdata/silence_1000ms_audio_data.h"
 #include "tensorflow/lite/micro/examples/micro_speech/testdata/yes_1000ms_audio_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/yes_30ms_audio_data.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
@@ -37,7 +21,7 @@ namespace {
 // Arena size is a guesstimate, followed by use of
 // MicroInterpreter::arena_used_bytes() on both the AudioPreprocessor and
 // MicroSpeech models and using the larger of the two results.
-constexpr size_t kArenaSize = 28584;  // xtensa p6
+constexpr size_t kArenaSize = 28584;
 alignas(16) uint8_t g_arena[kArenaSize];
 
 using Features = int8_t[kFeatureCount][kFeatureSize];
@@ -51,6 +35,7 @@ constexpr int kAudioSampleStrideCount =
 using MicroSpeechOpResolver = tflite::MicroMutableOpResolver<4>;
 using AudioPreprocessorOpResolver = tflite::MicroMutableOpResolver<18>;
 
+// Registers the ops used by the MicroSpeech model.
 TfLiteStatus RegisterOps(MicroSpeechOpResolver& op_resolver) {
   TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
   TF_LITE_ENSURE_STATUS(op_resolver.AddFullyConnected());
@@ -59,6 +44,7 @@ TfLiteStatus RegisterOps(MicroSpeechOpResolver& op_resolver) {
   return kTfLiteOk;
 }
 
+// Registers the ops used by the AudioPreprocessor model.
 TfLiteStatus RegisterOps(AudioPreprocessorOpResolver& op_resolver) {
   TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
   TF_LITE_ENSURE_STATUS(op_resolver.AddCast());
@@ -81,108 +67,113 @@ TfLiteStatus RegisterOps(AudioPreprocessorOpResolver& op_resolver) {
   return kTfLiteOk;
 }
 
-TfLiteStatus LoadMicroSpeechModelAndPerformInference(
-    const Features& features, const char* expected_label) {
-  // Map the model into a usable data structure. This doesn't involve any
-  // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model =
-      tflite::GetModel(g_micro_speech_quantized_model_data);
-
-  MicroSpeechOpResolver op_resolver;
-  RegisterOps(op_resolver);
-
-  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
-  interpreter.AllocateTensors();
-
-  TfLiteTensor* input = interpreter.input(0);
-
-  TfLiteTensor* output = interpreter.output(0);
-
-  float output_scale = output->params.scale;
-  int output_zero_point = output->params.zero_point;
-
-  for (int i = 0; i < 4; i++) {
-    std::copy_n(&features[0][0], kFeatureElementCount,
-                tflite::GetTensorData<int8_t>(input));
-    interpreter.Invoke();
-  }
-
-  // Dequantize output values
-  volatile float category_predictions[kCategoryCount];
-  for (int i = 0; i < kCategoryCount; i++) {
-    category_predictions[i] =
-        (tflite::GetTensorData<int8_t>(output)[i] - output_zero_point) *
-        output_scale;
-
-    if (category_predictions[i] > -1000.0f) {
-        // Dummy read to satisfy compiler
-    }
-  }
-
-  return kTfLiteOk;
-}
-
+// Helper function to generate a single feature slice.
 TfLiteStatus GenerateSingleFeature(const int16_t* audio_data,
                                    const int audio_data_size,
                                    int8_t* feature_output,
                                    tflite::MicroInterpreter* interpreter) {
   TfLiteTensor* input = interpreter->input(0);
-  TfLiteTensor* output = interpreter->output(0);
-
   std::copy_n(audio_data, audio_data_size,
               tflite::GetTensorData<int16_t>(input));
-  interpreter->Invoke();
+  if (interpreter->Invoke() != kTfLiteOk) {
+    return kTfLiteError;
+  }
+  TfLiteTensor* output = interpreter->output(0);
   std::copy_n(tflite::GetTensorData<int8_t>(output), kFeatureSize,
               feature_output);
-
   return kTfLiteOk;
 }
 
+// Generates the full feature data from a single audio clip.
 TfLiteStatus GenerateFeatures(const int16_t* audio_data,
                               const size_t audio_data_size,
                               Features* features_output) {
-  // Map the model into a usable data structure. This doesn't involve any
-  // copying or parsing, it's a very lightweight operation.
   const tflite::Model* model =
       tflite::GetModel(g_audio_preprocessor_int8_model_data);
-
   AudioPreprocessorOpResolver op_resolver;
-  RegisterOps(op_resolver);
+  TF_LITE_ENSURE_STATUS(RegisterOps(op_resolver));
 
   tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
-  interpreter.AllocateTensors();
+  TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
 
   size_t remaining_samples = audio_data_size;
   size_t feature_index = 0;
   while (remaining_samples >= kAudioSampleDurationCount &&
          feature_index < kFeatureCount) {
-  GenerateSingleFeature(audio_data, kAudioSampleDurationCount,
-                              (*features_output)[feature_index], &interpreter);
+    TF_LITE_ENSURE_STATUS(
+        GenerateSingleFeature(audio_data, kAudioSampleDurationCount,
+                              (*features_output)[feature_index], &interpreter));
     feature_index++;
     audio_data += kAudioSampleStrideCount;
     remaining_samples -= kAudioSampleStrideCount;
   }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus TestAudioSample(const char* label, const int16_t* audio_data,
-                             const size_t audio_data_size) {
-      GenerateFeatures(audio_data, audio_data_size, &g_features);
-      LoadMicroSpeechModelAndPerformInference(g_features, label);
   return kTfLiteOk;
 }
 
 }  // namespace
 
-int main () {
-  TestAudioSample("no", g_no_1000ms_audio_data, g_no_1000ms_audio_data_size);
+int main(int argc, char** argv) {
+  // ====================================================================
+  // 1. PARSE COMMAND-LINE ARGUMENTS
+  // ====================================================================
+  if (argc != 2) {
+    printf("ERROR: Incorrect usage.\n");
+    printf("Usage: %s <num_invocations>\n", argv[0]);
+    return 1;
+  }
+
+  int num_invocations = atoi(argv[1]);
+  if (num_invocations <= 0) {
+    printf("ERROR: Number of invocations must be greater than 0.\n");
+    return 1;
+  }
+
+  // ====================================================================
+  // 2. PERFORM ONE-TIME SETUP
+  // This is the "startup cost" that the delta measurement will cancel out.
+  // ====================================================================
+  printf("Performing one-time setup...\n");
+
+  // Generate a single, representative feature set from an audio file.
+  // The "yes" audio file is a good choice for a typical input.
+  if (GenerateFeatures(g_yes_1000ms_audio_data, g_yes_1000ms_audio_data_size,
+                       &g_features) != kTfLiteOk) {
+    printf("ERROR: Feature generation failed.\n");
+    return 1;
+  }
+
+  // Set up the MicroSpeech interpreter.
+  const tflite::Model* model =
+      tflite::GetModel(g_micro_speech_quantized_model_data);
+  MicroSpeechOpResolver op_resolver;
+  if (RegisterOps(op_resolver) != kTfLiteOk) {
+    printf("ERROR: Failed to register ops.\n");
+    return 1;
+  }
+
+  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
+  if (interpreter.AllocateTensors() != kTfLiteOk) {
+    printf("ERROR: AllocateTensors() failed.\n");
+    return 1;
+  }
 
-  TestAudioSample("yes", g_yes_1000ms_audio_data, g_yes_1000ms_audio_data_size);
+  // Get the input tensor and copy the feature data into it.
+  TfLiteTensor* input = interpreter.input(0);
+  std::copy_n(&g_features[0][0], kFeatureElementCount,
+              tflite::GetTensorData<int8_t>(input));
+
+  printf("Setup complete.\n");
+
+  // ====================================================================
+  printf("Running %d invocations...\n", num_invocations);
+
+  for (int i = 0; i < num_invocations; ++i) {
+    if (interpreter.Invoke() != kTfLiteOk) {
+      return 1;
+    }
+  }
 
-  TestAudioSample("silence", g_silence_1000ms_audio_data,
-                  g_silence_1000ms_audio_data_size);
+  printf("Finished all invocations successfully.\n");
 
-  TestAudioSample("silence", g_noise_1000ms_audio_data,
-                  g_noise_1000ms_audio_data_size);
+  return 0;
 }
\ No newline at end of file

From 4307f3c4ee9b6677199fb3f9232b598fddc7b1d5 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 02:45:29 -0600
Subject: [PATCH 59/86] Revert target makefile RISCV_CODE_MODEL

---
 .../lite/micro/tools/make/targets/riscv32_vector_makefile.inc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 9e466b10616..6306b2607da 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -4,7 +4,7 @@ TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
 
 RISCV_ARCH := rv32imc_zve32x_zvl128b
 RISCV_ABI := ilp32
-RISCV_CODE_MODEL := medlow
+RISCV_CODE_MODEL := medany
 
 # Allow additional flags on the command line for debugging.
 RISCV_EXTRA_CFLAGS :=

From 9ee7feff3241dcb8cc6d6812d31fa491c9c0be8a Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 08:20:09 -0600
Subject: [PATCH 60/86] New person_detection2.cc for benchmarking

---
 .../examples/person_detection/Makefile.inc    |   2 +-
 .../person_detection_test2.cc                 | 100 ++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc

diff --git a/tensorflow/lite/micro/examples/person_detection/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
index c142c7ddc10..c06e75d97ad 100644
--- a/tensorflow/lite/micro/examples/person_detection/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
@@ -5,7 +5,7 @@ person_detection_MODEL_HDRS := \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/model_settings.h
 
 person_detection_TEST_SRCS := \
-$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/person_detection_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc \
 $(person_detection_MODEL_SRCS)
 
 person_detection_TEST_HDRS := \
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc
new file mode 100644
index 00000000000..57fed75773e
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/examples/person_detection/model_settings.h"
+#include "tensorflow/lite/micro/examples/person_detection/testdata/person_image_data.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/models/person_detect_model_data.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+// Create an area of memory to use for input, output, and intermediate arrays.
+#if defined(XTENSA) && defined(VISION_P6)
+constexpr int tensor_arena_size = 352 * 1024;
+#else
+constexpr int tensor_arena_size = 136 * 1024;
+#endif  // defined(XTENSA) && defined(VISION_P6)
+uint8_t tensor_arena[tensor_arena_size];
+
+int main(int argc, char** argv) {
+  // Parse command-line arguments
+  if (argc != 2) {
+    printf("ERROR: Incorrect usage.\n");
+    printf("Usage: %s <num_invocations>\n", argv[0]);
+    return 1;
+  }
+
+  int num_invocations = atoi(argv[1]);
+  if (num_invocations <= 0) {
+    printf("ERROR: Number of invocations must be greater than 0.\n");
+    return 1;
+  }
+
+  // This is the "startup cost" that the delta measurement will cancel out
+  printf("Performing one-time setup...\n");
+
+  // Map the model into a usable data structure
+  const tflite::Model* model = ::tflite::GetModel(g_person_detect_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    // Don't care
+    return 1;
+  }
+
+  // Pull in only the operation implementations we need
+  tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddAveragePool2D(tflite::Register_AVERAGE_POOL_2D_INT8());
+  micro_op_resolver.AddConv2D(tflite::Register_CONV_2D_INT8());
+  micro_op_resolver.AddDepthwiseConv2D(
+      tflite::Register_DEPTHWISE_CONV_2D_INT8());
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax(tflite::Register_SOFTMAX_INT8());
+
+  // Build an interpreter to run the model with
+  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
+                                       tensor_arena_size);
+  if (interpreter.AllocateTensors() != kTfLiteOk) {
+    printf("ERROR: AllocateTensors() failed.\n");
+    return 1;
+  }
+
+  // Get information about the model's input tensor
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Copy a representative image into the input tensor
+  memcpy(input->data.int8, g_person_image_data, input->bytes);
+
+  printf("Setup complete.\n");
+
+  // Run the benchmark loop
+  printf("Running %d invocations...\n", num_invocations);
+
+  for (int i = 0; i < num_invocations; ++i) {
+    if (interpreter.Invoke() != kTfLiteOk) {
+      printf("ERROR: Invoke() failed on iteration %d.\n", i);
+      return 1;
+    }
+  }
+
+  printf("Finished all invocations successfully.\n");
+
+  return 0;
+}
\ No newline at end of file

From 7dea3b9b2fc87adde2cb8d842b18eab8bd396a7c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 08:20:46 -0600
Subject: [PATCH 61/86] Format comments

---
 .../micro_speech/micro_speech_test2.cc        | 19 +++++++------------
 .../kernels/riscv_vector/requantize_rvv.h     | 18 +++++++++---------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
index 75d0ad07518..462cd4c8396 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
@@ -1,7 +1,7 @@
 #include <algorithm>
 #include <cstdio>
 #include <cstdint>
-#include <cstdlib>  // Required for atoi
+#include <cstdlib>
 #include <iterator>
 
 #include "tensorflow/lite/core/c/common.h"
@@ -113,9 +113,7 @@ TfLiteStatus GenerateFeatures(const int16_t* audio_data,
 }  // namespace
 
 int main(int argc, char** argv) {
-  // ====================================================================
-  // 1. PARSE COMMAND-LINE ARGUMENTS
-  // ====================================================================
+  // Parse command-line argument
   if (argc != 2) {
     printf("ERROR: Incorrect usage.\n");
     printf("Usage: %s <num_invocations>\n", argv[0]);
@@ -128,21 +126,18 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // ====================================================================
-  // 2. PERFORM ONE-TIME SETUP
-  // This is the "startup cost" that the delta measurement will cancel out.
-  // ====================================================================
+  // This is the "startup cost" that the delta measurement will cancel out
   printf("Performing one-time setup...\n");
 
-  // Generate a single, representative feature set from an audio file.
-  // The "yes" audio file is a good choice for a typical input.
+  // Generate a single, representative feature set from an audio file
+  // The "yes" audio file is a good choice for a typical input
   if (GenerateFeatures(g_yes_1000ms_audio_data, g_yes_1000ms_audio_data_size,
                        &g_features) != kTfLiteOk) {
     printf("ERROR: Feature generation failed.\n");
     return 1;
   }
 
-  // Set up the MicroSpeech interpreter.
+  // Set up the MicroSpeech interpreter
   const tflite::Model* model =
       tflite::GetModel(g_micro_speech_quantized_model_data);
   MicroSpeechOpResolver op_resolver;
@@ -157,7 +152,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // Get the input tensor and copy the feature data into it.
+  // Get the input tensor and copy the feature data into it
   TfLiteTensor* input = interpreter.input(0);
   std::copy_n(&g_features[0][0], kFeatureElementCount,
               tflite::GetTensorData<int8_t>(input));
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
index 678c26e35a7..f6498dfebc5 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
@@ -63,14 +63,14 @@ inline vint32m4_t RequantizeVectorPerChannelS32(
     const int32_t output_offset, const int32_t activation_min,
     const int32_t activation_max, const size_t vl)
 {
-    // Perform 32x32 -> 64-bit multiplication, getting high and low parts.
+    // Perform 32x32 -> 64-bit multiplication, getting high and low parts
     vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_acc, v_multiplier, vl);
     vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_acc, v_multiplier, vl);
 
-    // Calculate the effective right shift for TFLM's fixed-point scheme.
+    // Calculate the effective right shift for TFLM's fixed-point scheme
     vint32m4_t v_effective_shift = __riscv_vrsub_vx_i32m4(v_shift, 31, vl);
 
-    // Create masks to separate lanes into right-shift and left-shift paths.
+    // Create masks to separate lanes into right-shift and left-shift paths
     vbool8_t v_mask_right_shift =
         __riscv_vmsgt_vx_i32m4_b8(v_effective_shift, 0, vl);
     vbool8_t v_mask_left_shift = __riscv_vmnot_m_b8(v_mask_right_shift, vl);
@@ -78,7 +78,7 @@ inline vint32m4_t RequantizeVectorPerChannelS32(
     // Path 1: Right Shift (for lanes where effective_shift > 0)
     vint32m4_t v_res_right;
     {
-        // Calculate the 64-bit rounding value: (1LL << (effective_shift - 1)).
+        // Calculate the 64-bit rounding value: (1LL << (effective_shift - 1))
         vint32m4_t v_shift_minus_1 = __riscv_vsub_vx_i32m4_m(
             v_mask_right_shift, v_effective_shift, 1, vl);
         vuint32m4_t v_shift_minus_1_u =
@@ -103,7 +103,7 @@ inline vint32m4_t RequantizeVectorPerChannelS32(
                 vl),
             v_mask_round_ge_32, vl);
 
-        // Add the 64-bit rounding value to the 64-bit product using 32-bit ops.
+        // Add the 64-bit rounding value to the 64-bit product using 32-bit ops
         vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
         vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4_m(
             v_mask_right_shift, v_prod_lo_u, v_rounding_lo_u, vl);
@@ -114,7 +114,7 @@ inline vint32m4_t RequantizeVectorPerChannelS32(
             __riscv_vreinterpret_v_u32m4_i32m4(v_rounding_hi_u), vl);
         v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
 
-        // Emulate a 64-bit arithmetic right shift using two 32-bit sub-paths.
+        // Emulate a 64-bit arithmetic right shift using two 32-bit sub-paths
         vbool8_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m4_b8_m(
             v_mask_right_shift, v_effective_shift, 32, vl);
         vbool8_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b8(
@@ -142,17 +142,17 @@ inline vint32m4_t RequantizeVectorPerChannelS32(
     // Path 2: Left Shift (for lanes where effective_shift <= 0)
     vint32m4_t v_res_left;
     {
-        // Calculate the positive left shift amount.
+        // Calculate the positive left shift amount
         vint32m4_t v_left_shift_amount =
             __riscv_vneg_v_i32m4_m(v_mask_left_shift, v_effective_shift, vl);
 
-        // Perform the left shift on the low 32 bits of the product.
+        // Perform the left shift on the low 32 bits of the product
         v_res_left = __riscv_vsll_vv_i32m4_m(
             v_mask_left_shift, v_prod_lo,
             __riscv_vreinterpret_v_i32m4_u32m4(v_left_shift_amount), vl);
     }
 
-    // Merge the results from the right and left shift paths.
+    // Merge the results from the right and left shift paths
     vint32m4_t v_res32 =
         __riscv_vmerge_vvm_i32m4(v_res_left, v_res_right, v_mask_right_shift, vl);
 

From 4255b946cdfa3e2d8befcb00499b87a70ab91861 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 09:14:17 -0600
Subject: [PATCH 62/86] Update micro_speech_test2

---
 .../micro_speech/micro_speech_test2.cc        | 126 +++++++++---------
 1 file changed, 62 insertions(+), 64 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
index 462cd4c8396..499cfb8aca1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
@@ -67,49 +67,6 @@ TfLiteStatus RegisterOps(AudioPreprocessorOpResolver& op_resolver) {
   return kTfLiteOk;
 }
 
-// Helper function to generate a single feature slice.
-TfLiteStatus GenerateSingleFeature(const int16_t* audio_data,
-                                   const int audio_data_size,
-                                   int8_t* feature_output,
-                                   tflite::MicroInterpreter* interpreter) {
-  TfLiteTensor* input = interpreter->input(0);
-  std::copy_n(audio_data, audio_data_size,
-              tflite::GetTensorData<int16_t>(input));
-  if (interpreter->Invoke() != kTfLiteOk) {
-    return kTfLiteError;
-  }
-  TfLiteTensor* output = interpreter->output(0);
-  std::copy_n(tflite::GetTensorData<int8_t>(output), kFeatureSize,
-              feature_output);
-  return kTfLiteOk;
-}
-
-// Generates the full feature data from a single audio clip.
-TfLiteStatus GenerateFeatures(const int16_t* audio_data,
-                              const size_t audio_data_size,
-                              Features* features_output) {
-  const tflite::Model* model =
-      tflite::GetModel(g_audio_preprocessor_int8_model_data);
-  AudioPreprocessorOpResolver op_resolver;
-  TF_LITE_ENSURE_STATUS(RegisterOps(op_resolver));
-
-  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
-  TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
-
-  size_t remaining_samples = audio_data_size;
-  size_t feature_index = 0;
-  while (remaining_samples >= kAudioSampleDurationCount &&
-         feature_index < kFeatureCount) {
-    TF_LITE_ENSURE_STATUS(
-        GenerateSingleFeature(audio_data, kAudioSampleDurationCount,
-                              (*features_output)[feature_index], &interpreter));
-    feature_index++;
-    audio_data += kAudioSampleStrideCount;
-    remaining_samples -= kAudioSampleStrideCount;
-  }
-  return kTfLiteOk;
-}
-
 }  // namespace
 
 int main(int argc, char** argv) {
@@ -126,44 +83,85 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // This is the "startup cost" that the delta measurement will cancel out
-  printf("Performing one-time setup...\n");
+  // One-time setup for both models
+  printf("Performing one-time setup for both models...\n");
 
-  // Generate a single, representative feature set from an audio file
-  // The "yes" audio file is a good choice for a typical input
-  if (GenerateFeatures(g_yes_1000ms_audio_data, g_yes_1000ms_audio_data_size,
-                       &g_features) != kTfLiteOk) {
-    printf("ERROR: Feature generation failed.\n");
+  // Set up the AudioPreprocessor interpreter
+  const tflite::Model* preprocessor_model =
+      tflite::GetModel(g_audio_preprocessor_int8_model_data);
+  AudioPreprocessorOpResolver preprocessor_op_resolver;
+  if (RegisterOps(preprocessor_op_resolver) != kTfLiteOk) {
+    printf("ERROR: Failed to register preprocessor ops.\n");
     return 1;
   }
 
   // Set up the MicroSpeech interpreter
-  const tflite::Model* model =
+  const tflite::Model* speech_model =
       tflite::GetModel(g_micro_speech_quantized_model_data);
-  MicroSpeechOpResolver op_resolver;
-  if (RegisterOps(op_resolver) != kTfLiteOk) {
-    printf("ERROR: Failed to register ops.\n");
+  MicroSpeechOpResolver speech_op_resolver;
+  if (RegisterOps(speech_op_resolver) != kTfLiteOk) {
+    printf("ERROR: Failed to register speech ops.\n");
     return 1;
   }
 
-  tflite::MicroInterpreter interpreter(model, op_resolver, g_arena, kArenaSize);
-  if (interpreter.AllocateTensors() != kTfLiteOk) {
-    printf("ERROR: AllocateTensors() failed.\n");
+  // Create BOTH interpreters first, sharing the same arena.
+  tflite::MicroInterpreter preprocessor_interpreter(
+      preprocessor_model, preprocessor_op_resolver, g_arena, kArenaSize);
+  tflite::MicroInterpreter speech_interpreter(
+      speech_model, speech_op_resolver, g_arena, kArenaSize);
+
+  // Allocate tensors for the first model.
+  if (preprocessor_interpreter.AllocateTensors() != kTfLiteOk) {
+    printf("ERROR: Preprocessor AllocateTensors() failed.\n");
+    return 1;
+  }
+  // Now, the second interpreter will automatically allocate its memory *after*
+  // the first one in the shared arena.
+  if (speech_interpreter.AllocateTensors() != kTfLiteOk) {
+    printf("ERROR: Speech AllocateTensors() failed.\n");
     return 1;
   }
 
-  // Get the input tensor and copy the feature data into it
-  TfLiteTensor* input = interpreter.input(0);
-  std::copy_n(&g_features[0][0], kFeatureElementCount,
-              tflite::GetTensorData<int8_t>(input));
+  // Get pointers to the input and output tensors of both models
+  TfLiteTensor* preprocessor_input = preprocessor_interpreter.input(0); // <-- TYPO FIXED HERE
+  TfLiteTensor* preprocessor_output = preprocessor_interpreter.output(0);
+  TfLiteTensor* speech_input = speech_interpreter.input(0);
 
   printf("Setup complete.\n");
 
-  // ====================================================================
-  printf("Running %d invocations...\n", num_invocations);
+  printf("Running %d end-to-end invocations...\n", num_invocations);
 
   for (int i = 0; i < num_invocations; ++i) {
-    if (interpreter.Invoke() != kTfLiteOk) {
+    // Generate Features
+    const int16_t* audio_data = g_yes_1000ms_audio_data;
+    size_t remaining_samples = g_yes_1000ms_audio_data_size;
+    size_t feature_index = 0;
+
+    while (remaining_samples >= kAudioSampleDurationCount &&
+           feature_index < kFeatureCount)
+    {
+      std::copy_n(audio_data, kAudioSampleDurationCount,
+                  tflite::GetTensorData<int16_t>(preprocessor_input));
+
+      if (preprocessor_interpreter.Invoke() != kTfLiteOk) {
+        printf("ERROR: Preprocessor Invoke() failed.\n");
+        return 1;
+      }
+
+      std::copy_n(tflite::GetTensorData<int8_t>(preprocessor_output), kFeatureSize,
+                  g_features[feature_index]);
+
+      feature_index++;
+      audio_data += kAudioSampleStrideCount;
+      remaining_samples -= kAudioSampleStrideCount;
+    }
+
+    // Classify Features
+    std::copy_n(&g_features[0][0], kFeatureElementCount,
+                tflite::GetTensorData<int8_t>(speech_input));
+
+    if (speech_interpreter.Invoke() != kTfLiteOk) {
+      printf("ERROR: Speech Invoke() failed.\n");
       return 1;
     }
   }

From 065051d0757e36fdcf406794ccdf1347ef29312e Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 22:37:29 -0600
Subject: [PATCH 63/86] FilterBank: Accumulate carries into a vector register
 using masked addition and reduce to scalar only once per channel

---
 .../micro/examples/micro_speech/Makefile.inc  |  2 +-
 .../riscv_vector/signal/filter_bank_rvv.cc    | 22 ++++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index d8a93991ef7..a1b5b565cf5 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -1,6 +1,6 @@
 
 MICRO_SPEECH_TEST_SRCS := \
-$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
 
 MICRO_SPEECH_TEST_HDRS := \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h \
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
index ad419a9930c..b38e803e327 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
@@ -33,9 +33,9 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
             vuint32m4_t v_acc_uw_low = __riscv_vmv_v_x_u32m4(0, vl_max);
             vuint32m4_t v_acc_uw_high = __riscv_vmv_v_x_u32m4(0, vl_max);
 
-            // Initialize scalar counters for total carries
-            size_t w_carry_count = 0;
-            size_t uw_carry_count = 0;
+            // Initialize vector accumulators for carries (Optimization: avoid vcpop in loop)
+            vuint32m4_t v_carry_w_acc = __riscv_vmv_v_x_u32m4(0, vl_max);
+            vuint32m4_t v_carry_uw_acc = __riscv_vmv_v_x_u32m4(0, vl_max);
 
             // Process the channel width in vector-sized chunks (stripmining)
             int j = 0;
@@ -72,9 +72,9 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
                 vbool8_t v_carry_w = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_w_low, v_acc_w_low, vl);
                 vbool8_t v_carry_uw = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_uw_low, v_acc_uw_low, vl);
 
-                // Count the number of carries that occurred in this iteration
-                w_carry_count += __riscv_vcpop_m_b8(v_carry_w, vl);
-                uw_carry_count += __riscv_vcpop_m_b8(v_carry_uw, vl);
+                // Optimization: Accumulate carries into vector register instead of scalar vcpop
+                v_carry_w_acc = __riscv_vadd_vx_u32m4_m(v_carry_w, v_carry_w_acc, 1, vl);
+                v_carry_uw_acc = __riscv_vadd_vx_u32m4_m(v_carry_uw, v_carry_uw_acc, 1, vl);
 
                 // Add the high 32-bit parts of the products
                 v_acc_w_high = __riscv_vadd_vv_u32m4(v_acc_w_high, v_prod_w_high, vl);
@@ -97,13 +97,19 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
             vuint32m1_t v_sum_w_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_w_high, v_zero, vl_max);
             vuint32m1_t v_sum_uw_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_uw_high, v_zero, vl_max);
 
-            // Extract scalar results from vector registers
+            // Reduce the carry accumulators
+            vuint32m1_t v_sum_carry_w = __riscv_vredsum_vs_u32m4_u32m1(v_carry_w_acc, v_zero, vl_max);
+            vuint32m1_t v_sum_carry_uw = __riscv_vredsum_vs_u32m4_u32m1(v_carry_uw_acc, v_zero, vl_max);
+
+            // Extract scalar results
             uint32_t final_w_low = __riscv_vmv_x_s_u32m1_u32(v_sum_w_low);
             uint32_t final_uw_low = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_low);
             uint32_t final_w_high = __riscv_vmv_x_s_u32m1_u32(v_sum_w_high);
             uint32_t final_uw_high = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_high);
+            uint32_t w_carry_count = __riscv_vmv_x_s_u32m1_u32(v_sum_carry_w);
+            uint32_t uw_carry_count = __riscv_vmv_x_s_u32m1_u32(v_sum_carry_uw);
 
-            // Reconstruct the final 64-bit sum, adding the total carry count to the high part
+            // Reconstruct the final 64-bit sum
             uint64_t final_w = ((uint64_t)(final_w_high + w_carry_count) << 32) | final_w_low;
             uint64_t final_uw = ((uint64_t)(final_uw_high + uw_carry_count) << 32) | final_uw_low;
 

From 3affc81e7a66ecc3d4d9eef92697a4320b4d6ccf Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 22:50:09 -0600
Subject: [PATCH 64/86] Optimize RFFT by replacing gather/scatter with strided
 loads

---
 .../riscv_vector/signal/rfft_int16_rvv.cc     | 43 ++++++++-----------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
index 78665010511..269e55d446f 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
@@ -762,13 +762,16 @@ void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* ti
   freqdata[0].i = 0;
   freqdata[ncfft].i = 0;
 
-  // Initialize pointers and loop variables for the main vector processing loop
+  // Initialize pointers and loop variables
   size_t k = 1;
   const size_t loop_end = ncfft / 2;
   const int16_t* tmpbuf_base_ptr = (const int16_t*)st->tmpbuf;
   const int16_t* twiddles_base_ptr = (const int16_t*)st->super_twiddles;
   int16_t* freqdata_base_ptr = (int16_t*)freqdata;
+  
+  // Stride for complex numbers (R, I) is 4 bytes (2 * int16)
   ptrdiff_t stride = sizeof(kiss_fft_fixed16::kiss_fft_cpx);
+  ptrdiff_t neg_stride = -stride;
 
   // Main loop to process FFT bins in vector chunks
   while (k <= loop_end)
@@ -776,27 +779,20 @@ void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* ti
     // Set the vector length (vl) for the current iteration
     size_t vl = __riscv_vsetvl_e16m4(loop_end - k + 1);
 
-    // Generate index vectors for accessing fpk, fpnk, and twiddles
-    vuint16m4_t v_k_indices = __riscv_vid_v_u16m4(vl);
-    v_k_indices = __riscv_vadd_vx_u16m4(v_k_indices, k, vl);
-    vuint16m4_t v_neg_k_indices = __riscv_vrsub_vx_u16m4(v_k_indices, ncfft, vl);
-    vuint16m4_t v_twiddle_indices = __riscv_vsub_vx_u16m4(v_k_indices, 1, vl);
-
-    // Load the 'fpk' vector using a strided load
+    // fpk indices: k, k+1, ...
     vint16m4_t v_fpk_r = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k], stride, vl);
     vint16m4_t v_fpk_i = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
 
-    // Gather the 'fpnk' vector using indexed loads
-    vuint32m8_t v_tmp_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_tmp_i_offsets = __riscv_vadd_vx_u32m8(v_tmp_r_offsets, sizeof(int16_t), vl);
-    vint16m4_t v_fpnk_r_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_r_offsets, vl);
-    vint16m4_t v_fpnk_i_raw = __riscv_vluxei32_v_i16m4(tmpbuf_base_ptr, v_tmp_i_offsets, vl);
+    // fpnk indices: N-k, N-(k+1), ...
+    const int16_t* fpnk_ptr = &tmpbuf_base_ptr[2 * (ncfft - k)];
+    vint16m4_t v_fpnk_r_raw = __riscv_vlse16_v_i16m4(fpnk_ptr, neg_stride, vl);
+    vint16m4_t v_fpnk_i_raw = __riscv_vlse16_v_i16m4(fpnk_ptr + 1, neg_stride, vl);
 
-    // Gather the twiddle factors using indexed loads
-    vuint32m8_t v_tw_r_offsets = __riscv_vwmulu_vx_u32m8(v_twiddle_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_tw_i_offsets = __riscv_vadd_vx_u32m8(v_tw_r_offsets, sizeof(int16_t), vl);
-    vint16m4_t v_tw_r = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_r_offsets, vl);
-    vint16m4_t v_tw_i = __riscv_vluxei32_v_i16m4(twiddles_base_ptr, v_tw_i_offsets, vl);
+    // Twiddle indices: k-1, k, ...
+    // Must use strided load to extract only Reals or only Imags from the interleaved array
+    const int16_t* tw_ptr = &twiddles_base_ptr[2 * (k - 1)];
+    vint16m4_t v_tw_r = __riscv_vlse16_v_i16m4(tw_ptr, stride, vl);
+    vint16m4_t v_tw_i = __riscv_vlse16_v_i16m4(tw_ptr + 1, stride, vl);
 
     // Perform high-precision rounding division on fpk
     const int16_t scale = 16383;
@@ -839,15 +835,14 @@ void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* ti
     vint16m4_t v_out_nk_r = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
     vint16m4_t v_out_nk_i = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_tw_res_i, v_f1k_i, vl), 1, vl);
 
-    // Store the results using a strided store
+    // Store the results using a strided store (Forward)
     __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
     __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
 
-    // Scatter the results using an indexed store
-    vuint32m8_t v_freq_r_offsets = __riscv_vwmulu_vx_u32m8(v_neg_k_indices, sizeof(kiss_fft_fixed16::kiss_fft_cpx), vl);
-    vuint32m8_t v_freq_i_offsets = __riscv_vadd_vx_u32m8(v_freq_r_offsets, sizeof(int16_t), vl);
-    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_r_offsets, v_out_nk_r, vl);
-    __riscv_vsuxei32_v_i16m4(freqdata_base_ptr, v_freq_i_offsets, v_out_nk_i, vl);
+    // Store the results using a strided store (Reverse)
+    int16_t* out_nk_ptr = &freqdata_base_ptr[2 * (ncfft - k)];
+    __riscv_vsse16_v_i16m4(out_nk_ptr, neg_stride, v_out_nk_r, vl);
+    __riscv_vsse16_v_i16m4(out_nk_ptr + 1, neg_stride, v_out_nk_i, vl);
 
     // Advance to the next vector chunk
     k += vl;

From 6ae426dfc704567c80dbcea61c18ea51a60c086e Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Tue, 18 Nov 2025 23:16:55 -0600
Subject: [PATCH 65/86] SoftMax: Fix vector-vector merge intrinsic usage and
 fix 64-bit emulation logic

---
 .../micro/kernels/riscv_vector/softmax.cc     |   5 +-
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 623 ++++++++----------
 .../make/targets/riscv32_vector_makefile.inc  |   4 +-
 3 files changed, 289 insertions(+), 343 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc b/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
index e4afcdd0744..c66afe58a65 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax.cc
@@ -29,19 +29,20 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h"
 
 namespace tflite {
+  
 namespace {
 
 void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                       const SoftmaxParams& op_data) {
   if (input->type == kTfLiteInt8) {
     if (output->type == kTfLiteInt16) {
-      SoftmaxInt8RVV<int16_t>(
+      SoftmaxRVV<int8_t, int16_t>(
           op_data, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<int8_t>(input),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int16_t>(output));
     } else {
-      SoftmaxInt8RVV<int8_t>(
+      SoftmaxRVV<int8_t, int8_t>(
           op_data, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<int8_t>(input),
           tflite::micro::GetTensorShape(output),
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 9187f175e16..3dab3ef0439 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -2,9 +2,10 @@
 #define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SOFTMAX_RVV_H_
 
 #include <riscv_vector.h>
-#include <limits>
+
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -12,464 +13,408 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-// Vectorized absolute value for signed 32-bit integers
-inline vint32m4_t vabs_i32m4(vint32m4_t v_in, size_t vl)
+inline vint32m4_t SaturatingLeftShift_vx_i32m4(vint32m4_t v_in, int shift,
+                                               size_t vl)
 {
-  // Create a mask for elements that are less than zero
-  vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
-  
-  // Negate the elements that are negative by calculating (0 - v_in)
-  vint32m4_t v_negated = __riscv_vrsub_vx_i32m4(v_in, 0, vl);
-  
-  // Use the mask to merge the original values (where mask is false) with the negated values
-  return __riscv_vmerge_vvm_i32m4(v_in, v_negated, v_neg_mask, vl);
-}
+    // Return early if shift is zero or negative
+    if (shift <= 0) return v_in;
 
-// Vectorized Saturating Rounding Doubling High Multiply (Vector-Vector)
-inline vint32m4_t SRDMH_vv_i32m4(vint32m4_t v_a, vint32m4_t v_b, size_t vl)
-{
-    // Define scalar constants for saturation and rounding
-    const int32_t s_int32_min = INT32_MIN;
-    const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_rounding_nudge = (INT32_C(1) << 30);
-
-    // Create a mask for the specific overflow case: INT32_MIN * INT32_MIN
-    vbool8_t v_min_mask_a = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
-    vbool8_t v_min_mask_b = __riscv_vmseq_vx_i32m4_b8(v_b, s_int32_min, vl);
-    vbool8_t v_overflow_mask = __riscv_vmand_mm_b8(v_min_mask_a, v_min_mask_b, vl);
-
-    // Perform a 32x32 -> 64-bit multiplication, storing high and low parts
-    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_a, v_b, vl);
-    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_a, v_b, vl);
-    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-
-    // Add the rounding nudge and detect if a carry-out occurred
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_rounding_nudge, vl);
-    vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    
-    // Add the carry to the high part of the product
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
-
-    // Combine the high and low parts to form the doubled result and apply saturation
-    vint32m4_t v_result_hi_part = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-    vuint32m4_t v_result_lo_part_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
-    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(
-        v_result_hi_part, 
-        __riscv_vreinterpret_v_u32m4_i32m4(v_result_lo_part_u), vl);
-    
-    // Apply saturation for the INT32_MIN * INT32_MIN case
-    return __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
+    // Handle extreme shifts that always saturate
+    if (shift >= 31)
+    {
+        // Create mask for negative values
+        vbool8_t v_neg = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
+        
+        // Set positive max and merge with negative min
+        vint32m4_t v_max = __riscv_vmv_v_x_i32m4(INT32_MAX, vl);
+        return __riscv_vmerge_vxm_i32m4(v_max, INT32_MIN, v_neg, vl);
+    }
+
+    // Perform the logical left shift
+    vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_in, shift, vl);
+
+    // Verify overflow by shifting back and comparing
+    vint32m4_t v_unshifted = __riscv_vsra_vx_i32m4(v_shifted, shift, vl);
+    vbool8_t v_no_overflow = __riscv_vmseq_vv_i32m4_b8(v_in, v_unshifted, vl);
+
+    // Select saturating constants based on sign
+    vbool8_t v_neg = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
+    vint32m4_t v_sat = __riscv_vmerge_vxm_i32m4(
+        __riscv_vmv_v_x_i32m4(INT32_MAX, vl), INT32_MIN, v_neg, vl);
+
+    // Merge valid results with saturated results
+    return __riscv_vmerge_vvm_i32m4(v_sat, v_shifted, v_no_overflow, vl);
 }
 
-// Vectorized Saturating Rounding Doubling High Multiply (Vector-Scalar)
-inline vint32m4_t SRDMH_vx_i32m4(vint32m4_t v_a, int32_t s_b, size_t vl)
+inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
+    vint32m4_t v_x, int32_t multiplier, int left_shift, size_t vl)
 {
-    // Define scalar constants for saturation and rounding
-    const int32_t s_int32_min = INT32_MIN;
-    const int32_t s_int32_max = INT32_MAX;
-    const int32_t s_rounding_nudge = (INT32_C(1) << 30);
-
-    // Create a mask for the specific overflow case: v_a[i] == INT32_MIN and s_b == INT32_MIN
-    vbool8_t v_overflow_mask;
-    if (s_b == s_int32_min)
+    // Calculate low 32 bits of product
+    vint32m4_t v_lo = __riscv_vmul_vx_i32m4(v_x, multiplier, vl);
+
+    // Calculate high 32 bits of product
+    vint32m4_t v_hi = __riscv_vmulh_vx_i32m4(v_x, multiplier, vl);
+
+    // Determine effective right shift amount
+    int total_right_shift = 31 - left_shift;
+
+    // Calculate rounding nudge
+    int32_t nudge = 1 << (total_right_shift - 1);
+
+    // Add nudge to low part treating as unsigned
+    vuint32m4_t v_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_lo);
+    vuint32m4_t v_lo_plus_nudge = __riscv_vadd_vx_u32m4(v_lo_u, nudge, vl);
+
+    // Detect carry from low part addition
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_lo_plus_nudge, nudge, vl);
+
+    // Apply carry to high part
+    vint32m4_t v_hi_rounded = __riscv_vadd_vx_i32m4_m(v_carry, v_hi, 1, vl);
+
+    // Calculate shift amounts for recombination
+    int shift_hi = left_shift + 1;
+    int shift_lo = total_right_shift;
+
+    // Shift high part (handling mod 32 behavior)
+    vint32m4_t v_res_from_hi;
+    if (shift_hi < 32)
     {
-        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_a, s_int32_min, vl);
+        v_res_from_hi = __riscv_vsll_vx_i32m4(v_hi_rounded, shift_hi, vl);
     }
     else
     {
-        vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-        v_overflow_mask = __riscv_vmseq_vx_i32m4_b8(v_zero, 1, vl); // Always false
+        v_res_from_hi = __riscv_vmv_v_x_i32m4(0, vl);
     }
 
-    // Perform a 32x32 -> 64-bit multiplication, storing high and low parts
-    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_a, s_b, vl);
-    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_a, s_b, vl);
-    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-
-    // Add the rounding nudge and detect if a carry-out occurred
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, s_rounding_nudge, vl);
-    vbool8_t v_carry_mask = __riscv_vmsltu_vv_u32m4_b8(v_sum_lo_u, v_prod_lo_u, vl);
-    
-    // Add the carry to the high part of the product
-    vint32m4_t v_sum_hi = __riscv_vadd_vx_i32m4_m(v_carry_mask, v_prod_hi, 1, vl);
-
-    // Combine the high and low parts to form the doubled result
-    vint32m4_t v_result_hi_part = __riscv_vsll_vx_i32m4(v_sum_hi, 1, vl);
-    vuint32m4_t v_result_lo_part_u = __riscv_vsrl_vx_u32m4(v_sum_lo_u, 31, vl);
-    vint32m4_t v_result_before_sat = __riscv_vor_vv_i32m4(
-        v_result_hi_part,
-        __riscv_vreinterpret_v_u32m4_i32m4(v_result_lo_part_u), vl);
-
-    // Apply saturation for the INT32_MIN * INT32_MIN case
-    return __riscv_vmerge_vxm_i32m4(v_result_before_sat, s_int32_max, v_overflow_mask, vl);
+    // Shift low part
+    vuint32m4_t v_res_from_lo =
+        __riscv_vsrl_vx_u32m4(v_lo_plus_nudge, shift_lo, vl);
+
+    // Combine results
+    return __riscv_vor_vv_i32m4(
+        v_res_from_hi, __riscv_vreinterpret_v_u32m4_i32m4(v_res_from_lo), vl);
 }
 
-// Vectorized Saturating Rounding Multiply by Power-of-Two
 inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
 {
-  // If shift is zero, return the original vector
-  if (shift == 0)
-  {
-    return v_vec;
-  }
-
-  // This section handles left shifts (positive shift values)
-  if (shift > 0)
-  {
-    // Define scalar constants for saturation and shifting
-    const int32_t s_shift = shift;
-    const int32_t s_max_val = INT32_MAX;
-    const int32_t s_min_val = INT32_MIN;
-
-    // Handle extreme shifts that always result in saturation
-    if (s_shift >= 31)
+    // Return early if shift is zero
+    if (shift == 0) return v_vec;
+
+    // Handle positive shifts using saturating left shift
+    if (shift > 0)
     {
-      vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-      vbool8_t v_pos_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, 0, vl);
-      vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
-      vint32m4_t v_saturated = __riscv_vmerge_vxm_i32m4(v_zero, s_max_val, v_pos_mask, vl);
-      return __riscv_vmerge_vxm_i32m4(v_saturated, s_min_val, v_neg_mask, vl);
+        return SaturatingLeftShift_vx_i32m4(v_vec, shift, vl);
     }
-
-    // Calculate thresholds for overflow detection
-    const int32_t pos_threshold = (INT32_C(1) << (31 - s_shift));
-    const int32_t neg_threshold = -pos_threshold;
-    
-    // Create masks for positive and negative overflow
-    vbool8_t v_pos_ovfl_mask = __riscv_vmsgt_vx_i32m4_b8(v_vec, pos_threshold - 1, vl);
-    vbool8_t v_neg_ovfl_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, neg_threshold, vl);
-
-    // Perform the left shift
-    vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_vec, s_shift, vl);
-
-    // Merge the shifted result with saturated values based on overflow masks
-    vint32m4_t v_result = __riscv_vmerge_vxm_i32m4(v_shifted, s_max_val, v_pos_ovfl_mask, vl);
-    return __riscv_vmerge_vxm_i32m4(v_result, s_min_val, v_neg_ovfl_mask, vl);
-
-  }
-  else
-  {
-    // This section handles right shifts (negative shift values) with rounding
-    const int exponent = -shift;
-    if (exponent <= 0) return v_vec;
-    
-    // Handle extreme shifts that result in 0 or -1
-    if (exponent > 31)
+    else
     {
-        vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-        vbool8_t v_neg_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
-        return __riscv_vmerge_vxm_i32m4(v_zero, -1, v_neg_mask, vl);
+        // Perform rounding arithmetic right shift
+        return __riscv_vssra_vx_i32m4(v_vec, -shift, __RISCV_VXRM_RNU, vl);
     }
-    
-    // Calculate the rounding threshold ("round half away from zero")
-    const int32_t s_mask = (INT32_C(1) << exponent) - 1;
-    const int32_t s_threshold_base = s_mask >> 1;
-    vbool8_t v_is_negative_mask = __riscv_vmslt_vx_i32m4_b8(v_vec, 0, vl);
-    vint32m4_t v_threshold = __riscv_vmv_v_x_i32m4(s_threshold_base, vl);
-    v_threshold = __riscv_vadd_vx_i32m4_m(v_is_negative_mask, v_threshold, 1, vl);
-
-    // Check if the remainder requires rounding up
-    vint32m4_t v_remainder = __riscv_vand_vx_i32m4(v_vec, s_mask, vl);
-    vint32m4_t v_abs_remainder = vabs_i32m4(v_remainder, vl);
-    vbool8_t v_should_round_mask = __riscv_vmsgt_vv_i32m4_b8(v_abs_remainder, v_threshold, vl);
-
-    // Perform the arithmetic right shift
-    vint32m4_t v_shifted = __riscv_vsra_vx_i32m4(v_vec, exponent, vl);
-    
-    // Add 1 to the result if rounding is needed
-    return __riscv_vadd_vx_i32m4_m(v_should_round_mask, v_shifted, 1, vl);
-  }
 }
 
-// Vectorized MultiplyByQuantizedMultiplier for multipliers > 1 (Vector-Scalar)
-inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
-    vint32m4_t v_x, int32_t quantized_multiplier, int left_shift, size_t vl) 
-{
-    // Apply the left shift to the input vector
-    vint32m4_t v_shifted_x = __riscv_vsll_vx_i32m4(v_x, left_shift, vl);
-    
-    // Perform the saturating rounding doubling high multiply
-    return SRDMH_vx_i32m4(v_shifted_x, quantized_multiplier, vl);
-}
-
-// Vectorized fixed-point implementation of exp(x) for negative q526 inputs
 vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
 {
-    // Define fixed-point constants for input and output formats
-    const int kInputIntegerBits = 5;
-    const int kInputFractionalBits = 26; 
+    // Define fixed-point constants
+    const int kInputFractionalBits = 26;
     const int kOutputFractionalBits = 31;
-
-    // Define constants for range reduction (exp(x) = exp(x/4) * exp(3x/4))
-    const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
+    const int32_t s_kOneQuarter_q5_26 = INT32_C(1)
+                                        << (kInputFractionalBits - 2);
     const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
 
-    // Define constants for Taylor series approximation of exp(x) around -1/8
-    const int32_t s_result_one_q0_31 = INT32_MAX; 
-    const int32_t s_exp_neg_1_8_q0_31 = 1895147668; 
-    const int32_t s_one_third_q0_31 = 715827883; 
-    const int32_t s_one_24th_q0_31 = 89478485; 
-    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
+    // Define Taylor Series Constants (Q0.31)
+    const int32_t s_result_one_q0_31 = INT32_MAX;
+    const int32_t s_exp_neg_1_8_q0_31 = 1895147668;
+    const int32_t s_one_third_q0_31 = 715827883;
+    const int32_t s_one_24th_q0_31 = 89478485;
+    const int32_t s_one_eighth_q0_31 = INT32_C(1)
+                                       << (kOutputFractionalBits - 3);
 
-    // Perform range reduction to map the input to the [-1/4, 0] interval
+    // Perform range reduction masking
     vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
-    vint32m4_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
-    vint32m4_t v_remainder_q5_26 = __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
 
-    // Rescale for Taylor series input
+    // Subtract quarter constant
+    vint32m4_t v_a_mod_q_m_q_q5_26 =
+        __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
+
+    // Rescale from Q5.26 to Q0.31
     const int rescale_shift = kOutputFractionalBits - kInputFractionalBits;
-    vint32m4_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
+    vint32m4_t v_a_input_taylor_q0_31 =
+        SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
 
-    // Center the input around -1/8 for better Taylor series accuracy
-    vint32m4_t v_y = __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
+    // Center input around -1/8
+    vint32m4_t v_y =
+        __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
 
-    // Calculate polynomial terms: y^2, y^3, y^4
-    vint32m4_t v_y2 = SRDMH_vv_i32m4(v_y, v_y, vl);
-    vint32m4_t v_y3 = SRDMH_vv_i32m4(v_y2, v_y, vl);
-    vint32m4_t v_y4 = SRDMH_vv_i32m4(v_y2, v_y2, vl);
+    // Calculate polynomial terms using 32-bit saturating multiply
+    vint32m4_t v_y2 = __riscv_vsmul_vv_i32m4(v_y, v_y, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_y3 = __riscv_vsmul_vv_i32m4(v_y2, v_y, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_y4 = __riscv_vsmul_vv_i32m4(v_y2, v_y2, __RISCV_VXRM_RNU, vl);
 
-    // Calculate scaled polynomial terms: y^2/2, y^3/6, y^4/24
+    // Calculate coefficients
     vint32m4_t v_term_y2_over_2 = SRMPOT_vx_i32m4(v_y2, -1, vl);
-    vint32m4_t v_term_y3_over_3 = SRDMH_vx_i32m4(v_y3, s_one_third_q0_31, vl);
+    vint32m4_t v_term_y3_over_3 =
+        __riscv_vsmul_vx_i32m4(v_y3, s_one_third_q0_31, __RISCV_VXRM_RNU, vl);
     vint32m4_t v_term_y3_over_6 = SRMPOT_vx_i32m4(v_term_y3_over_3, -1, vl);
-    vint32m4_t v_term_y4_over_24 = SRDMH_vx_i32m4(v_y4, s_one_24th_q0_31, vl);
+    vint32m4_t v_term_y4_over_24 =
+        __riscv_vsmul_vx_i32m4(v_y4, s_one_24th_q0_31, __RISCV_VXRM_RNU, vl);
 
-    // Sum the polynomial terms: y + y^2/2 + y^3/6 + y^4/24
+    // Sum polynomial terms
     vint32m4_t v_poly_sum = __riscv_vadd_vv_i32m4(v_y, v_term_y2_over_2, vl);
     v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y3_over_6, vl);
     v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y4_over_24, vl);
-    
-    // Calculate the final result for the interval: exp(-1/8) * (1 + poly_sum)
-    vint32m4_t v_const_term_vec = __riscv_vmv_v_x_i32m4(s_exp_neg_1_8_q0_31, vl);
-    vint32m4_t v_mul_term = SRDMH_vv_i32m4(v_poly_sum, v_const_term_vec, vl);
-    vint32m4_t v_interval_result_q0_31 = __riscv_vadd_vv_i32m4(v_mul_term, v_const_term_vec, vl);
-
-    // Reconstruct the full result using a barrel shifter based on the remainder
-    vint32m4_t v_current_result = v_interval_result_q0_31;
-    const int32_t s_mult_exp_neg_1_4 = 1672461947;
-    const int32_t s_mult_exp_neg_1_2 = 1302514674;
-    const int32_t s_mult_exp_neg_1   = 790015084;
-    const int32_t s_mult_exp_neg_2   = 290630308;
-    const int32_t s_mult_exp_neg_4   = 39332535;
-    const int32_t s_mult_exp_neg_8   = 720401;
-    const int32_t s_mult_exp_neg_16  = 242;
-
-    // Macro to conditionally apply multipliers based on remainder bits
-    #define APPLY_BARREL_SHIFT(exponent, multiplier_q0_31) \
-    do \
-    { \
-        if (kInputIntegerBits > exponent) \
-        { \
-            const int shift_amount = kInputFractionalBits + exponent; \
-            if (shift_amount >= 0 && shift_amount < 32) \
-            { \
-                int32_t bit_mask_val = INT32_C(1) << shift_amount; \
-                vint32m4_t v_rem_masked = __riscv_vand_vx_i32m4(v_remainder_q5_26, bit_mask_val, vl); \
-                vbool8_t v_apply_mask = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl); \
-                vint32m4_t v_multiplied = SRDMH_vx_i32m4(v_current_result, multiplier_q0_31, vl); \
-                v_current_result = __riscv_vmerge_vvm_i32m4(v_current_result, v_multiplied, v_apply_mask, vl); \
-            } \
-        } \
-    } while(0)
-
-    // Apply barrel shifter for each power-of-two component
-    APPLY_BARREL_SHIFT(-2, s_mult_exp_neg_1_4);
-    APPLY_BARREL_SHIFT(-1, s_mult_exp_neg_1_2);
-    APPLY_BARREL_SHIFT( 0, s_mult_exp_neg_1);
-    APPLY_BARREL_SHIFT( 1, s_mult_exp_neg_2);
-    APPLY_BARREL_SHIFT( 2, s_mult_exp_neg_4);
-    APPLY_BARREL_SHIFT( 3, s_mult_exp_neg_8);
-    APPLY_BARREL_SHIFT( 4, s_mult_exp_neg_16);
-
-    #undef APPLY_BARREL_SHIFT
-
-    // Handle the case where input is 0, for which exp(0) = 1
-    vint32m4_t v_final_result = v_current_result;
-    vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
-    v_final_result = __riscv_vmerge_vxm_i32m4(v_final_result, s_result_one_q0_31, v_zero_mask, vl);
 
-    return v_final_result;
+    // Apply constant term
+    vint32m4_t v_mul_term = __riscv_vsmul_vx_i32m4(
+        v_poly_sum, s_exp_neg_1_8_q0_31, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_current_result =
+        __riscv_vadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
+
+    // Calculate remainder for barrel shifter
+    vint32m4_t v_remainder_q5_26 =
+        __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
+
+    // Multipliers for reconstruction
+    const int32_t multipliers[] = {1672461947, 1302514674, 790015084, 290630308,
+                                   39332535,   720401,     242};
+
+    // Apply barrel shifter using unrolled loop
+    for (int i = 0; i < 7; ++i)
+    {
+        int exponent = i - 2;
+        int shift_amount = 26 + exponent;
+        if (shift_amount >= 0 && shift_amount < 32)
+        {
+            int32_t mask = 1 << shift_amount;
+            int32_t mult = multipliers[i];
+
+            vint32m4_t v_rem_masked =
+                __riscv_vand_vx_i32m4(v_remainder_q5_26, mask, vl);
+            vbool8_t v_apply = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl);
+
+            vint32m4_t v_multiplied = __riscv_vsmul_vx_i32m4(
+                v_current_result, mult, __RISCV_VXRM_RNU, vl);
+            v_current_result = __riscv_vmerge_vvm_i32m4(
+                v_current_result, v_multiplied, v_apply, vl);
+        }
+    }
+
+    // Handle zero input case
+    vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
+    return __riscv_vmerge_vxm_i32m4(v_current_result, s_result_one_q0_31,
+                                    v_zero_mask, vl);
 }
 
-// Main RVV-accelerated Softmax kernel function
-template<typename InputT, typename OutputT>
+template <typename InputT, typename OutputT>
 void SoftmaxRVV(const tflite::SoftmaxParams& params,
                 const tflite::RuntimeShape& input_shape,
                 const InputT* input_data,
-                const tflite::RuntimeShape& output_shape,
-                OutputT* output_data)
+                const tflite::RuntimeShape& output_shape, OutputT* output_data)
 {
     // Extract quantization parameters
     const int32_t input_beta_multiplier = params.input_multiplier;
     const int32_t input_beta_left_shift = params.input_left_shift;
     const int diff_min = params.diff_min;
-    
-    // Define fixed-point constants for accumulation and output
+
+    // Define fixed-point constants
     static const int kAccumulationIntegerBits = 12;
-    static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits;
+    static const int kAccumulationFractionalBits =
+        32 - 1 - kAccumulationIntegerBits;
     static const int kExpOutputFractionalBits = 31;
-    
+
     // Extract shape dimensions
     const int trailing_dim = input_shape.DimensionsCount() - 1;
-    const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+    const int outer_size = tflite::MatchingFlatSizeSkipDim(
+        input_shape, trailing_dim, output_shape);
+    const int depth = tflite::MatchingDim(input_shape, trailing_dim,
+                                          output_shape, trailing_dim);
     const size_t depth_sz = static_cast<size_t>(depth);
 
-    // Loop over each row in the outer dimensions
+    // Loop over outer dimensions
     for (int i = 0; i < outer_size; ++i)
     {
         const InputT* current_input_data = input_data + i * depth;
         OutputT* current_output_data = output_data + i * depth;
 
-        // Find the maximum value in the current row for numerical stability
+        // Find maximum value in the row
         InputT max_in_row = std::numeric_limits<InputT>::min();
         const InputT* ptr_max = current_input_data;
-        ptrdiff_t n = depth_sz;
+        size_t n = depth_sz;
         while (n > 0)
         {
             size_t vl = __riscv_vsetvl_e8m1(n);
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_input = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(ptr_max), vl);
-                vint8m1_t v_scalar = __riscv_vmv_v_x_i8m1(max_in_row, vl);
-                vint8m1_t v_red = __riscv_vredmax_vs_i8m1_i8m1(v_input, v_scalar, vl);
-                max_in_row = std::max(max_in_row, __riscv_vmv_x_s_i8m1_i8(v_red));
+                vint8m1_t v_in = __riscv_vle8_v_i8m1(
+                    reinterpret_cast<const int8_t*>(ptr_max), vl);
+                vint8m1_t v_red = __riscv_vredmax_vs_i8m1_i8m1(
+                    v_in, __riscv_vmv_v_x_i8m1(max_in_row, vl), vl);
+                max_in_row =
+                    std::max(max_in_row, __riscv_vmv_x_s_i8m1_i8(v_red));
             }
             else
             {
-                vuint8m1_t v_input = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(ptr_max), vl);
-                vuint8m1_t v_scalar = __riscv_vmv_v_x_u8m1(max_in_row, vl);
-                vuint8m1_t v_red = __riscv_vredmaxu_vs_u8m1_u8m1(v_input, v_scalar, vl);
-                max_in_row = std::max(max_in_row, __riscv_vmv_x_s_u8m1_u8(v_red));
+                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
+                    reinterpret_cast<const uint8_t*>(ptr_max), vl);
+                vuint8m1_t v_red = __riscv_vredmaxu_vs_u8m1_u8m1(
+                    v_in, __riscv_vmv_v_x_u8m1(max_in_row, vl), vl);
+                max_in_row = std::max(max_in_row,
+                                      (InputT)__riscv_vmv_x_s_u8m1_u8(v_red));
             }
             ptr_max += vl;
             n -= vl;
         }
         const int32_t max_in_row_s32 = static_cast<int32_t>(max_in_row);
 
-        // Calculate the sum of exponentials of (input - max)
-        size_t vl_temp_sum = __riscv_vsetvl_e32m1(1);
-        vint32m1_t v_sum_acc_m1 = __riscv_vmv_v_x_i32m1(0, vl_temp_sum);
+        // Accumulate sum of exponentials
         size_t current_c = 0;
+        vint32m1_t v_sum_acc = __riscv_vmv_v_x_i32m1(0, 1);
+
         while (current_c < depth_sz)
         {
             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
-            // Load 8-bit input data and widen to 32-bit
+            // Load and widen input without 64-bit instructions
             vint32m4_t v_input_s32;
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
-                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-                v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+                vint8m1_t v_in = __riscv_vle8_v_i8m1(
+                    reinterpret_cast<const int8_t*>(current_input_data +
+                                                    current_c),
+                    vl);
+                vint16m2_t v_in_16 = __riscv_vsext_vf2_i16m2(v_in, vl);
+                v_input_s32 = __riscv_vsext_vf2_i32m4(v_in_16, vl);
             }
             else
             {
-                vuint8m1_t v_input_u8 = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
-                vuint16m2_t v_input_u16 = __riscv_vwaddu_vx_u16m2(v_input_u8, 0, vl);
-                vuint32m4_t v_input_u32 = __riscv_vwaddu_vx_u32m4(v_input_u16, 0, vl);
-                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_input_u32);
+                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
+                    reinterpret_cast<const uint8_t*>(current_input_data +
+                                                     current_c),
+                    vl);
+                vuint16m2_t v_in_16 = __riscv_vzext_vf2_u16m2(v_in, vl);
+                vuint32m4_t v_in_32 = __riscv_vzext_vf2_u32m4(v_in_16, vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_in_32);
             }
 
-            // Calculate the difference and create a mask for values >= diff_min
-            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
-            
-            // Rescale the difference for the exp function
-            vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
-                v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
-            
-            // Calculate the exponential of the rescaled difference
-            vint32m4_t v_exp_val_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
-            
-            // Rescale the exponential result to the accumulation format
-            const int rescale_shift = kAccumulationFractionalBits - kExpOutputFractionalBits;
-            vint32m4_t v_exp_term_q12_19 = SRMPOT_vx_i32m4(v_exp_val_q0_31, rescale_shift, vl);
-            
-            // Mask out values that were below the diff_min threshold and accumulate
-            vint32m4_t v_zero_q12_19 = __riscv_vmv_v_x_i32m4(0, vl);
-            vint32m4_t v_exp_term_masked = __riscv_vmerge_vvm_i32m4(v_zero_q12_19, v_exp_term_q12_19, v_diff_mask, vl);
-            v_sum_acc_m1 = __riscv_vredsum_vs_i32m4_i32m1(v_exp_term_masked, v_sum_acc_m1, vl);
-            
+            // Calculate difference from max
+            vint32m4_t v_diff =
+                __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
+            vbool8_t v_mask = __riscv_vmsge_vx_i32m4_b8(v_diff, diff_min, vl);
+
+            // Scale difference using custom 32-bit implementation
+            vint32m4_t v_diff_scaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
+                    v_diff, input_beta_multiplier, input_beta_left_shift, vl);
+
+            // Calculate exponential
+            vint32m4_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
+
+            // Rescale result
+            vint32m4_t v_exp_rescaled = __riscv_vssra_vx_i32m4(
+                v_exp, kExpOutputFractionalBits - kAccumulationFractionalBits,
+                __RISCV_VXRM_RNU, vl);
+
+            // Merge and accumulate
+            vint32m4_t v_add_val = __riscv_vmerge_vvm_i32m4(
+                __riscv_vmv_v_x_i32m4(0, vl), v_exp_rescaled, v_mask, vl);
+            v_sum_acc =
+                __riscv_vredsum_vs_i32m4_i32m1(v_add_val, v_sum_acc, vl);
+
             current_c += vl;
         }
-        int32_t sum_of_exps_raw = __riscv_vmv_x_s_i32m1_i32(v_sum_acc_m1);
+        int32_t sum_of_exps = __riscv_vmv_x_s_i32m1_i32(v_sum_acc);
 
-        // Calculate the reciprocal of the sum of exponentials
+        // Calculate reciprocal
         int num_bits_over_unit;
-        int32_t reciprocal_raw_q0_31 = tflite::GetReciprocal(sum_of_exps_raw, kAccumulationIntegerBits, &num_bits_over_unit);
-        
-        // Calculate the final output shift exponent
+        int32_t reciprocal = tflite::GetReciprocal(
+            sum_of_exps, kAccumulationIntegerBits, &num_bits_over_unit);
         const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
-        const int32_t output_min_s32 = static_cast<int32_t>(std::numeric_limits<OutputT>::min());
-        const int32_t output_max_s32 = static_cast<int32_t>(std::numeric_limits<OutputT>::max());
+        const int32_t output_min =
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
+        const int32_t output_max =
+            static_cast<int32_t>(std::numeric_limits<OutputT>::max());
 
-        // Compute and store the final output values
+        // Compute final output
         current_c = 0;
         while (current_c < depth_sz)
         {
             size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
 
-            // Reload and widen the input data
+            // Reload and widen input
             vint32m4_t v_input_s32;
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_input_s8 = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
-                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2(v_input_s8, vl);
-                v_input_s32 = __riscv_vwadd_vx_i32m4(v_input_s16, 0, vl);
+                vint8m1_t v_in = __riscv_vle8_v_i8m1(
+                    reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
+                v_input_s32 = __riscv_vsext_vf2_i32m4(
+                    __riscv_vsext_vf2_i16m2(v_in, vl), vl);
             }
             else
             {
-                vuint8m1_t v_input_u8 = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
-                vuint16m2_t v_input_u16 = __riscv_vwaddu_vx_u16m2(v_input_u8, 0, vl);
-                vuint32m4_t v_input_u32 = __riscv_vwaddu_vx_u32m4(v_input_u16, 0, vl);
-                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_input_u32);
+                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
+                    reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(
+                    __riscv_vzext_vf2_u32m4(__riscv_vzext_vf2_u16m2(v_in, vl), vl));
             }
-            
-            // Recompute the difference, mask, and exponential
-            vint32m4_t v_diff_s32 = __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-            vbool8_t v_diff_mask = __riscv_vmsge_vx_i32m4_b8(v_diff_s32, diff_min, vl);
-            vint32m4_t v_diff_rescaled_q5_26 = MultiplyByQuantizedMultiplierGreaterThanOne_vx_i32m4(
-                v_diff_s32, input_beta_multiplier, input_beta_left_shift, vl);
-            vint32m4_t v_exp_in_q0_31 = vectorized_exp_on_negative_values(v_diff_rescaled_q5_26, vl);
-            
-            // Multiply the exponential by the reciprocal to get the normalized result
-            vint32m4_t v_product_raw_q0_31 = SRDMH_vx_i32m4(v_exp_in_q0_31, reciprocal_raw_q0_31, vl);
-            
-            // Rescale the output and add the output offset (zero point)
-            vint32m4_t v_unsat_output = SRMPOT_vx_i32m4(v_product_raw_q0_31, -exponent, vl);
-            vint32m4_t v_shifted_output = __riscv_vadd_vx_i32m4(v_unsat_output, output_min_s32, vl);
-            
-            // Clamp the result to the output data type's range
-            vint32m4_t v_clamped_output = __riscv_vmax_vx_i32m4(__riscv_vmin_vx_i32m4(v_shifted_output, output_max_s32, vl), output_min_s32, vl);
-            
-            // Apply the diff_min mask one last time
-            vint32m4_t v_output_min_vec = __riscv_vmv_v_x_i32m4(output_min_s32, vl);
-            vint32m4_t v_final_s32 = __riscv_vmerge_vvm_i32m4(v_output_min_vec, v_clamped_output, v_diff_mask, vl);
-
-            // Narrow the 32-bit results down to the output type and store
+
+            // Recompute difference and mask
+            vint32m4_t v_diff =
+                __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
+            vbool8_t v_mask = __riscv_vmsge_vx_i32m4_b8(v_diff, diff_min, vl);
+
+            // Scale and exponentiate
+            vint32m4_t v_diff_scaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
+                    v_diff, input_beta_multiplier, input_beta_left_shift, vl);
+            vint32m4_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
+
+            // Multiply by reciprocal using 32-bit saturating multiply
+            vint32m4_t v_prod = __riscv_vsmul_vx_i32m4(v_exp, reciprocal,
+                                                       __RISCV_VXRM_RNU, vl);
+
+            // Perform final shift and add offset
+            vint32m4_t v_out_shifted = __riscv_vssra_vx_i32m4(
+                v_prod, exponent, __RISCV_VXRM_RNU, vl);
+            vint32m4_t v_out_final =
+                __riscv_vadd_vx_i32m4(v_out_shifted, output_min, vl);
+
+            // Clamp result
+            v_out_final = __riscv_vmax_vx_i32m4(v_out_final, output_min, vl);
+            v_out_final = __riscv_vmin_vx_i32m4(v_out_final, output_max, vl);
+
+            // Apply mask using vector merge
+            v_out_final = __riscv_vmerge_vvm_i32m4(
+                __riscv_vmv_v_x_i32m4(output_min, vl), v_out_final, v_mask, vl);
+
+            // Narrow and store result
             if constexpr (sizeof(OutputT) == 1)
             {
                 if constexpr (std::is_signed_v<OutputT>)
                 {
-                    vint16m2_t v_temp_s16 = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
-                    vint8m1_t v_final_output = __riscv_vncvt_x_x_w_i8m1(v_temp_s16, vl);
-                    __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(current_output_data + current_c), v_final_output, vl);
+                    vint8m1_t v_store = __riscv_vncvt_x_x_w_i8m1(
+                        __riscv_vncvt_x_x_w_i16m2(v_out_final, vl), vl);
+                    __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(
+                                            current_output_data + current_c),
+                                        v_store, vl);
                 }
                 else
                 {
-                    vuint32m4_t v_final_u32 = __riscv_vreinterpret_v_i32m4_u32m4(v_final_s32);
-                    vuint16m2_t v_temp_u16 = __riscv_vncvt_x_x_w_u16m2(v_final_u32, vl);
-                    vuint8m1_t v_final_output = __riscv_vncvt_x_x_w_u8m1(v_temp_u16, vl);
-                    __riscv_vse8_v_u8m1(reinterpret_cast<uint8_t*>(current_output_data + current_c), v_final_output, vl);
+                    vuint8m1_t v_store = __riscv_vncvt_x_x_w_u8m1(
+                        __riscv_vncvt_x_x_w_u16m2(
+                            __riscv_vreinterpret_v_i32m4_u32m4(v_out_final),
+                            vl),
+                        vl);
+                    __riscv_vse8_v_u8m1(reinterpret_cast<uint8_t*>(
+                                            current_output_data + current_c),
+                                        v_store, vl);
                 }
             }
             else
             {
-                vint16m2_t v_final_output = __riscv_vncvt_x_x_w_i16m2(v_final_s32, vl);
-                __riscv_vse16_v_i16m2(reinterpret_cast<int16_t*>(current_output_data + current_c), v_final_output, vl);
+                vint16m2_t v_store = __riscv_vncvt_x_x_w_i16m2(v_out_final, vl);
+                __riscv_vse16_v_i16m2(
+                    reinterpret_cast<int16_t*>(current_output_data + current_c),
+                    v_store, vl);
             }
-            
             current_c += vl;
         }
     }
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 6306b2607da..3cff6d110ca 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -71,7 +71,7 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc \
   tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc \
-#  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
   tensorflow/lite/micro/kernels/conv.cc \
@@ -83,6 +83,6 @@ EXCLUDED_SRCS := \
   signal/src/kiss_fft_wrappers/kiss_fft_int16.cc \
   signal/micro/kernels/filter_bank.cc \
   signal/src/filter_bank.cc \
-#  tensorflow/lite/micro/kernels/softmax.cc \
+  tensorflow/lite/micro/kernels/softmax.cc \
 
 

From bbe6a3ecdeddaa55515c92764f95ab3f408f8aae Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 00:46:50 -0600
Subject: [PATCH 66/86] Vector optimized FilterBankLog kernel

---
 .../riscv_vector/signal/filter_bank_log.cc    | 114 ++++++++++
 .../signal/filter_bank_log_rvv.cc             | 204 ++++++++++++++++++
 .../riscv_vector/signal/filter_bank_log_rvv.h |  10 +
 .../riscv_vector/signal/filter_bank_rvv.h     |   6 +-
 .../make/targets/riscv32_vector_makefile.inc  |   4 +
 5 files changed, 335 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
 create mode 100644 tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log.cc
new file mode 100644
index 00000000000..eeee7cc2797
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log.cc
@@ -0,0 +1,114 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/flatbuffer_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+#include "tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// Indices into the init flexbuffer's vector.
+// The parameter's name is in the comment that follows.
+// Elements in the vectors are ordered alphabetically by parameter name.
+constexpr int kInputCorrectionBitsIndex = 0;  // 'input_correction_bits'
+constexpr int kOutputScaleIndex = 1;          // 'output_scale'
+
+struct TFLMSignalLogParams {
+  int input_correction_bits;
+  int output_scale;
+};
+
+void* FilterBankLogInit(TfLiteContext* context, const char* buffer,
+                        size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+
+  auto* params = static_cast<TFLMSignalLogParams*>(
+      context->AllocatePersistentBuffer(context, sizeof(TFLMSignalLogParams)));
+
+  if (params == nullptr) {
+    return nullptr;
+  }
+  tflite::FlexbufferWrapper fbw(reinterpret_cast<const uint8_t*>(buffer),
+                                length);
+
+  params->input_correction_bits = fbw.ElementAsInt32(kInputCorrectionBitsIndex);
+  params->output_scale = fbw.ElementAsInt32(kOutputScaleIndex);
+  return params;
+}
+
+TfLiteStatus FilterBankLogPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 1);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteUInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt16);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus FilterBankLogEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TFLMSignalLogParams*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  const uint32_t* input_data = tflite::micro::GetTensorData<uint32_t>(input);
+  int16_t* output_data = tflite::micro::GetTensorData<int16_t>(output);
+  int num_channels = input->dims->data[0];
+  FilterbankLogRVV(input_data, num_channels, params->output_scale,
+                             params->input_correction_bits, output_data);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+namespace tflm_signal {
+
+TFLMRegistration* Register_FILTER_BANK_LOG() {
+  static TFLMRegistration r = tflite::micro::RegisterOp(
+      FilterBankLogInit, FilterBankLogPrepare, FilterBankLogEval);
+  return &r;
+}
+
+}  // namespace tflm_signal
+
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
new file mode 100644
index 00000000000..d66ed46cbb8
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -0,0 +1,204 @@
+#include <riscv_vector.h>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+const uint16_t kLogLut[] = {
+    0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
+    2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
+    3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
+    5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
+    5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
+    5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
+    5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
+    4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
+    3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
+    2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
+    1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
+
+inline vuint32m4_t MulHighFixedPoint16_UU_vx_u32m4(vuint32m4_t v_a, uint32_t b, size_t vl)
+{
+  // load scalar and perform low and high multiplication
+  vuint32m4_t v_b = __riscv_vmv_v_x_u32m4(b, vl);
+  vuint32m4_t v_lo = __riscv_vmul_vv_u32m4(v_a, v_b, vl);
+  vuint32m4_t v_hi = __riscv_vmulhu_vv_u32m4(v_a, v_b, vl);
+
+  // Add rounding constant 32768 to the low part
+  vuint32m4_t v_round = __riscv_vmv_v_x_u32m4(32768, vl);
+  vuint32m4_t v_lo_rounded = __riscv_vadd_vv_u32m4(v_lo, v_round, vl);
+
+  // Detect carry from the low part addition and propagate to high part
+  vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(v_lo_rounded, v_lo, vl);
+  v_hi = __riscv_vadd_vx_u32m4_m(v_carry, v_hi, 1, vl);
+
+  // Combine high shifted left and low shifted right
+  return __riscv_vor_vv_u32m4(
+      __riscv_vsll_vx_u32m4(v_hi, 16, vl),
+      __riscv_vsrl_vx_u32m4(v_lo_rounded, 16, vl), vl);
+}
+
+inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_t signed_scalar, size_t vl)
+{
+  // Load signed scalar and perform low and high multiplication
+  vint32m4_t v_signed_scalar = __riscv_vmv_v_x_i32m4(signed_scalar, vl);
+  vint32m4_t v_lo = __riscv_vmul_vv_i32m4(v_signed_scalar, __riscv_vreinterpret_v_u32m4_i32m4(v_unsigned), vl);
+  vint32m4_t v_hi = __riscv_vmulhsu_vv_i32m4(v_signed_scalar, v_unsigned, vl);
+
+  // Add rounding constant 32768 to the low part
+  vint32m4_t v_round = __riscv_vmv_v_x_i32m4(32768, vl);
+  vint32m4_t v_lo_rounded = __riscv_vadd_vv_i32m4(v_lo, v_round, vl);
+
+  // Detect carry treating low part as unsigned and propagate to high part
+  vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(
+      __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded),
+      __riscv_vreinterpret_v_i32m4_u32m4(v_lo), vl);
+  v_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_hi, 1, vl);
+
+  // Combine high shifted left and low shifted right
+  vuint32m4_t v_lo_shifted = __riscv_vsrl_vx_u32m4(
+      __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl);
+
+  return __riscv_vor_vv_i32m4(
+      __riscv_vsll_vx_i32m4(v_hi, 16, vl),
+      __riscv_vreinterpret_v_u32m4_i32m4(v_lo_shifted), vl);
+}
+
+inline vuint32m4_t VectorLog2Int_u32m4(vuint32m4_t v_in, size_t vl)
+{
+  // Initialize result vector to zero
+  vuint32m4_t v_result = __riscv_vmv_v_x_u32m4(0, vl);
+  vuint32m4_t v_tmp;
+  vbool8_t v_mask;
+
+  // Check bit 16 and update result and input
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 16, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 16, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 8 and update result and input
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 8, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 8, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 4 and update result and input
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 4, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 4, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 2 and update result and input
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 2, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 2, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 1 and update result
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 1, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 1, vl);
+
+  return v_result;
+}
+
+void FilterbankLogRVV(const uint32_t* input, int num_channels,
+                      int32_t output_scale, uint32_t correction_bits,
+                      int16_t* output)
+{
+  const uint32_t kLogScaleLog2 = 16;
+  const uint32_t kLogCoeff = 45426;
+
+  int i = 0;
+  while (i < num_channels)
+  {
+    // Set vector length for 32-bit elements and group multiplier 4
+    size_t vl = __riscv_vsetvl_e32m4(num_channels - i);
+
+    // Load input, shift by correction bits, and determine active elements
+    vuint32m4_t v_input = __riscv_vle32_v_u32m4(input + i, vl);
+    vuint32m4_t v_scaled = __riscv_vsll_vx_u32m4(v_input, correction_bits, vl);
+    vbool8_t v_active = __riscv_vmsgtu_vx_u32m4_b8(v_scaled, 1, vl);
+
+    // Calculate integer part of log2
+    vuint32m4_t v_integer = VectorLog2Int_u32m4(v_scaled, vl);
+
+    // Calculate shift amount to align MSB to bit 16
+    vint32m4_t v_shift_amt = __riscv_vrsub_vx_i32m4(
+        __riscv_vreinterpret_v_u32m4_i32m4(v_integer), 16, vl);
+
+    // Create mask for left shifting vs right shifting
+    vbool8_t v_shift_left_mask = __riscv_vmsgt_vx_i32m4_b8(v_shift_amt, 0, vl);
+    vuint32m4_t v_shift_u32 = __riscv_vreinterpret_v_i32m4_u32m4(v_shift_amt);
+
+    // Perform shifts and merge results based on direction mask
+    vuint32m4_t v_aligned_left = __riscv_vsll_vv_u32m4(v_scaled, v_shift_u32, vl);
+    vuint32m4_t v_aligned_right = __riscv_vsrl_vv_u32m4(
+        v_scaled, __riscv_vneg_v_u32m4(v_shift_u32, vl), vl);
+    vuint32m4_t v_aligned = __riscv_vmerge_vvm_u32m4(
+        v_aligned_right, v_aligned_left, v_shift_left_mask, vl);
+
+    // Extract fractional part by keeping bottom 16 bits
+    vuint32m4_t v_frac = __riscv_vand_vx_u32m4(v_aligned, 0xFFFF, vl);
+
+    // Calculate base segment for LUT lookup
+    vuint32m4_t v_base_seg = __riscv_vsrl_vx_u32m4(v_frac, 9, vl);
+    vuint16m2_t v_base_seg_u16 = __riscv_vncvt_x_x_w_u16m2(v_base_seg, vl);
+
+    // Calculate offsets for c0 and c1 coefficients
+    vuint16m2_t v_offset_c0 = __riscv_vsll_vx_u16m2(v_base_seg_u16, 1, vl);
+    vuint16m2_t v_offset_c1 = __riscv_vadd_vx_u16m2(v_offset_c0, 2, vl);
+
+    // Switch to 16-bit element width for gather load to ensure correct data width
+    vl = __riscv_vsetvl_e16m2(vl);
+    vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl);
+    vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl);
+
+    // Switch back to 32-bit element width for remaining computation
+    vl = __riscv_vsetvl_e32m4(vl);
+
+    // Widen loaded 16-bit coefficients to 32-bit
+    vuint32m4_t v_c0 = __riscv_vwaddu_vx_u32m4(v_c0_u16, 0, vl);
+    vuint32m4_t v_c1 = __riscv_vwaddu_vx_u32m4(v_c1_u16, 0, vl);
+
+    // Calculate linear interpolation
+    vuint32m4_t v_seg_base = __riscv_vand_vx_u32m4(v_frac, 0xFE00, vl);
+    vuint32m4_t v_dist = __riscv_vsub_vv_u32m4(v_frac, v_seg_base, vl);
+
+    // Compute difference between coefficients and multiply by distance
+    vint32m4_t v_diff = __riscv_vsub_vv_i32m4(
+        __riscv_vreinterpret_v_u32m4_i32m4(v_c1),
+        __riscv_vreinterpret_v_u32m4_i32m4(v_c0), vl);
+    vint32m4_t v_rel_pos = __riscv_vmul_vv_i32m4(
+        v_diff, __riscv_vreinterpret_v_u32m4_i32m4(v_dist), vl);
+    v_rel_pos = __riscv_vsra_vx_i32m4(v_rel_pos, kLogScaleLog2, vl);
+
+    // Add interpolation result to base coefficient and fraction
+    vuint32m4_t v_final_frac = __riscv_vadd_vv_u32m4(v_frac, v_c0, vl);
+    v_final_frac = __riscv_vadd_vv_u32m4(
+        v_final_frac, __riscv_vreinterpret_v_i32m4_u32m4(v_rel_pos), vl);
+
+    // Construct final log2 value
+    vuint32m4_t v_log2 = __riscv_vsll_vx_u32m4(v_integer, 16, vl);
+    v_log2 = __riscv_vadd_vv_u32m4(v_log2, v_final_frac, vl);
+
+    // Convert Log2 to LogE using fixed point multiplication
+    vuint32m4_t v_loge = MulHighFixedPoint16_UU_vx_u32m4(v_log2, kLogCoeff, vl);
+
+    // Apply output scaling
+    vint32m4_t v_loge_scaled = MulHighFixedPoint16_SU_vx_i32m4(v_loge, output_scale, vl);
+
+    // Saturate result to 16-bit max value
+    vint32m4_t v_sat_val = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
+    vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_loge_scaled, v_sat_val, vl);
+
+    // Zero out inactive elements where input was less than or equal to 1
+    vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
+    v_result = __riscv_vmerge_vvm_i32m4(v_zero, v_result, v_active, vl);
+
+    // Narrow 32-bit result to 16-bit and store
+    vint16m2_t v_res_i16 = __riscv_vncvt_x_x_w_i16m2(v_result, vl);
+    __riscv_vse16_v_i16m2(output + i, v_res_i16, vl);
+
+    i += vl;
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h
new file mode 100644
index 00000000000..2cb7c974328
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h
@@ -0,0 +1,10 @@
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_LOG_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_LOG_RVV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+void FilterbankLogRVV(const uint32_t* input, int num_channels,
+                   int32_t output_scale, uint32_t correction_bits,
+                   int16_t* output);
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
index 14cd5c58461..8f03a819f68 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
@@ -1,5 +1,5 @@
-#ifndef SIGNAL_SRC_FILTER_BANK_H_
-#define SIGNAL_SRC_FILTER_BANK_H_
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
 
 #include <stdint.h>
 
@@ -20,4 +20,4 @@ struct FilterbankConfig {
 void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
                                   const uint32_t* input, uint64_t* output);
 
-#endif  // SIGNAL_SRC_FILTER_BANK_H_
\ No newline at end of file
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 3cff6d110ca..24fb8f132d7 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -71,6 +71,8 @@ MICROLITE_CC_SRCS += \
   tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank.cc \
   tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log.cc \
+  tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc \
   tensorflow/lite/micro/kernels/riscv_vector/softmax.cc \
 
 EXCLUDED_SRCS := \
@@ -83,6 +85,8 @@ EXCLUDED_SRCS := \
   signal/src/kiss_fft_wrappers/kiss_fft_int16.cc \
   signal/micro/kernels/filter_bank.cc \
   signal/src/filter_bank.cc \
+  signal/src/filter_bank_log.cc \
+  signal/micro/kernels/filter_bank_log.cc \
   tensorflow/lite/micro/kernels/softmax.cc \
 
 

From 2190e8a84eecc89f61a20ea5f349fbad5886fe7c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:01:38 -0600
Subject: [PATCH 67/86] Optimize FilterbankLogRVV with branchless normalization
 and fix LUT gather offsets

---
 .../signal/filter_bank_log_rvv.cc             | 166 +++++++++---------
 1 file changed, 84 insertions(+), 82 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index d66ed46cbb8..3ba9c9a37db 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -2,6 +2,9 @@
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
+#include <riscv_vector.h>
+#include "tensorflow/lite/kernels/internal/common.h"
+
 const uint16_t kLogLut[] = {
     0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
     2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
@@ -15,9 +18,54 @@ const uint16_t kLogLut[] = {
     2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
     1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
 
+inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
+{
+  // Initialize variables
+  vuint32m4_t v_result = __riscv_vmv_v_x_u32m4(0, vl);
+  vuint32m4_t v_tmp;
+  vbool8_t v_mask;
+  vuint32m4_t v_added;
+
+  // Check bit 16
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 16, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_added = __riscv_vadd_vx_u32m4(v_result, 16, vl);
+  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 8
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 8, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_added = __riscv_vadd_vx_u32m4(v_result, 8, vl);
+  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 4
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 4, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_added = __riscv_vadd_vx_u32m4(v_result, 4, vl);
+  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 2
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 2, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_added = __riscv_vadd_vx_u32m4(v_result, 2, vl);
+  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
+
+  // Check bit 1
+  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 1, vl);
+  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
+  v_added = __riscv_vadd_vx_u32m4(v_result, 1, vl);
+  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+
+  return v_result;
+}
+
 inline vuint32m4_t MulHighFixedPoint16_UU_vx_u32m4(vuint32m4_t v_a, uint32_t b, size_t vl)
 {
-  // load scalar and perform low and high multiplication
+  // Load scalar and perform low and high multiplication
   vuint32m4_t v_b = __riscv_vmv_v_x_u32m4(b, vl);
   vuint32m4_t v_lo = __riscv_vmul_vv_u32m4(v_a, v_b, vl);
   vuint32m4_t v_hi = __riscv_vmulhu_vv_u32m4(v_a, v_b, vl);
@@ -26,20 +74,24 @@ inline vuint32m4_t MulHighFixedPoint16_UU_vx_u32m4(vuint32m4_t v_a, uint32_t b,
   vuint32m4_t v_round = __riscv_vmv_v_x_u32m4(32768, vl);
   vuint32m4_t v_lo_rounded = __riscv_vadd_vv_u32m4(v_lo, v_round, vl);
 
-  // Detect carry from the low part addition and propagate to high part
+  // Propagate carry to high part if overflow occurred
   vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(v_lo_rounded, v_lo, vl);
-  v_hi = __riscv_vadd_vx_u32m4_m(v_carry, v_hi, 1, vl);
+  vuint32m4_t v_hi_plus_1 = __riscv_vadd_vx_u32m4(v_hi, 1, vl);
+  v_hi = __riscv_vmerge_vvm_u32m4(v_hi, v_hi_plus_1, v_carry, vl);
 
   // Combine high shifted left and low shifted right
-  return __riscv_vor_vv_u32m4(
-      __riscv_vsll_vx_u32m4(v_hi, 16, vl),
-      __riscv_vsrl_vx_u32m4(v_lo_rounded, 16, vl), vl);
+  vuint32m4_t v_hi_shifted = __riscv_vsll_vx_u32m4(v_hi, 16, vl);
+  vuint32m4_t v_lo_shifted = __riscv_vsrl_vx_u32m4(v_lo_rounded, 16, vl);
+
+  return __riscv_vor_vv_u32m4(v_hi_shifted, v_lo_shifted, vl);
 }
 
 inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_t signed_scalar, size_t vl)
 {
-  // Load signed scalar and perform low and high multiplication
+  // Load scalar
   vint32m4_t v_signed_scalar = __riscv_vmv_v_x_i32m4(signed_scalar, vl);
+
+  // Perform low and high multiplication
   vint32m4_t v_lo = __riscv_vmul_vv_i32m4(v_signed_scalar, __riscv_vreinterpret_v_u32m4_i32m4(v_unsigned), vl);
   vint32m4_t v_hi = __riscv_vmulhsu_vv_i32m4(v_signed_scalar, v_unsigned, vl);
 
@@ -47,11 +99,12 @@ inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_
   vint32m4_t v_round = __riscv_vmv_v_x_i32m4(32768, vl);
   vint32m4_t v_lo_rounded = __riscv_vadd_vv_i32m4(v_lo, v_round, vl);
 
-  // Detect carry treating low part as unsigned and propagate to high part
+  // Propagate carry to high part using unsigned comparison for safety
   vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(
       __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded),
       __riscv_vreinterpret_v_i32m4_u32m4(v_lo), vl);
-  v_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_hi, 1, vl);
+  vint32m4_t v_hi_plus_1 = __riscv_vadd_vx_i32m4(v_hi, 1, vl);
+  v_hi = __riscv_vmerge_vvm_i32m4(v_hi, v_hi_plus_1, v_carry, vl);
 
   // Combine high shifted left and low shifted right
   vuint32m4_t v_lo_shifted = __riscv_vsrl_vx_u32m4(
@@ -62,109 +115,60 @@ inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_
       __riscv_vreinterpret_v_u32m4_i32m4(v_lo_shifted), vl);
 }
 
-inline vuint32m4_t VectorLog2Int_u32m4(vuint32m4_t v_in, size_t vl)
-{
-  // Initialize result vector to zero
-  vuint32m4_t v_result = __riscv_vmv_v_x_u32m4(0, vl);
-  vuint32m4_t v_tmp;
-  vbool8_t v_mask;
-
-  // Check bit 16 and update result and input
-  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 16, vl);
-  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 16, vl);
-  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
-
-  // Check bit 8 and update result and input
-  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 8, vl);
-  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 8, vl);
-  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
-
-  // Check bit 4 and update result and input
-  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 4, vl);
-  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 4, vl);
-  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
-
-  // Check bit 2 and update result and input
-  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 2, vl);
-  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 2, vl);
-  v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
-
-  // Check bit 1 and update result
-  v_tmp = __riscv_vsrl_vx_u32m4(v_in, 1, vl);
-  v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_result = __riscv_vadd_vx_u32m4_m(v_mask, v_result, 1, vl);
-
-  return v_result;
-}
-
 void FilterbankLogRVV(const uint32_t* input, int num_channels,
                       int32_t output_scale, uint32_t correction_bits,
                       int16_t* output)
 {
-  const uint32_t kLogScaleLog2 = 16;
   const uint32_t kLogCoeff = 45426;
+  const uint32_t kLogScaleLog2 = 16;
 
   int i = 0;
   while (i < num_channels)
   {
-    // Set vector length for 32-bit elements and group multiplier 4
+    // Set vector length
     size_t vl = __riscv_vsetvl_e32m4(num_channels - i);
 
-    // Load input, shift by correction bits, and determine active elements
+    // Load input and shift by correction bits
     vuint32m4_t v_input = __riscv_vle32_v_u32m4(input + i, vl);
     vuint32m4_t v_scaled = __riscv_vsll_vx_u32m4(v_input, correction_bits, vl);
+
+    // Identify active elements where input is greater than 1
     vbool8_t v_active = __riscv_vmsgtu_vx_u32m4_b8(v_scaled, 1, vl);
 
     // Calculate integer part of log2
-    vuint32m4_t v_integer = VectorLog2Int_u32m4(v_scaled, vl);
+    vuint32m4_t v_integer = VectorLog2Int_Zve32x(v_scaled, vl);
 
-    // Calculate shift amount to align MSB to bit 16
-    vint32m4_t v_shift_amt = __riscv_vrsub_vx_i32m4(
-        __riscv_vreinterpret_v_u32m4_i32m4(v_integer), 16, vl);
+    // Normalize mantissa to [1.0, 2.0) in Q16 by aligning MSB to bit 31 then shifting down
+    vuint32m4_t v_shift_to_msb = __riscv_vrsub_vx_u32m4(v_integer, 31, vl);
+    vuint32m4_t v_norm = __riscv_vsll_vv_u32m4(v_scaled, v_shift_to_msb, vl);
+    vuint32m4_t v_aligned = __riscv_vsrl_vx_u32m4(v_norm, 15, vl);
 
-    // Create mask for left shifting vs right shifting
-    vbool8_t v_shift_left_mask = __riscv_vmsgt_vx_i32m4_b8(v_shift_amt, 0, vl);
-    vuint32m4_t v_shift_u32 = __riscv_vreinterpret_v_i32m4_u32m4(v_shift_amt);
-
-    // Perform shifts and merge results based on direction mask
-    vuint32m4_t v_aligned_left = __riscv_vsll_vv_u32m4(v_scaled, v_shift_u32, vl);
-    vuint32m4_t v_aligned_right = __riscv_vsrl_vv_u32m4(
-        v_scaled, __riscv_vneg_v_u32m4(v_shift_u32, vl), vl);
-    vuint32m4_t v_aligned = __riscv_vmerge_vvm_u32m4(
-        v_aligned_right, v_aligned_left, v_shift_left_mask, vl);
-
-    // Extract fractional part by keeping bottom 16 bits
+    // Extract fractional part (lower 16 bits)
     vuint32m4_t v_frac = __riscv_vand_vx_u32m4(v_aligned, 0xFFFF, vl);
 
-    // Calculate base segment for LUT lookup
+    // Calculate base segment index for LUT
     vuint32m4_t v_base_seg = __riscv_vsrl_vx_u32m4(v_frac, 9, vl);
     vuint16m2_t v_base_seg_u16 = __riscv_vncvt_x_x_w_u16m2(v_base_seg, vl);
 
-    // Calculate offsets for c0 and c1 coefficients
+    // Calculate offsets for LUT access multiplying indices by 2 for byte addressing
     vuint16m2_t v_offset_c0 = __riscv_vsll_vx_u16m2(v_base_seg_u16, 1, vl);
     vuint16m2_t v_offset_c1 = __riscv_vadd_vx_u16m2(v_offset_c0, 2, vl);
 
-    // Switch to 16-bit element width for gather load to ensure correct data width
+    // Gather LUT coefficients using 16-bit element width
     vl = __riscv_vsetvl_e16m2(vl);
     vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl);
     vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl);
 
-    // Switch back to 32-bit element width for remaining computation
+    // Restore vector length and widen coefficients to 32-bit
     vl = __riscv_vsetvl_e32m4(vl);
-
-    // Widen loaded 16-bit coefficients to 32-bit
     vuint32m4_t v_c0 = __riscv_vwaddu_vx_u32m4(v_c0_u16, 0, vl);
     vuint32m4_t v_c1 = __riscv_vwaddu_vx_u32m4(v_c1_u16, 0, vl);
 
-    // Calculate linear interpolation
+    // Calculate linear interpolation distance
     vuint32m4_t v_seg_base = __riscv_vand_vx_u32m4(v_frac, 0xFE00, vl);
     vuint32m4_t v_dist = __riscv_vsub_vv_u32m4(v_frac, v_seg_base, vl);
 
-    // Compute difference between coefficients and multiply by distance
+    // Calculate relative position from the difference between coefficients
     vint32m4_t v_diff = __riscv_vsub_vv_i32m4(
         __riscv_vreinterpret_v_u32m4_i32m4(v_c1),
         __riscv_vreinterpret_v_u32m4_i32m4(v_c0), vl);
@@ -172,30 +176,28 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
         v_diff, __riscv_vreinterpret_v_u32m4_i32m4(v_dist), vl);
     v_rel_pos = __riscv_vsra_vx_i32m4(v_rel_pos, kLogScaleLog2, vl);
 
-    // Add interpolation result to base coefficient and fraction
+    // Combine base fractional part, coefficient, and interpolation result
     vuint32m4_t v_final_frac = __riscv_vadd_vv_u32m4(v_frac, v_c0, vl);
     v_final_frac = __riscv_vadd_vv_u32m4(
         v_final_frac, __riscv_vreinterpret_v_i32m4_u32m4(v_rel_pos), vl);
 
-    // Construct final log2 value
+    // Combine integer part and fractional part to form final log2 value
     vuint32m4_t v_log2 = __riscv_vsll_vx_u32m4(v_integer, 16, vl);
     v_log2 = __riscv_vadd_vv_u32m4(v_log2, v_final_frac, vl);
 
     // Convert Log2 to LogE using fixed point multiplication
     vuint32m4_t v_loge = MulHighFixedPoint16_UU_vx_u32m4(v_log2, kLogCoeff, vl);
 
-    // Apply output scaling
+    // Apply output scaling and saturate to 16-bit range
     vint32m4_t v_loge_scaled = MulHighFixedPoint16_SU_vx_i32m4(v_loge, output_scale, vl);
-
-    // Saturate result to 16-bit max value
     vint32m4_t v_sat_val = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
     vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_loge_scaled, v_sat_val, vl);
 
-    // Zero out inactive elements where input was less than or equal to 1
+    // Zero out inactive elements
     vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
     v_result = __riscv_vmerge_vvm_i32m4(v_zero, v_result, v_active, vl);
 
-    // Narrow 32-bit result to 16-bit and store
+    // Narrow result to 16-bit and store to memory
     vint16m2_t v_res_i16 = __riscv_vncvt_x_x_w_i16m2(v_result, vl);
     __riscv_vse16_v_i16m2(output + i, v_res_i16, vl);
 

From 007ce6ecb49865d382479b26dc39ebc25b53d2d3 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:03:49 -0600
Subject: [PATCH 68/86] Remove redundant included headers

---
 .../micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index 3ba9c9a37db..f84650eb2d9 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -2,9 +2,6 @@
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
-#include <riscv_vector.h>
-#include "tensorflow/lite/kernels/internal/common.h"
-
 const uint16_t kLogLut[] = {
     0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
     2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,

From 06331334b0a6de59d0ede292745807cd8bf4338b Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:27:40 -0600
Subject: [PATCH 69/86] Optimize FilterbankLogRVV kernel using widening
 multiply and arithmetic decomposition

---
 .../signal/filter_bank_log_rvv.cc             | 105 ++++++------------
 1 file changed, 36 insertions(+), 69 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index f84650eb2d9..b9453f1d5a4 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -2,6 +2,8 @@
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
+constexpr uint16_t kLogCoeff = 45426;
+
 const uint16_t kLogLut[] = {
     0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
     2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
@@ -21,68 +23,39 @@ inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
   vuint32m4_t v_result = __riscv_vmv_v_x_u32m4(0, vl);
   vuint32m4_t v_tmp;
   vbool8_t v_mask;
-  vuint32m4_t v_added;
 
   // Check bit 16
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 16, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_added = __riscv_vadd_vx_u32m4(v_result, 16, vl);
-  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 16, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
   // Check bit 8
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 8, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_added = __riscv_vadd_vx_u32m4(v_result, 8, vl);
-  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 8, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
   // Check bit 4
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 4, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_added = __riscv_vadd_vx_u32m4(v_result, 4, vl);
-  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 4, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
   // Check bit 2
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 2, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_added = __riscv_vadd_vx_u32m4(v_result, 2, vl);
-  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 2, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
   // Check bit 1
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 1, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
-  v_added = __riscv_vadd_vx_u32m4(v_result, 1, vl);
-  v_result = __riscv_vmerge_vvm_u32m4(v_result, v_added, v_mask, vl);
+  v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 1, vl);
 
   return v_result;
 }
 
-inline vuint32m4_t MulHighFixedPoint16_UU_vx_u32m4(vuint32m4_t v_a, uint32_t b, size_t vl)
-{
-  // Load scalar and perform low and high multiplication
-  vuint32m4_t v_b = __riscv_vmv_v_x_u32m4(b, vl);
-  vuint32m4_t v_lo = __riscv_vmul_vv_u32m4(v_a, v_b, vl);
-  vuint32m4_t v_hi = __riscv_vmulhu_vv_u32m4(v_a, v_b, vl);
-
-  // Add rounding constant 32768 to the low part
-  vuint32m4_t v_round = __riscv_vmv_v_x_u32m4(32768, vl);
-  vuint32m4_t v_lo_rounded = __riscv_vadd_vv_u32m4(v_lo, v_round, vl);
-
-  // Propagate carry to high part if overflow occurred
-  vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(v_lo_rounded, v_lo, vl);
-  vuint32m4_t v_hi_plus_1 = __riscv_vadd_vx_u32m4(v_hi, 1, vl);
-  v_hi = __riscv_vmerge_vvm_u32m4(v_hi, v_hi_plus_1, v_carry, vl);
-
-  // Combine high shifted left and low shifted right
-  vuint32m4_t v_hi_shifted = __riscv_vsll_vx_u32m4(v_hi, 16, vl);
-  vuint32m4_t v_lo_shifted = __riscv_vsrl_vx_u32m4(v_lo_rounded, 16, vl);
-
-  return __riscv_vor_vv_u32m4(v_hi_shifted, v_lo_shifted, vl);
-}
-
 inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_t signed_scalar, size_t vl)
 {
   // Load scalar
@@ -116,9 +89,6 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
                       int32_t output_scale, uint32_t correction_bits,
                       int16_t* output)
 {
-  const uint32_t kLogCoeff = 45426;
-  const uint32_t kLogScaleLog2 = 16;
-
   int i = 0;
   while (i < num_channels)
   {
@@ -152,41 +122,38 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
     vuint16m2_t v_offset_c1 = __riscv_vadd_vx_u16m2(v_offset_c0, 2, vl);
 
     // Gather LUT coefficients using 16-bit element width
-    vl = __riscv_vsetvl_e16m2(vl);
-    vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl);
-    vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl);
+    size_t vl_u16 = __riscv_vsetvl_e16m2(vl);
+    vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl_u16);
+    vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl_u16);
 
-    // Restore vector length and widen coefficients to 32-bit
+    // Calculate dist and diff in 16-bit
+    vuint16m2_t v_seg_base = __riscv_vand_vx_u16m2(__riscv_vncvt_x_x_w_u16m2(v_frac, vl), 0xFE00, vl_u16);
+    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(__riscv_vncvt_x_x_w_u16m2(v_frac, vl), v_seg_base, vl_u16);
+    vint16m2_t v_diff = __riscv_vsub_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(v_c1_u16), __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl_u16);
+
+    // Restore vector length for 32-bit operations
     vl = __riscv_vsetvl_e32m4(vl);
-    vuint32m4_t v_c0 = __riscv_vwaddu_vx_u32m4(v_c0_u16, 0, vl);
-    vuint32m4_t v_c1 = __riscv_vwaddu_vx_u32m4(v_c1_u16, 0, vl);
-
-    // Calculate linear interpolation distance
-    vuint32m4_t v_seg_base = __riscv_vand_vx_u32m4(v_frac, 0xFE00, vl);
-    vuint32m4_t v_dist = __riscv_vsub_vv_u32m4(v_frac, v_seg_base, vl);
-
-    // Calculate relative position from the difference between coefficients
-    vint32m4_t v_diff = __riscv_vsub_vv_i32m4(
-        __riscv_vreinterpret_v_u32m4_i32m4(v_c1),
-        __riscv_vreinterpret_v_u32m4_i32m4(v_c0), vl);
-    vint32m4_t v_rel_pos = __riscv_vmul_vv_i32m4(
-        v_diff, __riscv_vreinterpret_v_u32m4_i32m4(v_dist), vl);
-    v_rel_pos = __riscv_vsra_vx_i32m4(v_rel_pos, kLogScaleLog2, vl);
-
-    // Combine base fractional part, coefficient, and interpolation result
-    vuint32m4_t v_final_frac = __riscv_vadd_vv_u32m4(v_frac, v_c0, vl);
-    v_final_frac = __riscv_vadd_vv_u32m4(
-        v_final_frac, __riscv_vreinterpret_v_i32m4_u32m4(v_rel_pos), vl);
-
-    // Combine integer part and fractional part to form final log2 value
-    vuint32m4_t v_log2 = __riscv_vsll_vx_u32m4(v_integer, 16, vl);
-    v_log2 = __riscv_vadd_vv_u32m4(v_log2, v_final_frac, vl);
-
-    // Convert Log2 to LogE using fixed point multiplication
-    vuint32m4_t v_loge = MulHighFixedPoint16_UU_vx_u32m4(v_log2, kLogCoeff, vl);
-
-    // Apply output scaling and saturate to 16-bit range
+
+    // Calculate interpolation using widening multiply
+    vint32m4_t v_rel_pos = __riscv_vwmul_vv_i32m4(v_diff, __riscv_vreinterpret_v_u16m2_i16m2(v_dist), vl);
+    v_rel_pos = __riscv_vsra_vx_i32m4(v_rel_pos, 16, vl);
+
+    // Widen coefficient c0 and combine parts
+    vuint32m4_t v_c0_32 = __riscv_vwaddu_vx_u32m4(v_c0_u16, 0, vl);
+    vuint32m4_t v_final_frac = __riscv_vadd_vv_u32m4(v_frac, v_c0_32, vl);
+    v_final_frac = __riscv_vadd_vv_u32m4(v_final_frac, __riscv_vreinterpret_v_i32m4_u32m4(v_rel_pos), vl);
+
+    // Calculate LogE using arithmetic decomposition
+    vuint32m4_t v_loge_int = __riscv_vmul_vx_u32m4(v_integer, kLogCoeff, vl);
+    vuint32m4_t v_loge_frac = __riscv_vmul_vx_u32m4(v_final_frac, kLogCoeff, vl);
+    v_loge_frac = __riscv_vadd_vx_u32m4(v_loge_frac, 32768, vl);
+    v_loge_frac = __riscv_vsrl_vx_u32m4(v_loge_frac, 16, vl);
+    vuint32m4_t v_loge = __riscv_vadd_vv_u32m4(v_loge_int, v_loge_frac, vl);
+
+    // Apply output scaling
     vint32m4_t v_loge_scaled = MulHighFixedPoint16_SU_vx_i32m4(v_loge, output_scale, vl);
+
+    // Saturate to 16-bit range
     vint32m4_t v_sat_val = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
     vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_loge_scaled, v_sat_val, vl);
 

From 06dd4c8b108cf264e616acab3493b2db04f6ee25 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:32:27 -0600
Subject: [PATCH 70/86] FilterbankLogRVV: Use widening instructions and fix
 signed scaling logic

---
 .../signal/filter_bank_log_rvv.cc             | 139 ++++++++----------
 1 file changed, 63 insertions(+), 76 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index b9453f1d5a4..15ad105dc4b 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -4,7 +4,8 @@
 
 constexpr uint16_t kLogCoeff = 45426;
 
-const uint16_t kLogLut[] = {
+const uint16_t kLogLut[] =
+{
     0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
     2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
     3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
@@ -15,7 +16,8 @@ const uint16_t kLogLut[] = {
     4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
     3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
     2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
-    1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
+    1094, 963,  830,  695,  559,  421,  282,  142,  0,    0
+};
 
 inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
 {
@@ -56,35 +58,6 @@ inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
   return v_result;
 }
 
-inline vint32m4_t MulHighFixedPoint16_SU_vx_i32m4(vuint32m4_t v_unsigned, int32_t signed_scalar, size_t vl)
-{
-  // Load scalar
-  vint32m4_t v_signed_scalar = __riscv_vmv_v_x_i32m4(signed_scalar, vl);
-
-  // Perform low and high multiplication
-  vint32m4_t v_lo = __riscv_vmul_vv_i32m4(v_signed_scalar, __riscv_vreinterpret_v_u32m4_i32m4(v_unsigned), vl);
-  vint32m4_t v_hi = __riscv_vmulhsu_vv_i32m4(v_signed_scalar, v_unsigned, vl);
-
-  // Add rounding constant 32768 to the low part
-  vint32m4_t v_round = __riscv_vmv_v_x_i32m4(32768, vl);
-  vint32m4_t v_lo_rounded = __riscv_vadd_vv_i32m4(v_lo, v_round, vl);
-
-  // Propagate carry to high part using unsigned comparison for safety
-  vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(
-      __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded),
-      __riscv_vreinterpret_v_i32m4_u32m4(v_lo), vl);
-  vint32m4_t v_hi_plus_1 = __riscv_vadd_vx_i32m4(v_hi, 1, vl);
-  v_hi = __riscv_vmerge_vvm_i32m4(v_hi, v_hi_plus_1, v_carry, vl);
-
-  // Combine high shifted left and low shifted right
-  vuint32m4_t v_lo_shifted = __riscv_vsrl_vx_u32m4(
-      __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl);
-
-  return __riscv_vor_vv_i32m4(
-      __riscv_vsll_vx_i32m4(v_hi, 16, vl),
-      __riscv_vreinterpret_v_u32m4_i32m4(v_lo_shifted), vl);
-}
-
 void FilterbankLogRVV(const uint32_t* input, int num_channels,
                       int32_t output_scale, uint32_t correction_bits,
                       int16_t* output)
@@ -92,32 +65,24 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
   int i = 0;
   while (i < num_channels)
   {
-    // Set vector length
+    // Set vector length and load input
     size_t vl = __riscv_vsetvl_e32m4(num_channels - i);
-
-    // Load input and shift by correction bits
     vuint32m4_t v_input = __riscv_vle32_v_u32m4(input + i, vl);
     vuint32m4_t v_scaled = __riscv_vsll_vx_u32m4(v_input, correction_bits, vl);
-
-    // Identify active elements where input is greater than 1
     vbool8_t v_active = __riscv_vmsgtu_vx_u32m4_b8(v_scaled, 1, vl);
 
     // Calculate integer part of log2
     vuint32m4_t v_integer = VectorLog2Int_Zve32x(v_scaled, vl);
 
-    // Normalize mantissa to [1.0, 2.0) in Q16 by aligning MSB to bit 31 then shifting down
-    vuint32m4_t v_shift_to_msb = __riscv_vrsub_vx_u32m4(v_integer, 31, vl);
-    vuint32m4_t v_norm = __riscv_vsll_vv_u32m4(v_scaled, v_shift_to_msb, vl);
-    vuint32m4_t v_aligned = __riscv_vsrl_vx_u32m4(v_norm, 15, vl);
-
-    // Extract fractional part (lower 16 bits)
-    vuint32m4_t v_frac = __riscv_vand_vx_u32m4(v_aligned, 0xFFFF, vl);
+    // Normalize mantissa to [1.0, 2.0) in Q16
+    vuint32m4_t v_shift_norm = __riscv_vrsub_vx_u32m4(v_integer, 31, vl);
+    vuint32m4_t v_norm = __riscv_vsll_vv_u32m4(v_scaled, v_shift_norm, vl);
+    vuint32m4_t v_frac = __riscv_vsrl_vx_u32m4(v_norm, 15, vl);
+    v_frac = __riscv_vand_vx_u32m4(v_frac, 0xFFFF, vl);
 
-    // Calculate base segment index for LUT
+    // Calculate base segment index and offsets for LUT access
     vuint32m4_t v_base_seg = __riscv_vsrl_vx_u32m4(v_frac, 9, vl);
     vuint16m2_t v_base_seg_u16 = __riscv_vncvt_x_x_w_u16m2(v_base_seg, vl);
-
-    // Calculate offsets for LUT access multiplying indices by 2 for byte addressing
     vuint16m2_t v_offset_c0 = __riscv_vsll_vx_u16m2(v_base_seg_u16, 1, vl);
     vuint16m2_t v_offset_c1 = __riscv_vadd_vx_u16m2(v_offset_c0, 2, vl);
 
@@ -126,42 +91,64 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
     vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl_u16);
     vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl_u16);
 
-    // Calculate dist and diff in 16-bit
-    vuint16m2_t v_seg_base = __riscv_vand_vx_u16m2(__riscv_vncvt_x_x_w_u16m2(v_frac, vl), 0xFE00, vl_u16);
-    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(__riscv_vncvt_x_x_w_u16m2(v_frac, vl), v_seg_base, vl_u16);
-    vint16m2_t v_diff = __riscv_vsub_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(v_c1_u16), __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl_u16);
+    // Calculate interpolation distance and difference
+    vuint16m2_t v_seg_base = __riscv_vand_vx_u16m2(
+        __riscv_vncvt_x_x_w_u16m2(v_frac, vl), 0xFE00, vl_u16);
+    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(
+        __riscv_vncvt_x_x_w_u16m2(v_frac, vl), v_seg_base, vl_u16);
+    vint16m2_t v_diff = __riscv_vsub_vv_i16m2(
+        __riscv_vreinterpret_v_u16m2_i16m2(v_c1_u16),
+        __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl_u16);
 
-    // Restore vector length for 32-bit operations
+    // Restore vector length and widen for interpolation
     vl = __riscv_vsetvl_e32m4(vl);
-
-    // Calculate interpolation using widening multiply
-    vint32m4_t v_rel_pos = __riscv_vwmul_vv_i32m4(v_diff, __riscv_vreinterpret_v_u16m2_i16m2(v_dist), vl);
+    vint32m4_t v_rel_pos = __riscv_vwmul_vv_i32m4(
+        v_diff, __riscv_vreinterpret_v_u16m2_i16m2(v_dist), vl);
     v_rel_pos = __riscv_vsra_vx_i32m4(v_rel_pos, 16, vl);
 
-    // Widen coefficient c0 and combine parts
-    vuint32m4_t v_c0_32 = __riscv_vwaddu_vx_u32m4(v_c0_u16, 0, vl);
-    vuint32m4_t v_final_frac = __riscv_vadd_vv_u32m4(v_frac, v_c0_32, vl);
-    v_final_frac = __riscv_vadd_vv_u32m4(v_final_frac, __riscv_vreinterpret_v_i32m4_u32m4(v_rel_pos), vl);
-
-    // Calculate LogE using arithmetic decomposition
-    vuint32m4_t v_loge_int = __riscv_vmul_vx_u32m4(v_integer, kLogCoeff, vl);
-    vuint32m4_t v_loge_frac = __riscv_vmul_vx_u32m4(v_final_frac, kLogCoeff, vl);
-    v_loge_frac = __riscv_vadd_vx_u32m4(v_loge_frac, 32768, vl);
-    v_loge_frac = __riscv_vsrl_vx_u32m4(v_loge_frac, 16, vl);
-    vuint32m4_t v_loge = __riscv_vadd_vv_u32m4(v_loge_int, v_loge_frac, vl);
-
-    // Apply output scaling
-    vint32m4_t v_loge_scaled = MulHighFixedPoint16_SU_vx_i32m4(v_loge, output_scale, vl);
-
-    // Saturate to 16-bit range
-    vint32m4_t v_sat_val = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
-    vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_loge_scaled, v_sat_val, vl);
-
-    // Zero out inactive elements
+    // Combine interpolated result with base coefficient and fraction
+    vint32m4_t v_tmp = __riscv_vwadd_wv_i32m4(
+        v_rel_pos, __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl);
+    vint32m4_t v_final_frac_part = __riscv_vadd_vv_i32m4(
+        v_tmp, __riscv_vreinterpret_v_u32m4_i32m4(v_frac), vl);
+
+    // Convert Log2 to LogE using fixed point multiplication
+    vuint32m4_t v_term1 = __riscv_vmul_vx_u32m4(v_integer, kLogCoeff, vl);
+    vuint32m4_t v_frac_u32 = __riscv_vreinterpret_v_i32m4_u32m4(v_final_frac_part);
+    vuint32m4_t v_term2_u = __riscv_vmul_vx_u32m4(v_frac_u32, kLogCoeff, vl);
+    v_term2_u = __riscv_vadd_vx_u32m4(v_term2_u, 32768, vl);
+    v_term2_u = __riscv_vsrl_vx_u32m4(v_term2_u, 16, vl);
+    vuint32m4_t v_loge = __riscv_vadd_vv_u32m4(v_term1, v_term2_u, vl);
+
+    // Apply output scaling using signed arithmetic
+    vint32m4_t v_loge_i = __riscv_vreinterpret_v_u32m4_i32m4(v_loge);
+    vint32m4_t v_scale = __riscv_vmv_v_x_i32m4(output_scale, vl);
+    vint32m4_t v_lo = __riscv_vmul_vv_i32m4(v_loge_i, v_scale, vl);
+    vint32m4_t v_hi = __riscv_vmulh_vv_i32m4(v_loge_i, v_scale, vl);
+
+    // Add rounding constant and propagate carry
+    vint32m4_t v_round = __riscv_vmv_v_x_i32m4(32768, vl);
+    vint32m4_t v_lo_rounded = __riscv_vadd_vv_i32m4(v_lo, v_round, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(
+        __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded),
+        __riscv_vreinterpret_v_i32m4_u32m4(v_lo), vl);
+    v_hi = __riscv_vadd_vx_i32m4_mu(v_carry, v_hi, v_hi, 1, vl);
+
+    // Combine high shifted left and low shifted right
+    vint32m4_t v_res = __riscv_vor_vv_i32m4(
+        __riscv_vsll_vx_i32m4(v_hi, 16, vl),
+        __riscv_vreinterpret_v_u32m4_i32m4(
+            __riscv_vsrl_vx_u32m4(
+                __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl)),
+        vl);
+
+    // Saturate result to 16-bit range
+    vint32m4_t v_sat_limit = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
+    vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_res, v_sat_limit, vl);
+
+    // Zero out inactive elements and store result
     vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
     v_result = __riscv_vmerge_vvm_i32m4(v_zero, v_result, v_active, vl);
-
-    // Narrow result to 16-bit and store to memory
     vint16m2_t v_res_i16 = __riscv_vncvt_x_x_w_i16m2(v_result, vl);
     __riscv_vse16_v_i16m2(output + i, v_res_i16, vl);
 

From e3298e3777bf6746922b11fa009c97abec6ef89c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:50:44 -0600
Subject: [PATCH 71/86] Update FilterBankLogRVV

---
 .../signal/filter_bank_log_rvv.cc             | 77 +++++++++----------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index 15ad105dc4b..7a5711ce616 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -26,31 +26,31 @@ inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
   vuint32m4_t v_tmp;
   vbool8_t v_mask;
 
-  // Check bit 16
+  // Check bit 16 and update result and input
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 16, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
   v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 16, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
-  // Check bit 8
+  // Check bit 8 and update result and input
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 8, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
   v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 8, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
-  // Check bit 4
+  // Check bit 4 and update result and input
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 4, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
   v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 4, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
-  // Check bit 2
+  // Check bit 2 and update result and input
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 2, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
   v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 2, vl);
   v_in = __riscv_vmerge_vvm_u32m4(v_in, v_tmp, v_mask, vl);
 
-  // Check bit 1
+  // Check bit 1 and update result
   v_tmp = __riscv_vsrl_vx_u32m4(v_in, 1, vl);
   v_mask = __riscv_vmsne_vx_u32m4_b8(v_tmp, 0, vl);
   v_result = __riscv_vadd_vx_u32m4_mu(v_mask, v_result, v_result, 1, vl);
@@ -65,18 +65,24 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
   int i = 0;
   while (i < num_channels)
   {
-    // Set vector length and load input
+    // Set vector length
     size_t vl = __riscv_vsetvl_e32m4(num_channels - i);
+
+    // Load input and shift by correction bits
     vuint32m4_t v_input = __riscv_vle32_v_u32m4(input + i, vl);
     vuint32m4_t v_scaled = __riscv_vsll_vx_u32m4(v_input, correction_bits, vl);
+
+    // Identify active elements where input is greater than 1
     vbool8_t v_active = __riscv_vmsgtu_vx_u32m4_b8(v_scaled, 1, vl);
 
     // Calculate integer part of log2
     vuint32m4_t v_integer = VectorLog2Int_Zve32x(v_scaled, vl);
 
-    // Normalize mantissa to [1.0, 2.0) in Q16
+    // Normalize mantissa to [1.0, 2.0) in Q16 by aligning MSB to bit 31 then shifting down
     vuint32m4_t v_shift_norm = __riscv_vrsub_vx_u32m4(v_integer, 31, vl);
     vuint32m4_t v_norm = __riscv_vsll_vv_u32m4(v_scaled, v_shift_norm, vl);
+
+    // Extract fractional part (Q15)
     vuint32m4_t v_frac = __riscv_vsrl_vx_u32m4(v_norm, 15, vl);
     v_frac = __riscv_vand_vx_u32m4(v_frac, 0xFFFF, vl);
 
@@ -91,16 +97,15 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
     vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl_u16);
     vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl_u16);
 
-    // Calculate interpolation distance and difference
-    vuint16m2_t v_seg_base = __riscv_vand_vx_u16m2(
-        __riscv_vncvt_x_x_w_u16m2(v_frac, vl), 0xFE00, vl_u16);
-    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(
-        __riscv_vncvt_x_x_w_u16m2(v_frac, vl), v_seg_base, vl_u16);
+    // Calculate interpolation distance and coefficient difference
     vint16m2_t v_diff = __riscv_vsub_vv_i16m2(
         __riscv_vreinterpret_v_u16m2_i16m2(v_c1_u16),
         __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl_u16);
+    vuint16m2_t v_frac_u16 = __riscv_vncvt_x_x_w_u16m2(v_frac, vl);
+    vuint16m2_t v_seg_base_val = __riscv_vand_vx_u16m2(v_frac_u16, 0xFE00, vl_u16);
+    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(v_frac_u16, v_seg_base_val, vl_u16);
 
-    // Restore vector length and widen for interpolation
+    // Restore vector length and calculate widening multiplication for interpolation
     vl = __riscv_vsetvl_e32m4(vl);
     vint32m4_t v_rel_pos = __riscv_vwmul_vv_i32m4(
         v_diff, __riscv_vreinterpret_v_u16m2_i16m2(v_dist), vl);
@@ -120,37 +125,31 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
     v_term2_u = __riscv_vsrl_vx_u32m4(v_term2_u, 16, vl);
     vuint32m4_t v_loge = __riscv_vadd_vv_u32m4(v_term1, v_term2_u, vl);
 
-    // Apply output scaling using signed arithmetic
+    // Apply output scaling using signed multiplication
     vint32m4_t v_loge_i = __riscv_vreinterpret_v_u32m4_i32m4(v_loge);
-    vint32m4_t v_scale = __riscv_vmv_v_x_i32m4(output_scale, vl);
-    vint32m4_t v_lo = __riscv_vmul_vv_i32m4(v_loge_i, v_scale, vl);
-    vint32m4_t v_hi = __riscv_vmulh_vv_i32m4(v_loge_i, v_scale, vl);
-
-    // Add rounding constant and propagate carry
-    vint32m4_t v_round = __riscv_vmv_v_x_i32m4(32768, vl);
-    vint32m4_t v_lo_rounded = __riscv_vadd_vv_i32m4(v_lo, v_round, vl);
-    vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8(
-        __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded),
-        __riscv_vreinterpret_v_i32m4_u32m4(v_lo), vl);
+    vint32m4_t v_lo = __riscv_vmul_vx_i32m4(v_loge_i, output_scale, vl);
+    vint32m4_t v_hi = __riscv_vmulh_vx_i32m4(v_loge_i, output_scale, vl);
+
+    // Add rounding constant and handle carry propagation
+    vint32m4_t v_lo_rounded = __riscv_vadd_vx_i32m4(v_lo, 32768, vl);
+    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(
+        __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 32768, vl);
     v_hi = __riscv_vadd_vx_i32m4_mu(v_carry, v_hi, v_hi, 1, vl);
 
     // Combine high shifted left and low shifted right
+    vint32m4_t v_lo_shifted = __riscv_vreinterpret_v_u32m4_i32m4(
+        __riscv_vsrl_vx_u32m4(
+            __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl));
     vint32m4_t v_res = __riscv_vor_vv_i32m4(
-        __riscv_vsll_vx_i32m4(v_hi, 16, vl),
-        __riscv_vreinterpret_v_u32m4_i32m4(
-            __riscv_vsrl_vx_u32m4(
-                __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl)),
-        vl);
-
-    // Saturate result to 16-bit range
-    vint32m4_t v_sat_limit = __riscv_vmv_v_x_i32m4(INT16_MAX, vl);
-    vint32m4_t v_result = __riscv_vmin_vv_i32m4(v_res, v_sat_limit, vl);
-
-    // Zero out inactive elements and store result
-    vint32m4_t v_zero = __riscv_vmv_v_x_i32m4(0, vl);
-    v_result = __riscv_vmerge_vvm_i32m4(v_zero, v_result, v_active, vl);
-    vint16m2_t v_res_i16 = __riscv_vncvt_x_x_w_i16m2(v_result, vl);
-    __riscv_vse16_v_i16m2(output + i, v_res_i16, vl);
+        __riscv_vsll_vx_i32m4(v_hi, 16, vl), v_lo_shifted, vl);
+
+    // Narrow to 16-bit with saturation
+    vint16m2_t v_res_i16 = __riscv_vnclip_wx_i16m2(v_res, 0, __RISCV_VXRM_RNU, vl);
+
+    // Zero out inactive channels and store result
+    vint16m2_t v_zero = __riscv_vmv_v_x_i16m2(0, vl);
+    vint16m2_t v_final = __riscv_vmerge_vvm_i16m2(v_zero, v_res_i16, v_active, vl);
+    __riscv_vse16_v_i16m2(output + i, v_final, vl);
 
     i += vl;
   }

From 0a69b8d18250384ec426f1bfd982ed25f10ac230 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Wed, 19 Nov 2025 01:53:55 -0600
Subject: [PATCH 72/86] Update FilterBankLogRVV

---
 .../signal/filter_bank_log_rvv.cc             | 64 ++++++++++---------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
index 7a5711ce616..235513e46b7 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.cc
@@ -19,6 +19,10 @@ const uint16_t kLogLut[] =
     1094, 963,  830,  695,  559,  421,  282,  142,  0,    0
 };
 
+// Calculate Integer Log2 using binary search (SIMD compatible).
+// This manual implementation is required because the target architecture 
+// (rv32imc_zve32x_zvl128b) does not support the 'zvbb' extension 
+// which provides the hardware '__riscv_vclz' instruction.
 inline vuint32m4_t VectorLog2Int_Zve32x(vuint32m4_t v_in, size_t vl)
 {
   // Initialize variables
@@ -62,50 +66,47 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
                       int32_t output_scale, uint32_t correction_bits,
                       int16_t* output)
 {
-  int i = 0;
-  while (i < num_channels)
-  {
-    // Set vector length
-    size_t vl = __riscv_vsetvl_e32m4(num_channels - i);
+  const uint32_t* p_src = input;
+  int16_t* p_dst = output;
+  int remaining = num_channels;
 
-    // Load input and shift by correction bits
-    vuint32m4_t v_input = __riscv_vle32_v_u32m4(input + i, vl);
+  while (remaining > 0)
+  {
+    // Set vector length and load input
+    size_t vl = __riscv_vsetvl_e32m4(remaining);
+    vuint32m4_t v_input = __riscv_vle32_v_u32m4(p_src, vl);
     vuint32m4_t v_scaled = __riscv_vsll_vx_u32m4(v_input, correction_bits, vl);
-
-    // Identify active elements where input is greater than 1
     vbool8_t v_active = __riscv_vmsgtu_vx_u32m4_b8(v_scaled, 1, vl);
 
     // Calculate integer part of log2
     vuint32m4_t v_integer = VectorLog2Int_Zve32x(v_scaled, vl);
 
-    // Normalize mantissa to [1.0, 2.0) in Q16 by aligning MSB to bit 31 then shifting down
+    // Normalize mantissa to [1.0, 2.0) in Q16
     vuint32m4_t v_shift_norm = __riscv_vrsub_vx_u32m4(v_integer, 31, vl);
     vuint32m4_t v_norm = __riscv_vsll_vv_u32m4(v_scaled, v_shift_norm, vl);
-
-    // Extract fractional part (Q15)
     vuint32m4_t v_frac = __riscv_vsrl_vx_u32m4(v_norm, 15, vl);
     v_frac = __riscv_vand_vx_u32m4(v_frac, 0xFFFF, vl);
 
     // Calculate base segment index and offsets for LUT access
     vuint32m4_t v_base_seg = __riscv_vsrl_vx_u32m4(v_frac, 9, vl);
     vuint16m2_t v_base_seg_u16 = __riscv_vncvt_x_x_w_u16m2(v_base_seg, vl);
-    vuint16m2_t v_offset_c0 = __riscv_vsll_vx_u16m2(v_base_seg_u16, 1, vl);
-    vuint16m2_t v_offset_c1 = __riscv_vadd_vx_u16m2(v_offset_c0, 2, vl);
+    vuint16m2_t v_offset = __riscv_vsll_vx_u16m2(v_base_seg_u16, 1, vl);
 
     // Gather LUT coefficients using 16-bit element width
     size_t vl_u16 = __riscv_vsetvl_e16m2(vl);
-    vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c0, vl_u16);
-    vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset_c1, vl_u16);
+    vuint16m2_t v_c0_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset, vl_u16);
+    v_offset = __riscv_vadd_vx_u16m2(v_offset, 2, vl);
+    vuint16m2_t v_c1_u16 = __riscv_vluxei16_v_u16m2(kLogLut, v_offset, vl_u16);
 
-    // Calculate interpolation distance and coefficient difference
+    // Calculate interpolation distance and difference
     vint16m2_t v_diff = __riscv_vsub_vv_i16m2(
         __riscv_vreinterpret_v_u16m2_i16m2(v_c1_u16),
         __riscv_vreinterpret_v_u16m2_i16m2(v_c0_u16), vl_u16);
     vuint16m2_t v_frac_u16 = __riscv_vncvt_x_x_w_u16m2(v_frac, vl);
-    vuint16m2_t v_seg_base_val = __riscv_vand_vx_u16m2(v_frac_u16, 0xFE00, vl_u16);
-    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(v_frac_u16, v_seg_base_val, vl_u16);
+    vuint16m2_t v_seg_base = __riscv_vand_vx_u16m2(v_frac_u16, 0xFE00, vl_u16);
+    vuint16m2_t v_dist = __riscv_vsub_vv_u16m2(v_frac_u16, v_seg_base, vl_u16);
 
-    // Restore vector length and calculate widening multiplication for interpolation
+    // Restore vector length and widen for interpolation
     vl = __riscv_vsetvl_e32m4(vl);
     vint32m4_t v_rel_pos = __riscv_vwmul_vv_i32m4(
         v_diff, __riscv_vreinterpret_v_u16m2_i16m2(v_dist), vl);
@@ -125,32 +126,35 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
     v_term2_u = __riscv_vsrl_vx_u32m4(v_term2_u, 16, vl);
     vuint32m4_t v_loge = __riscv_vadd_vv_u32m4(v_term1, v_term2_u, vl);
 
-    // Apply output scaling using signed multiplication
+    // Apply output scaling using signed arithmetic
     vint32m4_t v_loge_i = __riscv_vreinterpret_v_u32m4_i32m4(v_loge);
     vint32m4_t v_lo = __riscv_vmul_vx_i32m4(v_loge_i, output_scale, vl);
     vint32m4_t v_hi = __riscv_vmulh_vx_i32m4(v_loge_i, output_scale, vl);
 
-    // Add rounding constant and handle carry propagation
+    // Add rounding constant and propagate carry
     vint32m4_t v_lo_rounded = __riscv_vadd_vx_i32m4(v_lo, 32768, vl);
     vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(
         __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 32768, vl);
     v_hi = __riscv_vadd_vx_i32m4_mu(v_carry, v_hi, v_hi, 1, vl);
 
     // Combine high shifted left and low shifted right
-    vint32m4_t v_lo_shifted = __riscv_vreinterpret_v_u32m4_i32m4(
-        __riscv_vsrl_vx_u32m4(
-            __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl));
     vint32m4_t v_res = __riscv_vor_vv_i32m4(
-        __riscv_vsll_vx_i32m4(v_hi, 16, vl), v_lo_shifted, vl);
+        __riscv_vsll_vx_i32m4(v_hi, 16, vl),
+        __riscv_vreinterpret_v_u32m4_i32m4(
+            __riscv_vsrl_vx_u32m4(
+                __riscv_vreinterpret_v_i32m4_u32m4(v_lo_rounded), 16, vl)),
+        vl);
 
-    // Narrow to 16-bit with saturation
+    // Saturate result to 16-bit range
     vint16m2_t v_res_i16 = __riscv_vnclip_wx_i16m2(v_res, 0, __RISCV_VXRM_RNU, vl);
 
-    // Zero out inactive channels and store result
+    // Zero out inactive elements and store result
     vint16m2_t v_zero = __riscv_vmv_v_x_i16m2(0, vl);
     vint16m2_t v_final = __riscv_vmerge_vvm_i16m2(v_zero, v_res_i16, v_active, vl);
-    __riscv_vse16_v_i16m2(output + i, v_final, vl);
+    __riscv_vse16_v_i16m2(p_dst, v_final, vl);
 
-    i += vl;
+    p_src += vl;
+    p_dst += vl;
+    remaining -= vl;
   }
 }
\ No newline at end of file

From 41bdbd8818dc7ed9b49fbddfd0244ad8978c9afc Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 21 Nov 2025 07:23:31 -0600
Subject: [PATCH 73/86] Fix register spilling in FilterBank

---
 .../riscv_vector/signal/filter_bank_rvv.cc    | 127 +++++++++---------
 1 file changed, 60 insertions(+), 67 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
index b38e803e327..a314361cd27 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.cc
@@ -24,63 +24,66 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
         // Process channel only if it has non-zero width
         if (channel_width > 0)
         {
-            // Set max vector length for the channel
-            size_t vl_max = __riscv_vsetvl_e32m4(channel_width);
+            // Optimization: Use LMUL=2 to fit all variables in registers and avoid spilling
+            size_t vl_max = __riscv_vsetvl_e32m2(channel_width);
 
-            // Initialize vector accumulators for 64-bit sums (low and high parts)
-            vuint32m4_t v_acc_w_low = __riscv_vmv_v_x_u32m4(0, vl_max);
-            vuint32m4_t v_acc_w_high = __riscv_vmv_v_x_u32m4(0, vl_max);
-            vuint32m4_t v_acc_uw_low = __riscv_vmv_v_x_u32m4(0, vl_max);
-            vuint32m4_t v_acc_uw_high = __riscv_vmv_v_x_u32m4(0, vl_max);
+            // Initialize vector accumulators for 64-bit sums
+            vuint32m2_t v_acc_w_low = __riscv_vmv_v_x_u32m2(0, vl_max);
+            vuint32m2_t v_acc_w_high = __riscv_vmv_v_x_u32m2(0, vl_max);
+            vuint32m2_t v_acc_uw_low = __riscv_vmv_v_x_u32m2(0, vl_max);
+            vuint32m2_t v_acc_uw_high = __riscv_vmv_v_x_u32m2(0, vl_max);
 
-            // Initialize vector accumulators for carries (Optimization: avoid vcpop in loop)
-            vuint32m4_t v_carry_w_acc = __riscv_vmv_v_x_u32m4(0, vl_max);
-            vuint32m4_t v_carry_uw_acc = __riscv_vmv_v_x_u32m4(0, vl_max);
-
-            // Process the channel width in vector-sized chunks (stripmining)
+            // Process the channel width in vector-sized chunks
             int j = 0;
             while (j < channel_width)
             {
                 // Set vector length for the current strip
-                size_t vl = __riscv_vsetvl_e32m4(channel_width - j);
+                size_t vl = __riscv_vsetvl_e32m2(channel_width - j);
 
                 // Load vector of input data
-                vuint32m4_t v_input =
-                    __riscv_vle32_v_u32m4(&input[freq_start + j], vl);
-
-                // Load 16-bit weights and unweights
-                vuint16m2_t v_weights16 = __riscv_vle16_v_u16m2(
-                    reinterpret_cast<const uint16_t*>(&config->weights[weight_start + j]), vl);
-                vuint16m2_t v_unweights16 = __riscv_vle16_v_u16m2(
-                    reinterpret_cast<const uint16_t*>(&config->unweights[weight_start + j]), vl);
-
-                // Widen weights and unweights to 32-bit
-                vuint32m4_t v_weights32 = __riscv_vwaddu_vx_u32m4(v_weights16, 0, vl);
-                vuint32m4_t v_unweights32 = __riscv_vwaddu_vx_u32m4(v_unweights16, 0, vl);
-
-                // Perform 32x32 multiply, producing 64-bit results as low/high pairs
-                vuint32m4_t v_prod_w_low = __riscv_vmul_vv_u32m4(v_input, v_weights32, vl);
-                vuint32m4_t v_prod_w_high = __riscv_vmulhu_vv_u32m4(v_input, v_weights32, vl);
-                vuint32m4_t v_prod_uw_low = __riscv_vmul_vv_u32m4(v_input, v_unweights32, vl);
-                vuint32m4_t v_prod_uw_high = __riscv_vmulhu_vv_u32m4(v_input, v_unweights32, vl);
-
-                // Add the low 32-bit parts of the products
-                vuint32m4_t v_next_acc_w_low = __riscv_vadd_vv_u32m4(v_acc_w_low, v_prod_w_low, vl);
-                vuint32m4_t v_next_acc_uw_low = __riscv_vadd_vv_u32m4(v_acc_uw_low, v_prod_uw_low, vl);
-
-                // Detect carries from the low-part addition
-                vbool8_t v_carry_w = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_w_low, v_acc_w_low, vl);
-                vbool8_t v_carry_uw = __riscv_vmsltu_vv_u32m4_b8(v_next_acc_uw_low, v_acc_uw_low, vl);
-
-                // Optimization: Accumulate carries into vector register instead of scalar vcpop
-                v_carry_w_acc = __riscv_vadd_vx_u32m4_m(v_carry_w, v_carry_w_acc, 1, vl);
-                v_carry_uw_acc = __riscv_vadd_vx_u32m4_m(v_carry_uw, v_carry_uw_acc, 1, vl);
-
-                // Add the high 32-bit parts of the products
-                v_acc_w_high = __riscv_vadd_vv_u32m4(v_acc_w_high, v_prod_w_high, vl);
-                v_acc_uw_high = __riscv_vadd_vv_u32m4(v_acc_uw_high, v_prod_uw_high, vl);
-
-                // Update the low-part accumulators
+                vuint32m2_t v_input = __riscv_vle32_v_u32m2(&input[freq_start + j], vl);
+
+                // Load Weights and Unweights
+                vint16m1_t v_weights16 = __riscv_vle16_v_i16m1(
+                    reinterpret_cast<const int16_t*>(&config->weights[weight_start + j]), vl);
+                vint16m1_t v_unweights16 = __riscv_vle16_v_i16m1(
+                    reinterpret_cast<const int16_t*>(&config->unweights[weight_start + j]), vl);
+
+                // Sign-extend weights to 32-bit
+                vint32m2_t v_weights32 = __riscv_vsext_vf2_i32m2(v_weights16, vl);
+                vint32m2_t v_unweights32 = __riscv_vsext_vf2_i32m2(v_unweights16, vl);
+
+                // Reinterpret weights as unsigned bits for vmul
+                vuint32m2_t v_weights32_u = __riscv_vreinterpret_v_i32m2_u32m2(v_weights32);
+                vuint32m2_t v_unweights32_u = __riscv_vreinterpret_v_i32m2_u32m2(v_unweights32);
+
+                // Low part multiply
+                vuint32m2_t v_prod_w_low = __riscv_vmul_vv_u32m2(v_input, v_weights32_u, vl);
+                vuint32m2_t v_prod_uw_low = __riscv_vmul_vv_u32m2(v_input, v_unweights32_u, vl);
+
+                // High part multiply
+                vint32m2_t v_prod_w_high_i = __riscv_vmulhsu_vv_i32m2(v_weights32, v_input, vl);
+                vint32m2_t v_prod_uw_high_i = __riscv_vmulhsu_vv_i32m2(v_unweights32, v_input, vl);
+                vuint32m2_t v_prod_w_high = __riscv_vreinterpret_v_i32m2_u32m2(v_prod_w_high_i);
+                vuint32m2_t v_prod_uw_high = __riscv_vreinterpret_v_i32m2_u32m2(v_prod_uw_high_i);
+
+                // Accumulate Low part
+                vuint32m2_t v_next_acc_w_low = __riscv_vadd_vv_u32m2(v_acc_w_low, v_prod_w_low, vl);
+                vuint32m2_t v_next_acc_uw_low = __riscv_vadd_vv_u32m2(v_acc_uw_low, v_prod_uw_low, vl);
+
+                // Detect Carries (if result < accumulator, we wrapped)
+                vbool16_t v_carry_w = __riscv_vmsltu_vv_u32m2_b16(v_next_acc_w_low, v_acc_w_low, vl);
+                vbool16_t v_carry_uw = __riscv_vmsltu_vv_u32m2_b16(v_next_acc_uw_low, v_acc_uw_low, vl);
+
+                // Accumulate High part
+                v_acc_w_high = __riscv_vadd_vv_u32m2(v_acc_w_high, v_prod_w_high, vl);
+                v_acc_uw_high = __riscv_vadd_vv_u32m2(v_acc_uw_high, v_prod_uw_high, vl);
+
+                // Apply Carry: Add 1 to high accumulator where carry is set
+                v_acc_w_high = __riscv_vadd_vx_u32m2_mu(v_carry_w, v_acc_w_high, v_acc_w_high, 1, vl);
+                v_acc_uw_high = __riscv_vadd_vx_u32m2_mu(v_carry_uw, v_acc_uw_high, v_acc_uw_high, 1, vl);
+
+                // Update low accumulator
                 v_acc_w_low = v_next_acc_w_low;
                 v_acc_uw_low = v_next_acc_uw_low;
 
@@ -92,30 +95,20 @@ void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
             vuint32m1_t v_zero = __riscv_vmv_v_x_u32m1(0, vl_max);
 
             // Reduce the 32-bit vector accumulators to scalar sums
-            vuint32m1_t v_sum_w_low = __riscv_vredsum_vs_u32m4_u32m1(v_acc_w_low, v_zero, vl_max);
-            vuint32m1_t v_sum_uw_low = __riscv_vredsum_vs_u32m4_u32m1(v_acc_uw_low, v_zero, vl_max);
-            vuint32m1_t v_sum_w_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_w_high, v_zero, vl_max);
-            vuint32m1_t v_sum_uw_high = __riscv_vredsum_vs_u32m4_u32m1(v_acc_uw_high, v_zero, vl_max);
-
-            // Reduce the carry accumulators
-            vuint32m1_t v_sum_carry_w = __riscv_vredsum_vs_u32m4_u32m1(v_carry_w_acc, v_zero, vl_max);
-            vuint32m1_t v_sum_carry_uw = __riscv_vredsum_vs_u32m4_u32m1(v_carry_uw_acc, v_zero, vl_max);
+            vuint32m1_t v_sum_w_low = __riscv_vredsum_vs_u32m2_u32m1(v_acc_w_low, v_zero, vl_max);
+            vuint32m1_t v_sum_w_high = __riscv_vredsum_vs_u32m2_u32m1(v_acc_w_high, v_zero, vl_max);
+            vuint32m1_t v_sum_uw_low = __riscv_vredsum_vs_u32m2_u32m1(v_acc_uw_low, v_zero, vl_max);
+            vuint32m1_t v_sum_uw_high = __riscv_vredsum_vs_u32m2_u32m1(v_acc_uw_high, v_zero, vl_max);
 
             // Extract scalar results
             uint32_t final_w_low = __riscv_vmv_x_s_u32m1_u32(v_sum_w_low);
-            uint32_t final_uw_low = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_low);
             uint32_t final_w_high = __riscv_vmv_x_s_u32m1_u32(v_sum_w_high);
+            uint32_t final_uw_low = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_low);
             uint32_t final_uw_high = __riscv_vmv_x_s_u32m1_u32(v_sum_uw_high);
-            uint32_t w_carry_count = __riscv_vmv_x_s_u32m1_u32(v_sum_carry_w);
-            uint32_t uw_carry_count = __riscv_vmv_x_s_u32m1_u32(v_sum_carry_uw);
-
-            // Reconstruct the final 64-bit sum
-            uint64_t final_w = ((uint64_t)(final_w_high + w_carry_count) << 32) | final_w_low;
-            uint64_t final_uw = ((uint64_t)(final_uw_high + uw_carry_count) << 32) | final_uw_low;
 
-            // Add the vector reduction result to the channel's scalar accumulator
-            channel_w_acc += final_w;
-            channel_uw_acc += final_uw;
+            // Reconstruct the final 64-bit sum and add to channel accumulator
+            channel_w_acc += ((uint64_t)final_w_high << 32) | final_w_low;
+            channel_uw_acc += ((uint64_t)final_uw_high << 32) | final_uw_low;
         }
 
         // Store the final weighted result for this channel

From 7592a96ea718147b08107bc5cadd98a365e5249a Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 21 Nov 2025 08:10:49 -0600
Subject: [PATCH 74/86] Optimize register usage for convolution and
 fullyconnected kernels

---
 .../micro/kernels/riscv_vector/conv_rvv.cc    |  91 ++++-----
 .../riscv_vector/fully_connected_rvv.cc       |  75 ++++----
 .../kernels/riscv_vector/requantize_rvv.h     | 176 +++++++++---------
 3 files changed, 181 insertions(+), 161 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
index 591e583e8c3..e0cf889ecf8 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.cc
@@ -109,23 +109,23 @@ void ConvPerChannelRVV(const ConvParams& params,
                 size_t current_out_x = 0;
                 while (current_out_x < static_cast<size_t>(output_width))
                 {
-                    // Set vector length for this iteration
-                    size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
+                    // Set vector length for this iteration (LMUL=2 optimization)
+                    size_t vl = __riscv_vsetvl_e32m2(output_width - current_out_x);
 
                     // Initialize accumulator vector with bias
-                    vint32m4_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m4(bias_val, vl)
-                                                     : __riscv_vmv_v_x_i32m4(0, vl);
+                    vint32m2_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m2(bias_val, vl)
+                                                     : __riscv_vmv_v_x_i32m2(0, vl);
 
                     // Calculate base input x coordinates for the vector lanes
-                    vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
-                    vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, static_cast<uint32_t>(current_out_x), vl));
-                    vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
+                    vuint32m2_t v_idx = __riscv_vid_v_u32m2(vl);
+                    vint32m2_t v_out_x = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vx_u32m2(v_idx, static_cast<uint32_t>(current_out_x), vl));
+                    vint32m2_t v_in_x_origin_base = __riscv_vsub_vx_i32m2(__riscv_vmul_vx_i32m2(v_out_x, stride_width, vl), pad_width, vl);
 
                     // Loop over filter height
                     for (int filter_y = 0; filter_y < filter_height; ++filter_y)
                     {
                         const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                        if (in_y < 0 || in_y >= input_height) continue; // Simplified boundary check
+                        if (in_y < 0 || in_y >= input_height) continue;
 
                         const int8_t* filter_y_base = filter_oc_base + (filter_y * filter_h_stride);
 
@@ -134,12 +134,12 @@ void ConvPerChannelRVV(const ConvParams& params,
                         {
                             const int in_x_offset = dilation_width_factor * filter_x;
                             const int8_t* filter_patch_base = filter_y_base + (filter_x * filter_w_stride);
-                            vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
+                            vint32m2_t v_in_x = __riscv_vadd_vx_i32m2(v_in_x_origin_base, in_x_offset, vl);
 
                             // Create mask for valid input coordinates
-                            vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
-                            vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
-                            vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
+                            vbool16_t v_mask_ge_zero = __riscv_vmsge_vx_i32m2_b16(v_in_x, 0, vl);
+                            vbool16_t v_mask_lt_width = __riscv_vmslt_vx_i32m2_b16(v_in_x, input_width, vl);
+                            vbool16_t v_active_lane_mask = __riscv_vmand_mm_b16(v_mask_ge_zero, v_mask_lt_width, vl);
 
                             // Calculate base input pointer and stride for vector load
                             int32_t base_in_x_for_vector0 = static_cast<int32_t>(current_out_x) * stride_width - pad_width + in_x_offset;
@@ -153,16 +153,22 @@ void ConvPerChannelRVV(const ConvParams& params,
                                 int8_t s_filter_val_s8 = filter_patch_base[ic * filter_ch_stride];
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(s_filter_val_s8);
                                 const int8_t* input_ic_ptr = input_base_for_y_x_patch + (ic * input_ch_stride);
-                                vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_ic_ptr, input_x_stride_bytes, vl);
-                                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
-                                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
-                                v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
+                                
+                                // Load inputs: Use mf2 to match m2 element count (32bit vs 8bit ratio is 4)
+                                vint8mf2_t v_input_s8 = __riscv_vlse8_v_i8mf2_m(v_active_lane_mask, input_ic_ptr, input_x_stride_bytes, vl);
+                                
+                                // Widen to 16-bit (m1)
+                                vint16m1_t v_input_s16 = __riscv_vsext_vf2_i16m1_m(v_active_lane_mask, v_input_s8, vl);
+                                vint16m1_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m1_m(v_active_lane_mask, v_input_s16, s_input_offset_s16, vl);
+                                
+                                // Widen accumulate into 32-bit (m2)
+                                v_acc_s32 = __riscv_vwmacc_vx_i32m2_m(v_active_lane_mask, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
                     }
 
-                    // Requantize the accumulated values in a single function call.
-                    vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+                    // Requantize the accumulated values (vint32m2_t)
+                    vint32m2_t v_res32 = RequantizeVectorPerTensorS32(
                         v_acc_s32,
                         scalar_multiplier,
                         effective_right_shift,
@@ -171,13 +177,13 @@ void ConvPerChannelRVV(const ConvParams& params,
                         s_output_activation_max_s32,
                         vl);
                     
-                    // Narrow result to int16 and then int8 with saturation
-                    vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
-                    vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+                    // Narrow result to int16 (m1) and then int8 (mf2) with saturation
+                    vint16m1_t v_res16 = __riscv_vnclip_wx_i16m1(v_res32, 0, __RISCV_VXRM_RNU, vl);
+                    vint8mf2_t v_out_s8 = __riscv_vnclip_wx_i8mf2(v_res16, 0, __RISCV_VXRM_RNU, vl);
 
                     // Store results vector (strided)
                     int8_t* output_strip_base_ptr = output_channel_base + current_out_x * output_w_stride;
-                    __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
+                    __riscv_vsse8_v_i8mf2(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
 
                     // Advance output x pointer
                     current_out_x += vl;
@@ -278,17 +284,17 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                     size_t current_out_x = 0;
                     while (current_out_x < static_cast<size_t>(output_width))
                     {
-                        // Set vector length for this iteration
-                        size_t vl = __riscv_vsetvl_e32m4(output_width - current_out_x);
+                        // Set vector length for this iteration (LMUL=2)
+                        size_t vl = __riscv_vsetvl_e32m2(output_width - current_out_x);
 
                         // Initialize accumulator vector with bias
-                        vint32m4_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m4(bias_val, vl)
-                                                         : __riscv_vmv_v_x_i32m4(0, vl);
+                        vint32m2_t v_acc_s32 = bias_data ? __riscv_vmv_v_x_i32m2(bias_val, vl)
+                                                         : __riscv_vmv_v_x_i32m2(0, vl);
 
                         // Calculate base input x coordinates for the vector lanes
-                        vuint32m4_t v_idx = __riscv_vid_v_u32m4(vl);
-                        vint32m4_t v_out_x = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vadd_vx_u32m4(v_idx, static_cast<uint32_t>(current_out_x), vl));
-                        vint32m4_t v_in_x_origin_base = __riscv_vsub_vx_i32m4(__riscv_vmul_vx_i32m4(v_out_x, stride_width, vl), pad_width, vl);
+                        vuint32m2_t v_idx = __riscv_vid_v_u32m2(vl);
+                        vint32m2_t v_out_x = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vx_u32m2(v_idx, static_cast<uint32_t>(current_out_x), vl));
+                        vint32m2_t v_in_x_origin_base = __riscv_vsub_vx_i32m2(__riscv_vmul_vx_i32m2(v_out_x, stride_width, vl), pad_width, vl);
 
                         // Loop over filter height
                         for (int filter_y = 0; filter_y < filter_height; ++filter_y)
@@ -302,15 +308,15 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                             for (int filter_x = 0; filter_x < filter_width; ++filter_x)
                             {
                                 const int in_x_offset = dilation_width_factor * filter_x;
-                                vint32m4_t v_in_x = __riscv_vadd_vx_i32m4(v_in_x_origin_base, in_x_offset, vl);
+                                vint32m2_t v_in_x = __riscv_vadd_vx_i32m2(v_in_x_origin_base, in_x_offset, vl);
 
                                 // Create mask for valid input coordinates
-                                vbool8_t v_mask_ge_zero = __riscv_vmsge_vx_i32m4_b8(v_in_x, 0, vl);
-                                vbool8_t v_mask_lt_width = __riscv_vmslt_vx_i32m4_b8(v_in_x, input_width, vl);
-                                vbool8_t v_active_lane_mask_b8 = __riscv_vmand_mm_b8(v_mask_ge_zero, v_mask_lt_width, vl);
+                                vbool16_t v_mask_ge_zero = __riscv_vmsge_vx_i32m2_b16(v_in_x, 0, vl);
+                                vbool16_t v_mask_lt_width = __riscv_vmslt_vx_i32m2_b16(v_in_x, input_width, vl);
+                                vbool16_t v_active_lane_mask = __riscv_vmand_mm_b16(v_mask_ge_zero, v_mask_lt_width, vl);
 
                                 // Optimization: skip MAC if all lanes are masked off
-                                if (__riscv_vfirst_m_b8(v_active_lane_mask_b8, vl) == -1) continue;
+                                if (__riscv_vfirst_m_b16(v_active_lane_mask, vl) == -1) continue;
 
                                 const int8_t* filter_ptr = filter_y_base + filter_x * filter_w_stride + output_channel * filter_ch_stride;
                                 int16_t s_filter_val_s16 = static_cast<int16_t>(*filter_ptr);
@@ -320,15 +326,16 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                                   input_batch_base + in_y * input_h_stride + base_in_x_for_vector0 * input_w_stride + in_channel * input_ch_stride;
                                 ptrdiff_t input_x_stride_bytes = static_cast<ptrdiff_t>(stride_width) * input_w_stride * sizeof(int8_t);
 
-                                vint8m1_t v_input_s8 = __riscv_vlse8_v_i8m1_m(v_active_lane_mask_b8, input_base_ptr, input_x_stride_bytes, vl);
-                                vint16m2_t v_input_s16 = __riscv_vsext_vf2_i16m2_m(v_active_lane_mask_b8, v_input_s8, vl);
-                                vint16m2_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m2_m(v_active_lane_mask_b8, v_input_s16, s_input_offset_s16, vl);
-                                v_acc_s32 = __riscv_vwmacc_vx_i32m4_m(v_active_lane_mask_b8, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
+                                // Load input: mf2 -> m1 -> m2 accumulate
+                                vint8mf2_t v_input_s8 = __riscv_vlse8_v_i8mf2_m(v_active_lane_mask, input_base_ptr, input_x_stride_bytes, vl);
+                                vint16m1_t v_input_s16 = __riscv_vsext_vf2_i16m1_m(v_active_lane_mask, v_input_s8, vl);
+                                vint16m1_t v_input_plus_offset_s16 = __riscv_vadd_vx_i16m1_m(v_active_lane_mask, v_input_s16, s_input_offset_s16, vl);
+                                v_acc_s32 = __riscv_vwmacc_vx_i32m2_m(v_active_lane_mask, v_acc_s32, s_filter_val_s16, v_input_plus_offset_s16, vl);
                             }
                         }
 
                         // Requantize the accumulated values in a single function call.
-                        vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+                        vint32m2_t v_res32 = RequantizeVectorPerTensorS32(
                             v_acc_s32,
                             scalar_multiplier,
                             effective_right_shift,
@@ -338,12 +345,12 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
                             vl);
                         
                         // Narrow result to int16 and then int8 with saturation
-                        vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
-                        vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+                        vint16m1_t v_res16 = __riscv_vnclip_wx_i16m1(v_res32, 0, __RISCV_VXRM_RNU, vl);
+                        vint8mf2_t v_out_s8 = __riscv_vnclip_wx_i8mf2(v_res16, 0, __RISCV_VXRM_RNU, vl);
 
                         // Store results vector (strided)
                         int8_t* output_strip_base_ptr = output_channel_row_base + current_out_x * output_w_stride;
-                        __riscv_vsse8_v_i8m1(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
+                        __riscv_vsse8_v_i8mf2(output_strip_base_ptr, output_x_stride_bytes, v_out_s8, vl);
 
                         // Advance output x pointer
                         current_out_x += vl;
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
index 3e1110596a1..cfdfc12b893 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.cc
@@ -17,7 +17,8 @@ void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
                                  const RuntimeShape& bias_shape,
                                  const int32_t* bias_data,
                                  const RuntimeShape& output_shape,
-                                 int8_t* output_data) {
+                                 int8_t* output_data)
+{
     // Extract quantization parameters
     const int32_t input_offset = params.input_offset;
     const int32_t output_offset = params.output_offset;
@@ -42,13 +43,13 @@ void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
         size_t current_out_c = 0;
         while (current_out_c < static_cast<size_t>(output_depth))
         {
-            // Set vector length for this iteration
-            size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
+            // Set vector length for this iteration (LMUL=2)
+            size_t vl = __riscv_vsetvl_e32m2(output_depth - current_out_c);
 
             // Initialize accumulator vector with biases
-            vint32m4_t v_acc_s32 = bias_data
-                ? __riscv_vle32_v_i32m4(bias_data + current_out_c, vl)
-                : __riscv_vmv_v_x_i32m4(0, vl);
+            vint32m2_t v_acc_s32 = bias_data
+                ? __riscv_vle32_v_i32m2(bias_data + current_out_c, vl)
+                : __riscv_vmv_v_x_i32m2(0, vl);
 
             // Main MAC loop to compute dot products
             for (int d = 0; d < accum_depth; ++d)
@@ -56,27 +57,33 @@ void FullyConnectedPerChannelRVV(const FullyConnectedParams& params,
                 int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
                 const int8_t* filter_col_ptr = filter_data + d + current_out_c * accum_depth;
                 ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
-                vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
-                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
-                v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_s16, vl);
+                
+                // Load filter: mf2 (matches element count of m2 32-bit)
+                vint8mf2_t v_filter_s8 = __riscv_vlse8_v_i8mf2(filter_col_ptr, filter_stride, vl);
+                
+                // Widen to m1
+                vint16m1_t v_filter_s16 = __riscv_vsext_vf2_i16m1(v_filter_s8, vl);
+                
+                // Widen accumulate to m2
+                v_acc_s32 = __riscv_vwmacc_vx_i32m2(v_acc_s32, s_input_val_s16, v_filter_s16, vl);
             }
 
             // Load per-channel requantization parameters into vectors
-            vint32m4_t v_multiplier = __riscv_vle32_v_i32m4(output_multiplier + current_out_c, vl);
-            vint32m4_t v_shift = __riscv_vle32_v_i32m4(
+            vint32m2_t v_multiplier = __riscv_vle32_v_i32m2(output_multiplier + current_out_c, vl);
+            vint32m2_t v_shift = __riscv_vle32_v_i32m2(
                 reinterpret_cast<const int32_t*>(output_shift) + current_out_c, vl);
 
-            // Requantize the accumulated values using the fully vectorized helper.
-            vint32m4_t v_res32 = RequantizeVectorPerChannelS32(
+            // Requantize
+            vint32m2_t v_res32 = RequantizeVectorPerChannelS32(
                 v_acc_s32, v_multiplier, v_shift,
                 output_offset, output_activation_min, output_activation_max, vl);
             
-            // Narrow the 32-bit results to 16-bit, then 8-bit with saturation
-            vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
-            vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
+            // Narrow result
+            vint16m1_t v_res16 = __riscv_vnclip_wx_i16m1(v_res32, 0, __RISCV_VXRM_RNU, vl);
+            vint8mf2_t v_out_s8 = __riscv_vnclip_wx_i8mf2(v_res16, 0, __RISCV_VXRM_RNU, vl);
       
-            // Store the final 8-bit output vector
-            __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
+            // Store result
+            __riscv_vse8_v_i8mf2(output_batch_ptr + current_out_c, v_out_s8, vl);
 
             // Advance to the next block of output channels
             current_out_c += vl;
@@ -124,28 +131,30 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
         size_t current_out_c = 0;
         while (current_out_c < static_cast<size_t>(output_depth))
         {
-            // Set vector length for processing multiple output channels
-            size_t vl = __riscv_vsetvl_e32m4(output_depth - current_out_c);
+            // Set vector length for processing multiple output channels (LMUL=2)
+            size_t vl = __riscv_vsetvl_e32m2(output_depth - current_out_c);
 
             // Initialize accumulator vector with biases
-            vint32m4_t v_acc_s32 = bias_data
-                ? __riscv_vle32_v_i32m4(bias_data + current_out_c, vl)
-                : __riscv_vmv_v_x_i32m4(0, vl);
+            vint32m2_t v_acc_s32 = bias_data
+                ? __riscv_vle32_v_i32m2(bias_data + current_out_c, vl)
+                : __riscv_vmv_v_x_i32m2(0, vl);
 
-            // Loop over accumulation depth to compute 'vl' dot products in parallel
+            // Loop over accumulation depth to compute dot products in parallel
             for (int d = 0; d < accum_depth; ++d)
             {
                 int16_t s_input_val_s16 = static_cast<int16_t>(input_batch_ptr[d]) + s_input_offset_s16;
                 const int8_t* filter_col_ptr = filter_data + current_out_c * accum_depth + d;
                 ptrdiff_t filter_stride = accum_depth * sizeof(int8_t);
-                vint8m1_t v_filter_s8 = __riscv_vlse8_v_i8m1(filter_col_ptr, filter_stride, vl);
-                vint16m2_t v_filter_s16 = __riscv_vsext_vf2_i16m2(v_filter_s8, vl);
-                vint16m2_t v_filter_plus_offset_s16 = __riscv_vadd_vx_i16m2(v_filter_s16, s_filter_offset_s16, vl);
-                v_acc_s32 = __riscv_vwmacc_vx_i32m4(v_acc_s32, s_input_val_s16, v_filter_plus_offset_s16, vl);
+                
+                // Load: mf2 -> m1 -> m1 (+offset) -> m2 (accumulate)
+                vint8mf2_t v_filter_s8 = __riscv_vlse8_v_i8mf2(filter_col_ptr, filter_stride, vl);
+                vint16m1_t v_filter_s16 = __riscv_vsext_vf2_i16m1(v_filter_s8, vl);
+                vint16m1_t v_filter_plus_offset_s16 = __riscv_vadd_vx_i16m1(v_filter_s16, s_filter_offset_s16, vl);
+                v_acc_s32 = __riscv_vwmacc_vx_i32m2(v_acc_s32, s_input_val_s16, v_filter_plus_offset_s16, vl);
             }
 
             const int effective_right_shift = 31 - output_shift;
-            vint32m4_t v_res32 = RequantizeVectorPerTensorS32(
+            vint32m2_t v_res32 = RequantizeVectorPerTensorS32(
                 v_acc_s32,
                 output_multiplier,
                 effective_right_shift,
@@ -154,10 +163,10 @@ void FullyConnectedRVV(const FullyConnectedParams& params,
                 output_activation_max,
                 vl);
 
-            // Narrow result to int8 and store
-            vint16m2_t v_res16 = __riscv_vnclip_wx_i16m2(v_res32, 0, __RISCV_VXRM_RNU, vl);
-            vint8m1_t v_out_s8 = __riscv_vnclip_wx_i8m1(v_res16, 0, __RISCV_VXRM_RNU, vl);
-            __riscv_vse8_v_i8m1(output_batch_ptr + current_out_c, v_out_s8, vl);
+            // Narrow result
+            vint16m1_t v_res16 = __riscv_vnclip_wx_i16m1(v_res32, 0, __RISCV_VXRM_RNU, vl);
+            vint8mf2_t v_out_s8 = __riscv_vnclip_wx_i8mf2(v_res16, 0, __RISCV_VXRM_RNU, vl);
+            __riscv_vse8_v_i8mf2(output_batch_ptr + current_out_c, v_out_s8, vl);
 
             // Advance to the next block of output channels
             current_out_c += vl;
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
index f6498dfebc5..b8e49d1e6ad 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
@@ -1,8 +1,8 @@
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_REQUANTIZE_RVV_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_REQUANTIZE_RVV_H_
 
-inline vint32m4_t RequantizeVectorPerTensorS32(
-    vint32m4_t v_acc, const int32_t multiplier, const int effective_right_shift,
+inline vint32m2_t RequantizeVectorPerTensorS32(
+    vint32m2_t v_acc, const int32_t multiplier, const int effective_right_shift,
     const int32_t output_offset, const int32_t activation_min,
     const int32_t activation_max, const size_t vl)
 {
@@ -15,153 +15,157 @@ inline vint32m4_t RequantizeVectorPerTensorS32(
     const int32_t rounding_hi = static_cast<int32_t>((rounding_val >> 32));
 
     // Multiply accumulator by scalar multiplier (results in 64b intermediate)
-    vint32m4_t v_prod_lo = __riscv_vmul_vx_i32m4(v_acc, multiplier, vl);
-    vint32m4_t v_prod_hi = __riscv_vmulh_vx_i32m4(v_acc, multiplier, vl);
+    // Uses m2 intrinsics
+    vint32m2_t v_prod_lo = __riscv_vmul_vx_i32m2(v_acc, multiplier, vl);
+    vint32m2_t v_prod_hi = __riscv_vmulh_vx_i32m2(v_acc, multiplier, vl);
 
     // Add 64b rounding value using 32b operations with carry
-    vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-    vuint32m4_t v_sum_lo_u = __riscv_vadd_vx_u32m4(v_prod_lo_u, rounding_lo, vl);
-    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_sum_lo_u, rounding_lo, vl);
-    vint32m4_t v_rounded_hi = __riscv_vadd_vx_i32m4(v_prod_hi, rounding_hi, vl);
-    v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
-    vint32m4_t v_rounded_lo = __riscv_vreinterpret_v_u32m4_i32m4(v_sum_lo_u);
+    vuint32m2_t v_prod_lo_u = __riscv_vreinterpret_v_i32m2_u32m2(v_prod_lo);
+    vuint32m2_t v_sum_lo_u = __riscv_vadd_vx_u32m2(v_prod_lo_u, rounding_lo, vl);
+    vbool16_t v_carry = __riscv_vmsltu_vx_u32m2_b16(v_sum_lo_u, rounding_lo, vl);
+    vint32m2_t v_rounded_hi = __riscv_vadd_vx_i32m2(v_prod_hi, rounding_hi, vl);
+    v_rounded_hi = __riscv_vadd_vx_i32m2_m(v_carry, v_rounded_hi, 1, vl);
+    vint32m2_t v_rounded_lo = __riscv_vreinterpret_v_u32m2_i32m2(v_sum_lo_u);
 
     // Perform 64b arithmetic right shift using 32b vector shifts
-    vint32m4_t v_res32;
+    vint32m2_t v_res32;
     if (effective_right_shift == 0)
     {
         v_res32 = v_rounded_lo;
     }
     else if (effective_right_shift > 0 && effective_right_shift < 32)
     {
-        vuint32m4_t v_lo_usrl = __riscv_vsrl_vx_u32m4(
-            __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_lo),
+        vuint32m2_t v_lo_usrl = __riscv_vsrl_vx_u32m2(
+            __riscv_vreinterpret_v_i32m2_u32m2(v_rounded_lo),
             effective_right_shift, vl);
-        vint32m4_t v_hi_sll = __riscv_vsll_vx_i32m4(
+        vint32m2_t v_hi_sll = __riscv_vsll_vx_i32m2(
             v_rounded_hi, 32 - effective_right_shift, vl);
-        v_res32 = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vor_vv_u32m4(
-            v_lo_usrl, __riscv_vreinterpret_v_i32m4_u32m4(v_hi_sll), vl));
+        v_res32 = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vor_vv_u32m2(
+            v_lo_usrl, __riscv_vreinterpret_v_i32m2_u32m2(v_hi_sll), vl));
     }
     else
     {
         const int shift_hi = std::min(31, effective_right_shift - 32);
-        v_res32 = __riscv_vsra_vx_i32m4(v_rounded_hi, shift_hi, vl);
+        v_res32 = __riscv_vsra_vx_i32m2(v_rounded_hi, shift_hi, vl);
     }
 
     // Add output offset
-    v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+    v_res32 = __riscv_vadd_vx_i32m2(v_res32, output_offset, vl);
 
     // Clamp to activation bounds
-    v_res32 = __riscv_vmax_vx_i32m4(v_res32, activation_min, vl);
-    v_res32 = __riscv_vmin_vx_i32m4(v_res32, activation_max, vl);
+    v_res32 = __riscv_vmax_vx_i32m2(v_res32, activation_min, vl);
+    v_res32 = __riscv_vmin_vx_i32m2(v_res32, activation_max, vl);
 
     return v_res32;
 }
 
-inline vint32m4_t RequantizeVectorPerChannelS32(
-    vint32m4_t v_acc, vint32m4_t v_multiplier, vint32m4_t v_shift,
+inline vint32m2_t RequantizeVectorPerChannelS32(
+    vint32m2_t v_acc, vint32m2_t v_multiplier, vint32m2_t v_shift,
     const int32_t output_offset, const int32_t activation_min,
     const int32_t activation_max, const size_t vl)
 {
-    // Perform 32x32 -> 64-bit multiplication, getting high and low parts
-    vint32m4_t v_prod_hi = __riscv_vmulh_vv_i32m4(v_acc, v_multiplier, vl);
-    vint32m4_t v_prod_lo = __riscv_vmul_vv_i32m4(v_acc, v_multiplier, vl);
-
-    // Calculate the effective right shift for TFLM's fixed-point scheme
-    vint32m4_t v_effective_shift = __riscv_vrsub_vx_i32m4(v_shift, 31, vl);
-
-    // Create masks to separate lanes into right-shift and left-shift paths
-    vbool8_t v_mask_right_shift =
-        __riscv_vmsgt_vx_i32m4_b8(v_effective_shift, 0, vl);
-    vbool8_t v_mask_left_shift = __riscv_vmnot_m_b8(v_mask_right_shift, vl);
-
-    // Path 1: Right Shift (for lanes where effective_shift > 0)
-    vint32m4_t v_res_right;
+    // Perform 32x32 -> 64-bit multiplication
+    vint32m2_t v_prod_hi = __riscv_vmulh_vv_i32m2(v_acc, v_multiplier, vl);
+    vint32m2_t v_prod_lo = __riscv_vmul_vv_i32m2(v_acc, v_multiplier, vl);
+
+    // Calculate effective right shift
+    vint32m2_t v_effective_shift = __riscv_vrsub_vx_i32m2(v_shift, 31, vl);
+
+    // Create masks
+    vbool16_t v_mask_right_shift =
+        __riscv_vmsgt_vx_i32m2_b16(v_effective_shift, 0, vl);
+    vbool16_t v_mask_left_shift = __riscv_vmnot_m_b16(v_mask_right_shift, vl);
+
+    // Path 1: Right Shift
+    // Initialize to 0 to avoid "maybe-uninitialized" warnings
+    vint32m2_t v_res_right = __riscv_vmv_v_x_i32m2(0, vl);
+    
+    // Optimization: check if any lane needs right shift
+    if (__riscv_vfirst_m_b16(v_mask_right_shift, vl) >= 0) 
     {
-        // Calculate the 64-bit rounding value: (1LL << (effective_shift - 1))
-        vint32m4_t v_shift_minus_1 = __riscv_vsub_vx_i32m4_m(
+        vint32m2_t v_shift_minus_1 = __riscv_vsub_vx_i32m2_m(
             v_mask_right_shift, v_effective_shift, 1, vl);
-        vuint32m4_t v_shift_minus_1_u =
-            __riscv_vreinterpret_v_i32m4_u32m4(v_shift_minus_1);
-        vbool8_t v_mask_round_lt_32 = __riscv_vmsltu_vx_u32m4_b8_m(
+        vuint32m2_t v_shift_minus_1_u =
+            __riscv_vreinterpret_v_i32m2_u32m2(v_shift_minus_1);
+        vbool16_t v_mask_round_lt_32 = __riscv_vmsltu_vx_u32m2_b16_m(
             v_mask_right_shift, v_shift_minus_1_u, 32, vl);
-        vbool8_t v_mask_round_ge_32 = __riscv_vmandn_mm_b8(
+        vbool16_t v_mask_round_ge_32 = __riscv_vmandn_mm_b16(
             v_mask_right_shift, v_mask_round_lt_32, vl);
-        vuint32m4_t v_one_u = __riscv_vmv_v_x_u32m4(1, vl);
-        vuint32m4_t v_zero_u = __riscv_vmv_v_x_u32m4(0, vl);
-        vuint32m4_t v_rounding_lo_u = __riscv_vmerge_vvm_u32m4(
+        vuint32m2_t v_one_u = __riscv_vmv_v_x_u32m2(1, vl);
+        vuint32m2_t v_zero_u = __riscv_vmv_v_x_u32m2(0, vl);
+        vuint32m2_t v_rounding_lo_u = __riscv_vmerge_vvm_u32m2(
             v_zero_u,
-            __riscv_vsll_vv_u32m4_m(v_mask_round_lt_32, v_one_u,
+            __riscv_vsll_vv_u32m2_m(v_mask_round_lt_32, v_one_u,
                                    v_shift_minus_1_u, vl),
             v_mask_round_lt_32, vl);
-        vuint32m4_t v_rounding_hi_u = __riscv_vmerge_vvm_u32m4(
+        vuint32m2_t v_rounding_hi_u = __riscv_vmerge_vvm_u32m2(
             v_zero_u,
-            __riscv_vsll_vv_u32m4_m(
+            __riscv_vsll_vv_u32m2_m(
                 v_mask_round_ge_32, v_one_u,
-                __riscv_vsub_vx_u32m4_m(v_mask_round_ge_32, v_shift_minus_1_u,
+                __riscv_vsub_vx_u32m2_m(v_mask_round_ge_32, v_shift_minus_1_u,
                                         32, vl),
                 vl),
             v_mask_round_ge_32, vl);
 
-        // Add the 64-bit rounding value to the 64-bit product using 32-bit ops
-        vuint32m4_t v_prod_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_prod_lo);
-        vuint32m4_t v_sum_lo_u = __riscv_vadd_vv_u32m4_m(
+        vuint32m2_t v_prod_lo_u = __riscv_vreinterpret_v_i32m2_u32m2(v_prod_lo);
+        vuint32m2_t v_sum_lo_u = __riscv_vadd_vv_u32m2_m(
             v_mask_right_shift, v_prod_lo_u, v_rounding_lo_u, vl);
-        vbool8_t v_carry = __riscv_vmsltu_vv_u32m4_b8_m(
+        vbool16_t v_carry = __riscv_vmsltu_vv_u32m2_b16_m(
             v_mask_right_shift, v_sum_lo_u, v_prod_lo_u, vl);
-        vint32m4_t v_rounded_hi = __riscv_vadd_vv_i32m4_m(
+        vint32m2_t v_rounded_hi = __riscv_vadd_vv_i32m2_m(
             v_mask_right_shift, v_prod_hi,
-            __riscv_vreinterpret_v_u32m4_i32m4(v_rounding_hi_u), vl);
-        v_rounded_hi = __riscv_vadd_vx_i32m4_m(v_carry, v_rounded_hi, 1, vl);
+            __riscv_vreinterpret_v_u32m2_i32m2(v_rounding_hi_u), vl);
+        v_rounded_hi = __riscv_vadd_vx_i32m2_m(v_carry, v_rounded_hi, 1, vl);
 
-        // Emulate a 64-bit arithmetic right shift using two 32-bit sub-paths
-        vbool8_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m4_b8_m(
+        vbool16_t v_mask_shift_lt_32 = __riscv_vmslt_vx_i32m2_b16_m(
             v_mask_right_shift, v_effective_shift, 32, vl);
-        vbool8_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b8(
+        vbool16_t v_mask_shift_ge_32 = __riscv_vmandn_mm_b16(
             v_mask_right_shift, v_mask_shift_lt_32, vl);
-        vuint32m4_t v_shift_u =
-            __riscv_vreinterpret_v_i32m4_u32m4(v_effective_shift);
-        vuint32m4_t v_lo_part = __riscv_vsrl_vv_u32m4_m(
+        vuint32m2_t v_shift_u =
+            __riscv_vreinterpret_v_i32m2_u32m2(v_effective_shift);
+        vuint32m2_t v_lo_part = __riscv_vsrl_vv_u32m2_m(
             v_mask_shift_lt_32, v_sum_lo_u, v_shift_u, vl);
-        vuint32m4_t v_hi_part = __riscv_vsll_vv_u32m4_m(
+        vuint32m2_t v_hi_part = __riscv_vsll_vv_u32m2_m(
             v_mask_shift_lt_32,
-            __riscv_vreinterpret_v_i32m4_u32m4(v_rounded_hi),
-            __riscv_vrsub_vx_u32m4_m(v_mask_shift_lt_32, v_shift_u, 32, vl),
+            __riscv_vreinterpret_v_i32m2_u32m2(v_rounded_hi),
+            __riscv_vrsub_vx_u32m2_m(v_mask_shift_lt_32, v_shift_u, 32, vl),
             vl);
-        vint32m4_t v_res_lt_32 = __riscv_vreinterpret_v_u32m4_i32m4(
-            __riscv_vor_vv_u32m4_m(v_mask_shift_lt_32, v_lo_part, v_hi_part, vl));
-        vint32m4_t v_res_ge_32 = __riscv_vsra_vv_i32m4_m(
+        vint32m2_t v_res_lt_32 = __riscv_vreinterpret_v_u32m2_i32m2(
+            __riscv_vor_vv_u32m2_m(v_mask_shift_lt_32, v_lo_part, v_hi_part, vl));
+        vint32m2_t v_res_ge_32 = __riscv_vsra_vv_i32m2_m(
             v_mask_shift_ge_32, v_rounded_hi,
-            __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vsub_vx_i32m4_m(
+            __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vsub_vx_i32m2_m(
                 v_mask_shift_ge_32, v_effective_shift, 32, vl)),
             vl);
-        v_res_right = __riscv_vmerge_vvm_i32m4(v_res_ge_32, v_res_lt_32,
+        v_res_right = __riscv_vmerge_vvm_i32m2(v_res_ge_32, v_res_lt_32,
                                               v_mask_shift_lt_32, vl);
     }
 
-    // Path 2: Left Shift (for lanes where effective_shift <= 0)
-    vint32m4_t v_res_left;
+    // Path 2: Left Shift
+    // Initialize to 0 to avoid "maybe-uninitialized" warnings
+    vint32m2_t v_res_left = __riscv_vmv_v_x_i32m2(0, vl);
+    
+    if (__riscv_vfirst_m_b16(v_mask_left_shift, vl) >= 0)
     {
-        // Calculate the positive left shift amount
-        vint32m4_t v_left_shift_amount =
-            __riscv_vneg_v_i32m4_m(v_mask_left_shift, v_effective_shift, vl);
+        vint32m2_t v_left_shift_amount =
+            __riscv_vneg_v_i32m2_m(v_mask_left_shift, v_effective_shift, vl);
 
-        // Perform the left shift on the low 32 bits of the product
-        v_res_left = __riscv_vsll_vv_i32m4_m(
+        v_res_left = __riscv_vsll_vv_i32m2_m(
             v_mask_left_shift, v_prod_lo,
-            __riscv_vreinterpret_v_i32m4_u32m4(v_left_shift_amount), vl);
+            __riscv_vreinterpret_v_i32m2_u32m2(v_left_shift_amount), vl);
     }
 
-    // Merge the results from the right and left shift paths
-    vint32m4_t v_res32 =
-        __riscv_vmerge_vvm_i32m4(v_res_left, v_res_right, v_mask_right_shift, vl);
+    // Merge results
+    // Lanes with mask_right=1 take v_res_right, mask_right=0 (left) take v_res_left
+    vint32m2_t v_res32 =
+        __riscv_vmerge_vvm_i32m2(v_res_left, v_res_right, v_mask_right_shift, vl);
 
-    // Add the final output offset.
-    v_res32 = __riscv_vadd_vx_i32m4(v_res32, output_offset, vl);
+    // Add output offset
+    v_res32 = __riscv_vadd_vx_i32m2(v_res32, output_offset, vl);
 
-    // Clamp the results to the activation range.
-    v_res32 = __riscv_vmax_vx_i32m4(v_res32, activation_min, vl);
-    v_res32 = __riscv_vmin_vx_i32m4(v_res32, activation_max, vl);
+    // Clamp to activation bounds
+    v_res32 = __riscv_vmax_vx_i32m2(v_res32, activation_min, vl);
+    v_res32 = __riscv_vmin_vx_i32m2(v_res32, activation_max, vl);
 
     return v_res32;
 }

From c79cb3286e47b1ac91d9c4109f75097bd280bfd0 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 21 Nov 2025 08:24:13 -0600
Subject: [PATCH 75/86] Softmax: Optimize register usage

---
 .../micro/kernels/riscv_vector/softmax_rvv.h  | 282 +++++++-----------
 1 file changed, 112 insertions(+), 170 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 3dab3ef0439..28ab9e42567 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -13,8 +13,7 @@
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
-inline vint32m4_t SaturatingLeftShift_vx_i32m4(vint32m4_t v_in, int shift,
-                                               size_t vl)
+inline vint32m2_t SaturatingLeftShift_vx_i32m2(vint32m2_t v_in, int shift, size_t vl)
 {
     // Return early if shift is zero or negative
     if (shift <= 0) return v_in;
@@ -22,38 +21,35 @@ inline vint32m4_t SaturatingLeftShift_vx_i32m4(vint32m4_t v_in, int shift,
     // Handle extreme shifts that always saturate
     if (shift >= 31)
     {
-        // Create mask for negative values
-        vbool8_t v_neg = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
-        
-        // Set positive max and merge with negative min
-        vint32m4_t v_max = __riscv_vmv_v_x_i32m4(INT32_MAX, vl);
-        return __riscv_vmerge_vxm_i32m4(v_max, INT32_MIN, v_neg, vl);
+        vbool16_t v_neg = __riscv_vmslt_vx_i32m2_b16(v_in, 0, vl);
+        vint32m2_t v_max = __riscv_vmv_v_x_i32m2(INT32_MAX, vl);
+        return __riscv_vmerge_vxm_i32m2(v_max, INT32_MIN, v_neg, vl);
     }
 
     // Perform the logical left shift
-    vint32m4_t v_shifted = __riscv_vsll_vx_i32m4(v_in, shift, vl);
+    vint32m2_t v_shifted = __riscv_vsll_vx_i32m2(v_in, shift, vl);
 
     // Verify overflow by shifting back and comparing
-    vint32m4_t v_unshifted = __riscv_vsra_vx_i32m4(v_shifted, shift, vl);
-    vbool8_t v_no_overflow = __riscv_vmseq_vv_i32m4_b8(v_in, v_unshifted, vl);
+    vint32m2_t v_unshifted = __riscv_vsra_vx_i32m2(v_shifted, shift, vl);
+    vbool16_t v_no_overflow = __riscv_vmseq_vv_i32m2_b16(v_in, v_unshifted, vl);
 
     // Select saturating constants based on sign
-    vbool8_t v_neg = __riscv_vmslt_vx_i32m4_b8(v_in, 0, vl);
-    vint32m4_t v_sat = __riscv_vmerge_vxm_i32m4(
-        __riscv_vmv_v_x_i32m4(INT32_MAX, vl), INT32_MIN, v_neg, vl);
+    vbool16_t v_neg = __riscv_vmslt_vx_i32m2_b16(v_in, 0, vl);
+    vint32m2_t v_sat = __riscv_vmerge_vxm_i32m2(
+        __riscv_vmv_v_x_i32m2(INT32_MAX, vl), INT32_MIN, v_neg, vl);
 
     // Merge valid results with saturated results
-    return __riscv_vmerge_vvm_i32m4(v_sat, v_shifted, v_no_overflow, vl);
+    return __riscv_vmerge_vvm_i32m2(v_sat, v_shifted, v_no_overflow, vl);
 }
 
-inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
-    vint32m4_t v_x, int32_t multiplier, int left_shift, size_t vl)
+inline vint32m2_t MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m2(
+    vint32m2_t v_x, int32_t multiplier, int left_shift, size_t vl)
 {
     // Calculate low 32 bits of product
-    vint32m4_t v_lo = __riscv_vmul_vx_i32m4(v_x, multiplier, vl);
+    vint32m2_t v_lo = __riscv_vmul_vx_i32m2(v_x, multiplier, vl);
 
     // Calculate high 32 bits of product
-    vint32m4_t v_hi = __riscv_vmulh_vx_i32m4(v_x, multiplier, vl);
+    vint32m2_t v_hi = __riscv_vmulh_vx_i32m2(v_x, multiplier, vl);
 
     // Determine effective right shift amount
     int total_right_shift = 31 - left_shift;
@@ -62,40 +58,39 @@ inline vint32m4_t MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
     int32_t nudge = 1 << (total_right_shift - 1);
 
     // Add nudge to low part treating as unsigned
-    vuint32m4_t v_lo_u = __riscv_vreinterpret_v_i32m4_u32m4(v_lo);
-    vuint32m4_t v_lo_plus_nudge = __riscv_vadd_vx_u32m4(v_lo_u, nudge, vl);
+    vuint32m2_t v_lo_u = __riscv_vreinterpret_v_i32m2_u32m2(v_lo);
+    vuint32m2_t v_lo_plus_nudge = __riscv_vadd_vx_u32m2(v_lo_u, nudge, vl);
 
     // Detect carry from low part addition
-    vbool8_t v_carry = __riscv_vmsltu_vx_u32m4_b8(v_lo_plus_nudge, nudge, vl);
+    vbool16_t v_carry = __riscv_vmsltu_vx_u32m2_b16(v_lo_plus_nudge, nudge, vl);
 
     // Apply carry to high part
-    vint32m4_t v_hi_rounded = __riscv_vadd_vx_i32m4_m(v_carry, v_hi, 1, vl);
+    vint32m2_t v_hi_rounded = __riscv_vadd_vx_i32m2_m(v_carry, v_hi, 1, vl);
 
     // Calculate shift amounts for recombination
     int shift_hi = left_shift + 1;
     int shift_lo = total_right_shift;
 
     // Shift high part (handling mod 32 behavior)
-    vint32m4_t v_res_from_hi;
+    vint32m2_t v_res_from_hi;
     if (shift_hi < 32)
     {
-        v_res_from_hi = __riscv_vsll_vx_i32m4(v_hi_rounded, shift_hi, vl);
+        v_res_from_hi = __riscv_vsll_vx_i32m2(v_hi_rounded, shift_hi, vl);
     }
     else
     {
-        v_res_from_hi = __riscv_vmv_v_x_i32m4(0, vl);
+        v_res_from_hi = __riscv_vmv_v_x_i32m2(0, vl);
     }
 
     // Shift low part
-    vuint32m4_t v_res_from_lo =
-        __riscv_vsrl_vx_u32m4(v_lo_plus_nudge, shift_lo, vl);
+    vuint32m2_t v_res_from_lo = __riscv_vsrl_vx_u32m2(v_lo_plus_nudge, shift_lo, vl);
 
     // Combine results
-    return __riscv_vor_vv_i32m4(
-        v_res_from_hi, __riscv_vreinterpret_v_u32m4_i32m4(v_res_from_lo), vl);
+    return __riscv_vor_vv_i32m2(
+        v_res_from_hi, __riscv_vreinterpret_v_u32m2_i32m2(v_res_from_lo), vl);
 }
 
-inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
+inline vint32m2_t SRMPOT_vx_i32m2(vint32m2_t v_vec, int shift, size_t vl)
 {
     // Return early if shift is zero
     if (shift == 0) return v_vec;
@@ -103,22 +98,21 @@ inline vint32m4_t SRMPOT_vx_i32m4(vint32m4_t v_vec, int shift, size_t vl)
     // Handle positive shifts using saturating left shift
     if (shift > 0)
     {
-        return SaturatingLeftShift_vx_i32m4(v_vec, shift, vl);
+        return SaturatingLeftShift_vx_i32m2(v_vec, shift, vl);
     }
     else
     {
         // Perform rounding arithmetic right shift
-        return __riscv_vssra_vx_i32m4(v_vec, -shift, __RISCV_VXRM_RNU, vl);
+        return __riscv_vssra_vx_i32m2(v_vec, -shift, __RISCV_VXRM_RNU, vl);
     }
 }
 
-vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
+vint32m2_t vectorized_exp_on_negative_values(vint32m2_t v_a_q5_26, size_t vl)
 {
     // Define fixed-point constants
     const int kInputFractionalBits = 26;
     const int kOutputFractionalBits = 31;
-    const int32_t s_kOneQuarter_q5_26 = INT32_C(1)
-                                        << (kInputFractionalBits - 2);
+    const int32_t s_kOneQuarter_q5_26 = INT32_C(1) << (kInputFractionalBits - 2);
     const int32_t s_mask_val = s_kOneQuarter_q5_26 - 1;
 
     // Define Taylor Series Constants (Q0.31)
@@ -126,56 +120,46 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
     const int32_t s_exp_neg_1_8_q0_31 = 1895147668;
     const int32_t s_one_third_q0_31 = 715827883;
     const int32_t s_one_24th_q0_31 = 89478485;
-    const int32_t s_one_eighth_q0_31 = INT32_C(1)
-                                       << (kOutputFractionalBits - 3);
+    const int32_t s_one_eighth_q0_31 = INT32_C(1) << (kOutputFractionalBits - 3);
 
     // Perform range reduction masking
-    vint32m4_t v_a_masked = __riscv_vand_vx_i32m4(v_a_q5_26, s_mask_val, vl);
+    vint32m2_t v_a_masked = __riscv_vand_vx_i32m2(v_a_q5_26, s_mask_val, vl);
 
     // Subtract quarter constant
-    vint32m4_t v_a_mod_q_m_q_q5_26 =
-        __riscv_vsub_vx_i32m4(v_a_masked, s_kOneQuarter_q5_26, vl);
+    vint32m2_t v_a_mod_q_m_q_q5_26 = __riscv_vsub_vx_i32m2(v_a_masked, s_kOneQuarter_q5_26, vl);
 
     // Rescale from Q5.26 to Q0.31
     const int rescale_shift = kOutputFractionalBits - kInputFractionalBits;
-    vint32m4_t v_a_input_taylor_q0_31 =
-        SRMPOT_vx_i32m4(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
+    vint32m2_t v_a_input_taylor_q0_31 = SRMPOT_vx_i32m2(v_a_mod_q_m_q_q5_26, rescale_shift, vl);
 
     // Center input around -1/8
-    vint32m4_t v_y =
-        __riscv_vadd_vx_i32m4(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
+    vint32m2_t v_y = __riscv_vadd_vx_i32m2(v_a_input_taylor_q0_31, s_one_eighth_q0_31, vl);
 
     // Calculate polynomial terms using 32-bit saturating multiply
-    vint32m4_t v_y2 = __riscv_vsmul_vv_i32m4(v_y, v_y, __RISCV_VXRM_RNU, vl);
-    vint32m4_t v_y3 = __riscv_vsmul_vv_i32m4(v_y2, v_y, __RISCV_VXRM_RNU, vl);
-    vint32m4_t v_y4 = __riscv_vsmul_vv_i32m4(v_y2, v_y2, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_y2 = __riscv_vsmul_vv_i32m2(v_y, v_y, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_y3 = __riscv_vsmul_vv_i32m2(v_y2, v_y, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_y4 = __riscv_vsmul_vv_i32m2(v_y2, v_y2, __RISCV_VXRM_RNU, vl);
 
     // Calculate coefficients
-    vint32m4_t v_term_y2_over_2 = SRMPOT_vx_i32m4(v_y2, -1, vl);
-    vint32m4_t v_term_y3_over_3 =
-        __riscv_vsmul_vx_i32m4(v_y3, s_one_third_q0_31, __RISCV_VXRM_RNU, vl);
-    vint32m4_t v_term_y3_over_6 = SRMPOT_vx_i32m4(v_term_y3_over_3, -1, vl);
-    vint32m4_t v_term_y4_over_24 =
-        __riscv_vsmul_vx_i32m4(v_y4, s_one_24th_q0_31, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_term_y2_over_2 = SRMPOT_vx_i32m2(v_y2, -1, vl);
+    vint32m2_t v_term_y3_over_3 = __riscv_vsmul_vx_i32m2(v_y3, s_one_third_q0_31, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_term_y3_over_6 = SRMPOT_vx_i32m2(v_term_y3_over_3, -1, vl);
+    vint32m2_t v_term_y4_over_24 = __riscv_vsmul_vx_i32m2(v_y4, s_one_24th_q0_31, __RISCV_VXRM_RNU, vl);
 
     // Sum polynomial terms
-    vint32m4_t v_poly_sum = __riscv_vadd_vv_i32m4(v_y, v_term_y2_over_2, vl);
-    v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y3_over_6, vl);
-    v_poly_sum = __riscv_vadd_vv_i32m4(v_poly_sum, v_term_y4_over_24, vl);
+    vint32m2_t v_poly_sum = __riscv_vadd_vv_i32m2(v_y, v_term_y2_over_2, vl);
+    v_poly_sum = __riscv_vadd_vv_i32m2(v_poly_sum, v_term_y3_over_6, vl);
+    v_poly_sum = __riscv_vadd_vv_i32m2(v_poly_sum, v_term_y4_over_24, vl);
 
     // Apply constant term
-    vint32m4_t v_mul_term = __riscv_vsmul_vx_i32m4(
-        v_poly_sum, s_exp_neg_1_8_q0_31, __RISCV_VXRM_RNU, vl);
-    vint32m4_t v_current_result =
-        __riscv_vadd_vx_i32m4(v_mul_term, s_exp_neg_1_8_q0_31, vl);
+    vint32m2_t v_mul_term = __riscv_vsmul_vx_i32m2(v_poly_sum, s_exp_neg_1_8_q0_31, __RISCV_VXRM_RNU, vl);
+    vint32m2_t v_current_result = __riscv_vadd_vx_i32m2(v_mul_term, s_exp_neg_1_8_q0_31, vl);
 
     // Calculate remainder for barrel shifter
-    vint32m4_t v_remainder_q5_26 =
-        __riscv_vsub_vv_i32m4(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
+    vint32m2_t v_remainder_q5_26 = __riscv_vsub_vv_i32m2(v_a_mod_q_m_q_q5_26, v_a_q5_26, vl);
 
     // Multipliers for reconstruction
-    const int32_t multipliers[] = {1672461947, 1302514674, 790015084, 290630308,
-                                   39332535,   720401,     242};
+    const int32_t multipliers[] = {1672461947, 1302514674, 790015084, 290630308, 39332535, 720401, 242};
 
     // Apply barrel shifter using unrolled loop
     for (int i = 0; i < 7; ++i)
@@ -187,21 +171,17 @@ vint32m4_t vectorized_exp_on_negative_values(vint32m4_t v_a_q5_26, size_t vl)
             int32_t mask = 1 << shift_amount;
             int32_t mult = multipliers[i];
 
-            vint32m4_t v_rem_masked =
-                __riscv_vand_vx_i32m4(v_remainder_q5_26, mask, vl);
-            vbool8_t v_apply = __riscv_vmsne_vx_i32m4_b8(v_rem_masked, 0, vl);
+            vint32m2_t v_rem_masked = __riscv_vand_vx_i32m2(v_remainder_q5_26, mask, vl);
+            vbool16_t v_apply = __riscv_vmsne_vx_i32m2_b16(v_rem_masked, 0, vl);
 
-            vint32m4_t v_multiplied = __riscv_vsmul_vx_i32m4(
-                v_current_result, mult, __RISCV_VXRM_RNU, vl);
-            v_current_result = __riscv_vmerge_vvm_i32m4(
-                v_current_result, v_multiplied, v_apply, vl);
+            vint32m2_t v_multiplied = __riscv_vsmul_vx_i32m2(v_current_result, mult, __RISCV_VXRM_RNU, vl);
+            v_current_result = __riscv_vmerge_vvm_i32m2(v_current_result, v_multiplied, v_apply, vl);
         }
     }
 
     // Handle zero input case
-    vbool8_t v_zero_mask = __riscv_vmseq_vx_i32m4_b8(v_a_q5_26, 0, vl);
-    return __riscv_vmerge_vxm_i32m4(v_current_result, s_result_one_q0_31,
-                                    v_zero_mask, vl);
+    vbool16_t v_zero_mask = __riscv_vmseq_vx_i32m2_b16(v_a_q5_26, 0, vl);
+    return __riscv_vmerge_vxm_i32m2(v_current_result, s_result_one_q0_31, v_zero_mask, vl);
 }
 
 template <typename InputT, typename OutputT>
@@ -217,16 +197,13 @@ void SoftmaxRVV(const tflite::SoftmaxParams& params,
 
     // Define fixed-point constants
     static const int kAccumulationIntegerBits = 12;
-    static const int kAccumulationFractionalBits =
-        32 - 1 - kAccumulationIntegerBits;
+    static const int kAccumulationFractionalBits = 32 - 1 - kAccumulationIntegerBits;
     static const int kExpOutputFractionalBits = 31;
 
     // Extract shape dimensions
     const int trailing_dim = input_shape.DimensionsCount() - 1;
-    const int outer_size = tflite::MatchingFlatSizeSkipDim(
-        input_shape, trailing_dim, output_shape);
-    const int depth = tflite::MatchingDim(input_shape, trailing_dim,
-                                          output_shape, trailing_dim);
+    const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
     const size_t depth_sz = static_cast<size_t>(depth);
 
     // Loop over outer dimensions
@@ -241,24 +218,19 @@ void SoftmaxRVV(const tflite::SoftmaxParams& params,
         size_t n = depth_sz;
         while (n > 0)
         {
+            // Keep m1 for Max finding (low register pressure)
             size_t vl = __riscv_vsetvl_e8m1(n);
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_in = __riscv_vle8_v_i8m1(
-                    reinterpret_cast<const int8_t*>(ptr_max), vl);
-                vint8m1_t v_red = __riscv_vredmax_vs_i8m1_i8m1(
-                    v_in, __riscv_vmv_v_x_i8m1(max_in_row, vl), vl);
-                max_in_row =
-                    std::max(max_in_row, __riscv_vmv_x_s_i8m1_i8(v_red));
+                vint8m1_t v_in = __riscv_vle8_v_i8m1(reinterpret_cast<const int8_t*>(ptr_max), vl);
+                vint8m1_t v_red = __riscv_vredmax_vs_i8m1_i8m1(v_in, __riscv_vmv_v_x_i8m1(max_in_row, vl), vl);
+                max_in_row = std::max(max_in_row, __riscv_vmv_x_s_i8m1_i8(v_red));
             }
             else
             {
-                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
-                    reinterpret_cast<const uint8_t*>(ptr_max), vl);
-                vuint8m1_t v_red = __riscv_vredmaxu_vs_u8m1_u8m1(
-                    v_in, __riscv_vmv_v_x_u8m1(max_in_row, vl), vl);
-                max_in_row = std::max(max_in_row,
-                                      (InputT)__riscv_vmv_x_s_u8m1_u8(v_red));
+                vuint8m1_t v_in = __riscv_vle8_v_u8m1(reinterpret_cast<const uint8_t*>(ptr_max), vl);
+                vuint8m1_t v_red = __riscv_vredmaxu_vs_u8m1_u8m1(v_in, __riscv_vmv_v_x_u8m1(max_in_row, vl), vl);
+                max_in_row = std::max(max_in_row, (InputT)__riscv_vmv_x_s_u8m1_u8(v_red));
             }
             ptr_max += vl;
             n -= vl;
@@ -271,53 +243,45 @@ void SoftmaxRVV(const tflite::SoftmaxParams& params,
 
         while (current_c < depth_sz)
         {
-            size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
+            // OPT: Use m2 to reduce register pressure in the exp() call
+            size_t vl = __riscv_vsetvl_e32m2(depth_sz - current_c);
 
             // Load and widen input without 64-bit instructions
-            vint32m4_t v_input_s32;
+            vint32m2_t v_input_s32;
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_in = __riscv_vle8_v_i8m1(
-                    reinterpret_cast<const int8_t*>(current_input_data +
-                                                    current_c),
-                    vl);
-                vint16m2_t v_in_16 = __riscv_vsext_vf2_i16m2(v_in, vl);
-                v_input_s32 = __riscv_vsext_vf2_i32m4(v_in_16, vl);
+                // Load mf2 (8-bit) matches m2 (32-bit) element count
+                vint8mf2_t v_in = __riscv_vle8_v_i8mf2(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
+                vint16m1_t v_in_16 = __riscv_vsext_vf2_i16m1(v_in, vl);
+                v_input_s32 = __riscv_vsext_vf2_i32m2(v_in_16, vl);
             }
             else
             {
-                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
-                    reinterpret_cast<const uint8_t*>(current_input_data +
-                                                     current_c),
-                    vl);
-                vuint16m2_t v_in_16 = __riscv_vzext_vf2_u16m2(v_in, vl);
-                vuint32m4_t v_in_32 = __riscv_vzext_vf2_u32m4(v_in_16, vl);
-                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(v_in_32);
+                vuint8mf2_t v_in = __riscv_vle8_v_u8mf2(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
+                vuint16m1_t v_in_16 = __riscv_vzext_vf2_u16m1(v_in, vl);
+                vuint32m2_t v_in_32 = __riscv_vzext_vf2_u32m2(v_in_16, vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m2_i32m2(v_in_32);
             }
 
             // Calculate difference from max
-            vint32m4_t v_diff =
-                __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-            vbool8_t v_mask = __riscv_vmsge_vx_i32m4_b8(v_diff, diff_min, vl);
+            vint32m2_t v_diff = __riscv_vsub_vx_i32m2(v_input_s32, max_in_row_s32, vl);
+            vbool16_t v_mask = __riscv_vmsge_vx_i32m2_b16(v_diff, diff_min, vl);
 
             // Scale difference using custom 32-bit implementation
-            vint32m4_t v_diff_scaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
+            vint32m2_t v_diff_scaled = MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m2(
                     v_diff, input_beta_multiplier, input_beta_left_shift, vl);
 
             // Calculate exponential
-            vint32m4_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
+            vint32m2_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
 
             // Rescale result
-            vint32m4_t v_exp_rescaled = __riscv_vssra_vx_i32m4(
-                v_exp, kExpOutputFractionalBits - kAccumulationFractionalBits,
-                __RISCV_VXRM_RNU, vl);
+            vint32m2_t v_exp_rescaled = __riscv_vssra_vx_i32m2(v_exp, kExpOutputFractionalBits - kAccumulationFractionalBits, __RISCV_VXRM_RNU, vl);
 
             // Merge and accumulate
-            vint32m4_t v_add_val = __riscv_vmerge_vvm_i32m4(
-                __riscv_vmv_v_x_i32m4(0, vl), v_exp_rescaled, v_mask, vl);
-            v_sum_acc =
-                __riscv_vredsum_vs_i32m4_i32m1(v_add_val, v_sum_acc, vl);
+            vint32m2_t v_add_val = __riscv_vmerge_vvm_i32m2(__riscv_vmv_v_x_i32m2(0, vl), v_exp_rescaled, v_mask, vl);
+            
+            // Reduce m2 vector to scalar
+            v_sum_acc = __riscv_vredsum_vs_i32m2_i32m1(v_add_val, v_sum_acc, vl);
 
             current_c += vl;
         }
@@ -325,95 +289,73 @@ void SoftmaxRVV(const tflite::SoftmaxParams& params,
 
         // Calculate reciprocal
         int num_bits_over_unit;
-        int32_t reciprocal = tflite::GetReciprocal(
-            sum_of_exps, kAccumulationIntegerBits, &num_bits_over_unit);
+        int32_t reciprocal = tflite::GetReciprocal(sum_of_exps, kAccumulationIntegerBits, &num_bits_over_unit);
         const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
-        const int32_t output_min =
-            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
-        const int32_t output_max =
-            static_cast<int32_t>(std::numeric_limits<OutputT>::max());
+        const int32_t output_min = static_cast<int32_t>(std::numeric_limits<OutputT>::min());
+        const int32_t output_max = static_cast<int32_t>(std::numeric_limits<OutputT>::max());
 
         // Compute final output
         current_c = 0;
         while (current_c < depth_sz)
         {
-            size_t vl = __riscv_vsetvl_e32m4(depth_sz - current_c);
+            // OPT: m2
+            size_t vl = __riscv_vsetvl_e32m2(depth_sz - current_c);
 
             // Reload and widen input
-            vint32m4_t v_input_s32;
+            vint32m2_t v_input_s32;
             if constexpr (std::is_signed_v<InputT>)
             {
-                vint8m1_t v_in = __riscv_vle8_v_i8m1(
-                    reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
-                v_input_s32 = __riscv_vsext_vf2_i32m4(
-                    __riscv_vsext_vf2_i16m2(v_in, vl), vl);
+                vint8mf2_t v_in = __riscv_vle8_v_i8mf2(reinterpret_cast<const int8_t*>(current_input_data + current_c), vl);
+                v_input_s32 = __riscv_vsext_vf2_i32m2(__riscv_vsext_vf2_i16m1(v_in, vl), vl);
             }
             else
             {
-                vuint8m1_t v_in = __riscv_vle8_v_u8m1(
-                    reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
-                v_input_s32 = __riscv_vreinterpret_v_u32m4_i32m4(
-                    __riscv_vzext_vf2_u32m4(__riscv_vzext_vf2_u16m2(v_in, vl), vl));
+                vuint8mf2_t v_in = __riscv_vle8_v_u8mf2(reinterpret_cast<const uint8_t*>(current_input_data + current_c), vl);
+                v_input_s32 = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vzext_vf2_u32m2(__riscv_vzext_vf2_u16m1(v_in, vl), vl));
             }
 
             // Recompute difference and mask
-            vint32m4_t v_diff =
-                __riscv_vsub_vx_i32m4(v_input_s32, max_in_row_s32, vl);
-            vbool8_t v_mask = __riscv_vmsge_vx_i32m4_b8(v_diff, diff_min, vl);
+            vint32m2_t v_diff = __riscv_vsub_vx_i32m2(v_input_s32, max_in_row_s32, vl);
+            vbool16_t v_mask = __riscv_vmsge_vx_i32m2_b16(v_diff, diff_min, vl);
 
             // Scale and exponentiate
-            vint32m4_t v_diff_scaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m4(
+            vint32m2_t v_diff_scaled = MultiplyByQuantizedMultiplierGreaterThanOne_32bit_vx_i32m2(
                     v_diff, input_beta_multiplier, input_beta_left_shift, vl);
-            vint32m4_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
+            vint32m2_t v_exp = vectorized_exp_on_negative_values(v_diff_scaled, vl);
 
             // Multiply by reciprocal using 32-bit saturating multiply
-            vint32m4_t v_prod = __riscv_vsmul_vx_i32m4(v_exp, reciprocal,
-                                                       __RISCV_VXRM_RNU, vl);
+            vint32m2_t v_prod = __riscv_vsmul_vx_i32m2(v_exp, reciprocal, __RISCV_VXRM_RNU, vl);
 
             // Perform final shift and add offset
-            vint32m4_t v_out_shifted = __riscv_vssra_vx_i32m4(
-                v_prod, exponent, __RISCV_VXRM_RNU, vl);
-            vint32m4_t v_out_final =
-                __riscv_vadd_vx_i32m4(v_out_shifted, output_min, vl);
+            vint32m2_t v_out_shifted = __riscv_vssra_vx_i32m2(v_prod, exponent, __RISCV_VXRM_RNU, vl);
+            vint32m2_t v_out_final = __riscv_vadd_vx_i32m2(v_out_shifted, output_min, vl);
 
             // Clamp result
-            v_out_final = __riscv_vmax_vx_i32m4(v_out_final, output_min, vl);
-            v_out_final = __riscv_vmin_vx_i32m4(v_out_final, output_max, vl);
+            v_out_final = __riscv_vmax_vx_i32m2(v_out_final, output_min, vl);
+            v_out_final = __riscv_vmin_vx_i32m2(v_out_final, output_max, vl);
 
             // Apply mask using vector merge
-            v_out_final = __riscv_vmerge_vvm_i32m4(
-                __riscv_vmv_v_x_i32m4(output_min, vl), v_out_final, v_mask, vl);
+            v_out_final = __riscv_vmerge_vvm_i32m2(__riscv_vmv_v_x_i32m2(output_min, vl), v_out_final, v_mask, vl);
 
             // Narrow and store result
             if constexpr (sizeof(OutputT) == 1)
             {
                 if constexpr (std::is_signed_v<OutputT>)
                 {
-                    vint8m1_t v_store = __riscv_vncvt_x_x_w_i8m1(
-                        __riscv_vncvt_x_x_w_i16m2(v_out_final, vl), vl);
-                    __riscv_vse8_v_i8m1(reinterpret_cast<int8_t*>(
-                                            current_output_data + current_c),
-                                        v_store, vl);
+                    // Narrow: m2 -> m1 -> mf2
+                    vint8mf2_t v_store = __riscv_vncvt_x_x_w_i8mf2(__riscv_vncvt_x_x_w_i16m1(v_out_final, vl), vl);
+                    __riscv_vse8_v_i8mf2(reinterpret_cast<int8_t*>(current_output_data + current_c), v_store, vl);
                 }
                 else
                 {
-                    vuint8m1_t v_store = __riscv_vncvt_x_x_w_u8m1(
-                        __riscv_vncvt_x_x_w_u16m2(
-                            __riscv_vreinterpret_v_i32m4_u32m4(v_out_final),
-                            vl),
-                        vl);
-                    __riscv_vse8_v_u8m1(reinterpret_cast<uint8_t*>(
-                                            current_output_data + current_c),
-                                        v_store, vl);
+                    vuint8mf2_t v_store = __riscv_vncvt_x_x_w_u8mf2(__riscv_vncvt_x_x_w_u16m1(__riscv_vreinterpret_v_i32m2_u32m2(v_out_final), vl), vl);
+                    __riscv_vse8_v_u8mf2(reinterpret_cast<uint8_t*>(current_output_data + current_c), v_store, vl);
                 }
             }
             else
             {
-                vint16m2_t v_store = __riscv_vncvt_x_x_w_i16m2(v_out_final, vl);
-                __riscv_vse16_v_i16m2(
-                    reinterpret_cast<int16_t*>(current_output_data + current_c),
-                    v_store, vl);
+                vint16m1_t v_store = __riscv_vncvt_x_x_w_i16m1(v_out_final, vl);
+                __riscv_vse16_v_i16m1(reinterpret_cast<int16_t*>(current_output_data + current_c), v_store, vl);
             }
             current_c += vl;
         }

From e694a1d03d50bd310f00860ae616cbf2f0e18f1c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Fri, 21 Nov 2025 09:41:31 -0600
Subject: [PATCH 76/86] RFFT: Switch to LMUL=2 to reduce register pressure

---
 .../riscv_vector/signal/rfft_int16_rvv.cc     | 543 +++++++++---------
 1 file changed, 272 insertions(+), 271 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
index 269e55d446f..740730a02a8 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.cc
@@ -40,74 +40,74 @@ static void kf_bfly2_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
   size_t k = 0;
   while (k < m)
   {
-    // Set the vector length for this iteration
-    size_t vl = __riscv_vsetvl_e16m4(m - k);
+    // Set the vector length for this iteration (LMUL=2)
+    size_t vl = __riscv_vsetvl_e16m2(m - k);
 
     // Load input data vectors
-    vint16m4_t v_fout_r =
-        __riscv_vlse16_v_i16m4(Fout_base + 2 * k, cpx_stride, vl);
-    vint16m4_t v_fout_i =
-        __riscv_vlse16_v_i16m4(Fout_base + 2 * k + 1, cpx_stride, vl);
-    vint16m4_t v_fout2_r =
-        __riscv_vlse16_v_i16m4(Fout2_base + 2 * k, cpx_stride, vl);
-    vint16m4_t v_fout2_i =
-        __riscv_vlse16_v_i16m4(Fout2_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_fout_r =
+        __riscv_vlse16_v_i16m2(Fout_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_fout_i =
+        __riscv_vlse16_v_i16m2(Fout_base + 2 * k + 1, cpx_stride, vl);
+    vint16m2_t v_fout2_r =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m2_t v_fout2_i =
+        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, vl);
 
     // Load twiddle factor vectors
-    vint16m4_t v_tw_r =
-        __riscv_vlse16_v_i16m4(tw1_base + (k * fstride * 2), tw_stride, vl);
-    vint16m4_t v_tw_i =
-        __riscv_vlse16_v_i16m4(tw1_base + (k * fstride * 2) + 1, tw_stride, vl);
+    vint16m2_t v_tw_r =
+        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2), tw_stride, vl);
+    vint16m2_t v_tw_i =
+        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2) + 1, tw_stride, vl);
 
     // Perform rounding division by 2 on input data
-    vint32m8_t v_fout_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout_r, scale, vl),
+    vint32m4_t v_fout_r_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fout_r, scale, vl),
                               round_const, vl),
         15, vl);
-    vint32m8_t v_fout_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout_i, scale, vl),
+    vint32m4_t v_fout_i_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fout_i, scale, vl),
                               round_const, vl),
         15, vl);
-    vint16m4_t v_fout_r_div2 =
-        __riscv_vnclip_wx_i16m4(v_fout_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fout_i_div2 =
-        __riscv_vnclip_wx_i16m4(v_fout_i_32, 0, __RISCV_VXRM_RNU, vl);
-    vint32m8_t v_fout2_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout2_r, scale, vl),
+    vint16m2_t v_fout_r_div2 =
+        __riscv_vnclip_wx_i16m2(v_fout_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fout_i_div2 =
+        __riscv_vnclip_wx_i16m2(v_fout_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_fout2_r_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fout2_r, scale, vl),
                               round_const, vl),
         15, vl);
-    vint32m8_t v_fout2_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fout2_i, scale, vl),
+    vint32m4_t v_fout2_i_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fout2_i, scale, vl),
                               round_const, vl),
         15, vl);
-    vint16m4_t v_fout2_r_div2 =
-        __riscv_vnclip_wx_i16m4(v_fout2_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fout2_i_div2 =
-        __riscv_vnclip_wx_i16m4(v_fout2_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fout2_r_div2 =
+        __riscv_vnclip_wx_i16m2(v_fout2_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fout2_i_div2 =
+        __riscv_vnclip_wx_i16m2(v_fout2_i_32, 0, __RISCV_VXRM_RNU, vl);
 
     // Perform complex multiplication: t = Fout2 * tw
-    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_fout2_r_div2, v_tw_r, vl);
-    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_fout2_i_div2, v_tw_i, vl);
-    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_fout2_r_div2, v_tw_i, vl);
-    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_fout2_i_div2, v_tw_r, vl);
-    vint32m8_t v_t_r_32 = __riscv_vssra_vx_i32m8(
-        __riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint32m8_t v_t_i_32 = __riscv_vssra_vx_i32m8(
-        __riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_t_r = __riscv_vnclip_wx_i16m4(v_t_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_t_i = __riscv_vnclip_wx_i16m4(v_t_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_ac = __riscv_vwmul_vv_i32m4(v_fout2_r_div2, v_tw_r, vl);
+    vint32m4_t v_bd = __riscv_vwmul_vv_i32m4(v_fout2_i_div2, v_tw_i, vl);
+    vint32m4_t v_ad = __riscv_vwmul_vv_i32m4(v_fout2_r_div2, v_tw_i, vl);
+    vint32m4_t v_bc = __riscv_vwmul_vv_i32m4(v_fout2_i_div2, v_tw_r, vl);
+    vint32m4_t v_t_r_32 = __riscv_vssra_vx_i32m4(
+        __riscv_vsub_vv_i32m4(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_t_i_32 = __riscv_vssra_vx_i32m4(
+        __riscv_vadd_vv_i32m4(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_t_r = __riscv_vnclip_wx_i16m2(v_t_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_t_i = __riscv_vnclip_wx_i16m2(v_t_i_32, 0, __RISCV_VXRM_RNU, vl);
 
     // Calculate butterfly outputs: Fout = Fout + t and Fout2 = Fout - t
-    vint16m4_t v_res_fout2_r = __riscv_vsub_vv_i16m4(v_fout_r_div2, v_t_r, vl);
-    vint16m4_t v_res_fout2_i = __riscv_vsub_vv_i16m4(v_fout_i_div2, v_t_i, vl);
-    vint16m4_t v_res_fout_r = __riscv_vadd_vv_i16m4(v_fout_r_div2, v_t_r, vl);
-    vint16m4_t v_res_fout_i = __riscv_vadd_vv_i16m4(v_fout_i_div2, v_t_i, vl);
+    vint16m2_t v_res_fout2_r = __riscv_vsub_vv_i16m2(v_fout_r_div2, v_t_r, vl);
+    vint16m2_t v_res_fout2_i = __riscv_vsub_vv_i16m2(v_fout_i_div2, v_t_i, vl);
+    vint16m2_t v_res_fout_r = __riscv_vadd_vv_i16m2(v_fout_r_div2, v_t_r, vl);
+    vint16m2_t v_res_fout_i = __riscv_vadd_vv_i16m2(v_fout_i_div2, v_t_i, vl);
 
     // Store results
-    __riscv_vsse16_v_i16m4(Fout_base + 2 * k, cpx_stride, v_res_fout_r, vl);
-    __riscv_vsse16_v_i16m4(Fout_base + 2 * k + 1, cpx_stride, v_res_fout_i, vl);
-    __riscv_vsse16_v_i16m4(Fout2_base + 2 * k, cpx_stride, v_res_fout2_r, vl);
-    __riscv_vsse16_v_i16m4(Fout2_base + 2 * k + 1, cpx_stride, v_res_fout2_i, vl);
+    __riscv_vsse16_v_i16m2(Fout_base + 2 * k, cpx_stride, v_res_fout_r, vl);
+    __riscv_vsse16_v_i16m2(Fout_base + 2 * k + 1, cpx_stride, v_res_fout_i, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, v_res_fout2_r, vl);
+    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, v_res_fout2_i, vl);
 
     // Advance loop counter
     k += vl;
@@ -141,175 +141,175 @@ static void kf_bfly4_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
   size_t k = 0;
   while (k < m)
   {
-    // Set the vector length for this iteration
-    size_t vl = __riscv_vsetvl_e16m2(m - k);
+    // Set the vector length for this iteration (LMUL=1)
+    size_t vl = __riscv_vsetvl_e16m1(m - k);
 
     // Load input data vectors
-    vint16m2_t v_f0_r =
-        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f0_i =
-        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, vl);
-    vint16m2_t v_f1_r =
-        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f1_i =
-        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, vl);
-    vint16m2_t v_f2_r =
-        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f2_i =
-        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, vl);
-    vint16m2_t v_f3_r =
-        __riscv_vlse16_v_i16m2(Fout3_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f3_i =
-        __riscv_vlse16_v_i16m2(Fout3_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f0_r =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f0_i =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f1_r =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f1_i =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f2_r =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f2_i =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f3_r =
+        __riscv_vlse16_v_i16m1(Fout3_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f3_i =
+        __riscv_vlse16_v_i16m1(Fout3_base + 2 * k + 1, cpx_stride, vl);
 
     // Perform rounding division by 4 on input data
-    vint16m2_t v_f0d_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f0_r, scale, vl), round_const, vl),
+    vint16m1_t v_f0d_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f0_r, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f0d_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f0_i, scale, vl), round_const, vl),
+    vint16m1_t v_f0d_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f0_i, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f1d_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f1_r, scale, vl), round_const, vl),
+    vint16m1_t v_f1d_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f1_r, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f1d_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f1_i, scale, vl), round_const, vl),
+    vint16m1_t v_f1d_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f1_i, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f2d_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f2_r, scale, vl), round_const, vl),
+    vint16m1_t v_f2d_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f2_r, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f2d_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f2_i, scale, vl), round_const, vl),
+    vint16m1_t v_f2d_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f2_i, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f3d_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f3_r, scale, vl), round_const, vl),
+    vint16m1_t v_f3d_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f3_r, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_f3d_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vsra_vx_i32m4(
-            __riscv_vadd_vx_i32m4(
-                __riscv_vwmul_vx_i32m4(v_f3_i, scale, vl), round_const, vl),
+    vint16m1_t v_f3d_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vsra_vx_i32m2(
+            __riscv_vadd_vx_i32m2(
+                __riscv_vwmul_vx_i32m2(v_f3_i, scale, vl), round_const, vl),
             15, vl),
         0, __RISCV_VXRM_RNU, vl);
 
     // Load twiddle factor vectors
-    vint16m2_t v_tw1_r =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 2), tw1_stride, vl);
-    vint16m2_t v_tw1_i =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 2) + 1, tw1_stride, vl);
-    vint16m2_t v_tw2_r =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 4), tw2_stride, vl);
-    vint16m2_t v_tw2_i =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 4) + 1, tw2_stride, vl);
-    vint16m2_t v_tw3_r =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 6), tw3_stride, vl);
-    vint16m2_t v_tw3_i =
-        __riscv_vlse16_v_i16m2(tw_base + (k * fstride * 6) + 1, tw3_stride, vl);
+    vint16m1_t v_tw1_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 2), tw1_stride, vl);
+    vint16m1_t v_tw1_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 2) + 1, tw1_stride, vl);
+    vint16m1_t v_tw2_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 4), tw2_stride, vl);
+    vint16m1_t v_tw2_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 4) + 1, tw2_stride, vl);
+    vint16m1_t v_tw3_r =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 6), tw3_stride, vl);
+    vint16m1_t v_tw3_i =
+        __riscv_vlse16_v_i16m1(tw_base + (k * fstride * 6) + 1, tw3_stride, vl);
 
     // Perform complex multiplications
-    vint16m2_t v_s0_r, v_s0_i, v_s1_r, v_s1_i, v_s2_r, v_s2_i;
+    vint16m1_t v_s0_r, v_s0_i, v_s1_r, v_s1_i, v_s2_r, v_s2_i;
     do
     {
-      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f1d_r, v_tw1_r, vl);
-      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f1d_i, v_tw1_i, vl);
-      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f1d_r, v_tw1_i, vl);
-      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f1d_i, v_tw1_r, vl);
-      v_s0_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+      vint32m2_t ac = __riscv_vwmul_vv_i32m2(v_f1d_r, v_tw1_r, vl);
+      vint32m2_t bd = __riscv_vwmul_vv_i32m2(v_f1d_i, v_tw1_i, vl);
+      vint32m2_t ad = __riscv_vwmul_vv_i32m2(v_f1d_r, v_tw1_i, vl);
+      vint32m2_t bc = __riscv_vwmul_vv_i32m2(v_f1d_i, v_tw1_r, vl);
+      v_s0_r = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vsub_vv_i32m2(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
-      v_s0_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+      v_s0_i = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vadd_vv_i32m2(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
     } while (0);
 
     do
     {
-      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f2d_r, v_tw2_r, vl);
-      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f2d_i, v_tw2_i, vl);
-      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f2d_r, v_tw2_i, vl);
-      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f2d_i, v_tw2_r, vl);
-      v_s1_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+      vint32m2_t ac = __riscv_vwmul_vv_i32m2(v_f2d_r, v_tw2_r, vl);
+      vint32m2_t bd = __riscv_vwmul_vv_i32m2(v_f2d_i, v_tw2_i, vl);
+      vint32m2_t ad = __riscv_vwmul_vv_i32m2(v_f2d_r, v_tw2_i, vl);
+      vint32m2_t bc = __riscv_vwmul_vv_i32m2(v_f2d_i, v_tw2_r, vl);
+      v_s1_r = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vsub_vv_i32m2(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
-      v_s1_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+      v_s1_i = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vadd_vv_i32m2(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
     } while (0);
 
     do
     {
-      vint32m4_t ac = __riscv_vwmul_vv_i32m4(v_f3d_r, v_tw3_r, vl);
-      vint32m4_t bd = __riscv_vwmul_vv_i32m4(v_f3d_i, v_tw3_i, vl);
-      vint32m4_t ad = __riscv_vwmul_vv_i32m4(v_f3d_r, v_tw3_i, vl);
-      vint32m4_t bc = __riscv_vwmul_vv_i32m4(v_f3d_i, v_tw3_r, vl);
-      v_s2_r = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vsub_vv_i32m4(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
+      vint32m2_t ac = __riscv_vwmul_vv_i32m2(v_f3d_r, v_tw3_r, vl);
+      vint32m2_t bd = __riscv_vwmul_vv_i32m2(v_f3d_i, v_tw3_i, vl);
+      vint32m2_t ad = __riscv_vwmul_vv_i32m2(v_f3d_r, v_tw3_i, vl);
+      vint32m2_t bc = __riscv_vwmul_vv_i32m2(v_f3d_i, v_tw3_r, vl);
+      v_s2_r = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vsub_vv_i32m2(ac, bd, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
-      v_s2_i = __riscv_vnclip_wx_i16m2(__riscv_vssra_vx_i32m4(
-          __riscv_vadd_vv_i32m4(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
+      v_s2_i = __riscv_vnclip_wx_i16m1(__riscv_vssra_vx_i32m2(
+          __riscv_vadd_vv_i32m2(ad, bc, vl), 15, __RISCV_VXRM_RNU, vl),
                                        0, __RISCV_VXRM_RNU, vl);
     } while (0);
 
     // Calculate intermediate butterfly values
-    vint16m2_t v_s5_r = __riscv_vsub_vv_i16m2(v_f0d_r, v_s1_r, vl);
-    vint16m2_t v_s5_i = __riscv_vsub_vv_i16m2(v_f0d_i, v_s1_i, vl);
-    vint16m2_t v_f0d_plus_s1_r = __riscv_vadd_vv_i16m2(v_f0d_r, v_s1_r, vl);
-    vint16m2_t v_f0d_plus_s1_i = __riscv_vadd_vv_i16m2(v_f0d_i, v_s1_i, vl);
-    vint16m2_t v_s3_r = __riscv_vadd_vv_i16m2(v_s0_r, v_s2_r, vl);
-    vint16m2_t v_s3_i = __riscv_vadd_vv_i16m2(v_s0_i, v_s2_i, vl);
-    vint16m2_t v_s4_r = __riscv_vsub_vv_i16m2(v_s0_r, v_s2_r, vl);
-    vint16m2_t v_s4_i = __riscv_vsub_vv_i16m2(v_s0_i, v_s2_i, vl);
-    vint16m2_t v_res_f0_r = __riscv_vadd_vv_i16m2(v_f0d_plus_s1_r, v_s3_r, vl);
-    vint16m2_t v_res_f0_i = __riscv_vadd_vv_i16m2(v_f0d_plus_s1_i, v_s3_i, vl);
-    vint16m2_t v_res_f2_r = __riscv_vsub_vv_i16m2(v_f0d_plus_s1_r, v_s3_r, vl);
-    vint16m2_t v_res_f2_i = __riscv_vsub_vv_i16m2(v_f0d_plus_s1_i, v_s3_i, vl);
+    vint16m1_t v_s5_r = __riscv_vsub_vv_i16m1(v_f0d_r, v_s1_r, vl);
+    vint16m1_t v_s5_i = __riscv_vsub_vv_i16m1(v_f0d_i, v_s1_i, vl);
+    vint16m1_t v_f0d_plus_s1_r = __riscv_vadd_vv_i16m1(v_f0d_r, v_s1_r, vl);
+    vint16m1_t v_f0d_plus_s1_i = __riscv_vadd_vv_i16m1(v_f0d_i, v_s1_i, vl);
+    vint16m1_t v_s3_r = __riscv_vadd_vv_i16m1(v_s0_r, v_s2_r, vl);
+    vint16m1_t v_s3_i = __riscv_vadd_vv_i16m1(v_s0_i, v_s2_i, vl);
+    vint16m1_t v_s4_r = __riscv_vsub_vv_i16m1(v_s0_r, v_s2_r, vl);
+    vint16m1_t v_s4_i = __riscv_vsub_vv_i16m1(v_s0_i, v_s2_i, vl);
+    vint16m1_t v_res_f0_r = __riscv_vadd_vv_i16m1(v_f0d_plus_s1_r, v_s3_r, vl);
+    vint16m1_t v_res_f0_i = __riscv_vadd_vv_i16m1(v_f0d_plus_s1_i, v_s3_i, vl);
+    vint16m1_t v_res_f2_r = __riscv_vsub_vv_i16m1(v_f0d_plus_s1_r, v_s3_r, vl);
+    vint16m1_t v_res_f2_i = __riscv_vsub_vv_i16m1(v_f0d_plus_s1_i, v_s3_i, vl);
 
     // Calculate final results, handling inverse case
-    vint16m2_t v_res_f1_r, v_res_f1_i, v_res_f3_r, v_res_f3_i;
+    vint16m1_t v_res_f1_r, v_res_f1_i, v_res_f3_r, v_res_f3_i;
     if (st->inverse)
     {
-      v_res_f1_r = __riscv_vsub_vv_i16m2(v_s5_r, v_s4_i, vl);
-      v_res_f1_i = __riscv_vadd_vv_i16m2(v_s5_i, v_s4_r, vl);
-      v_res_f3_r = __riscv_vadd_vv_i16m2(v_s5_r, v_s4_i, vl);
-      v_res_f3_i = __riscv_vsub_vv_i16m2(v_s5_i, v_s4_r, vl);
+      v_res_f1_r = __riscv_vsub_vv_i16m1(v_s5_r, v_s4_i, vl);
+      v_res_f1_i = __riscv_vadd_vv_i16m1(v_s5_i, v_s4_r, vl);
+      v_res_f3_r = __riscv_vadd_vv_i16m1(v_s5_r, v_s4_i, vl);
+      v_res_f3_i = __riscv_vsub_vv_i16m1(v_s5_i, v_s4_r, vl);
     }
     else
     {
-      v_res_f1_r = __riscv_vadd_vv_i16m2(v_s5_r, v_s4_i, vl);
-      v_res_f1_i = __riscv_vsub_vv_i16m2(v_s5_i, v_s4_r, vl);
-      v_res_f3_r = __riscv_vsub_vv_i16m2(v_s5_r, v_s4_i, vl);
-      v_res_f3_i = __riscv_vadd_vv_i16m2(v_s5_i, v_s4_r, vl);
+      v_res_f1_r = __riscv_vadd_vv_i16m1(v_s5_r, v_s4_i, vl);
+      v_res_f1_i = __riscv_vsub_vv_i16m1(v_s5_i, v_s4_r, vl);
+      v_res_f3_r = __riscv_vsub_vv_i16m1(v_s5_r, v_s4_i, vl);
+      v_res_f3_i = __riscv_vadd_vv_i16m1(v_s5_i, v_s4_r, vl);
     }
 
     // Store final results
-    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
-    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
-    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
-    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
-    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
-    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
-    __riscv_vsse16_v_i16m2(Fout3_base + 2 * k, cpx_stride, v_res_f3_r, vl);
-    __riscv_vsse16_v_i16m2(Fout3_base + 2 * k + 1, cpx_stride, v_res_f3_i, vl);
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
+    __riscv_vsse16_v_i16m1(Fout3_base + 2 * k, cpx_stride, v_res_f3_r, vl);
+    __riscv_vsse16_v_i16m1(Fout3_base + 2 * k + 1, cpx_stride, v_res_f3_i, vl);
 
     // Advance loop counter
     k += vl;
@@ -337,98 +337,98 @@ static void kf_bfly3_rvv(kiss_fft_fixed16::kiss_fft_cpx* Fout,
   size_t k = 0;
   while (k < m)
   {
-    // Set the vector length for this iteration
-    size_t vl = __riscv_vsetvl_e16m2(m - k);
+    // Set the vector length for this iteration (LMUL=1)
+    size_t vl = __riscv_vsetvl_e16m1(m - k);
 
     // Load input data vectors
-    vint16m2_t v_f0_r =
-        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f0_i =
-        __riscv_vlse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, vl);
-    vint16m2_t v_f1_r =
-        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f1_i =
-        __riscv_vlse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, vl);
-    vint16m2_t v_f2_r =
-        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, vl);
-    vint16m2_t v_f2_i =
-        __riscv_vlse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f0_r =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f0_i =
+        __riscv_vlse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f1_r =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f1_i =
+        __riscv_vlse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, vl);
+    vint16m1_t v_f2_r =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, vl);
+    vint16m1_t v_f2_i =
+        __riscv_vlse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, vl);
 
     // Load twiddle factor vectors
-    vint16m2_t v_tw1_r =
-        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2), tw1_stride, vl);
-    vint16m2_t v_tw1_i =
-        __riscv_vlse16_v_i16m2(tw1_base + (k * fstride * 2) + 1, tw1_stride, vl);
-    vint16m2_t v_tw2_r =
-        __riscv_vlse16_v_i16m2(tw2_base + (k * fstride * 4), tw2_stride, vl);
-    vint16m2_t v_tw2_i =
-        __riscv_vlse16_v_i16m2(tw2_base + (k * fstride * 4) + 1, tw2_stride, vl);
+    vint16m1_t v_tw1_r =
+        __riscv_vlse16_v_i16m1(tw1_base + (k * fstride * 2), tw1_stride, vl);
+    vint16m1_t v_tw1_i =
+        __riscv_vlse16_v_i16m1(tw1_base + (k * fstride * 2) + 1, tw1_stride, vl);
+    vint16m1_t v_tw2_r =
+        __riscv_vlse16_v_i16m1(tw2_base + (k * fstride * 4), tw2_stride, vl);
+    vint16m1_t v_tw2_i =
+        __riscv_vlse16_v_i16m1(tw2_base + (k * fstride * 4) + 1, tw2_stride, vl);
 
     // Perform complex multiplications: v_s0 = v_f1 * v_tw1
-    vint32m4_t v_ac0 = __riscv_vwmul_vv_i32m4(v_f1_r, v_tw1_r, vl);
-    vint32m4_t v_bd0 = __riscv_vwmul_vv_i32m4(v_f1_i, v_tw1_i, vl);
-    vint32m4_t v_ad0 = __riscv_vwmul_vv_i32m4(v_f1_r, v_tw1_i, vl);
-    vint32m4_t v_bc0 = __riscv_vwmul_vv_i32m4(v_f1_i, v_tw1_r, vl);
-    vint16m2_t v_s0_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(__riscv_vsub_vv_i32m4(v_ac0, v_bd0, vl), 15,
+    vint32m2_t v_ac0 = __riscv_vwmul_vv_i32m2(v_f1_r, v_tw1_r, vl);
+    vint32m2_t v_bd0 = __riscv_vwmul_vv_i32m2(v_f1_i, v_tw1_i, vl);
+    vint32m2_t v_ad0 = __riscv_vwmul_vv_i32m2(v_f1_r, v_tw1_i, vl);
+    vint32m2_t v_bc0 = __riscv_vwmul_vv_i32m2(v_f1_i, v_tw1_r, vl);
+    vint16m1_t v_s0_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(__riscv_vsub_vv_i32m2(v_ac0, v_bd0, vl), 15,
                                __RISCV_VXRM_RNU, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_s0_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(__riscv_vadd_vv_i32m4(v_ad0, v_bc0, vl), 15,
+    vint16m1_t v_s0_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(__riscv_vadd_vv_i32m2(v_ad0, v_bc0, vl), 15,
                                __RISCV_VXRM_RNU, vl),
         0, __RISCV_VXRM_RNU, vl);
 
-    // Perform complex multiplications: v_s1 = v_f2 * v_tw2
-    vint32m4_t v_ac1 = __riscv_vwmul_vv_i32m4(v_f2_r, v_tw2_r, vl);
-    vint32m4_t v_bd1 = __riscv_vwmul_vv_i32m4(v_f2_i, v_tw2_i, vl);
-    vint32m4_t v_ad1 = __riscv_vwmul_vv_i32m4(v_f2_r, v_tw2_i, vl);
-    vint32m4_t v_bc1 = __riscv_vwmul_vv_i32m4(v_f2_i, v_tw2_r, vl);
-    vint16m2_t v_s1_r = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(__riscv_vsub_vv_i32m4(v_ac1, v_bd1, vl), 15,
+    // Perform complex multiplications
+    vint32m2_t v_ac1 = __riscv_vwmul_vv_i32m2(v_f2_r, v_tw2_r, vl);
+    vint32m2_t v_bd1 = __riscv_vwmul_vv_i32m2(v_f2_i, v_tw2_i, vl);
+    vint32m2_t v_ad1 = __riscv_vwmul_vv_i32m2(v_f2_r, v_tw2_i, vl);
+    vint32m2_t v_bc1 = __riscv_vwmul_vv_i32m2(v_f2_i, v_tw2_r, vl);
+    vint16m1_t v_s1_r = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(__riscv_vsub_vv_i32m2(v_ac1, v_bd1, vl), 15,
                                __RISCV_VXRM_RNU, vl),
         0, __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_s1_i = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(__riscv_vadd_vv_i32m4(v_ad1, v_bc1, vl), 15,
+    vint16m1_t v_s1_i = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(__riscv_vadd_vv_i32m2(v_ad1, v_bc1, vl), 15,
                                __RISCV_VXRM_RNU, vl),
         0, __RISCV_VXRM_RNU, vl);
 
     // Calculate intermediate butterfly values
-    vint16m2_t v_s_add_r = __riscv_vadd_vv_i16m2(v_s0_r, v_s1_r, vl);
-    vint16m2_t v_s_add_i = __riscv_vadd_vv_i16m2(v_s0_i, v_s1_i, vl);
-    vint16m2_t v_s_sub_r = __riscv_vsub_vv_i16m2(v_s0_r, v_s1_r, vl);
-    vint16m2_t v_s_sub_i = __riscv_vsub_vv_i16m2(v_s0_i, v_s1_i, vl);
+    vint16m1_t v_s_add_r = __riscv_vadd_vv_i16m1(v_s0_r, v_s1_r, vl);
+    vint16m1_t v_s_add_i = __riscv_vadd_vv_i16m1(v_s0_i, v_s1_i, vl);
+    vint16m1_t v_s_sub_r = __riscv_vsub_vv_i16m1(v_s0_r, v_s1_r, vl);
+    vint16m1_t v_s_sub_i = __riscv_vsub_vv_i16m1(v_s0_i, v_s1_i, vl);
 
     // Calculate Fout0 = Fout0 + s_add
-    vint16m2_t v_res_f0_r = __riscv_vadd_vv_i16m2(v_f0_r, v_s_add_r, vl);
-    vint16m2_t v_res_f0_i = __riscv_vadd_vv_i16m2(v_f0_i, v_s_add_i, vl);
+    vint16m1_t v_res_f0_r = __riscv_vadd_vv_i16m1(v_f0_r, v_s_add_r, vl);
+    vint16m1_t v_res_f0_i = __riscv_vadd_vv_i16m1(v_f0_i, v_s_add_i, vl);
 
     // Calculate remaining outputs using rotations
-    vint16m2_t v_s_add_r_neg_half =
-        __riscv_vneg_v_i16m2(__riscv_vsra_vx_i16m2(v_s_add_r, 1, vl), vl);
-    vint16m2_t v_s_add_i_neg_half =
-        __riscv_vneg_v_i16m2(__riscv_vsra_vx_i16m2(v_s_add_i, 1, vl), vl);
-    vint32m4_t v_s_sub_i_mul_tw3i = __riscv_vwmul_vx_i32m4(v_s_sub_i, tw3i, vl);
-    vint32m4_t v_s_sub_r_mul_tw3i = __riscv_vwmul_vx_i32m4(v_s_sub_r, tw3i, vl);
-    vint16m2_t v_s_sub_i_scaled = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(v_s_sub_i_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
+    vint16m1_t v_s_add_r_neg_half =
+        __riscv_vneg_v_i16m1(__riscv_vsra_vx_i16m1(v_s_add_r, 1, vl), vl);
+    vint16m1_t v_s_add_i_neg_half =
+        __riscv_vneg_v_i16m1(__riscv_vsra_vx_i16m1(v_s_add_i, 1, vl), vl);
+    vint32m2_t v_s_sub_i_mul_tw3i = __riscv_vwmul_vx_i32m2(v_s_sub_i, tw3i, vl);
+    vint32m2_t v_s_sub_r_mul_tw3i = __riscv_vwmul_vx_i32m2(v_s_sub_r, tw3i, vl);
+    vint16m1_t v_s_sub_i_scaled = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(v_s_sub_i_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
         __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_s_sub_r_scaled = __riscv_vnclip_wx_i16m2(
-        __riscv_vssra_vx_i32m4(v_s_sub_r_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
+    vint16m1_t v_s_sub_r_scaled = __riscv_vnclip_wx_i16m1(
+        __riscv_vssra_vx_i32m2(v_s_sub_r_mul_tw3i, 15, __RISCV_VXRM_RNU, vl), 0,
         __RISCV_VXRM_RNU, vl);
-    vint16m2_t v_tmp_r1 = __riscv_vadd_vv_i16m2(v_f0_r, v_s_add_r_neg_half, vl);
-    vint16m2_t v_res_f1_r = __riscv_vsub_vv_i16m2(v_tmp_r1, v_s_sub_i_scaled, vl);
-    vint16m2_t v_tmp_i1 = __riscv_vadd_vv_i16m2(v_f0_i, v_s_add_i_neg_half, vl);
-    vint16m2_t v_res_f1_i = __riscv_vadd_vv_i16m2(v_tmp_i1, v_s_sub_r_scaled, vl);
-    vint16m2_t v_res_f2_r = __riscv_vadd_vv_i16m2(v_tmp_r1, v_s_sub_i_scaled, vl);
-    vint16m2_t v_res_f2_i = __riscv_vsub_vv_i16m2(v_tmp_i1, v_s_sub_r_scaled, vl);
+    vint16m1_t v_tmp_r1 = __riscv_vadd_vv_i16m1(v_f0_r, v_s_add_r_neg_half, vl);
+    vint16m1_t v_res_f1_r = __riscv_vsub_vv_i16m1(v_tmp_r1, v_s_sub_i_scaled, vl);
+    vint16m1_t v_tmp_i1 = __riscv_vadd_vv_i16m1(v_f0_i, v_s_add_i_neg_half, vl);
+    vint16m1_t v_res_f1_i = __riscv_vadd_vv_i16m1(v_tmp_i1, v_s_sub_r_scaled, vl);
+    vint16m1_t v_res_f2_r = __riscv_vadd_vv_i16m1(v_tmp_r1, v_s_sub_i_scaled, vl);
+    vint16m1_t v_res_f2_i = __riscv_vsub_vv_i16m1(v_tmp_i1, v_s_sub_r_scaled, vl);
 
     // Store results
-    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
-    __riscv_vsse16_v_i16m2(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
-    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
-    __riscv_vsse16_v_i16m2(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
-    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
-    __riscv_vsse16_v_i16m2(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k, cpx_stride, v_res_f0_r, vl);
+    __riscv_vsse16_v_i16m1(Fout0_base + 2 * k + 1, cpx_stride, v_res_f0_i, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k, cpx_stride, v_res_f1_r, vl);
+    __riscv_vsse16_v_i16m1(Fout1_base + 2 * k + 1, cpx_stride, v_res_f1_i, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k, cpx_stride, v_res_f2_r, vl);
+    __riscv_vsse16_v_i16m1(Fout2_base + 2 * k + 1, cpx_stride, v_res_f2_i, vl);
 
     // Advance loop counter
     k += vl;
@@ -777,72 +777,73 @@ void kiss_fftr_rvv(kiss_fft_fixed16::kiss_fftr_cfg st, const kiss_fft_scalar* ti
   while (k <= loop_end)
   {
     // Set the vector length (vl) for the current iteration
-    size_t vl = __riscv_vsetvl_e16m4(loop_end - k + 1);
+    // Optimization: Reduced to m2 to prevent register spilling
+    size_t vl = __riscv_vsetvl_e16m2(loop_end - k + 1);
 
     // fpk indices: k, k+1, ...
-    vint16m4_t v_fpk_r = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k], stride, vl);
-    vint16m4_t v_fpk_i = __riscv_vlse16_v_i16m4(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
+    vint16m2_t v_fpk_r = __riscv_vlse16_v_i16m2(&tmpbuf_base_ptr[2 * k], stride, vl);
+    vint16m2_t v_fpk_i = __riscv_vlse16_v_i16m2(&tmpbuf_base_ptr[2 * k + 1], stride, vl);
 
     // fpnk indices: N-k, N-(k+1), ...
     const int16_t* fpnk_ptr = &tmpbuf_base_ptr[2 * (ncfft - k)];
-    vint16m4_t v_fpnk_r_raw = __riscv_vlse16_v_i16m4(fpnk_ptr, neg_stride, vl);
-    vint16m4_t v_fpnk_i_raw = __riscv_vlse16_v_i16m4(fpnk_ptr + 1, neg_stride, vl);
+    vint16m2_t v_fpnk_r_raw = __riscv_vlse16_v_i16m2(fpnk_ptr, neg_stride, vl);
+    vint16m2_t v_fpnk_i_raw = __riscv_vlse16_v_i16m2(fpnk_ptr + 1, neg_stride, vl);
 
     // Twiddle indices: k-1, k, ...
     // Must use strided load to extract only Reals or only Imags from the interleaved array
     const int16_t* tw_ptr = &twiddles_base_ptr[2 * (k - 1)];
-    vint16m4_t v_tw_r = __riscv_vlse16_v_i16m4(tw_ptr, stride, vl);
-    vint16m4_t v_tw_i = __riscv_vlse16_v_i16m4(tw_ptr + 1, stride, vl);
+    vint16m2_t v_tw_r = __riscv_vlse16_v_i16m2(tw_ptr, stride, vl);
+    vint16m2_t v_tw_i = __riscv_vlse16_v_i16m2(tw_ptr + 1, stride, vl);
 
     // Perform high-precision rounding division on fpk
     const int16_t scale = 16383;
     const int32_t round_const = 16384;
-    vint32m8_t v_fpk_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_r, scale, vl), round_const, vl), 15, vl);
-    vint32m8_t v_fpk_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpk_i, scale, vl), round_const, vl), 15, vl);
-    vint16m4_t v_fpk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpk_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fpk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpk_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_fpk_r_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fpk_r, scale, vl), round_const, vl), 15, vl);
+    vint32m4_t v_fpk_i_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fpk_i, scale, vl), round_const, vl), 15, vl);
+    vint16m2_t v_fpk_r_div2 = __riscv_vnclip_wx_i16m2(v_fpk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fpk_i_div2 = __riscv_vnclip_wx_i16m2(v_fpk_i_32, 0, __RISCV_VXRM_RNU, vl);
 
     // Perform high-precision rounding division on fpnk (with negated imaginary part)
-    vint16m4_t v_fpnk_i_neg = __riscv_vneg_v_i16m4(v_fpnk_i_raw, vl);
-    vint32m8_t v_fpnk_r_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_r_raw, scale, vl), round_const, vl), 15, vl);
-    vint32m8_t v_fpnk_i_32 = __riscv_vsra_vx_i32m8(
-        __riscv_vadd_vx_i32m8(__riscv_vwmul_vx_i32m8(v_fpnk_i_neg, scale, vl), round_const, vl), 15, vl);
-    vint16m4_t v_fpnk_r_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_fpnk_i_div2 = __riscv_vnclip_wx_i16m4(v_fpnk_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fpnk_i_neg = __riscv_vneg_v_i16m2(v_fpnk_i_raw, vl);
+    vint32m4_t v_fpnk_r_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fpnk_r_raw, scale, vl), round_const, vl), 15, vl);
+    vint32m4_t v_fpnk_i_32 = __riscv_vsra_vx_i32m4(
+        __riscv_vadd_vx_i32m4(__riscv_vwmul_vx_i32m4(v_fpnk_i_neg, scale, vl), round_const, vl), 15, vl);
+    vint16m2_t v_fpnk_r_div2 = __riscv_vnclip_wx_i16m2(v_fpnk_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_fpnk_i_div2 = __riscv_vnclip_wx_i16m2(v_fpnk_i_32, 0, __RISCV_VXRM_RNU, vl);
 
     // Calculate intermediate values f1k (add) and f2k (subtract)
-    vint16m4_t v_f1k_r = __riscv_vadd_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
-    vint16m4_t v_f1k_i = __riscv_vadd_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
-    vint16m4_t v_f2k_r = __riscv_vsub_vv_i16m4(v_fpk_r_div2, v_fpnk_r_div2, vl);
-    vint16m4_t v_f2k_i = __riscv_vsub_vv_i16m4(v_fpk_i_div2, v_fpnk_i_div2, vl);
+    vint16m2_t v_f1k_r = __riscv_vadd_vv_i16m2(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m2_t v_f1k_i = __riscv_vadd_vv_i16m2(v_fpk_i_div2, v_fpnk_i_div2, vl);
+    vint16m2_t v_f2k_r = __riscv_vsub_vv_i16m2(v_fpk_r_div2, v_fpnk_r_div2, vl);
+    vint16m2_t v_f2k_i = __riscv_vsub_vv_i16m2(v_fpk_i_div2, v_fpnk_i_div2, vl);
 
     // Perform complex multiplication
-    vint32m8_t v_ac = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_r, vl);
-    vint32m8_t v_bd = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_i, vl);
-    vint32m8_t v_ad = __riscv_vwmul_vv_i32m8(v_f2k_r, v_tw_i, vl);
-    vint32m8_t v_bc = __riscv_vwmul_vv_i32m8(v_f2k_i, v_tw_r, vl);
-    vint32m8_t v_tw_res_r_32 = __riscv_vssra_vx_i32m8(__riscv_vsub_vv_i32m8(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint32m8_t v_tw_res_i_32 = __riscv_vssra_vx_i32m8(__riscv_vadd_vv_i32m8(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_tw_res_r = __riscv_vnclip_wx_i16m4(v_tw_res_r_32, 0, __RISCV_VXRM_RNU, vl);
-    vint16m4_t v_tw_res_i = __riscv_vnclip_wx_i16m4(v_tw_res_i_32, 0, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_ac = __riscv_vwmul_vv_i32m4(v_f2k_r, v_tw_r, vl);
+    vint32m4_t v_bd = __riscv_vwmul_vv_i32m4(v_f2k_i, v_tw_i, vl);
+    vint32m4_t v_ad = __riscv_vwmul_vv_i32m4(v_f2k_r, v_tw_i, vl);
+    vint32m4_t v_bc = __riscv_vwmul_vv_i32m4(v_f2k_i, v_tw_r, vl);
+    vint32m4_t v_tw_res_r_32 = __riscv_vssra_vx_i32m4(__riscv_vsub_vv_i32m4(v_ac, v_bd, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint32m4_t v_tw_res_i_32 = __riscv_vssra_vx_i32m4(__riscv_vadd_vv_i32m4(v_ad, v_bc, vl), 15, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_tw_res_r = __riscv_vnclip_wx_i16m2(v_tw_res_r_32, 0, __RISCV_VXRM_RNU, vl);
+    vint16m2_t v_tw_res_i = __riscv_vnclip_wx_i16m2(v_tw_res_i_32, 0, __RISCV_VXRM_RNU, vl);
 
     // Calculate final output vectors
-    vint16m4_t v_out_k_r = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
-    vint16m4_t v_out_k_i = __riscv_vsra_vx_i16m4(__riscv_vadd_vv_i16m4(v_f1k_i, v_tw_res_i, vl), 1, vl);
-    vint16m4_t v_out_nk_r = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_f1k_r, v_tw_res_r, vl), 1, vl);
-    vint16m4_t v_out_nk_i = __riscv_vsra_vx_i16m4(__riscv_vsub_vv_i16m4(v_tw_res_i, v_f1k_i, vl), 1, vl);
+    vint16m2_t v_out_k_r = __riscv_vsra_vx_i16m2(__riscv_vadd_vv_i16m2(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m2_t v_out_k_i = __riscv_vsra_vx_i16m2(__riscv_vadd_vv_i16m2(v_f1k_i, v_tw_res_i, vl), 1, vl);
+    vint16m2_t v_out_nk_r = __riscv_vsra_vx_i16m2(__riscv_vsub_vv_i16m2(v_f1k_r, v_tw_res_r, vl), 1, vl);
+    vint16m2_t v_out_nk_i = __riscv_vsra_vx_i16m2(__riscv_vsub_vv_i16m2(v_tw_res_i, v_f1k_i, vl), 1, vl);
 
     // Store the results using a strided store (Forward)
-    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
-    __riscv_vsse16_v_i16m4(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
+    __riscv_vsse16_v_i16m2(&freqdata_base_ptr[2 * k], stride, v_out_k_r, vl);
+    __riscv_vsse16_v_i16m2(&freqdata_base_ptr[2 * k + 1], stride, v_out_k_i, vl);
 
     // Store the results using a strided store (Reverse)
     int16_t* out_nk_ptr = &freqdata_base_ptr[2 * (ncfft - k)];
-    __riscv_vsse16_v_i16m4(out_nk_ptr, neg_stride, v_out_nk_r, vl);
-    __riscv_vsse16_v_i16m4(out_nk_ptr + 1, neg_stride, v_out_nk_i, vl);
+    __riscv_vsse16_v_i16m2(out_nk_ptr, neg_stride, v_out_nk_r, vl);
+    __riscv_vsse16_v_i16m2(out_nk_ptr + 1, neg_stride, v_out_nk_i, vl);
 
     // Advance to the next vector chunk
     k += vl;

From ca9555feb27f8b109821dafe4aa50584f9b83ee8 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:27:21 -0600
Subject: [PATCH 77/86] Cleanup

---
 .../micro/examples/micro_speech/Makefile.inc  |   2 +-
 .../micro_speech/micro_speech_test.cc         |   2 +-
 .../micro_speech/micro_speech_test2.cc        | 172 ------------------
 .../examples/person_detection/Makefile.inc    |   4 +-
 .../person_detection/person_detection_test.cc |   2 +-
 .../person_detection_test2.cc                 | 100 ----------
 .../lite/micro/testing/test_with_spike.sh     |  29 ---
 .../make/targets/riscv32_vector_makefile.inc  |   2 +-
 8 files changed, 6 insertions(+), 307 deletions(-)
 delete mode 100644 tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
 delete mode 100644 tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc
 delete mode 100755 tensorflow/lite/micro/testing/test_with_spike.sh

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index a1b5b565cf5..67ce420a6a1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -61,4 +61,4 @@ list_micro_speech_example_sources:
 	@echo $(MICRO_SPEECH_SRCS)
 
 list_micro_speech_example_headers:
-	@echo $(MICRO_SPEECH_HDRS)
+	@echo $(MICRO_SPEECH_HDRS)
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index 0191958b82b..9b88efa90ea 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -277,4 +277,4 @@ TF_LITE_MICRO_TEST(NoiseTest) {
                   g_noise_1000ms_audio_data_size);
 }
 
-TF_LITE_MICRO_TESTS_END
+TF_LITE_MICRO_TESTS_END
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
deleted file mode 100644
index 499cfb8aca1..00000000000
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test2.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <algorithm>
-#include <cstdio>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
-
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/micro/examples/micro_speech/micro_model_settings.h"
-#include "tensorflow/lite/micro/examples/micro_speech/models/audio_preprocessor_int8_model_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/models/micro_speech_quantized_model_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/no_1000ms_audio_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/silence_1000ms_audio_data.h"
-#include "tensorflow/lite/micro/examples/micro_speech/testdata/yes_1000ms_audio_data.h"
-#include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/micro_log.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-#include "tensorflow/lite/micro/testing/micro_test.h"
-
-namespace {
-
-// Arena size is a guesstimate, followed by use of
-// MicroInterpreter::arena_used_bytes() on both the AudioPreprocessor and
-// MicroSpeech models and using the larger of the two results.
-constexpr size_t kArenaSize = 28584;
-alignas(16) uint8_t g_arena[kArenaSize];
-
-using Features = int8_t[kFeatureCount][kFeatureSize];
-Features g_features;
-
-constexpr int kAudioSampleDurationCount =
-    kFeatureDurationMs * kAudioSampleFrequency / 1000;
-constexpr int kAudioSampleStrideCount =
-    kFeatureStrideMs * kAudioSampleFrequency / 1000;
-
-using MicroSpeechOpResolver = tflite::MicroMutableOpResolver<4>;
-using AudioPreprocessorOpResolver = tflite::MicroMutableOpResolver<18>;
-
-// Registers the ops used by the MicroSpeech model.
-TfLiteStatus RegisterOps(MicroSpeechOpResolver& op_resolver) {
-  TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFullyConnected());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddDepthwiseConv2D());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddSoftmax());
-  return kTfLiteOk;
-}
-
-// Registers the ops used by the AudioPreprocessor model.
-TfLiteStatus RegisterOps(AudioPreprocessorOpResolver& op_resolver) {
-  TF_LITE_ENSURE_STATUS(op_resolver.AddReshape());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddCast());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddStridedSlice());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddConcatenation());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddMul());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddAdd());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddDiv());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddMinimum());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddMaximum());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddWindow());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFftAutoScale());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddRfft());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddEnergy());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBank());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankSquareRoot());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankSpectralSubtraction());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddPCAN());
-  TF_LITE_ENSURE_STATUS(op_resolver.AddFilterBankLog());
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-  // Parse command-line argument
-  if (argc != 2) {
-    printf("ERROR: Incorrect usage.\n");
-    printf("Usage: %s <num_invocations>\n", argv[0]);
-    return 1;
-  }
-
-  int num_invocations = atoi(argv[1]);
-  if (num_invocations <= 0) {
-    printf("ERROR: Number of invocations must be greater than 0.\n");
-    return 1;
-  }
-
-  // One-time setup for both models
-  printf("Performing one-time setup for both models...\n");
-
-  // Set up the AudioPreprocessor interpreter
-  const tflite::Model* preprocessor_model =
-      tflite::GetModel(g_audio_preprocessor_int8_model_data);
-  AudioPreprocessorOpResolver preprocessor_op_resolver;
-  if (RegisterOps(preprocessor_op_resolver) != kTfLiteOk) {
-    printf("ERROR: Failed to register preprocessor ops.\n");
-    return 1;
-  }
-
-  // Set up the MicroSpeech interpreter
-  const tflite::Model* speech_model =
-      tflite::GetModel(g_micro_speech_quantized_model_data);
-  MicroSpeechOpResolver speech_op_resolver;
-  if (RegisterOps(speech_op_resolver) != kTfLiteOk) {
-    printf("ERROR: Failed to register speech ops.\n");
-    return 1;
-  }
-
-  // Create BOTH interpreters first, sharing the same arena.
-  tflite::MicroInterpreter preprocessor_interpreter(
-      preprocessor_model, preprocessor_op_resolver, g_arena, kArenaSize);
-  tflite::MicroInterpreter speech_interpreter(
-      speech_model, speech_op_resolver, g_arena, kArenaSize);
-
-  // Allocate tensors for the first model.
-  if (preprocessor_interpreter.AllocateTensors() != kTfLiteOk) {
-    printf("ERROR: Preprocessor AllocateTensors() failed.\n");
-    return 1;
-  }
-  // Now, the second interpreter will automatically allocate its memory *after*
-  // the first one in the shared arena.
-  if (speech_interpreter.AllocateTensors() != kTfLiteOk) {
-    printf("ERROR: Speech AllocateTensors() failed.\n");
-    return 1;
-  }
-
-  // Get pointers to the input and output tensors of both models
-  TfLiteTensor* preprocessor_input = preprocessor_interpreter.input(0); // <-- TYPO FIXED HERE
-  TfLiteTensor* preprocessor_output = preprocessor_interpreter.output(0);
-  TfLiteTensor* speech_input = speech_interpreter.input(0);
-
-  printf("Setup complete.\n");
-
-  printf("Running %d end-to-end invocations...\n", num_invocations);
-
-  for (int i = 0; i < num_invocations; ++i) {
-    // Generate Features
-    const int16_t* audio_data = g_yes_1000ms_audio_data;
-    size_t remaining_samples = g_yes_1000ms_audio_data_size;
-    size_t feature_index = 0;
-
-    while (remaining_samples >= kAudioSampleDurationCount &&
-           feature_index < kFeatureCount)
-    {
-      std::copy_n(audio_data, kAudioSampleDurationCount,
-                  tflite::GetTensorData<int16_t>(preprocessor_input));
-
-      if (preprocessor_interpreter.Invoke() != kTfLiteOk) {
-        printf("ERROR: Preprocessor Invoke() failed.\n");
-        return 1;
-      }
-
-      std::copy_n(tflite::GetTensorData<int8_t>(preprocessor_output), kFeatureSize,
-                  g_features[feature_index]);
-
-      feature_index++;
-      audio_data += kAudioSampleStrideCount;
-      remaining_samples -= kAudioSampleStrideCount;
-    }
-
-    // Classify Features
-    std::copy_n(&g_features[0][0], kFeatureElementCount,
-                tflite::GetTensorData<int8_t>(speech_input));
-
-    if (speech_interpreter.Invoke() != kTfLiteOk) {
-      printf("ERROR: Speech Invoke() failed.\n");
-      return 1;
-    }
-  }
-
-  printf("Finished all invocations successfully.\n");
-
-  return 0;
-}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
index c06e75d97ad..a0d5adc800c 100644
--- a/tensorflow/lite/micro/examples/person_detection/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
@@ -5,7 +5,7 @@ person_detection_MODEL_HDRS := \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/model_settings.h
 
 person_detection_TEST_SRCS := \
-$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/examples/person_detection/person_detection_test.cc \
 $(person_detection_MODEL_SRCS)
 
 person_detection_TEST_HDRS := \
@@ -82,4 +82,4 @@ list_person_detection_example_sources:
 	@echo $(person_detection_SRCS)
 
 list_person_detection_example_headers:
-	@echo $(person_detection_HDRS)
+	@echo $(person_detection_HDRS)
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 679c26e56ca..4d6c0fce513 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -128,4 +128,4 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   MicroPrintf("Ran successfully\n");
 }
 
-TF_LITE_MICRO_TESTS_END
+TF_LITE_MICRO_TESTS_END
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc
deleted file mode 100644
index 57fed75773e..00000000000
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test2.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/examples/person_detection/model_settings.h"
-#include "tensorflow/lite/micro/examples/person_detection/testdata/person_image_data.h"
-#include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/micro_log.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-#include "tensorflow/lite/micro/models/person_detect_model_data.h"
-#include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-// Create an area of memory to use for input, output, and intermediate arrays.
-#if defined(XTENSA) && defined(VISION_P6)
-constexpr int tensor_arena_size = 352 * 1024;
-#else
-constexpr int tensor_arena_size = 136 * 1024;
-#endif  // defined(XTENSA) && defined(VISION_P6)
-uint8_t tensor_arena[tensor_arena_size];
-
-int main(int argc, char** argv) {
-  // Parse command-line arguments
-  if (argc != 2) {
-    printf("ERROR: Incorrect usage.\n");
-    printf("Usage: %s <num_invocations>\n", argv[0]);
-    return 1;
-  }
-
-  int num_invocations = atoi(argv[1]);
-  if (num_invocations <= 0) {
-    printf("ERROR: Number of invocations must be greater than 0.\n");
-    return 1;
-  }
-
-  // This is the "startup cost" that the delta measurement will cancel out
-  printf("Performing one-time setup...\n");
-
-  // Map the model into a usable data structure
-  const tflite::Model* model = ::tflite::GetModel(g_person_detect_model_data);
-  if (model->version() != TFLITE_SCHEMA_VERSION) {
-    // Don't care
-    return 1;
-  }
-
-  // Pull in only the operation implementations we need
-  tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddAveragePool2D(tflite::Register_AVERAGE_POOL_2D_INT8());
-  micro_op_resolver.AddConv2D(tflite::Register_CONV_2D_INT8());
-  micro_op_resolver.AddDepthwiseConv2D(
-      tflite::Register_DEPTHWISE_CONV_2D_INT8());
-  micro_op_resolver.AddReshape();
-  micro_op_resolver.AddSoftmax(tflite::Register_SOFTMAX_INT8());
-
-  // Build an interpreter to run the model with
-  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
-                                       tensor_arena_size);
-  if (interpreter.AllocateTensors() != kTfLiteOk) {
-    printf("ERROR: AllocateTensors() failed.\n");
-    return 1;
-  }
-
-  // Get information about the model's input tensor
-  TfLiteTensor* input = interpreter.input(0);
-
-  // Copy a representative image into the input tensor
-  memcpy(input->data.int8, g_person_image_data, input->bytes);
-
-  printf("Setup complete.\n");
-
-  // Run the benchmark loop
-  printf("Running %d invocations...\n", num_invocations);
-
-  for (int i = 0; i < num_invocations; ++i) {
-    if (interpreter.Invoke() != kTfLiteOk) {
-      printf("ERROR: Invoke() failed on iteration %d.\n", i);
-      return 1;
-    }
-  }
-
-  printf("Finished all invocations successfully.\n");
-
-  return 0;
-}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/testing/test_with_spike.sh b/tensorflow/lite/micro/testing/test_with_spike.sh
deleted file mode 100755
index 35d20e57456..00000000000
--- a/tensorflow/lite/micro/testing/test_with_spike.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash -e
-
-# Parameters:
-#  ${1} suffix for qemu binary (e.g. to use qemu-arm ${1} should be arm
-#  ${2} architecture to pass to qemu (e.g. cortex-m3)
-#  ${3} cross-compiled binary to be emulated
-#  ${4} - String that is checked for pass/fail.
-#  ${5} - target (cortex_m_qemu etc.)
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-TFLM_ROOT_DIR=${SCRIPT_DIR}/../../../../
-
-TEST_TMPDIR=/tmp/test_${5}
-MICRO_LOG_PATH=${TEST_TMPDIR}/${3}
-MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
-
-mkdir -p ${MICRO_LOG_PATH}
-spike --isa=rv32gcv ~/rv32imc_zve32x_zvl128b/riscv32-unknown-elf/bin/pk ${1} 2>&1 | tee ${MICRO_LOG_FILENAME}
-if [[ ${2} != "non_test_binary" ]]
-then
-  if grep -q "${2}" ${MICRO_LOG_FILENAME}
-  then
-    echo "Pass"
-    exit 0
-  else
-    echo "Fail"
-    exit 1
-  fi
-fi
diff --git a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
index 24fb8f132d7..ba3e4a2cc33 100644
--- a/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/riscv32_vector_makefile.inc
@@ -9,7 +9,7 @@ RISCV_CODE_MODEL := medany
 # Allow additional flags on the command line for debugging.
 RISCV_EXTRA_CFLAGS :=
 
-TARGET_DEFAULT_TOOLCHAIN_ROOT := $(HOME)/rv32imc_zve32x_zvl128b/bin/
+TARGET_DEFAULT_TOOLCHAIN_ROOT := $(DOWNLOADS_DIR)/riscv_toolchain/bin/
 TARGET_TOOLCHAIN_ROOT := $(TARGET_DEFAULT_TOOLCHAIN_ROOT)
 ifeq ($(TARGET_TOOLCHAIN_ROOT), $(TARGET_DEFAULT_TOOLCHAIN_ROOT))
   $(eval $(call add_third_party_download,$(RISCV_TOOLCHAIN_URL),$(RISCV_TOOLCHAIN_MD5),riscv_toolchain,))

From 03bb47ed529fcc61a5410fc70e26e3b78bf11bfb Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:34:17 -0600
Subject: [PATCH 78/86] Cleanup headers

---
 tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h       | 3 +--
 .../lite/micro/kernels/riscv_vector/fully_connected_rvv.h   | 2 +-
 tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h    | 2 +-
 tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h | 2 +-
 .../micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h | 6 +++---
 .../micro/kernels/riscv_vector/signal/filter_bank_rvv.h     | 6 +++---
 .../lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h | 6 +++---
 tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h    | 2 +-
 8 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
index 68dd6109781..0dfdebddf3e 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
@@ -1,4 +1,3 @@
-// tensorflow/lite/micro/kernels/riscv_vector/conv_rvv.h
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_CONV_RVV_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_CONV_RVV_H_
 
@@ -29,4 +28,4 @@ void DepthwiseConvPerChannelRVV(const DepthwiseParams& params,
     const RuntimeShape& bias_shape, const int32_t* bias_data,
     const RuntimeShape& output_shape, int8_t* output_data);
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_CONV_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h
index d3694a65c8e..48d00580b7a 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/fully_connected_rvv.h
@@ -30,4 +30,4 @@ void FullyConnectedRVV(
     const RuntimeShape& output_shape, 
     int8_t* output_data);
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FULLY_CONNECTED_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
index a5c818aacef..69c05065106 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/pooling_rvv.h
@@ -14,4 +14,4 @@ void MaxPool16BitRVV(const PoolParams& params, const RuntimeShape& input_shape,
                     const int16_t* input_data, const RuntimeShape& output_shape,
                     int16_t* output_data);
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_POOLING_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
index b8e49d1e6ad..d1812fcd85e 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/requantize_rvv.h
@@ -170,4 +170,4 @@ inline vint32m2_t RequantizeVectorPerChannelS32(
     return v_res32;
 }
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_REQUANTIZE_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h
index 2cb7c974328..cff55f8c932 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_log_rvv.h
@@ -1,5 +1,5 @@
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_LOG_RVV_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_LOG_RVV_H_
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_LOG_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_LOG_RVV_H_
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
@@ -7,4 +7,4 @@ void FilterbankLogRVV(const uint32_t* input, int num_channels,
                    int32_t output_scale, uint32_t correction_bits,
                    int16_t* output);
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_LOG_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
index 8f03a819f68..c513e24dbea 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/filter_bank_rvv.h
@@ -1,5 +1,5 @@
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_RVV_H_
 
 #include <stdint.h>
 
@@ -20,4 +20,4 @@ struct FilterbankConfig {
 void FilterbankAccumulateChannelsRVV(const FilterbankConfig* config,
                                   const uint32_t* input, uint64_t* output);
 
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_FILTER_BANK_RVV_H_
\ No newline at end of file
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_FILTER_BANK_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h
index 169bf899f93..dc9bef662e9 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/signal/rfft_int16_rvv.h
@@ -1,5 +1,5 @@
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_RFFT_RVV_H_
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_RFFT_RVV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_RFFT_RVV_H_
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
@@ -10,4 +10,4 @@ void* RfftInt16Init(int32_t fft_length, void* state, size_t state_size);
 void RfftInt16ApplyRVV(void* state, const int16_t* input,
                     Complex<int16_t>* output);
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SIGNAL_RFFT_RVV_H_
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
index 28ab9e42567..28f8fed7500 100644
--- a/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
+++ b/tensorflow/lite/micro/kernels/riscv_vector/softmax_rvv.h
@@ -362,4 +362,4 @@ void SoftmaxRVV(const tflite::SoftmaxParams& params,
     }
 }
 
-#endif
\ No newline at end of file
+#endif // TENSORFLOW_LITE_MICRO_KERNELS_RISCV_VECTOR_SOFTMAX_RVV_H_
\ No newline at end of file

From 523685437d4f82ed7e1270e99637dbf81d31f7f3 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:44:41 -0600
Subject: [PATCH 79/86] Update .gitignore

---
 .gitignore | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 90e4d97bc46..c8c180931da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,10 +5,9 @@
 *audio_frontend*
 *google*
 *__pycache__*
+.venv
 venv
 gen
-.venv
-tensorflow/lite/micro/examples/micro_speech2
 
 # Ignore the directory in which `clangd` stores its local index.
 /.cache/

From e6338a24aca894964805875722779d064276a5af Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:45:30 -0600
Subject: [PATCH 80/86] Remove PEANUT-README.md

---
 PEANUT-README.md | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 PEANUT-README.md

diff --git a/PEANUT-README.md b/PEANUT-README.md
deleted file mode 100644
index 8da7a460d3f..00000000000
--- a/PEANUT-README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# TFLite-Micro with Vector Intrinsics
-
-This is Peanut Microsystems' fork of tflite-micro to optimize bottleneck operations using vector intrinsics.
-
-## Building
-
-Follow the guide in the *toolchains* repository for a guide on how to build and run *tflite-micro*. Instead of using the *riscv32_generic_makefile.inc*, use *riscv32_vector_makefile.inc* to build with vector intrinsics. Also, use the *rv32gcv* ISA for Spike. This is a superset of the instructions we intend to support.
-
-To run with informative Peanut Microsystems-specific logs, add a PEANUT_MICRO_LOG flag in the PLATFORM_FLAGS of the *riscv32_vector_makefile.inc*:
-
-    PLATFORM_FLAGS = \
-        -march=$(RISCV_ARCH) \
-        ... \
-        -DPEANUT_MICRO_LOG
-
-The main purpose for this flag is to sanity-check which implementations are used and to determine model architectures, including input and output shapes.
-
-## Testing
-
-To test, follow the same steps as above, but instead of *hello_world*, run
-    
-    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=riscv32_vector test
-
-## Issues
-
-Sometimes, when modifying the kernels, the compiler/build system will use objects from the previous compilation, meaning the new code will not run. Make sure to sanity check that your code is actually being used.

From 0be2e65c71220833d7f0029d17150aabb8e1798d Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:48:35 -0600
Subject: [PATCH 81/86] Restore person detections main.cc

---
 .../micro/examples/person_detection/main.cc   | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/main.cc b/tensorflow/lite/micro/examples/person_detection/main.cc
index 4b3e54e4105..0d2bf49fa6e 100644
--- a/tensorflow/lite/micro/examples/person_detection/main.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <stdio.h>
-#include <stdlib.h>
-
 #include "tensorflow/lite/micro/examples/person_detection/main_functions.h"
 
 // This is the default main used on systems that have the standard C entry
@@ -23,20 +20,8 @@ limitations under the License.
 // requirements for entry code (like an app_main function) should specialize
 // this main.cc file in a target-specific subfolder.
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
-    fprintf(stderr, "Usage: %s <number_of_iterations>\n", argv[0]);
-    return 1; // Indicate an error
-  }
-
-  int loop_count = atoi(argv[1]);
-  if (loop_count <= 0) {
-    fprintf(stderr, "Error: Please provide a positive number of iterations.\n");
-    return 1; // Indicate an error
-  }
-
   setup();
-
-  for (int i = 0; i < loop_count; ++i) {
+  while (true) {
     loop();
   }
-}
+}
\ No newline at end of file

From 6d71d64f7c1ec362e97096cd340b977105587be7 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:49:51 -0600
Subject: [PATCH 82/86] Add new line EOF

---
 .../micro/examples/person_detection/person_detection_test.cc    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 4d6c0fce513..679c26e56ca 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -128,4 +128,4 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   MicroPrintf("Ran successfully\n");
 }
 
-TF_LITE_MICRO_TESTS_END
\ No newline at end of file
+TF_LITE_MICRO_TESTS_END

From a73a8ec679ef5f5962583e611e04eef368f7f7b6 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:55:09 -0600
Subject: [PATCH 83/86] Restore end-of-file new lines

---
 .../kernels/internal/reference/integer_ops/depthwise_conv.h    | 2 +-
 tensorflow/lite/micro/examples/micro_speech/Makefile.inc       | 3 ++-
 .../lite/micro/examples/micro_speech/micro_speech_test.cc      | 2 +-
 tensorflow/lite/micro/examples/person_detection/Makefile.inc   | 2 +-
 tensorflow/lite/micro/examples/person_detection/main.cc        | 2 +-
 tensorflow/lite/micro/kernels/conv.cc                          | 2 +-
 tensorflow/lite/micro/kernels/conv_test.h                      | 2 +-
 tensorflow/lite/micro/kernels/depthwise_conv.cc                | 2 +-
 tensorflow/lite/micro/kernels/fully_connected.cc               | 2 +-
 9 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 87030d9fbea..7676fce0f4d 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -288,4 +288,4 @@ inline void DepthwiseConvHybridPerChannel(
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
\ No newline at end of file
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index 67ce420a6a1..d710d0ad273 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -61,4 +61,5 @@ list_micro_speech_example_sources:
 	@echo $(MICRO_SPEECH_SRCS)
 
 list_micro_speech_example_headers:
-	@echo $(MICRO_SPEECH_HDRS)
\ No newline at end of file
+	@echo $(MICRO_SPEECH_HDRS)
+	
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index 9b88efa90ea..0191958b82b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -277,4 +277,4 @@ TF_LITE_MICRO_TEST(NoiseTest) {
                   g_noise_1000ms_audio_data_size);
 }
 
-TF_LITE_MICRO_TESTS_END
\ No newline at end of file
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/examples/person_detection/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
index a0d5adc800c..c142c7ddc10 100644
--- a/tensorflow/lite/micro/examples/person_detection/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
@@ -82,4 +82,4 @@ list_person_detection_example_sources:
 	@echo $(person_detection_SRCS)
 
 list_person_detection_example_headers:
-	@echo $(person_detection_HDRS)
\ No newline at end of file
+	@echo $(person_detection_HDRS)
diff --git a/tensorflow/lite/micro/examples/person_detection/main.cc b/tensorflow/lite/micro/examples/person_detection/main.cc
index 0d2bf49fa6e..b53d3665eb4 100644
--- a/tensorflow/lite/micro/examples/person_detection/main.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main.cc
@@ -24,4 +24,4 @@ int main(int argc, char* argv[]) {
   while (true) {
     loop();
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 794da09bb29..7be915ab51e 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -194,4 +194,4 @@ TFLMRegistration Register_CONV_2D() {
   return tflite::micro::RegisterOp(ConvInit, ConvPrepare, ConvEval);
 }
 
-}  // namespace tflite
\ No newline at end of file
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/conv_test.h b/tensorflow/lite/micro/kernels/conv_test.h
index 7fa7ac2009a..642f4c76d7a 100644
--- a/tensorflow/lite/micro/kernels/conv_test.h
+++ b/tensorflow/lite/micro/kernels/conv_test.h
@@ -226,4 +226,4 @@ TfLiteStatus TestConvQuantizedPerChannelCompressed(
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_TEST_H_
\ No newline at end of file
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_TEST_H_
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index fb5cc3878ae..489e83f94f2 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -187,4 +187,4 @@ TFLMRegistration Register_DEPTHWISE_CONV_2D() {
                                    DepthwiseConvEval);
 }
 
-}  // namespace tflite
\ No newline at end of file
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index b632457aaa9..6bf7665a81f 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -360,4 +360,4 @@ TFLMInferenceRegistration RegisterInference_FULLY_CONNECTED() {
   return tflite::micro::RegisterOp(FullyConnectedEval);
 }
 
-}  // namespace tflite
\ No newline at end of file
+}  // namespace tflite

From 0cfa2f2cbaa50ff8fd4e065718ff0fbb18de3363 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:56:24 -0600
Subject: [PATCH 84/86] Remove new line at end of micro speech Makefile

---
 tensorflow/lite/micro/examples/micro_speech/Makefile.inc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index d710d0ad273..67ce420a6a1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -61,5 +61,4 @@ list_micro_speech_example_sources:
 	@echo $(MICRO_SPEECH_SRCS)
 
 list_micro_speech_example_headers:
-	@echo $(MICRO_SPEECH_HDRS)
-	
\ No newline at end of file
+	@echo $(MICRO_SPEECH_HDRS)
\ No newline at end of file

From b5820fa7c1e8ce51c7c5817e1d656f2f695fb4b2 Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:57:39 -0600
Subject: [PATCH 85/86] Restore

---
 tensorflow/lite/micro/examples/micro_speech/Makefile.inc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index 67ce420a6a1..d710d0ad273 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -61,4 +61,5 @@ list_micro_speech_example_sources:
 	@echo $(MICRO_SPEECH_SRCS)
 
 list_micro_speech_example_headers:
-	@echo $(MICRO_SPEECH_HDRS)
\ No newline at end of file
+	@echo $(MICRO_SPEECH_HDRS)
+	
\ No newline at end of file

From b079dcac5aa5861faf1c33650e8040f5f7a58a3c Mon Sep 17 00:00:00 2001
From: JaimeHW <jameswhitfield65@gmail.com>
Date: Mon, 5 Jan 2026 22:58:07 -0600
Subject: [PATCH 86/86] Remove

---
 tensorflow/lite/micro/examples/micro_speech/Makefile.inc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
index d710d0ad273..67ce420a6a1 100644
--- a/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/Makefile.inc
@@ -61,5 +61,4 @@ list_micro_speech_example_sources:
 	@echo $(MICRO_SPEECH_SRCS)
 
 list_micro_speech_example_headers:
-	@echo $(MICRO_SPEECH_HDRS)
-	
\ No newline at end of file
+	@echo $(MICRO_SPEECH_HDRS)
\ No newline at end of file