add wsync test

EricSongXinLe · EricSongXinLe · commit 54fa44134a2b · 2026-03-13T19:28:41.000-07:00
test: add regression test for WSYNC pipeline synchronization

This test validates the warp-draining semantics of the WSYNC instruction,
ensuring it correctly stalls the warp until all previously issued
instructions are fully committed.

Test methodology:
1. Dynamic Baseline Calibration: Measures the inherent cycle overhead
   of `vx_rdcycle_sync()` with an empty pipeline to prevent false positives
   caused by measurement latency.
2. Pipeline Clogging: Floods the execution units with 32 long-latency
   integer divisions (`divu`) to create a severe instruction backlog.
3. Strict Compiler Barriers: Uses `__asm__ volatile("":::"memory")`
   clobbers to prevent the compiler from hoisting the raw MCYCLE read
   ahead of the division workload.
4. Race Condition Validation: Captures an unsynchronized raw cycle
   immediately followed by a synchronized cycle. The test asserts that
   the cycle gap strictly exceeds the baseline overhead plus a safe
   latency margin.

If WSYNC fails to stall the warp, the gap will fall below the threshold,
triggering a failure report on the host.
diff --git a/tests/regression/wsync/Makefile b/tests/regression/wsync/Makefile
@@ -0,0 +1,14 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := wsync
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -i1024
+
+include ../common.mk
diff --git a/tests/regression/wsync/common.h b/tests/regression/wsync/common.h
@@ -0,0 +1,22 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint32_t failures;
+  uint32_t first_iteration;
+  uint32_t baseline_gap;
+  uint32_t raw_cycle;
+  uint32_t sync_cycle;
+  uint32_t gap;
+  uint32_t checksum;
+} lane_result_t;
+
+typedef struct {
+  uint32_t num_threads;
+  uint32_t iterations;
+  uint64_t results_addr;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/wsync/kernel.cpp b/tests/regression/wsync/kernel.cpp
@@ -0,0 +1,126 @@
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+static constexpr uint32_t kMinDrainExtra = 16;
+
+static inline uint32_t mix32(uint32_t x) {
+  x ^= x >> 16;
+  x *= 0x7feb352du;
+  x ^= x >> 15;
+  x *= 0x846ca68bu;
+  x ^= x >> 16;
+  return x;
+}
+
+typedef struct {
+  uint32_t numerators[8];
+  uint32_t denominators[8];
+} div_inputs_t;
+
+static inline void fill_div_inputs(div_inputs_t* inputs, uint32_t seed) {
+  inputs->numerators[0] = mix32(seed ^ 0x13579bdfu);
+  inputs->numerators[1] = mix32(seed ^ 0x2468ace0u);
+  inputs->numerators[2] = mix32(seed ^ 0xfdb97531u);
+  inputs->numerators[3] = mix32(seed ^ 0x89abcdefu);
+  inputs->denominators[0] = mix32(seed ^ 0x31415926u) | 1u;
+  inputs->denominators[1] = mix32(seed ^ 0x27182818u) | 1u;
+  inputs->denominators[2] = mix32(seed ^ 0xfeedfaceu) | 1u;
+  inputs->denominators[3] = mix32(seed ^ 0xc001d00du) | 1u;
+
+  seed = mix32(seed ^ 0x9e3779b9u);
+  inputs->numerators[4] = mix32(seed ^ 0xa5a5a5a5u);
+  inputs->numerators[5] = mix32(seed ^ 0x5a5a5a5au);
+  inputs->numerators[6] = mix32(seed ^ 0xdeadc0deu);
+  inputs->numerators[7] = mix32(seed ^ 0xbaadf00du);
+  inputs->denominators[4] = mix32(seed ^ 0x01234567u) | 1u;
+  inputs->denominators[5] = mix32(seed ^ 0x76543210u) | 1u;
+  inputs->denominators[6] = mix32(seed ^ 0x0f0f0f0fu) | 1u;
+  inputs->denominators[7] = mix32(seed ^ 0xf0f0f0f0u) | 1u;
+}
+
+static inline uint32_t div_batch(const div_inputs_t& inputs) {
+  uint32_t r0, r1, r2, r3;
+
+  __asm__ volatile(
+    "divu %0, %4, %8\n\t"
+    "divu %1, %5, %9\n\t"
+    "divu %2, %6, %10\n\t"
+    "divu %3, %7, %11\n\t"
+    : "=&r"(r0), "=&r"(r1), "=&r"(r2), "=&r"(r3)
+    : "r"(inputs.numerators[0]), "r"(inputs.numerators[1]),
+      "r"(inputs.numerators[2]), "r"(inputs.numerators[3]),
+      "r"(inputs.denominators[0]), "r"(inputs.denominators[1]),
+      "r"(inputs.denominators[2]), "r"(inputs.denominators[3]));
+
+  uint32_t q0, q1, q2, q3;
+  __asm__ volatile(
+    "divu %0, %4, %8\n\t"
+    "divu %1, %5, %9\n\t"
+    "divu %2, %6, %10\n\t"
+    "divu %3, %7, %11\n\t"
+    : "=&r"(q0), "=&r"(q1), "=&r"(q2), "=&r"(q3)
+    : "r"(inputs.numerators[4]), "r"(inputs.numerators[5]),
+      "r"(inputs.numerators[6]), "r"(inputs.numerators[7]),
+      "r"(inputs.denominators[4]), "r"(inputs.denominators[5]),
+      "r"(inputs.denominators[6]), "r"(inputs.denominators[7]));
+
+  return r0 ^ r1 ^ r2 ^ r3 ^ q0 ^ q1 ^ q2 ^ q3;
+}
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+  uint32_t tid = threadIdx.x;
+  uint32_t iterations = arg->iterations;
+  auto results = reinterpret_cast<lane_result_t*>(arg->results_addr);
+  lane_result_t result = {};
+
+  vx_wsync();
+  uint32_t baseline_raw = csr_read(VX_CSR_MCYCLE);
+  __asm__ volatile("" : : "r"(baseline_raw) : "memory");
+  uint32_t baseline_sync = static_cast<uint32_t>(vx_rdcycle_sync());
+  __asm__ volatile("" : : "r"(baseline_sync) : "memory");
+  result.baseline_gap = baseline_sync - baseline_raw;
+
+  for (uint32_t iter = 0; iter < iterations; ++iter) {
+    uint32_t seed = mix32((tid + 1) * 0x10001u + iter * 0x45d9f3bu);
+    div_inputs_t batch0, batch1, batch2, batch3;
+    fill_div_inputs(&batch0, seed);
+    fill_div_inputs(&batch1, seed ^ 0x9e3779b9u);
+    fill_div_inputs(&batch2, seed ^ 0x85ebca6bu);
+    fill_div_inputs(&batch3, seed ^ 0xc2b2ae35u);
+
+    // Queue older long-latency warp instructions, then compare an unsynchronized
+    // cycle sample against a synchronized sample that drains the backlog first.
+    uint32_t batch_checksum = div_batch(batch0)
+                            ^ div_batch(batch1)
+                            ^ div_batch(batch2)
+                            ^ div_batch(batch3);
+    __asm__ volatile("" : : "r"(batch_checksum) : "memory");
+    uint32_t raw_cycle = csr_read(VX_CSR_MCYCLE);
+    __asm__ volatile("" : : "r"(raw_cycle) : "memory");
+    uint32_t sync_cycle = static_cast<uint32_t>(vx_rdcycle_sync());
+    uint32_t gap = sync_cycle - raw_cycle;
+    __asm__ volatile("" : : "r"(sync_cycle) : "memory");
+
+    result.checksum ^= batch_checksum ^ raw_cycle ^ sync_cycle ^ gap;
+
+    if (gap <= result.baseline_gap + kMinDrainExtra) {
+      if (0 == result.failures) {
+        result.first_iteration = iter;
+        result.raw_cycle = raw_cycle;
+        result.sync_cycle = sync_cycle;
+        result.gap = gap;
+      }
+      ++result.failures;
+    }
+  }
+
+  results[tid] = result;
+}
+
+int main() {
+  auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+  uint32_t grid_dim = 1;
+  uint32_t block_dim = arg->num_threads;
+  return vx_spawn_threads(1, &grid_dim, &block_dim, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/wsync/main.cpp b/tests/regression/wsync/main.cpp
@@ -0,0 +1,135 @@
+#include <cstdlib>
+#include <iostream>
+#include <unistd.h>
+#include <vortex.h>
+#include <vector>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     cleanup();                                                 \
+     exit(-1);                                                  \
+   } while (false)
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t iterations = 1024;
+
+vx_device_h device = nullptr;
+vx_buffer_h results_buffer = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+  std::cout << "Vortex WSYNC Test." << std::endl;
+  std::cout << "Usage: [-i iterations] [-k kernel] [-h help]" << std::endl;
+}
+
+static void parse_args(int argc, char** argv) {
+  int c;
+  while ((c = getopt(argc, argv, "i:k:h")) != -1) {
+    switch (c) {
+    case 'i':
+      iterations = std::atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(results_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  parse_args(argc, argv);
+
+  if (0 == iterations) {
+    std::cout << "Error: iterations must be greater than zero" << std::endl;
+    return -1;
+  }
+
+  std::cout << "open device connection" << std::endl;
+  RT_CHECK(vx_dev_open(&device));
+
+  uint64_t num_threads = 0;
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
+
+  kernel_arg.num_threads = static_cast<uint32_t>(num_threads);
+  kernel_arg.iterations = iterations;
+
+  std::cout << "warp size: " << kernel_arg.num_threads << std::endl;
+  std::cout << "iterations: " << kernel_arg.iterations << std::endl;
+
+  std::cout << "allocate device memory" << std::endl;
+  uint32_t results_size = kernel_arg.num_threads * sizeof(lane_result_t);
+  RT_CHECK(vx_mem_alloc(device, results_size, VX_MEM_READ_WRITE, &results_buffer));
+  RT_CHECK(vx_mem_address(results_buffer, &kernel_arg.results_addr));
+
+  std::vector<lane_result_t> results(kernel_arg.num_threads);
+
+  RT_CHECK(vx_copy_to_dev(results_buffer, results.data(), 0, results_size));
+
+  std::cout << "upload kernel" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+
+  std::cout << "upload args" << std::endl;
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  std::cout << "download results" << std::endl;
+  RT_CHECK(vx_copy_from_dev(results.data(), results_buffer, 0, results_size));
+
+  std::cout << "cleanup" << std::endl;
+  cleanup();
+
+  uint32_t errors = 0;
+  for (uint32_t lane = 0; lane < kernel_arg.num_threads; ++lane) {
+    auto& result = results[lane];
+    if (0 == result.failures) {
+      continue;
+    }
+
+    if (0 == errors) {
+      std::cout << "first failure: lane=" << lane
+                << ", iteration=" << result.first_iteration
+                << ", baseline_gap=" << result.baseline_gap
+                << ", raw_cycle=" << result.raw_cycle
+                << ", sync_cycle=" << result.sync_cycle
+                << ", gap=" << result.gap << std::endl;
+    }
+    errors += result.failures;
+  }
+
+  if (0 != errors) {
+    std::cout << "WSYNC timing mismatches: " << errors << std::endl;
+    std::cout << "FAILED!" << std::endl;
+    return errors;
+  }
+
+  std::cout << "PASSED!" << std::endl;
+  return 0;
+}