Skip to content

Commit 54fa441

Browse files
committed
add wsync test
test: add regression test for WSYNC pipeline synchronization This test validates the warp-draining semantics of the WSYNC instruction, ensuring it correctly stalls the warp until all previously issued instructions are fully committed. Test methodology: 1. Dynamic Baseline Calibration: Measures the inherent cycle overhead of `vx_rdcycle_sync()` with an empty pipeline to prevent false positives caused by measurement latency. 2. Pipeline Clogging: Floods the execution units with 32 long-latency integer divisions (`divu`) to create a severe instruction backlog. 3. Strict Compiler Barriers: Uses `__asm__ volatile("":::"memory")` clobbers to prevent the compiler from hoisting the raw MCYCLE read ahead of the division workload. 4. Race Condition Validation: Captures an unsynchronized raw cycle immediately followed by a synchronized cycle. The test asserts that the cycle gap strictly exceeds the baseline overhead plus a safe latency margin. If WSYNC fails to stall the warp, the gap will fall below the threshold, triggering a failure report on the host.
1 parent bdba9c2 commit 54fa441

4 files changed

Lines changed: 297 additions & 0 deletions

File tree

tests/regression/wsync/Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
ROOT_DIR := $(realpath ../../..)
2+
include $(ROOT_DIR)/config.mk
3+
4+
PROJECT := wsync
5+
6+
SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
7+
8+
SRCS := $(SRC_DIR)/main.cpp
9+
10+
VX_SRCS := $(SRC_DIR)/kernel.cpp
11+
12+
OPTS ?= -i1024
13+
14+
include ../common.mk

tests/regression/wsync/common.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef _COMMON_H_
2+
#define _COMMON_H_
3+
4+
#include <stdint.h>
5+
6+
typedef struct {
7+
uint32_t failures;
8+
uint32_t first_iteration;
9+
uint32_t baseline_gap;
10+
uint32_t raw_cycle;
11+
uint32_t sync_cycle;
12+
uint32_t gap;
13+
uint32_t checksum;
14+
} lane_result_t;
15+
16+
typedef struct {
17+
uint32_t num_threads;
18+
uint32_t iterations;
19+
uint64_t results_addr;
20+
} kernel_arg_t;
21+
22+
#endif

tests/regression/wsync/kernel.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include <vx_intrinsics.h>
2+
#include <vx_spawn.h>
3+
#include "common.h"
4+
5+
static constexpr uint32_t kMinDrainExtra = 16;
6+
7+
static inline uint32_t mix32(uint32_t x) {
8+
x ^= x >> 16;
9+
x *= 0x7feb352du;
10+
x ^= x >> 15;
11+
x *= 0x846ca68bu;
12+
x ^= x >> 16;
13+
return x;
14+
}
15+
16+
typedef struct {
17+
uint32_t numerators[8];
18+
uint32_t denominators[8];
19+
} div_inputs_t;
20+
21+
static inline void fill_div_inputs(div_inputs_t* inputs, uint32_t seed) {
22+
inputs->numerators[0] = mix32(seed ^ 0x13579bdfu);
23+
inputs->numerators[1] = mix32(seed ^ 0x2468ace0u);
24+
inputs->numerators[2] = mix32(seed ^ 0xfdb97531u);
25+
inputs->numerators[3] = mix32(seed ^ 0x89abcdefu);
26+
inputs->denominators[0] = mix32(seed ^ 0x31415926u) | 1u;
27+
inputs->denominators[1] = mix32(seed ^ 0x27182818u) | 1u;
28+
inputs->denominators[2] = mix32(seed ^ 0xfeedfaceu) | 1u;
29+
inputs->denominators[3] = mix32(seed ^ 0xc001d00du) | 1u;
30+
31+
seed = mix32(seed ^ 0x9e3779b9u);
32+
inputs->numerators[4] = mix32(seed ^ 0xa5a5a5a5u);
33+
inputs->numerators[5] = mix32(seed ^ 0x5a5a5a5au);
34+
inputs->numerators[6] = mix32(seed ^ 0xdeadc0deu);
35+
inputs->numerators[7] = mix32(seed ^ 0xbaadf00du);
36+
inputs->denominators[4] = mix32(seed ^ 0x01234567u) | 1u;
37+
inputs->denominators[5] = mix32(seed ^ 0x76543210u) | 1u;
38+
inputs->denominators[6] = mix32(seed ^ 0x0f0f0f0fu) | 1u;
39+
inputs->denominators[7] = mix32(seed ^ 0xf0f0f0f0u) | 1u;
40+
}
41+
42+
static inline uint32_t div_batch(const div_inputs_t& inputs) {
43+
uint32_t r0, r1, r2, r3;
44+
45+
__asm__ volatile(
46+
"divu %0, %4, %8\n\t"
47+
"divu %1, %5, %9\n\t"
48+
"divu %2, %6, %10\n\t"
49+
"divu %3, %7, %11\n\t"
50+
: "=&r"(r0), "=&r"(r1), "=&r"(r2), "=&r"(r3)
51+
: "r"(inputs.numerators[0]), "r"(inputs.numerators[1]),
52+
"r"(inputs.numerators[2]), "r"(inputs.numerators[3]),
53+
"r"(inputs.denominators[0]), "r"(inputs.denominators[1]),
54+
"r"(inputs.denominators[2]), "r"(inputs.denominators[3]));
55+
56+
uint32_t q0, q1, q2, q3;
57+
__asm__ volatile(
58+
"divu %0, %4, %8\n\t"
59+
"divu %1, %5, %9\n\t"
60+
"divu %2, %6, %10\n\t"
61+
"divu %3, %7, %11\n\t"
62+
: "=&r"(q0), "=&r"(q1), "=&r"(q2), "=&r"(q3)
63+
: "r"(inputs.numerators[4]), "r"(inputs.numerators[5]),
64+
"r"(inputs.numerators[6]), "r"(inputs.numerators[7]),
65+
"r"(inputs.denominators[4]), "r"(inputs.denominators[5]),
66+
"r"(inputs.denominators[6]), "r"(inputs.denominators[7]));
67+
68+
return r0 ^ r1 ^ r2 ^ r3 ^ q0 ^ q1 ^ q2 ^ q3;
69+
}
70+
71+
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
72+
uint32_t tid = threadIdx.x;
73+
uint32_t iterations = arg->iterations;
74+
auto results = reinterpret_cast<lane_result_t*>(arg->results_addr);
75+
lane_result_t result = {};
76+
77+
vx_wsync();
78+
uint32_t baseline_raw = csr_read(VX_CSR_MCYCLE);
79+
__asm__ volatile("" : : "r"(baseline_raw) : "memory");
80+
uint32_t baseline_sync = static_cast<uint32_t>(vx_rdcycle_sync());
81+
__asm__ volatile("" : : "r"(baseline_sync) : "memory");
82+
result.baseline_gap = baseline_sync - baseline_raw;
83+
84+
for (uint32_t iter = 0; iter < iterations; ++iter) {
85+
uint32_t seed = mix32((tid + 1) * 0x10001u + iter * 0x45d9f3bu);
86+
div_inputs_t batch0, batch1, batch2, batch3;
87+
fill_div_inputs(&batch0, seed);
88+
fill_div_inputs(&batch1, seed ^ 0x9e3779b9u);
89+
fill_div_inputs(&batch2, seed ^ 0x85ebca6bu);
90+
fill_div_inputs(&batch3, seed ^ 0xc2b2ae35u);
91+
92+
// Queue older long-latency warp instructions, then compare an unsynchronized
93+
// cycle sample against a synchronized sample that drains the backlog first.
94+
uint32_t batch_checksum = div_batch(batch0)
95+
^ div_batch(batch1)
96+
^ div_batch(batch2)
97+
^ div_batch(batch3);
98+
__asm__ volatile("" : : "r"(batch_checksum) : "memory");
99+
uint32_t raw_cycle = csr_read(VX_CSR_MCYCLE);
100+
__asm__ volatile("" : : "r"(raw_cycle) : "memory");
101+
uint32_t sync_cycle = static_cast<uint32_t>(vx_rdcycle_sync());
102+
uint32_t gap = sync_cycle - raw_cycle;
103+
__asm__ volatile("" : : "r"(sync_cycle) : "memory");
104+
105+
result.checksum ^= batch_checksum ^ raw_cycle ^ sync_cycle ^ gap;
106+
107+
if (gap <= result.baseline_gap + kMinDrainExtra) {
108+
if (0 == result.failures) {
109+
result.first_iteration = iter;
110+
result.raw_cycle = raw_cycle;
111+
result.sync_cycle = sync_cycle;
112+
result.gap = gap;
113+
}
114+
++result.failures;
115+
}
116+
}
117+
118+
results[tid] = result;
119+
}
120+
121+
int main() {
122+
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
123+
uint32_t grid_dim = 1;
124+
uint32_t block_dim = arg->num_threads;
125+
return vx_spawn_threads(1, &grid_dim, &block_dim, (vx_kernel_func_cb)kernel_body, arg);
126+
}

tests/regression/wsync/main.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#include <cstdlib>
2+
#include <iostream>
3+
#include <unistd.h>
4+
#include <vortex.h>
5+
#include <vector>
6+
#include "common.h"
7+
8+
#define RT_CHECK(_expr) \
9+
do { \
10+
int _ret = _expr; \
11+
if (0 == _ret) \
12+
break; \
13+
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
14+
cleanup(); \
15+
exit(-1); \
16+
} while (false)
17+
18+
const char* kernel_file = "kernel.vxbin";
19+
uint32_t iterations = 1024;
20+
21+
vx_device_h device = nullptr;
22+
vx_buffer_h results_buffer = nullptr;
23+
vx_buffer_h krnl_buffer = nullptr;
24+
vx_buffer_h args_buffer = nullptr;
25+
kernel_arg_t kernel_arg = {};
26+
27+
static void show_usage() {
28+
std::cout << "Vortex WSYNC Test." << std::endl;
29+
std::cout << "Usage: [-i iterations] [-k kernel] [-h help]" << std::endl;
30+
}
31+
32+
static void parse_args(int argc, char** argv) {
33+
int c;
34+
while ((c = getopt(argc, argv, "i:k:h")) != -1) {
35+
switch (c) {
36+
case 'i':
37+
iterations = std::atoi(optarg);
38+
break;
39+
case 'k':
40+
kernel_file = optarg;
41+
break;
42+
case 'h':
43+
show_usage();
44+
exit(0);
45+
break;
46+
default:
47+
show_usage();
48+
exit(-1);
49+
}
50+
}
51+
}
52+
53+
void cleanup() {
54+
if (device) {
55+
vx_mem_free(results_buffer);
56+
vx_mem_free(krnl_buffer);
57+
vx_mem_free(args_buffer);
58+
vx_dev_close(device);
59+
}
60+
}
61+
62+
int main(int argc, char* argv[]) {
63+
parse_args(argc, argv);
64+
65+
if (0 == iterations) {
66+
std::cout << "Error: iterations must be greater than zero" << std::endl;
67+
return -1;
68+
}
69+
70+
std::cout << "open device connection" << std::endl;
71+
RT_CHECK(vx_dev_open(&device));
72+
73+
uint64_t num_threads = 0;
74+
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
75+
76+
kernel_arg.num_threads = static_cast<uint32_t>(num_threads);
77+
kernel_arg.iterations = iterations;
78+
79+
std::cout << "warp size: " << kernel_arg.num_threads << std::endl;
80+
std::cout << "iterations: " << kernel_arg.iterations << std::endl;
81+
82+
std::cout << "allocate device memory" << std::endl;
83+
uint32_t results_size = kernel_arg.num_threads * sizeof(lane_result_t);
84+
RT_CHECK(vx_mem_alloc(device, results_size, VX_MEM_READ_WRITE, &results_buffer));
85+
RT_CHECK(vx_mem_address(results_buffer, &kernel_arg.results_addr));
86+
87+
std::vector<lane_result_t> results(kernel_arg.num_threads);
88+
89+
RT_CHECK(vx_copy_to_dev(results_buffer, results.data(), 0, results_size));
90+
91+
std::cout << "upload kernel" << std::endl;
92+
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
93+
94+
std::cout << "upload args" << std::endl;
95+
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
96+
97+
std::cout << "start device" << std::endl;
98+
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
99+
100+
std::cout << "wait for completion" << std::endl;
101+
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
102+
103+
std::cout << "download results" << std::endl;
104+
RT_CHECK(vx_copy_from_dev(results.data(), results_buffer, 0, results_size));
105+
106+
std::cout << "cleanup" << std::endl;
107+
cleanup();
108+
109+
uint32_t errors = 0;
110+
for (uint32_t lane = 0; lane < kernel_arg.num_threads; ++lane) {
111+
auto& result = results[lane];
112+
if (0 == result.failures) {
113+
continue;
114+
}
115+
116+
if (0 == errors) {
117+
std::cout << "first failure: lane=" << lane
118+
<< ", iteration=" << result.first_iteration
119+
<< ", baseline_gap=" << result.baseline_gap
120+
<< ", raw_cycle=" << result.raw_cycle
121+
<< ", sync_cycle=" << result.sync_cycle
122+
<< ", gap=" << result.gap << std::endl;
123+
}
124+
errors += result.failures;
125+
}
126+
127+
if (0 != errors) {
128+
std::cout << "WSYNC timing mismatches: " << errors << std::endl;
129+
std::cout << "FAILED!" << std::endl;
130+
return errors;
131+
}
132+
133+
std::cout << "PASSED!" << std::endl;
134+
return 0;
135+
}

0 commit comments

Comments
 (0)