Skip to content

Commit 453d1ec

Browse files
author
Randy L
committed
vecadd working:
fix bugs from merging kmu onto bug fixes
1 parent ac6ce45 commit 453d1ec

8 files changed

Lines changed: 49 additions & 26 deletions

File tree

hw/VX_config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ VM_ENABLED = "expr: 1 if $VM_ENABLE else 0"
3232
EXT_D_ENABLE = "expr: $XLEN_64"
3333
FLEN = "expr: 64 if $EXT_D_ENABLE else 32"
3434

35-
KMU_ENABLE = false
35+
KMU_ENABLE = true
3636
KMU_ENABLED = "expr: 1 if $KMU_ENABLE else 0"
3737

3838
# extensions

kernel/include/vx_spawn.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define __VX_SPAWN_H__
1616

1717
#include <vx_intrinsics.h>
18+
#include <vx_print.h>
1819
#include <stdint.h>
1920

2021
#ifdef __cplusplus
@@ -30,10 +31,7 @@ typedef union {
3031
uint32_t m[3];
3132
} dim3_t;
3233

33-
extern __thread dim3_t blockIdx;
34-
extern __thread dim3_t threadIdx;
3534
extern dim3_t gridDim;
36-
extern dim3_t blockDim;
3735

3836
extern __thread uint32_t __local_group_id;
3937
extern uint32_t __warps_per_group;

kernel/src/vx_start.S

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,8 @@ _start:
2323

2424
#ifdef KMU_ENABLE
2525

26-
# set global pointer register
27-
la gp, __global_pointer
28-
29-
# set stack pointer register
30-
LOAD_IMMEDIATE64(sp, STACK_BASE_ADDR)
31-
csrr t0, VX_CSR_MHARTID
32-
sll t1, t0, STACK_LOG2_SIZE
33-
sub sp, sp, t1
34-
35-
# set thread pointer register
36-
# use address space after BSS region
37-
la t1, __tbss_size
38-
mul t0, t0, t1
39-
la tp, _end
40-
add tp, tp, t0
41-
ret
26+
# initialize per-thread registers
27+
jal init_regs
4228

4329
# initialize TLS for all warps
4430
call __init_tls
@@ -53,12 +39,8 @@ _start:
5339
#endif
5440

5541
# call main program routine
56-
csrr a0, VX_CSR_MSCRATCH
5742
call main
5843

59-
# Moving to single thread
60-
# vx_tmc(0 == vx_warp_id());
61-
6244
# call exit routine
6345
tail exit
6446
.size _start, .-_start
@@ -108,6 +90,8 @@ _start:
10890
tail exit
10991
.size _start, .-_start
11092

93+
#endif
94+
11195
.section .text
11296
.type init_regs, @function
11397
.local init_regs
@@ -132,6 +116,8 @@ init_regs:
132116
add tp, tp, t0
133117
ret
134118

119+
#ifndef KMU_ENABLE
120+
135121
.section .text
136122
.type init_regs_all, @function
137123
.local init_regs_all

riscv-openocd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit eb01c632a4bb1c07d2bddb008d6987c809f1c496

runtime/rtlsim/vortex.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <stdint.h>
2121
#include <stdio.h>
2222
#include <stdlib.h>
23+
#include <string.h>
2324
#include <assert.h>
2425
#include <iostream>
2526
#include <future>
@@ -211,6 +212,27 @@ class vx_device {
211212
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
212213
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
213214

215+
// read block and grid dimensions from kernel arguments
216+
uint32_t block_dim[3] = {1, 1, 1};
217+
uint32_t grid_dim[3] = {1, 1, 1};
218+
if (args_addr != 0) {
219+
// Read first 24 bytes of kernel arguments (block_dim[3] + grid_dim[3])
220+
struct {
221+
uint32_t block_dim[3];
222+
uint32_t grid_dim[3];
223+
} args_dims;
224+
this->download(&args_dims, args_addr, sizeof(args_dims));
225+
memcpy(block_dim, args_dims.block_dim, sizeof(block_dim));
226+
memcpy(grid_dim, args_dims.grid_dim, sizeof(grid_dim));
227+
}
228+
229+
this->dcr_write(VX_DCR_BASE_GRID_DIM0, grid_dim[0]);
230+
this->dcr_write(VX_DCR_BASE_GRID_DIM1, grid_dim[1]);
231+
this->dcr_write(VX_DCR_BASE_GRID_DIM2, grid_dim[2]);
232+
this->dcr_write(VX_DCR_BASE_BLOCK_DIM0, block_dim[0]);
233+
this->dcr_write(VX_DCR_BASE_BLOCK_DIM1, block_dim[1]);
234+
this->dcr_write(VX_DCR_BASE_BLOCK_DIM2, block_dim[2]);
235+
214236
// start new run
215237
future_ = std::async(std::launch::async, [&]{
216238
processor_.run();

tests/regression/vecadd/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#endif
77

88
typedef struct {
9+
uint32_t block_dim[3];
10+
uint32_t grid_dim[3];
911
uint32_t num_points;
1012
uint64_t src0_addr;
1113
uint64_t src1_addr;

tests/regression/vecadd/kernel.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ int main() {
4242
// Calculate global thread ID
4343
// threadIdx.x gives the flat thread index within the CTA (warp_local_id * NUM_THREADS + thread_id)
4444
// globalId = blockIdx.x * blockDim.x + threadIdx.x
45-
uint32_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
45+
uint32_t bidx = blockIdx.x;
46+
uint32_t tidx = threadIdx.x;
47+
uint32_t bdim = blockDim.x;
48+
uint32_t globalId = bidx * bdim + tidx;
4649

4750
vx_printf("block id x: %d, threadIdx.x: %d, global id: %d\n",
48-
blockIdx.x, threadIdx.x, globalId);
51+
bidx, tidx, globalId);
4952

5053
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
5154
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);

tests/regression/vecadd/main.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@ int main(int argc, char *argv[]) {
134134
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
135135
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
136136

137+
// Set block dimensions to 8 (warp width)
138+
const uint32_t block_size = 16;
139+
kernel_arg.block_dim[0] = block_size;
140+
kernel_arg.block_dim[1] = 1;
141+
kernel_arg.block_dim[2] = 1;
142+
143+
// Calculate grid dimensions based on number of points
144+
kernel_arg.grid_dim[0] = (num_points + block_size - 1) / block_size; // Round up
145+
kernel_arg.grid_dim[1] = 1;
146+
kernel_arg.grid_dim[2] = 1;
147+
137148
kernel_arg.num_points = num_points;
138149

139150
// allocate device memory

0 commit comments

Comments
 (0)