From 0f59e2fe03dee808506c41100f2c7198babe22d9 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Sat, 28 Feb 2026 21:52:23 +0100 Subject: [PATCH 01/11] [LSQ] Add fallback logic to always issue oldest load or store --- .../vhdl_gen/generators/lsq.py | 93 +++++++++++++++++-- 1 file changed, 85 insertions(+), 8 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index a1f5eddd94..ff806ef93b 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -581,9 +581,22 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Multiple store channels not yet implemented assert (self.configs.numStMem == 1) + # current store request index store_idx = LogicVec(ctx, 'store_idx', 'w', self.configs.stqAddrW) + # whether the current store request is valid, including address and data + store_req_valid = Logic(ctx, 'store_req_valid', 'w') + # whether the current store request has conflicts with any previous loads + store_conflict = Logic(ctx, 'store_conflict', 'w') + # whether the to-be-issued store entry is older than each of the load entries + store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) + # store request enable (after fallback logic) store_en = Logic(ctx, 'store_en', 'w') + # Fallback load/store signals + fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') + fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) + fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') + # Matrix Generation ld_st_conflict = LogicVecArray( ctx, 'ld_st_conflict', 'w', self.configs.numLdqEntries, self.configs.numStqEntries) @@ -711,11 +724,14 @@ def generate(self, lsq_submodules, path_rtl) -> None: ctx, 'load_conflict', 'w', self.configs.numLdqEntries) load_req_valid = LogicArray( ctx, 'load_req_valid', 'w', self.configs.numLdqEntries) + load_req_valid_p0 = LogicArray( + ctx, 'load_req_valid_p0', pipe0_type, self.configs.numLdqEntries) can_load = LogicArray( ctx, 'can_load', 'w', self.configs.numLdqEntries) can_load_p0 = LogicArray( ctx, 'can_load_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: + load_req_valid_p0.regInit(init=[0]*self.configs.numLdqEntries) can_load_p0.regInit(init=[0]*self.configs.numLdqEntries) # The load conflicts with any store @@ -727,6 +743,8 @@ def generate(self, lsq_submodules, path_rtl) -> None: for i in range(0, self.configs.numLdqEntries): arch += Op(ctx, load_req_valid[i], ldq_alloc_pcomp[i], 'and', ldq_addr_valid_pcomp[i]) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, load_req_valid_p0[i], load_req_valid[i]) # Generate list for loads that does not face dependency issue for i in range(0, self.configs.numLdqEntries): arch += Op(ctx, can_load_p0[i], 'not', @@ -743,10 +761,15 @@ def generate(self, lsq_submodules, path_rtl) -> None: can_load_list = [] can_load_list.append(can_load) - for w in range(0, self.configs.numLdMem): + + # temporary (pre-fallback) signals + load_idx_tmp_oh = LogicVecArray(ctx, 'load_idx_tmp_oh', 'w', self.configs.numLdMem, self.configs.numLdqEntries) + load_en_tmp = LogicArray(ctx, 'load_en_tmp', 'w', self.configs.numLdMem) + + for w in range(self.configs.numLdMem): arch += CyclicPriorityMasking( - ctx, load_idx_oh[w], can_load_list[w], ldq_head_oh_p0) - arch += Reduce(ctx, load_en[w], can_load_list[w], 'or') + ctx, load_idx_tmp_oh[w], can_load_list[w], ldq_head_oh_p0) + arch += Reduce(ctx, load_en_tmp[w], can_load_list[w], 'or') if (w+1 != self.configs.numLdMem): load_idx_oh_LogicArray = LogicArray( ctx, f'load_idx_oh_Array_{w+1}', 'w', self.configs.numLdqEntries) @@ -758,6 +781,40 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Op(ctx, can_load_list[w+1][i], 'not', load_idx_oh_LogicArray[i], 'and', can_load_list[w][i]) + for w in range(self.configs.numLdMem): + if w != self.configs.numLdMem - 1: + # non-last load port: use _tmp signals directly + arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w]) + arch += Op(ctx, load_en[w], load_en_tmp[w]) + else: + # last load port: use fallback load (if any) as the first priority, then service + # other loads (from _tmp) + arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) + arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) + + # Fallback Load / Store + + # Find the oldest unissue load. This may still (a) be younger than the oldest store, (b) be + # not allocated, or (c) have an invalid address. + ldq_issue_not = LogicArray(ctx, 'ldq_issue_not', 'w', self.configs.numLdqEntries) + oldest_unissued_load_oh = LogicArray(ctx, 'oldest_unissued_load_oh', 'w', self.configs.numLdqEntries) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_issue_not[i], 'not', ldq_issue[i]) + arch += CyclicPriorityMasking(ctx, oldest_unissued_load_oh, ldq_issue_not, ldq_head_oh_p0) + + # If the fallback load is older than the fallback store, this contains a single bit set (at the fallback load entry). + # Otherwise (fallback store is oldest), this is all zeros. + fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, fallback_load_is_oldest_oh[i], 'not', store_is_older_arr[i], 'and', oldest_unissued_load_oh[i]) + # Whether the fallback load is the oldest. + arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') + + # fallback load: needs to be the oldest AND actually ready to be issued (allocated and address valid) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, (fallback_load_idx_oh, i), load_req_valid_p0[i], 'and', fallback_load_is_oldest_oh[i]) + arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') + # Store # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path. # Both the current and next stores are checked for validity and conflicts, and the result is multiplexed "late @@ -771,9 +828,11 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_conflict = Logic(ctx, 'store_conflict', 'w') store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type) + store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries) st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: store_req_valid_p0.regInit(init=0) + store_is_older_arr_p0.regInit() st_ld_conflict_p0.regInit() # next issue pointer (needed for look-ahead when pipelining is enabled) @@ -783,10 +842,12 @@ def generate(self, lsq_submodules, path_rtl) -> None: # checks for current and next (if needed) store entry store_req_valid_curr = Logic(ctx, 'store_req_valid_curr', 'w') + store_is_older_arr_curr = LogicArray(ctx, 'store_is_older_arr_curr', 'w', self.configs.numLdqEntries) st_ld_conflict_curr = LogicVec(ctx, 'st_ld_conflict_curr', 'w', self.configs.numLdqEntries) if self.configs.pipe0: # with pipelining: also compute for the next entry store_req_valid_next = Logic(ctx, 'store_req_valid_next', 'w') + store_is_older_arr_next = LogicArray(ctx, 'store_is_older_arr_next', 'w', self.configs.numLdqEntries) st_ld_conflict_next = LogicVec(ctx, 'st_ld_conflict_next', 'w', self.configs.numLdqEntries) # validity lookup @@ -795,6 +856,14 @@ def generate(self, lsq_submodules, path_rtl) -> None: # with pipelining: also compute for the next entry arch += MuxLookUp(ctx, store_req_valid_next, store_req_valid_arr, stq_issue_next) + # extract column from order matrix + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_curr[i], MuxIndex(store_is_older_pcomp[i], stq_issue)) + if self.configs.pipe0: + for i in range(self.configs.numLdqEntries): + # with pipelining: also compute for the next entry + arch += Op(ctx, store_is_older_arr_next[i], MuxIndex(store_is_older_pcomp[i], stq_issue_next)) + # A store conflicts with a load when: # 1. The load entry is valid, and # 2. The load entry hasn't completed (received data from memory), and @@ -806,8 +875,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: (st_ld_conflict_curr, i), (ldq_alloc_pcomp, i), 'and', 'not', (load_completed, i), 'and', - 'not', MuxIndex( - store_is_older_pcomp[i], stq_issue), 'and', + 'not', store_is_older_arr_curr[i], 'and', '(', MuxIndex( addr_same_pcomp[i], stq_issue), 'or', 'not', (ldq_addr_valid_pcomp, i), ')' ) @@ -818,8 +886,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: (st_ld_conflict_next, i), (ldq_alloc_pcomp, i), 'and', 'not', (load_completed, i), 'and', - 'not', MuxIndex( - store_is_older_pcomp[i], stq_issue_next), 'and', + 'not', store_is_older_arr_next[i], 'and', '(', MuxIndex( addr_same_pcomp[i], stq_issue_next), 'or', 'not', (ldq_addr_valid_pcomp, i), ')' ) @@ -831,15 +898,25 @@ def generate(self, lsq_submodules, path_rtl) -> None: 'when', stq_issue_en, 'else', st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when', stq_issue_en, 'else', store_req_valid_curr) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when', + stq_issue_en, 'else', store_is_older_arr_curr[i]) else: # without pipelining: only consider current store entry arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_curr) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i]) # The store conflicts with any load arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or') - arch += Op(ctx, store_en, 'not', store_conflict, 'and', store_req_valid_p0) + # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). + arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', 'not', fallback_load_is_oldest, ')') arch += Op(ctx, store_idx, stq_issue) + # needed for fallback logic + # FIXME: conditionally enable based on fallback issue flag + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) # Bypass bypass_idx_oh_p0 = LogicVecArray( From 4791aebb429d7a86e482d59da04343e133284bd0 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Mon, 2 Mar 2026 14:11:57 +0100 Subject: [PATCH 02/11] [LSQ] Add reset to some registers --- tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index ff806ef93b..f8a88f3e6b 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -482,7 +482,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Load Queue Entries ldq_alloc.regInit(init=[0]*self.configs.numLdqEntries) - ldq_issue.regInit() + ldq_issue.regInit(init=[0]*self.configs.numLdqEntries) if (self.configs.ldpAddrW > 0): ldq_port_idx.regInit(ldq_wen) ldq_addr_valid.regInit() @@ -756,7 +756,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: ldq_head_oh_p0 = LogicVec( ctx, 'ldq_head_oh_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: - ldq_head_oh_p0.regInit() + ldq_head_oh_p0.regInit(init=0) arch += Op(ctx, ldq_head_oh_p0, ldq_head_oh) can_load_list = [] From ea2be9933ad51e463ba7669d5e074355e97d4029 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Mon, 2 Mar 2026 14:23:15 +0100 Subject: [PATCH 03/11] [integration-test] Add simple memory dependency test --- .../test_memory_deps/test_memory_deps.c | 32 +++++++++++++++++++ .../test_memory_deps/test_memory_deps.h | 9 ++++++ tools/integration/TEST_SUITE.cpp | 3 +- 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 integration-test/test_memory_deps/test_memory_deps.c create mode 100644 integration-test/test_memory_deps/test_memory_deps.h diff --git a/integration-test/test_memory_deps/test_memory_deps.c b/integration-test/test_memory_deps/test_memory_deps.c new file mode 100644 index 0000000000..2b2a64118e --- /dev/null +++ b/integration-test/test_memory_deps/test_memory_deps.c @@ -0,0 +1,32 @@ +//===- test_memory_deps.c -----------------------------------------*- C -*-===// + +#include "test_memory_deps.h" +#include "dynamatic/Integration.h" +#include + +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n) { + int sum = 0; + for (int i = 0; i < n; ++i) { + sum += data[load_addrs[i]]; + data[store_addrs[i]] = i; + } + data[0] = sum; +} + +int main(void) { + in_int_t load_addrs[1000]; + in_int_t store_addrs[1000]; + inout_int_t data[1000]; + + in_int_t n = 1000; + for (int i = 0; i < n; ++i) { + // addresses alternate randomly between 1 and 2, creating RAW and WAR hazards + load_addrs[i] = (rand() % 4) + 1; + load_addrs[i] = (rand() % 4) + 1; + store_addrs[i] = (i == 0) ? 1 : 2; + data[i] = i; + } + + CALL_KERNEL(test_memory_deps, load_addrs, store_addrs, data, n); + return 0; +} diff --git a/integration-test/test_memory_deps/test_memory_deps.h b/integration-test/test_memory_deps/test_memory_deps.h new file mode 100644 index 0000000000..74a5719499 --- /dev/null +++ b/integration-test/test_memory_deps/test_memory_deps.h @@ -0,0 +1,9 @@ +#ifndef TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H +#define TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H + +typedef int in_int_t; +typedef int inout_int_t; + +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n); + +#endif // TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H diff --git a/tools/integration/TEST_SUITE.cpp b/tools/integration/TEST_SUITE.cpp index f58c5c9595..9493f29c46 100644 --- a/tools/integration/TEST_SUITE.cpp +++ b/tools/integration/TEST_SUITE.cpp @@ -284,6 +284,7 @@ INSTANTIATE_TEST_SUITE_P( "sumi3_mem", "symm_float", "syr2k_float", + "test_memory_deps", "test_stdint", "threshold", "triangular", @@ -329,7 +330,7 @@ INSTANTIATE_TEST_SUITE_P( "matvec" ), [](const auto &info) { return info.param; }); -#endif +#endif INSTANTIATE_TEST_SUITE_P( MemoryBenchmarks, MemoryFixture, From 17b0ebb67fe274c22e7cb9c389279dc08438b364 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Mon, 2 Mar 2026 14:24:37 +0100 Subject: [PATCH 04/11] [LSQ] Wait for outstanding stores (loads) for fallback loads (stores) --- .../vhdl_gen/generators/lsq.py | 67 ++++++++++++++----- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index f8a88f3e6b..bec1671f20 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -593,9 +593,9 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_en = Logic(ctx, 'store_en', 'w') # Fallback load/store signals - fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') + fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w') # Matrix Generation ld_st_conflict = LogicVecArray( @@ -720,20 +720,28 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Load + ldq_alloc_p0 = LogicArray( + ctx, 'ldq_alloc_p0', pipe0_type, self.configs.numLdqEntries) + ldq_addr_valid_p0 = LogicArray( + ctx, 'ldq_addr_valid_p0', pipe0_type, self.configs.numLdqEntries) load_conflict = LogicArray( ctx, 'load_conflict', 'w', self.configs.numLdqEntries) load_req_valid = LogicArray( ctx, 'load_req_valid', 'w', self.configs.numLdqEntries) - load_req_valid_p0 = LogicArray( - ctx, 'load_req_valid_p0', pipe0_type, self.configs.numLdqEntries) can_load = LogicArray( ctx, 'can_load', 'w', self.configs.numLdqEntries) can_load_p0 = LogicArray( ctx, 'can_load_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: - load_req_valid_p0.regInit(init=[0]*self.configs.numLdqEntries) + ldq_alloc_p0.regInit(init=[0]*self.configs.numLdqEntries) + ldq_addr_valid_p0.regInit(init=[0]*self.configs.numLdqEntries) can_load_p0.regInit(init=[0]*self.configs.numLdqEntries) + # Pipeline + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_alloc_p0[i], ldq_alloc_pcomp[i]) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_addr_valid_p0[i], ldq_addr_valid_pcomp[i]) # The load conflicts with any store for i in range(0, self.configs.numLdqEntries): arch += Reduce(ctx, @@ -743,8 +751,6 @@ def generate(self, lsq_submodules, path_rtl) -> None: for i in range(0, self.configs.numLdqEntries): arch += Op(ctx, load_req_valid[i], ldq_alloc_pcomp[i], 'and', ldq_addr_valid_pcomp[i]) - for i in range(0, self.configs.numLdqEntries): - arch += Op(ctx, load_req_valid_p0[i], load_req_valid[i]) # Generate list for loads that does not face dependency issue for i in range(0, self.configs.numLdqEntries): arch += Op(ctx, can_load_p0[i], 'not', @@ -794,27 +800,52 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Fallback Load / Store - # Find the oldest unissue load. This may still (a) be younger than the oldest store, (b) be - # not allocated, or (c) have an invalid address. - ldq_issue_not = LogicArray(ctx, 'ldq_issue_not', 'w', self.configs.numLdqEntries) - oldest_unissued_load_oh = LogicArray(ctx, 'oldest_unissued_load_oh', 'w', self.configs.numLdqEntries) + # The fallback load candidate is the oldest allocated and un-issued load. + ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries) + fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries) for i in range(0, self.configs.numLdqEntries): - arch += Op(ctx, ldq_issue_not[i], 'not', ldq_issue[i]) - arch += CyclicPriorityMasking(ctx, oldest_unissued_load_oh, ldq_issue_not, ldq_head_oh_p0) + arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i]) + arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0) + + # The fallback store canddiate is the oldest allocated and un-issued store. This is simply + # the store at the store queue issue pointer (if allocated). We do not explicitly track the + # fallback store candidate, but rather just keep the relevant row/column from the order + # matrix. - # If the fallback load is older than the fallback store, this contains a single bit set (at the fallback load entry). - # Otherwise (fallback store is oldest), this is all zeros. + # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry). + # Otherwise (oldest store is oldest overall), this is all zeros. fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries) for i in range(self.configs.numLdqEntries): - arch += Op(ctx, fallback_load_is_oldest_oh[i], 'not', store_is_older_arr[i], 'and', oldest_unissued_load_oh[i]) + arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i]) + # Whether the fallback load is the oldest. + fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') - # fallback load: needs to be the oldest AND actually ready to be issued (allocated and address valid) + # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle. + store_outstanding = Logic(ctx, 'store_outstanding', 'w') + arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") + + # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle. + load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) + load_outstanding = Logic(ctx, 'load_outstanding', 'w') for i in range(self.configs.numLdqEntries): - arch += Op(ctx, (fallback_load_idx_oh, i), load_req_valid_p0[i], 'and', fallback_load_is_oldest_oh[i]) + arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") + arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or') + + # We can issue the fallback load candidate if: + # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). + # - It has a valid address. + # - There are no outstanding stores. + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') + # We can issue the fallback store candidate (if it is valid) if: + # - It is older than the oldest store (NOT fallback_load_is_oldest). + # - There are no outstanding loads. + arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) + # Store # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path. # Both the current and next stores are checked for validity and conflicts, and the result is multiplexed "late @@ -911,7 +942,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: # The store conflicts with any load arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or') # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). - arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', 'not', fallback_load_is_oldest, ')') + arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')') arch += Op(ctx, store_idx, stq_issue) # needed for fallback logic # FIXME: conditionally enable based on fallback issue flag From 36887c218b749c9de90f9c91860ae09dc3e8de3d Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Tue, 3 Mar 2026 13:03:55 +0100 Subject: [PATCH 05/11] [LSQ] Use flag to enable/disable fallback issue --- .../lsq-generator-python/vhdl_gen/configs.py | 13 ++ .../vhdl_gen/generators/lsq.py | 135 ++++++++++-------- 2 files changed, 85 insertions(+), 63 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py index a83ae2de0d..fc873d4c29 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py @@ -65,6 +65,9 @@ class Configs: stResp: bool = False # Whether store response channel in store access port is enabled gaMulti: bool = False # Whether multiple groups are allowed to request an allocation at the same cycle bypass: bool = True # Whether bypassing (store-to-load forwarding) is enabled + # guarantees execution of the oldest pending memory operation (load or store) in the presence of false conflicts + # (which can happen with approximate address comparison) + fallbackIssue: bool = False def __init__(self, config: dict) -> None: self.name = config["name"] @@ -82,7 +85,10 @@ def __init__(self, config: dict) -> None: self.stResp = bool(config["stResp"]) self.gaMulti = bool(config["groupMulti"]) + + # TODO: set based on requested LSQ model self.bypass = True + self.fallbackIssue = False self.gaNumLoads = config["numLoads"] self.gaNumStores = config["numStores"] @@ -120,3 +126,10 @@ def __init__(self, config: dict) -> None: assert (len(self.gaLdOrder) == self.numGroups) assert (len(self.gaLdPortIdx) == self.numGroups) assert (len(self.gaStPortIdx) == self.numGroups) + + if self.fallbackIssue: + assert not self.bypass, "Fallback issue is not compatible with bypassing." + # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not + # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not + # currently used by Dynamatic, so this is left as future work. + assert self.numLdMem == 1, "Fallback issue is only supported for single load port configuration." diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index bec1671f20..f19501d282 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -587,15 +587,17 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_req_valid = Logic(ctx, 'store_req_valid', 'w') # whether the current store request has conflicts with any previous loads store_conflict = Logic(ctx, 'store_conflict', 'w') - # whether the to-be-issued store entry is older than each of the load entries - store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) + if self.configs.fallbackIssue: + # whether the to-be-issued store entry is older than each of the load entries + store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) # store request enable (after fallback logic) store_en = Logic(ctx, 'store_en', 'w') # Fallback load/store signals - fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) - fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') - fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w') + if self.configs.fallbackIssue: + fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) + fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') + fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w') # Matrix Generation ld_st_conflict = LogicVecArray( @@ -788,63 +790,64 @@ def generate(self, lsq_submodules, path_rtl) -> None: load_idx_oh_LogicArray[i], 'and', can_load_list[w][i]) for w in range(self.configs.numLdMem): - if w != self.configs.numLdMem - 1: + last_load_port = (w == self.configs.numLdMem - 1) + if self.configs.fallbackIssue and last_load_port: + # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp) + arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) + arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) + else: # non-last load port: use _tmp signals directly arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w]) arch += Op(ctx, load_en[w], load_en_tmp[w]) - else: - # last load port: use fallback load (if any) as the first priority, then service - # other loads (from _tmp) - arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) - arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) - # Fallback Load / Store + if self.configs.fallbackIssue: + # Fallback Load / Store - # The fallback load candidate is the oldest allocated and un-issued load. - ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries) - fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries) - for i in range(0, self.configs.numLdqEntries): - arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i]) - arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0) + # The fallback load candidate is the oldest allocated and un-issued load. + ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries) + fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i]) + arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0) - # The fallback store canddiate is the oldest allocated and un-issued store. This is simply - # the store at the store queue issue pointer (if allocated). We do not explicitly track the - # fallback store candidate, but rather just keep the relevant row/column from the order - # matrix. + # The fallback store canddiate is the oldest allocated and un-issued store. This is simply + # the store at the store queue issue pointer (if allocated). We do not explicitly track the + # fallback store candidate, but rather just keep the relevant row/column from the order + # matrix. - # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry). - # Otherwise (oldest store is oldest overall), this is all zeros. - fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries) - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i]) + # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry). + # Otherwise (oldest store is oldest overall), this is all zeros. + fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i]) - # Whether the fallback load is the oldest. - fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') - arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') + # Whether the fallback load is the oldest. + fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') + arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') - # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle. - store_outstanding = Logic(ctx, 'store_outstanding', 'w') - arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") + # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle. + store_outstanding = Logic(ctx, 'store_outstanding', 'w') + arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") - # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle. - load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) - load_outstanding = Logic(ctx, 'load_outstanding', 'w') - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") - arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or') + # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle. + load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) + load_outstanding = Logic(ctx, 'load_outstanding', 'w') + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") + arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or') - # We can issue the fallback load candidate if: - # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). - # - It has a valid address. - # - There are no outstanding stores. - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) - arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') + # We can issue the fallback load candidate if: + # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). + # - It has a valid address. + # - There are no outstanding stores. + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) + arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') - # We can issue the fallback store candidate (if it is valid) if: - # - It is older than the oldest store (NOT fallback_load_is_oldest). - # - There are no outstanding loads. - arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) + # We can issue the fallback store candidate (if it is valid) if: + # - It is older than the oldest store (NOT fallback_load_is_oldest). + # - There are no outstanding loads. + arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) # Store # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path. @@ -859,12 +862,14 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_conflict = Logic(ctx, 'store_conflict', 'w') store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type) - store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries) st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: store_req_valid_p0.regInit(init=0) - store_is_older_arr_p0.regInit() st_ld_conflict_p0.regInit() + if self.configs.fallbackIssue: + store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries) + if self.configs.pipe0: + store_is_older_arr_p0.regInit() # next issue pointer (needed for look-ahead when pipelining is enabled) if self.configs.pipe0: @@ -929,25 +934,29 @@ def generate(self, lsq_submodules, path_rtl) -> None: 'when', stq_issue_en, 'else', st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when', stq_issue_en, 'else', store_req_valid_curr) - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when', - stq_issue_en, 'else', store_is_older_arr_curr[i]) + if self.configs.fallbackIssue: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when', + stq_issue_en, 'else', store_is_older_arr_curr[i]) else: # without pipelining: only consider current store entry arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_curr) - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i]) + if self.configs.fallbackIssue: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i]) # The store conflicts with any load arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or') - # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). - arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')') arch += Op(ctx, store_idx, stq_issue) - # needed for fallback logic - # FIXME: conditionally enable based on fallback issue flag - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) + # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). + if self.configs.fallbackIssue: + arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')') + # ordering information needed by fallback issue logic + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) + else: + arch += Op(ctx, store_en, store_req_valid_p0, 'and', 'not', store_conflict) # Bypass bypass_idx_oh_p0 = LogicVecArray( From 539041b80a168109b66ccbc1f4af304f09f2f075 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Tue, 3 Mar 2026 13:42:15 +0100 Subject: [PATCH 06/11] [LSQ] Fix typo in signal name --- .../backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index f19501d282..488d0b257b 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -830,11 +830,11 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle. - load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) + load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) load_outstanding = Logic(ctx, 'load_outstanding', 'w') for i in range(self.configs.numLdqEntries): - arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") - arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or') + arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") + arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or') # We can issue the fallback load candidate if: # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). From 10ca1f89aac0e17f24cf4e5874f15f3d7accf95b Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Wed, 4 Mar 2026 09:27:21 +0100 Subject: [PATCH 07/11] [LSQ] Add comment to explain concurrent issue with fallback --- .../lsq-generator-python/vhdl_gen/generators/lsq.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index 488d0b257b..285afa91f2 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -825,11 +825,17 @@ def generate(self, lsq_submodules, path_rtl) -> None: fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') - # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle. + # NOTE: For both the outstanding loads and stores, we only need to consider loads/store which were issued + # previously. If a load (store) is issued through the regular path in the same cycle as the fallback store + # (load), it cannot conflict with the fallback store (load). This is because: + # 1. The regular load (store) must be younger than the fallback store (load) by construction. + # 2. The regular load (store) has been dependency-checked against the fallback store (load) before being + # issued. + # 3. Thus, the fallback store (load) and the regular load (store) must have different addresses. + store_outstanding = Logic(ctx, 'store_outstanding', 'w') arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") - # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle. load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) load_outstanding = Logic(ctx, 'load_outstanding', 'w') for i in range(self.configs.numLdqEntries): From c42f38b3d142744eb84ed0c21ebc647063eb914d Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Fri, 6 Mar 2026 17:12:21 +0100 Subject: [PATCH 08/11] [LSQ] Separate load and store paths of fallback issue --- .../lsq-generator-python/vhdl_gen/configs.py | 10 ++- .../vhdl_gen/generators/lsq.py | 64 ++++++++++--------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py index fc873d4c29..4d520defa1 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py @@ -65,9 +65,11 @@ class Configs: stResp: bool = False # Whether store response channel in store access port is enabled gaMulti: bool = False # Whether multiple groups are allowed to request an allocation at the same cycle bypass: bool = True # Whether bypassing (store-to-load forwarding) is enabled + # guarantees execution of the oldest pending memory operation (load or store) in the presence of false conflicts # (which can happen with approximate address comparison) - fallbackIssue: bool = False + fallbackIssueLoad: bool = False + fallbackIssueStore: bool = False def __init__(self, config: dict) -> None: self.name = config["name"] @@ -88,7 +90,8 @@ def __init__(self, config: dict) -> None: # TODO: set based on requested LSQ model self.bypass = True - self.fallbackIssue = False + self.fallbackIssueLoad = False + self.fallbackIssueStore = False self.gaNumLoads = config["numLoads"] self.gaNumStores = config["numStores"] @@ -127,8 +130,9 @@ def __init__(self, config: dict) -> None: assert (len(self.gaLdPortIdx) == self.numGroups) assert (len(self.gaStPortIdx) == self.numGroups) - if self.fallbackIssue: + if self.fallbackIssueLoad or self.fallbackIssueStore: assert not self.bypass, "Fallback issue is not compatible with bypassing." + if self.fallbackIssueLoad: # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not # currently used by Dynamatic, so this is left as future work. diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index 285afa91f2..a52a2d3409 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -587,16 +587,17 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_req_valid = Logic(ctx, 'store_req_valid', 'w') # whether the current store request has conflicts with any previous loads store_conflict = Logic(ctx, 'store_conflict', 'w') - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: # whether the to-be-issued store entry is older than each of the load entries store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) # store request enable (after fallback logic) store_en = Logic(ctx, 'store_en', 'w') # Fallback load/store signals - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad: fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') + if self.configs.fallbackIssueStore: fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w') # Matrix Generation @@ -791,7 +792,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: for w in range(self.configs.numLdMem): last_load_port = (w == self.configs.numLdMem - 1) - if self.configs.fallbackIssue and last_load_port: + if self.configs.fallbackIssueLoad and last_load_port: # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp) arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) @@ -800,7 +801,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w]) arch += Op(ctx, load_en[w], load_en_tmp[w]) - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: # Fallback Load / Store # The fallback load candidate is the oldest allocated and un-issued load. @@ -833,27 +834,30 @@ def generate(self, lsq_submodules, path_rtl) -> None: # issued. # 3. Thus, the fallback store (load) and the regular load (store) must have different addresses. - store_outstanding = Logic(ctx, 'store_outstanding', 'w') - arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") + if self.configs.fallbackIssueLoad: + store_outstanding = Logic(ctx, 'store_outstanding', 'w') + arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") - load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) - load_outstanding = Logic(ctx, 'load_outstanding', 'w') - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") - arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or') + # We can issue the fallback load candidate if: + # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). + # - It has a valid address. + # - There are no outstanding stores. + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) + arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') - # We can issue the fallback load candidate if: - # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). - # - It has a valid address. - # - There are no outstanding stores. - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) - arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') + if self.configs.fallbackIssueStore: + # FIXME: This can reuse code from the dependency-check refactor which is part of another PR. + load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) + load_outstanding = Logic(ctx, 'load_outstanding', 'w') + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") + arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or') - # We can issue the fallback store candidate (if it is valid) if: - # - It is older than the oldest store (NOT fallback_load_is_oldest). - # - There are no outstanding loads. - arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) + # We can issue the fallback store candidate (if it is valid) if: + # - It is older than the oldest store (NOT fallback_load_is_oldest). + # - There are no outstanding loads. + arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) # Store # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path. @@ -872,7 +876,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: if self.configs.pipe0: store_req_valid_p0.regInit(init=0) st_ld_conflict_p0.regInit() - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: store_is_older_arr_p0.regInit() @@ -940,7 +944,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: 'when', stq_issue_en, 'else', st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when', stq_issue_en, 'else', store_req_valid_curr) - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: for i in range(self.configs.numLdqEntries): arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when', stq_issue_en, 'else', store_is_older_arr_curr[i]) @@ -948,7 +952,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: # without pipelining: only consider current store entry arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_curr) - if self.configs.fallbackIssue: + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: for i in range(self.configs.numLdqEntries): arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i]) @@ -956,14 +960,16 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or') arch += Op(ctx, store_idx, stq_issue) # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). - if self.configs.fallbackIssue: + if self.configs.fallbackIssueStore: arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')') - # ordering information needed by fallback issue logic - for i in range(self.configs.numLdqEntries): - arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) else: arch += Op(ctx, store_en, store_req_valid_p0, 'and', 'not', store_conflict) + # ordering information needed by fallback issue logic + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) + # Bypass bypass_idx_oh_p0 = LogicVecArray( ctx, 'bypass_idx_oh_p0', pipe0_type, self.configs.numLdqEntries, self.configs.numStqEntries) From 5f4da8783bbea25958fc824bbf41ace13072a468 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Fri, 6 Mar 2026 17:35:49 +0100 Subject: [PATCH 09/11] [integration-test] Fix formatting --- integration-test/test_memory_deps/test_memory_deps.c | 6 ++++-- integration-test/test_memory_deps/test_memory_deps.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/integration-test/test_memory_deps/test_memory_deps.c b/integration-test/test_memory_deps/test_memory_deps.c index 2b2a64118e..1935e6bf9c 100644 --- a/integration-test/test_memory_deps/test_memory_deps.c +++ b/integration-test/test_memory_deps/test_memory_deps.c @@ -4,7 +4,8 @@ #include "dynamatic/Integration.h" #include -void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n) { +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], + inout_int_t data[1000], in_int_t n) { int sum = 0; for (int i = 0; i < n; ++i) { sum += data[load_addrs[i]]; @@ -20,7 +21,8 @@ int main(void) { in_int_t n = 1000; for (int i = 0; i < n; ++i) { - // addresses alternate randomly between 1 and 2, creating RAW and WAR hazards + // addresses alternate randomly between 1 and 2, creating RAW and WAR + // hazards load_addrs[i] = (rand() % 4) + 1; load_addrs[i] = (rand() % 4) + 1; store_addrs[i] = (i == 0) ? 1 : 2; diff --git a/integration-test/test_memory_deps/test_memory_deps.h b/integration-test/test_memory_deps/test_memory_deps.h index 74a5719499..6bc07a1cdd 100644 --- a/integration-test/test_memory_deps/test_memory_deps.h +++ b/integration-test/test_memory_deps/test_memory_deps.h @@ -4,6 +4,7 @@ typedef int in_int_t; typedef int inout_int_t; -void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n); +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], + inout_int_t data[1000], in_int_t n); #endif // TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H From 4c3b8dc4fb65e6e0e253740d8c571253d9e0c669 Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Tue, 10 Mar 2026 10:02:51 +0100 Subject: [PATCH 10/11] [LSQ] Address review comments --- .../lsq-generator-python/vhdl_gen/configs.py | 4 +-- .../vhdl_gen/generators/lsq.py | 29 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py index 4d520defa1..932dfb81f7 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py @@ -134,6 +134,6 @@ def __init__(self, config: dict) -> None: assert not self.bypass, "Fallback issue is not compatible with bypassing." if self.fallbackIssueLoad: # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not - # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not + # duplicated a load issued by another load channel in the same cycle. Multiple load channels are not # currently used by Dynamatic, so this is left as future work. - assert self.numLdMem == 1, "Fallback issue is only supported for single load port configuration." + assert self.numLdMem == 1, "Fallback issue is only supported for single load channel configuration." diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index a52a2d3409..80590e0e35 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -589,6 +589,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: store_conflict = Logic(ctx, 'store_conflict', 'w') if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: # whether the to-be-issued store entry is older than each of the load entries + # only needed for fallback logic, as regular store dependency checking uses "internal" signals instead store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) # store request enable (after fallback logic) store_en = Logic(ctx, 'store_en', 'w') @@ -791,13 +792,18 @@ def generate(self, lsq_submodules, path_rtl) -> None: load_idx_oh_LogicArray[i], 'and', can_load_list[w][i]) for w in range(self.configs.numLdMem): - last_load_port = (w == self.configs.numLdMem - 1) - if self.configs.fallbackIssueLoad and last_load_port: - # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp) + # We use the last load memory channel for potential fallback loads for two reasons: + # - It is the last channel to be utilized for regular loads, thus it is least likely we need to preempt a + # regular load for a fallback load. + # - If all load channels could be occupied by regular loads, we are preempting the youngest load. This + # probably has the least performance impact. + last_load_channel = (w == self.configs.numLdMem - 1) + if self.configs.fallbackIssueLoad and last_load_channel: + # last channel: use fallback load (if any) as the first priority, then service other loads (from _tmp) arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) else: - # non-last load port: use _tmp signals directly + # non-last load channel: use _tmp signals directly arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w]) arch += Op(ctx, load_en[w], load_en_tmp[w]) @@ -811,10 +817,9 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i]) arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0) - # The fallback store canddiate is the oldest allocated and un-issued store. This is simply - # the store at the store queue issue pointer (if allocated). We do not explicitly track the - # fallback store candidate, but rather just keep the relevant row/column from the order - # matrix. + # The fallback store candidate is the oldest allocated and un-issued store. This is simply the store at the + # store queue issue pointer (if allocated). We do not explicitly track the fallback store candidate, but + # rather just keep the relevant row/column from the order matrix. # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry). # Otherwise (oldest store is oldest overall), this is all zeros. @@ -822,10 +827,6 @@ def generate(self, lsq_submodules, path_rtl) -> None: for i in range(self.configs.numLdqEntries): arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i]) - # Whether the fallback load is the oldest. - fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') - arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') - # NOTE: For both the outstanding loads and stores, we only need to consider loads/store which were issued # previously. If a load (store) is issued through the regular path in the same cycle as the fallback store # (load), it cannot conflict with the fallback store (load). This is because: @@ -847,6 +848,10 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') if self.configs.fallbackIssueStore: + # whether the fallback load is the oldest + fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') + arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') + # FIXME: This can reuse code from the dependency-check refactor which is part of another PR. load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) load_outstanding = Logic(ctx, 'load_outstanding', 'w') From b7e56173b815d91ff33de3d10d8bde52329d85fc Mon Sep 17 00:00:00 2001 From: Max Wipfli Date: Tue, 10 Mar 2026 16:39:46 +0100 Subject: [PATCH 11/11] [LSQ] Fix merge conflict --- tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index 80590e0e35..9b8b855955 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -875,7 +875,6 @@ def generate(self, lsq_submodules, path_rtl) -> None: for i in range(self.configs.numStqEntries): arch += Op(ctx, store_req_valid_arr[i], stq_alloc_pcomp[i], 'and', stq_addr_valid_pcomp[i], 'and', stq_data_valid_pcomp[i]) - store_conflict = Logic(ctx, 'store_conflict', 'w') store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type) st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: