diff --git a/integration-test/test_memory_deps/test_memory_deps.c b/integration-test/test_memory_deps/test_memory_deps.c new file mode 100644 index 0000000000..1935e6bf9c --- /dev/null +++ b/integration-test/test_memory_deps/test_memory_deps.c @@ -0,0 +1,34 @@ +//===- test_memory_deps.c -----------------------------------------*- C -*-===// + +#include "test_memory_deps.h" +#include "dynamatic/Integration.h" +#include + +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], + inout_int_t data[1000], in_int_t n) { + int sum = 0; + for (int i = 0; i < n; ++i) { + sum += data[load_addrs[i]]; + data[store_addrs[i]] = i; + } + data[0] = sum; +} + +int main(void) { + in_int_t load_addrs[1000]; + in_int_t store_addrs[1000]; + inout_int_t data[1000]; + + in_int_t n = 1000; + for (int i = 0; i < n; ++i) { + // addresses alternate randomly between 1 and 2, creating RAW and WAR + // hazards + load_addrs[i] = (rand() % 4) + 1; + load_addrs[i] = (rand() % 4) + 1; + store_addrs[i] = (i == 0) ? 1 : 2; + data[i] = i; + } + + CALL_KERNEL(test_memory_deps, load_addrs, store_addrs, data, n); + return 0; +} diff --git a/integration-test/test_memory_deps/test_memory_deps.h b/integration-test/test_memory_deps/test_memory_deps.h new file mode 100644 index 0000000000..6bc07a1cdd --- /dev/null +++ b/integration-test/test_memory_deps/test_memory_deps.h @@ -0,0 +1,10 @@ +#ifndef TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H +#define TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H + +typedef int in_int_t; +typedef int inout_int_t; + +void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], + inout_int_t data[1000], in_int_t n); + +#endif // TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py index a83ae2de0d..932dfb81f7 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py @@ -66,6 +66,11 @@ class Configs: gaMulti: bool = False # Whether multiple groups are allowed to request an allocation at the same cycle bypass: bool = True # Whether bypassing (store-to-load forwarding) is enabled + # guarantees execution of the oldest pending memory operation (load or store) in the presence of false conflicts + # (which can happen with approximate address comparison) + fallbackIssueLoad: bool = False + fallbackIssueStore: bool = False + def __init__(self, config: dict) -> None: self.name = config["name"] self.dataW = config["dataWidth"] @@ -82,7 +87,11 @@ def __init__(self, config: dict) -> None: self.stResp = bool(config["stResp"]) self.gaMulti = bool(config["groupMulti"]) + + # TODO: set based on requested LSQ model self.bypass = True + self.fallbackIssueLoad = False + self.fallbackIssueStore = False self.gaNumLoads = config["numLoads"] self.gaNumStores = config["numStores"] @@ -120,3 +129,11 @@ def __init__(self, config: dict) -> None: assert (len(self.gaLdOrder) == self.numGroups) assert (len(self.gaLdPortIdx) == self.numGroups) assert (len(self.gaStPortIdx) == self.numGroups) + + if self.fallbackIssueLoad or self.fallbackIssueStore: + assert not self.bypass, "Fallback issue is not compatible with bypassing." + if self.fallbackIssueLoad: + # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not + # duplicated a load issued by another load channel in the same cycle. Multiple load channels are not + # currently used by Dynamatic, so this is left as future work. + assert self.numLdMem == 1, "Fallback issue is only supported for single load channel configuration." diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py index a1f5eddd94..9b8b855955 100644 --- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py +++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py @@ -482,7 +482,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Load Queue Entries ldq_alloc.regInit(init=[0]*self.configs.numLdqEntries) - ldq_issue.regInit() + ldq_issue.regInit(init=[0]*self.configs.numLdqEntries) if (self.configs.ldpAddrW > 0): ldq_port_idx.regInit(ldq_wen) ldq_addr_valid.regInit() @@ -581,9 +581,26 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Multiple store channels not yet implemented assert (self.configs.numStMem == 1) + # current store request index store_idx = LogicVec(ctx, 'store_idx', 'w', self.configs.stqAddrW) + # whether the current store request is valid, including address and data + store_req_valid = Logic(ctx, 'store_req_valid', 'w') + # whether the current store request has conflicts with any previous loads + store_conflict = Logic(ctx, 'store_conflict', 'w') + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + # whether the to-be-issued store entry is older than each of the load entries + # only needed for fallback logic, as regular store dependency checking uses "internal" signals instead + store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries) + # store request enable (after fallback logic) store_en = Logic(ctx, 'store_en', 'w') + # Fallback load/store signals + if self.configs.fallbackIssueLoad: + fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries) + fallback_load_en = Logic(ctx, 'fallback_load_en', 'w') + if self.configs.fallbackIssueStore: + fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w') + # Matrix Generation ld_st_conflict = LogicVecArray( ctx, 'ld_st_conflict', 'w', self.configs.numLdqEntries, self.configs.numStqEntries) @@ -707,6 +724,10 @@ def generate(self, lsq_submodules, path_rtl) -> None: # Load + ldq_alloc_p0 = LogicArray( + ctx, 'ldq_alloc_p0', pipe0_type, self.configs.numLdqEntries) + ldq_addr_valid_p0 = LogicArray( + ctx, 'ldq_addr_valid_p0', pipe0_type, self.configs.numLdqEntries) load_conflict = LogicArray( ctx, 'load_conflict', 'w', self.configs.numLdqEntries) load_req_valid = LogicArray( @@ -716,8 +737,15 @@ def generate(self, lsq_submodules, path_rtl) -> None: can_load_p0 = LogicArray( ctx, 'can_load_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: + ldq_alloc_p0.regInit(init=[0]*self.configs.numLdqEntries) + ldq_addr_valid_p0.regInit(init=[0]*self.configs.numLdqEntries) can_load_p0.regInit(init=[0]*self.configs.numLdqEntries) + # Pipeline + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_alloc_p0[i], ldq_alloc_pcomp[i]) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_addr_valid_p0[i], ldq_addr_valid_pcomp[i]) # The load conflicts with any store for i in range(0, self.configs.numLdqEntries): arch += Reduce(ctx, @@ -738,15 +766,20 @@ def generate(self, lsq_submodules, path_rtl) -> None: ldq_head_oh_p0 = LogicVec( ctx, 'ldq_head_oh_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: - ldq_head_oh_p0.regInit() + ldq_head_oh_p0.regInit(init=0) arch += Op(ctx, ldq_head_oh_p0, ldq_head_oh) can_load_list = [] can_load_list.append(can_load) - for w in range(0, self.configs.numLdMem): + + # temporary (pre-fallback) signals + load_idx_tmp_oh = LogicVecArray(ctx, 'load_idx_tmp_oh', 'w', self.configs.numLdMem, self.configs.numLdqEntries) + load_en_tmp = LogicArray(ctx, 'load_en_tmp', 'w', self.configs.numLdMem) + + for w in range(self.configs.numLdMem): arch += CyclicPriorityMasking( - ctx, load_idx_oh[w], can_load_list[w], ldq_head_oh_p0) - arch += Reduce(ctx, load_en[w], can_load_list[w], 'or') + ctx, load_idx_tmp_oh[w], can_load_list[w], ldq_head_oh_p0) + arch += Reduce(ctx, load_en_tmp[w], can_load_list[w], 'or') if (w+1 != self.configs.numLdMem): load_idx_oh_LogicArray = LogicArray( ctx, f'load_idx_oh_Array_{w+1}', 'w', self.configs.numLdqEntries) @@ -758,6 +791,79 @@ def generate(self, lsq_submodules, path_rtl) -> None: arch += Op(ctx, can_load_list[w+1][i], 'not', load_idx_oh_LogicArray[i], 'and', can_load_list[w][i]) + for w in range(self.configs.numLdMem): + # We use the last load memory channel for potential fallback loads for two reasons: + # - It is the last channel to be utilized for regular loads, thus it is least likely we need to preempt a + # regular load for a fallback load. + # - If all load channels could be occupied by regular loads, we are preempting the youngest load. This + # probably has the least performance impact. + last_load_channel = (w == self.configs.numLdMem - 1) + if self.configs.fallbackIssueLoad and last_load_channel: + # last channel: use fallback load (if any) as the first priority, then service other loads (from _tmp) + arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w]) + arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w]) + else: + # non-last load channel: use _tmp signals directly + arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w]) + arch += Op(ctx, load_en[w], load_en_tmp[w]) + + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + # Fallback Load / Store + + # The fallback load candidate is the oldest allocated and un-issued load. + ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries) + fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries) + for i in range(0, self.configs.numLdqEntries): + arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i]) + arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0) + + # The fallback store candidate is the oldest allocated and un-issued store. This is simply the store at the + # store queue issue pointer (if allocated). We do not explicitly track the fallback store candidate, but + # rather just keep the relevant row/column from the order matrix. + + # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry). + # Otherwise (oldest store is oldest overall), this is all zeros. + fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries) + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i]) + + # NOTE: For both the outstanding loads and stores, we only need to consider loads/store which were issued + # previously. If a load (store) is issued through the regular path in the same cycle as the fallback store + # (load), it cannot conflict with the fallback store (load). This is because: + # 1. The regular load (store) must be younger than the fallback store (load) by construction. + # 2. The regular load (store) has been dependency-checked against the fallback store (load) before being + # issued. + # 3. Thus, the fallback store (load) and the regular load (store) must have different addresses. + + if self.configs.fallbackIssueLoad: + store_outstanding = Logic(ctx, 'store_outstanding', 'w') + arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'") + + # We can issue the fallback load candidate if: + # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]). + # - It has a valid address. + # - There are no outstanding stores. + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding) + arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or') + + if self.configs.fallbackIssueStore: + # whether the fallback load is the oldest + fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w') + arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or') + + # FIXME: This can reuse code from the dependency-check refactor which is part of another PR. + load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries) + load_outstanding = Logic(ctx, 'load_outstanding', 'w') + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'") + arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or') + + # We can issue the fallback store candidate (if it is valid) if: + # - It is older than the oldest store (NOT fallback_load_is_oldest). + # - There are no outstanding loads. + arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding) + # Store # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path. # Both the current and next stores are checked for validity and conflicts, and the result is multiplexed "late @@ -769,12 +875,15 @@ def generate(self, lsq_submodules, path_rtl) -> None: for i in range(self.configs.numStqEntries): arch += Op(ctx, store_req_valid_arr[i], stq_alloc_pcomp[i], 'and', stq_addr_valid_pcomp[i], 'and', stq_data_valid_pcomp[i]) - store_conflict = Logic(ctx, 'store_conflict', 'w') store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type) st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries) if self.configs.pipe0: store_req_valid_p0.regInit(init=0) st_ld_conflict_p0.regInit() + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries) + if self.configs.pipe0: + store_is_older_arr_p0.regInit() # next issue pointer (needed for look-ahead when pipelining is enabled) if self.configs.pipe0: @@ -783,10 +892,12 @@ def generate(self, lsq_submodules, path_rtl) -> None: # checks for current and next (if needed) store entry store_req_valid_curr = Logic(ctx, 'store_req_valid_curr', 'w') + store_is_older_arr_curr = LogicArray(ctx, 'store_is_older_arr_curr', 'w', self.configs.numLdqEntries) st_ld_conflict_curr = LogicVec(ctx, 'st_ld_conflict_curr', 'w', self.configs.numLdqEntries) if self.configs.pipe0: # with pipelining: also compute for the next entry store_req_valid_next = Logic(ctx, 'store_req_valid_next', 'w') + store_is_older_arr_next = LogicArray(ctx, 'store_is_older_arr_next', 'w', self.configs.numLdqEntries) st_ld_conflict_next = LogicVec(ctx, 'st_ld_conflict_next', 'w', self.configs.numLdqEntries) # validity lookup @@ -795,6 +906,14 @@ def generate(self, lsq_submodules, path_rtl) -> None: # with pipelining: also compute for the next entry arch += MuxLookUp(ctx, store_req_valid_next, store_req_valid_arr, stq_issue_next) + # extract column from order matrix + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_curr[i], MuxIndex(store_is_older_pcomp[i], stq_issue)) + if self.configs.pipe0: + for i in range(self.configs.numLdqEntries): + # with pipelining: also compute for the next entry + arch += Op(ctx, store_is_older_arr_next[i], MuxIndex(store_is_older_pcomp[i], stq_issue_next)) + # A store conflicts with a load when: # 1. The load entry is valid, and # 2. The load entry hasn't completed (received data from memory), and @@ -806,8 +925,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: (st_ld_conflict_curr, i), (ldq_alloc_pcomp, i), 'and', 'not', (load_completed, i), 'and', - 'not', MuxIndex( - store_is_older_pcomp[i], stq_issue), 'and', + 'not', store_is_older_arr_curr[i], 'and', '(', MuxIndex( addr_same_pcomp[i], stq_issue), 'or', 'not', (ldq_addr_valid_pcomp, i), ')' ) @@ -818,8 +936,7 @@ def generate(self, lsq_submodules, path_rtl) -> None: (st_ld_conflict_next, i), (ldq_alloc_pcomp, i), 'and', 'not', (load_completed, i), 'and', - 'not', MuxIndex( - store_is_older_pcomp[i], stq_issue_next), 'and', + 'not', store_is_older_arr_next[i], 'and', '(', MuxIndex( addr_same_pcomp[i], stq_issue_next), 'or', 'not', (ldq_addr_valid_pcomp, i), ')' ) @@ -831,15 +948,31 @@ def generate(self, lsq_submodules, path_rtl) -> None: 'when', stq_issue_en, 'else', st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when', stq_issue_en, 'else', store_req_valid_curr) + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when', + stq_issue_en, 'else', store_is_older_arr_curr[i]) else: # without pipelining: only consider current store entry arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr) arch += Op(ctx, store_req_valid_p0, store_req_valid_curr) + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i]) # The store conflicts with any load arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or') - arch += Op(ctx, store_en, 'not', store_conflict, 'and', store_req_valid_p0) arch += Op(ctx, store_idx, stq_issue) + # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load). + if self.configs.fallbackIssueStore: + arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')') + else: + arch += Op(ctx, store_en, store_req_valid_p0, 'and', 'not', store_conflict) + + # ordering information needed by fallback issue logic + if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore: + for i in range(self.configs.numLdqEntries): + arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i]) # Bypass bypass_idx_oh_p0 = LogicVecArray( diff --git a/tools/integration/TEST_SUITE.cpp b/tools/integration/TEST_SUITE.cpp index f58c5c9595..9493f29c46 100644 --- a/tools/integration/TEST_SUITE.cpp +++ b/tools/integration/TEST_SUITE.cpp @@ -284,6 +284,7 @@ INSTANTIATE_TEST_SUITE_P( "sumi3_mem", "symm_float", "syr2k_float", + "test_memory_deps", "test_stdint", "threshold", "triangular", @@ -329,7 +330,7 @@ INSTANTIATE_TEST_SUITE_P( "matvec" ), [](const auto &info) { return info.param; }); -#endif +#endif INSTANTIATE_TEST_SUITE_P( MemoryBenchmarks, MemoryFixture,