From 0f59e2fe03dee808506c41100f2c7198babe22d9 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Sat, 28 Feb 2026 21:52:23 +0100
Subject: [PATCH 01/11] [LSQ] Add fallback logic to always issue oldest load or
 store

---
 .../vhdl_gen/generators/lsq.py                | 93 +++++++++++++++++--
 1 file changed, 85 insertions(+), 8 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index a1f5eddd94..ff806ef93b 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -581,9 +581,22 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         # Multiple store channels not yet implemented
         assert (self.configs.numStMem == 1)
+        # current store request index
         store_idx = LogicVec(ctx, 'store_idx', 'w', self.configs.stqAddrW)
+        # whether the current store request is valid, including address and data
+        store_req_valid = Logic(ctx, 'store_req_valid', 'w')
+        # whether the current store request has conflicts with any previous loads
+        store_conflict = Logic(ctx, 'store_conflict', 'w')
+        # whether the to-be-issued store entry is older than each of the load entries
+        store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries)
+        # store request enable (after fallback logic)
         store_en = Logic(ctx, 'store_en', 'w')
 
+        # Fallback load/store signals
+        fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
+        fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries)
+        fallback_load_en = Logic(ctx, 'fallback_load_en', 'w')
+
         # Matrix Generation
         ld_st_conflict = LogicVecArray(
             ctx, 'ld_st_conflict', 'w', self.configs.numLdqEntries, self.configs.numStqEntries)
@@ -711,11 +724,14 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             ctx, 'load_conflict', 'w', self.configs.numLdqEntries)
         load_req_valid = LogicArray(
             ctx, 'load_req_valid', 'w', self.configs.numLdqEntries)
+        load_req_valid_p0 = LogicArray(
+            ctx, 'load_req_valid_p0', pipe0_type, self.configs.numLdqEntries)
         can_load = LogicArray(
             ctx, 'can_load', 'w', self.configs.numLdqEntries)
         can_load_p0 = LogicArray(
             ctx, 'can_load_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0:
+            load_req_valid_p0.regInit(init=[0]*self.configs.numLdqEntries)
             can_load_p0.regInit(init=[0]*self.configs.numLdqEntries)
 
         # The load conflicts with any store
@@ -727,6 +743,8 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         for i in range(0, self.configs.numLdqEntries):
             arch += Op(ctx, load_req_valid[i], ldq_alloc_pcomp[i],
                        'and', ldq_addr_valid_pcomp[i])
+        for i in range(0, self.configs.numLdqEntries):
+            arch += Op(ctx, load_req_valid_p0[i], load_req_valid[i])
         # Generate list for loads that does not face dependency issue
         for i in range(0, self.configs.numLdqEntries):
             arch += Op(ctx, can_load_p0[i], 'not',
@@ -743,10 +761,15 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         can_load_list = []
         can_load_list.append(can_load)
-        for w in range(0, self.configs.numLdMem):
+
+        # temporary (pre-fallback) signals
+        load_idx_tmp_oh = LogicVecArray(ctx, 'load_idx_tmp_oh', 'w', self.configs.numLdMem, self.configs.numLdqEntries)
+        load_en_tmp = LogicArray(ctx, 'load_en_tmp', 'w', self.configs.numLdMem)
+
+        for w in range(self.configs.numLdMem):
             arch += CyclicPriorityMasking(
-                ctx, load_idx_oh[w], can_load_list[w], ldq_head_oh_p0)
-            arch += Reduce(ctx, load_en[w], can_load_list[w], 'or')
+                ctx, load_idx_tmp_oh[w], can_load_list[w], ldq_head_oh_p0)
+            arch += Reduce(ctx, load_en_tmp[w], can_load_list[w], 'or')
             if (w+1 != self.configs.numLdMem):
                 load_idx_oh_LogicArray = LogicArray(
                     ctx, f'load_idx_oh_Array_{w+1}', 'w', self.configs.numLdqEntries)
@@ -758,6 +781,40 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                     arch += Op(ctx, can_load_list[w+1][i], 'not',
                                load_idx_oh_LogicArray[i], 'and', can_load_list[w][i])
 
+        for w in range(self.configs.numLdMem):
+            if w != self.configs.numLdMem - 1:
+                # non-last load port: use _tmp signals directly
+                arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w])
+                arch += Op(ctx, load_en[w], load_en_tmp[w])
+            else:
+                # last load port: use fallback load (if any) as the first priority, then service
+                # other loads (from _tmp)
+                arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w])
+                arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w])
+
+        # Fallback Load / Store
+
+        # Find the oldest unissue load. This may still (a) be younger than the oldest store, (b) be
+        # not allocated, or (c) have an invalid address.
+        ldq_issue_not = LogicArray(ctx, 'ldq_issue_not', 'w', self.configs.numLdqEntries)
+        oldest_unissued_load_oh = LogicArray(ctx, 'oldest_unissued_load_oh', 'w', self.configs.numLdqEntries)
+        for i in range(0, self.configs.numLdqEntries):
+            arch += Op(ctx, ldq_issue_not[i], 'not', ldq_issue[i])
+        arch += CyclicPriorityMasking(ctx, oldest_unissued_load_oh, ldq_issue_not, ldq_head_oh_p0)
+
+        # If the fallback load is older than the fallback store, this contains a single bit set (at the fallback load entry).
+        # Otherwise (fallback store is oldest), this is all zeros.
+        fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries)
+        for i in range(self.configs.numLdqEntries):
+            arch += Op(ctx, fallback_load_is_oldest_oh[i], 'not', store_is_older_arr[i], 'and', oldest_unissued_load_oh[i])
+        # Whether the fallback load is the oldest.
+        arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
+
+        # fallback load: needs to be the oldest AND actually ready to be issued (allocated and address valid)
+        for i in range(self.configs.numLdqEntries):
+            arch += Op(ctx, (fallback_load_idx_oh, i), load_req_valid_p0[i], 'and', fallback_load_is_oldest_oh[i])
+        arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
+
         # Store
         # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path.
         # Both the current and next stores are checked for validity and conflicts, and the result is multiplexed "late
@@ -771,9 +828,11 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         store_conflict = Logic(ctx, 'store_conflict', 'w')
         store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type)
+        store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries)
         st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0:
             store_req_valid_p0.regInit(init=0)
+            store_is_older_arr_p0.regInit()
             st_ld_conflict_p0.regInit()
 
         # next issue pointer (needed for look-ahead when pipelining is enabled)
@@ -783,10 +842,12 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         # checks for current and next (if needed) store entry
         store_req_valid_curr = Logic(ctx, 'store_req_valid_curr', 'w')
+        store_is_older_arr_curr = LogicArray(ctx, 'store_is_older_arr_curr', 'w', self.configs.numLdqEntries)
         st_ld_conflict_curr = LogicVec(ctx, 'st_ld_conflict_curr', 'w', self.configs.numLdqEntries)
         if self.configs.pipe0:
             # with pipelining: also compute for the next entry
             store_req_valid_next = Logic(ctx, 'store_req_valid_next', 'w')
+            store_is_older_arr_next = LogicArray(ctx, 'store_is_older_arr_next', 'w', self.configs.numLdqEntries)
             st_ld_conflict_next = LogicVec(ctx, 'st_ld_conflict_next', 'w', self.configs.numLdqEntries)
 
         # validity lookup
@@ -795,6 +856,14 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             # with pipelining: also compute for the next entry
             arch += MuxLookUp(ctx, store_req_valid_next, store_req_valid_arr, stq_issue_next)
 
+        # extract column from order matrix
+        for i in range(self.configs.numLdqEntries):
+            arch += Op(ctx, store_is_older_arr_curr[i], MuxIndex(store_is_older_pcomp[i], stq_issue))
+        if self.configs.pipe0:
+            for i in range(self.configs.numLdqEntries):
+                # with pipelining: also compute for the next entry
+                arch += Op(ctx, store_is_older_arr_next[i], MuxIndex(store_is_older_pcomp[i], stq_issue_next))
+
         # A store conflicts with a load when:
         # 1. The load entry is valid, and
         # 2. The load entry hasn't completed (received data from memory), and
@@ -806,8 +875,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                        (st_ld_conflict_curr, i),
                        (ldq_alloc_pcomp, i), 'and',
                        'not', (load_completed, i), 'and',
-                       'not', MuxIndex(
-                           store_is_older_pcomp[i], stq_issue), 'and',
+                       'not', store_is_older_arr_curr[i], 'and',
                        '(', MuxIndex(
                            addr_same_pcomp[i], stq_issue), 'or', 'not', (ldq_addr_valid_pcomp, i), ')'
                        )
@@ -818,8 +886,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                            (st_ld_conflict_next, i),
                            (ldq_alloc_pcomp, i), 'and',
                            'not', (load_completed, i), 'and',
-                           'not', MuxIndex(
-                               store_is_older_pcomp[i], stq_issue_next), 'and',
+                           'not', store_is_older_arr_next[i], 'and',
                            '(', MuxIndex(
                                addr_same_pcomp[i], stq_issue_next), 'or', 'not', (ldq_addr_valid_pcomp, i), ')'
                            )
@@ -831,15 +898,25 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                        'when', stq_issue_en, 'else', st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when',
                        stq_issue_en, 'else', store_req_valid_curr)
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when',
+                           stq_issue_en, 'else', store_is_older_arr_curr[i])
         else:
             # without pipelining: only consider current store entry
             arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_curr)
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i])
 
         # The store conflicts with any load
         arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or')
-        arch += Op(ctx, store_en, 'not', store_conflict, 'and', store_req_valid_p0)
+        # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load).
+        arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', 'not', fallback_load_is_oldest, ')')
         arch += Op(ctx, store_idx, stq_issue)
+        # needed for fallback logic
+        # FIXME: conditionally enable based on fallback issue flag
+        for i in range(self.configs.numLdqEntries):
+            arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i])
 
         # Bypass
         bypass_idx_oh_p0 = LogicVecArray(

From 4791aebb429d7a86e482d59da04343e133284bd0 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Mon, 2 Mar 2026 14:11:57 +0100
Subject: [PATCH 02/11] [LSQ] Add reset to some registers

---
 tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index ff806ef93b..f8a88f3e6b 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -482,7 +482,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         # Load Queue Entries
         ldq_alloc.regInit(init=[0]*self.configs.numLdqEntries)
-        ldq_issue.regInit()
+        ldq_issue.regInit(init=[0]*self.configs.numLdqEntries)
         if (self.configs.ldpAddrW > 0):
             ldq_port_idx.regInit(ldq_wen)
         ldq_addr_valid.regInit()
@@ -756,7 +756,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         ldq_head_oh_p0 = LogicVec(
             ctx, 'ldq_head_oh_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0:
-            ldq_head_oh_p0.regInit()
+            ldq_head_oh_p0.regInit(init=0)
         arch += Op(ctx, ldq_head_oh_p0, ldq_head_oh)
 
         can_load_list = []

From ea2be9933ad51e463ba7669d5e074355e97d4029 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Mon, 2 Mar 2026 14:23:15 +0100
Subject: [PATCH 03/11] [integration-test] Add simple memory dependency test

---
 .../test_memory_deps/test_memory_deps.c       | 32 +++++++++++++++++++
 .../test_memory_deps/test_memory_deps.h       |  9 ++++++
 tools/integration/TEST_SUITE.cpp              |  3 +-
 3 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 integration-test/test_memory_deps/test_memory_deps.c
 create mode 100644 integration-test/test_memory_deps/test_memory_deps.h

diff --git a/integration-test/test_memory_deps/test_memory_deps.c b/integration-test/test_memory_deps/test_memory_deps.c
new file mode 100644
index 0000000000..2b2a64118e
--- /dev/null
+++ b/integration-test/test_memory_deps/test_memory_deps.c
@@ -0,0 +1,32 @@
+//===- test_memory_deps.c -----------------------------------------*- C -*-===//
+
+#include "test_memory_deps.h"
+#include "dynamatic/Integration.h"
+#include <stdlib.h>
+
+void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n) {
+  int sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += data[load_addrs[i]];
+    data[store_addrs[i]] = i;
+  }
+  data[0] = sum;
+}
+
+int main(void) {
+  in_int_t load_addrs[1000];
+  in_int_t store_addrs[1000];
+  inout_int_t data[1000];
+
+  in_int_t n = 1000;
+  for (int i = 0; i < n; ++i) {
+    // addresses alternate randomly between 1 and 2, creating RAW and WAR hazards
+    load_addrs[i] = (rand() % 4) + 1;
+    load_addrs[i] = (rand() % 4) + 1;
+    store_addrs[i] = (i == 0) ? 1 : 2;
+    data[i] = i;
+  }
+
+  CALL_KERNEL(test_memory_deps, load_addrs, store_addrs, data, n);
+  return 0;
+}
diff --git a/integration-test/test_memory_deps/test_memory_deps.h b/integration-test/test_memory_deps/test_memory_deps.h
new file mode 100644
index 0000000000..74a5719499
--- /dev/null
+++ b/integration-test/test_memory_deps/test_memory_deps.h
@@ -0,0 +1,9 @@
+#ifndef TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H
+#define TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H
+
+typedef int in_int_t;
+typedef int inout_int_t;
+
+void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n);
+
+#endif // TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H
diff --git a/tools/integration/TEST_SUITE.cpp b/tools/integration/TEST_SUITE.cpp
index f58c5c9595..9493f29c46 100644
--- a/tools/integration/TEST_SUITE.cpp
+++ b/tools/integration/TEST_SUITE.cpp
@@ -284,6 +284,7 @@ INSTANTIATE_TEST_SUITE_P(
       "sumi3_mem",
       "symm_float",
       "syr2k_float",
+      "test_memory_deps",
       "test_stdint",
       "threshold",
       "triangular",
@@ -329,7 +330,7 @@ INSTANTIATE_TEST_SUITE_P(
       "matvec"
       ),
       [](const auto &info) { return info.param; });
-#endif 
+#endif
 
 INSTANTIATE_TEST_SUITE_P(
     MemoryBenchmarks, MemoryFixture,

From 17b0ebb67fe274c22e7cb9c389279dc08438b364 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Mon, 2 Mar 2026 14:24:37 +0100
Subject: [PATCH 04/11] [LSQ] Wait for outstanding stores (loads) for fallback
 loads (stores)

---
 .../vhdl_gen/generators/lsq.py                | 67 ++++++++++++++-----
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index f8a88f3e6b..bec1671f20 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -593,9 +593,9 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         store_en = Logic(ctx, 'store_en', 'w')
 
         # Fallback load/store signals
-        fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
         fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries)
         fallback_load_en = Logic(ctx, 'fallback_load_en', 'w')
+        fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w')
 
         # Matrix Generation
         ld_st_conflict = LogicVecArray(
@@ -720,20 +720,28 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         # Load
 
+        ldq_alloc_p0 = LogicArray(
+            ctx, 'ldq_alloc_p0', pipe0_type, self.configs.numLdqEntries)
+        ldq_addr_valid_p0 = LogicArray(
+            ctx, 'ldq_addr_valid_p0', pipe0_type, self.configs.numLdqEntries)
         load_conflict = LogicArray(
             ctx, 'load_conflict', 'w', self.configs.numLdqEntries)
         load_req_valid = LogicArray(
             ctx, 'load_req_valid', 'w', self.configs.numLdqEntries)
-        load_req_valid_p0 = LogicArray(
-            ctx, 'load_req_valid_p0', pipe0_type, self.configs.numLdqEntries)
         can_load = LogicArray(
             ctx, 'can_load', 'w', self.configs.numLdqEntries)
         can_load_p0 = LogicArray(
             ctx, 'can_load_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0:
-            load_req_valid_p0.regInit(init=[0]*self.configs.numLdqEntries)
+            ldq_alloc_p0.regInit(init=[0]*self.configs.numLdqEntries)
+            ldq_addr_valid_p0.regInit(init=[0]*self.configs.numLdqEntries)
             can_load_p0.regInit(init=[0]*self.configs.numLdqEntries)
 
+        # Pipeline
+        for i in range(0, self.configs.numLdqEntries):
+            arch += Op(ctx, ldq_alloc_p0[i], ldq_alloc_pcomp[i])
+        for i in range(0, self.configs.numLdqEntries):
+            arch += Op(ctx, ldq_addr_valid_p0[i], ldq_addr_valid_pcomp[i])
         # The load conflicts with any store
         for i in range(0, self.configs.numLdqEntries):
             arch += Reduce(ctx,
@@ -743,8 +751,6 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         for i in range(0, self.configs.numLdqEntries):
             arch += Op(ctx, load_req_valid[i], ldq_alloc_pcomp[i],
                        'and', ldq_addr_valid_pcomp[i])
-        for i in range(0, self.configs.numLdqEntries):
-            arch += Op(ctx, load_req_valid_p0[i], load_req_valid[i])
         # Generate list for loads that does not face dependency issue
         for i in range(0, self.configs.numLdqEntries):
             arch += Op(ctx, can_load_p0[i], 'not',
@@ -794,27 +800,52 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         # Fallback Load / Store
 
-        # Find the oldest unissue load. This may still (a) be younger than the oldest store, (b) be
-        # not allocated, or (c) have an invalid address.
-        ldq_issue_not = LogicArray(ctx, 'ldq_issue_not', 'w', self.configs.numLdqEntries)
-        oldest_unissued_load_oh = LogicArray(ctx, 'oldest_unissued_load_oh', 'w', self.configs.numLdqEntries)
+        # The fallback load candidate is the oldest allocated and un-issued load.
+        ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries)
+        fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries)
         for i in range(0, self.configs.numLdqEntries):
-            arch += Op(ctx, ldq_issue_not[i], 'not', ldq_issue[i])
-        arch += CyclicPriorityMasking(ctx, oldest_unissued_load_oh, ldq_issue_not, ldq_head_oh_p0)
+            arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i])
+        arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0)
+
+        # The fallback store canddiate is the oldest allocated and un-issued store. This is simply
+        # the store at the store queue issue pointer (if allocated). We do not explicitly track the
+        # fallback store candidate, but rather just keep the relevant row/column from the order
+        # matrix.
 
-        # If the fallback load is older than the fallback store, this contains a single bit set (at the fallback load entry).
-        # Otherwise (fallback store is oldest), this is all zeros.
+        # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry).
+        # Otherwise (oldest store is oldest overall), this is all zeros.
         fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries)
         for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, fallback_load_is_oldest_oh[i], 'not', store_is_older_arr[i], 'and', oldest_unissued_load_oh[i])
+            arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i])
+
         # Whether the fallback load is the oldest.
+        fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
         arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
 
-        # fallback load: needs to be the oldest AND actually ready to be issued (allocated and address valid)
+        # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle.
+        store_outstanding = Logic(ctx, 'store_outstanding', 'w')
+        arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
+
+        # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle.
+        load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
+        load_outstanding = Logic(ctx, 'load_outstanding', 'w')
         for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, (fallback_load_idx_oh, i), load_req_valid_p0[i], 'and', fallback_load_is_oldest_oh[i])
+            arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
+        arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or')
+
+        # We can issue the fallback load candidate if:
+        # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).
+        # - It has a valid address.
+        # - There are no outstanding stores.
+        for i in range(self.configs.numLdqEntries):
+            arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding)
         arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
 
+        # We can issue the fallback store candidate (if it is valid) if:
+        # - It is older than the oldest store (NOT fallback_load_is_oldest).
+        # - There are no outstanding loads.
+        arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding)
+
         # Store
         # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path.
         # Both the current and next stores are checked for validity and conflicts, and the result is multiplexed "late
@@ -911,7 +942,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         # The store conflicts with any load
         arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or')
         # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load).
-        arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', 'not', fallback_load_is_oldest, ')')
+        arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')')
         arch += Op(ctx, store_idx, stq_issue)
         # needed for fallback logic
         # FIXME: conditionally enable based on fallback issue flag

From 36887c218b749c9de90f9c91860ae09dc3e8de3d Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Tue, 3 Mar 2026 13:03:55 +0100
Subject: [PATCH 05/11] [LSQ] Use flag to enable/disable fallback issue

---
 .../lsq-generator-python/vhdl_gen/configs.py  |  13 ++
 .../vhdl_gen/generators/lsq.py                | 135 ++++++++++--------
 2 files changed, 85 insertions(+), 63 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
index a83ae2de0d..fc873d4c29 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
@@ -65,6 +65,9 @@ class Configs:
     stResp:        bool = False     # Whether store response channel in store access port is enabled
     gaMulti:       bool = False     # Whether multiple groups are allowed to request an allocation at the same cycle
     bypass:        bool = True      # Whether bypassing (store-to-load forwarding) is enabled
+    # guarantees execution of the oldest pending memory operation (load or store) in the presence of false conflicts
+    # (which can happen with approximate address comparison)
+    fallbackIssue: bool = False
 
     def __init__(self, config: dict) -> None:
         self.name = config["name"]
@@ -82,7 +85,10 @@ def __init__(self, config: dict) -> None:
 
         self.stResp = bool(config["stResp"])
         self.gaMulti = bool(config["groupMulti"])
+
+        # TODO: set based on requested LSQ model
         self.bypass = True
+        self.fallbackIssue = False
 
         self.gaNumLoads = config["numLoads"]
         self.gaNumStores = config["numStores"]
@@ -120,3 +126,10 @@ def __init__(self, config: dict) -> None:
         assert (len(self.gaLdOrder) == self.numGroups)
         assert (len(self.gaLdPortIdx) == self.numGroups)
         assert (len(self.gaStPortIdx) == self.numGroups)
+
+        if self.fallbackIssue:
+            assert not self.bypass, "Fallback issue is not compatible with bypassing."
+            # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not
+            # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not
+            # currently used by Dynamatic, so this is left as future work.
+            assert self.numLdMem == 1, "Fallback issue is only supported for single load port configuration."
diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index bec1671f20..f19501d282 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -587,15 +587,17 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         store_req_valid = Logic(ctx, 'store_req_valid', 'w')
         # whether the current store request has conflicts with any previous loads
         store_conflict = Logic(ctx, 'store_conflict', 'w')
-        # whether the to-be-issued store entry is older than each of the load entries
-        store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries)
+        if self.configs.fallbackIssue:
+            # whether the to-be-issued store entry is older than each of the load entries
+            store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries)
         # store request enable (after fallback logic)
         store_en = Logic(ctx, 'store_en', 'w')
 
         # Fallback load/store signals
-        fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries)
-        fallback_load_en = Logic(ctx, 'fallback_load_en', 'w')
-        fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w')
+        if self.configs.fallbackIssue:
+            fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries)
+            fallback_load_en = Logic(ctx, 'fallback_load_en', 'w')
+            fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w')
 
         # Matrix Generation
         ld_st_conflict = LogicVecArray(
@@ -788,63 +790,64 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                                load_idx_oh_LogicArray[i], 'and', can_load_list[w][i])
 
         for w in range(self.configs.numLdMem):
-            if w != self.configs.numLdMem - 1:
+            last_load_port = (w == self.configs.numLdMem - 1)
+            if self.configs.fallbackIssue and last_load_port:
+                # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp)
+                arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w])
+                arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w])
+            else:
                 # non-last load port: use _tmp signals directly
                 arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w])
                 arch += Op(ctx, load_en[w], load_en_tmp[w])
-            else:
-                # last load port: use fallback load (if any) as the first priority, then service
-                # other loads (from _tmp)
-                arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w])
-                arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w])
 
-        # Fallback Load / Store
+        if self.configs.fallbackIssue:
+            # Fallback Load / Store
 
-        # The fallback load candidate is the oldest allocated and un-issued load.
-        ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries)
-        fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries)
-        for i in range(0, self.configs.numLdqEntries):
-            arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i])
-        arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0)
+            # The fallback load candidate is the oldest allocated and un-issued load.
+            ldq_alloc_no_issue = LogicArray(ctx, 'ldq_alloc_no_issue', 'w', self.configs.numLdqEntries)
+            fallback_load_candidate_oh = LogicArray(ctx, 'fallback_load_candidate_oh', 'w', self.configs.numLdqEntries)
+            for i in range(0, self.configs.numLdqEntries):
+                arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i])
+            arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0)
 
-        # The fallback store canddiate is the oldest allocated and un-issued store. This is simply
-        # the store at the store queue issue pointer (if allocated). We do not explicitly track the
-        # fallback store candidate, but rather just keep the relevant row/column from the order
-        # matrix.
+            # The fallback store canddiate is the oldest allocated and un-issued store. This is simply
+            # the store at the store queue issue pointer (if allocated). We do not explicitly track the
+            # fallback store candidate, but rather just keep the relevant row/column from the order
+            # matrix.
 
-        # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry).
-        # Otherwise (oldest store is oldest overall), this is all zeros.
-        fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries)
-        for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i])
+            # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry).
+            # Otherwise (oldest store is oldest overall), this is all zeros.
+            fallback_load_is_oldest_oh = LogicArray(ctx, 'fallback_load_is_oldest_oh', 'w', self.configs.numLdqEntries)
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i])
 
-        # Whether the fallback load is the oldest.
-        fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
-        arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
+            # Whether the fallback load is the oldest.
+            fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
+            arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
 
-        # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle.
-        store_outstanding = Logic(ctx, 'store_outstanding', 'w')
-        arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
+            # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle.
+            store_outstanding = Logic(ctx, 'store_outstanding', 'w')
+            arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
 
-        # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle.
-        load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
-        load_outstanding = Logic(ctx, 'load_outstanding', 'w')
-        for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
-        arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or')
+            # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle.
+            load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
+            load_outstanding = Logic(ctx, 'load_outstanding', 'w')
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
+            arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or')
 
-        # We can issue the fallback load candidate if:
-        # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).
-        # - It has a valid address.
-        # - There are no outstanding stores.
-        for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding)
-        arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
+            # We can issue the fallback load candidate if:
+            # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).
+            # - It has a valid address.
+            # - There are no outstanding stores.
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding)
+            arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
 
-        # We can issue the fallback store candidate (if it is valid) if:
-        # - It is older than the oldest store (NOT fallback_load_is_oldest).
-        # - There are no outstanding loads.
-        arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding)
+            # We can issue the fallback store candidate (if it is valid) if:
+            # - It is older than the oldest store (NOT fallback_load_is_oldest).
+            # - There are no outstanding loads.
+            arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding)
 
         # Store
         # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path.
@@ -859,12 +862,14 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         store_conflict = Logic(ctx, 'store_conflict', 'w')
         store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type)
-        store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries)
         st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0:
             store_req_valid_p0.regInit(init=0)
-            store_is_older_arr_p0.regInit()
             st_ld_conflict_p0.regInit()
+        if self.configs.fallbackIssue:
+            store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries)
+            if self.configs.pipe0:
+                store_is_older_arr_p0.regInit()
 
         # next issue pointer (needed for look-ahead when pipelining is enabled)
         if self.configs.pipe0:
@@ -929,25 +934,29 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                        'when', stq_issue_en, 'else', st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when',
                        stq_issue_en, 'else', store_req_valid_curr)
-            for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when',
-                           stq_issue_en, 'else', store_is_older_arr_curr[i])
+            if self.configs.fallbackIssue:
+                for i in range(self.configs.numLdqEntries):
+                    arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when',
+                               stq_issue_en, 'else', store_is_older_arr_curr[i])
         else:
             # without pipelining: only consider current store entry
             arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_curr)
-            for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i])
+            if self.configs.fallbackIssue:
+                for i in range(self.configs.numLdqEntries):
+                    arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i])
 
         # The store conflicts with any load
         arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or')
-        # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load).
-        arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')')
         arch += Op(ctx, store_idx, stq_issue)
-        # needed for fallback logic
-        # FIXME: conditionally enable based on fallback issue flag
-        for i in range(self.configs.numLdqEntries):
-            arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i])
+        # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load).
+        if self.configs.fallbackIssue:
+            arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')')
+            # ordering information needed by fallback issue logic
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i])
+        else:
+            arch += Op(ctx, store_en, store_req_valid_p0, 'and', 'not', store_conflict)
 
         # Bypass
         bypass_idx_oh_p0 = LogicVecArray(

From 539041b80a168109b66ccbc1f4af304f09f2f075 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Tue, 3 Mar 2026 13:42:15 +0100
Subject: [PATCH 06/11] [LSQ] Fix typo in signal name

---
 .../backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index f19501d282..488d0b257b 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -830,11 +830,11 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
 
             # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle.
-            load_oustanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
+            load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
             load_outstanding = Logic(ctx, 'load_outstanding', 'w')
             for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, load_oustanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
-            arch += Reduce(ctx, load_outstanding, load_oustanding_arr, 'or')
+                arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
+            arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or')
 
             # We can issue the fallback load candidate if:
             # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).

From 10ca1f89aac0e17f24cf4e5874f15f3d7accf95b Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Wed, 4 Mar 2026 09:27:21 +0100
Subject: [PATCH 07/11] [LSQ] Add comment to explain concurrent issue with
 fallback

---
 .../lsq-generator-python/vhdl_gen/generators/lsq.py    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index 488d0b257b..285afa91f2 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -825,11 +825,17 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
             arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
 
-            # FIXME: What if we are about to issue a store? We shouldn't issue a fallback load in the same cycle.
+            # NOTE: For both the outstanding loads and stores, we only need to consider loads/store which were issued
+            # previously. If a load (store) is issued through the regular path in the same cycle as the fallback store
+            # (load), it cannot conflict with the fallback store (load). This is because:
+            # 1. The regular load (store) must be younger than the fallback store (load) by construction.
+            # 2. The regular load (store) has been dependency-checked against the fallback store (load) before being
+            #    issued.
+            # 3. Thus, the fallback store (load) and the regular load (store) must have different addresses.
+
             store_outstanding = Logic(ctx, 'store_outstanding', 'w')
             arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
 
-            # FIXME: What if we are about to issue a load? We shouldn't issue a fallback store in the same cycle.
             load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
             load_outstanding = Logic(ctx, 'load_outstanding', 'w')
             for i in range(self.configs.numLdqEntries):

From c42f38b3d142744eb84ed0c21ebc647063eb914d Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Fri, 6 Mar 2026 17:12:21 +0100
Subject: [PATCH 08/11] [LSQ] Separate load and store paths of fallback issue

---
 .../lsq-generator-python/vhdl_gen/configs.py  | 10 ++-
 .../vhdl_gen/generators/lsq.py                | 64 ++++++++++---------
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
index fc873d4c29..4d520defa1 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
@@ -65,9 +65,11 @@ class Configs:
     stResp:        bool = False     # Whether store response channel in store access port is enabled
     gaMulti:       bool = False     # Whether multiple groups are allowed to request an allocation at the same cycle
     bypass:        bool = True      # Whether bypassing (store-to-load forwarding) is enabled
+
     # guarantees execution of the oldest pending memory operation (load or store) in the presence of false conflicts
     # (which can happen with approximate address comparison)
-    fallbackIssue: bool = False
+    fallbackIssueLoad: bool = False
+    fallbackIssueStore: bool = False
 
     def __init__(self, config: dict) -> None:
         self.name = config["name"]
@@ -88,7 +90,8 @@ def __init__(self, config: dict) -> None:
 
         # TODO: set based on requested LSQ model
         self.bypass = True
-        self.fallbackIssue = False
+        self.fallbackIssueLoad = False
+        self.fallbackIssueStore = False
 
         self.gaNumLoads = config["numLoads"]
         self.gaNumStores = config["numStores"]
@@ -127,8 +130,9 @@ def __init__(self, config: dict) -> None:
         assert (len(self.gaLdPortIdx) == self.numGroups)
         assert (len(self.gaStPortIdx) == self.numGroups)
 
-        if self.fallbackIssue:
+        if self.fallbackIssueLoad or self.fallbackIssueStore:
             assert not self.bypass, "Fallback issue is not compatible with bypassing."
+        if self.fallbackIssueLoad:
             # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not
             # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not
             # currently used by Dynamatic, so this is left as future work.
diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index 285afa91f2..a52a2d3409 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -587,16 +587,17 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         store_req_valid = Logic(ctx, 'store_req_valid', 'w')
         # whether the current store request has conflicts with any previous loads
         store_conflict = Logic(ctx, 'store_conflict', 'w')
-        if self.configs.fallbackIssue:
+        if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
             # whether the to-be-issued store entry is older than each of the load entries
             store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries)
         # store request enable (after fallback logic)
         store_en = Logic(ctx, 'store_en', 'w')
 
         # Fallback load/store signals
-        if self.configs.fallbackIssue:
+        if self.configs.fallbackIssueLoad:
             fallback_load_idx_oh = LogicVec(ctx, 'fallback_load_idx_oh', 'w', self.configs.numLdqEntries)
             fallback_load_en = Logic(ctx, 'fallback_load_en', 'w')
+        if self.configs.fallbackIssueStore:
             fallback_store_en_if_valid = Logic(ctx, 'fallback_store_en_if_valid', 'w')
 
         # Matrix Generation
@@ -791,7 +792,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
 
         for w in range(self.configs.numLdMem):
             last_load_port = (w == self.configs.numLdMem - 1)
-            if self.configs.fallbackIssue and last_load_port:
+            if self.configs.fallbackIssueLoad and last_load_port:
                 # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp)
                 arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w])
                 arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w])
@@ -800,7 +801,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                 arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w])
                 arch += Op(ctx, load_en[w], load_en_tmp[w])
 
-        if self.configs.fallbackIssue:
+        if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
             # Fallback Load / Store
 
             # The fallback load candidate is the oldest allocated and un-issued load.
@@ -833,27 +834,30 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             #    issued.
             # 3. Thus, the fallback store (load) and the regular load (store) must have different addresses.
 
-            store_outstanding = Logic(ctx, 'store_outstanding', 'w')
-            arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
+            if self.configs.fallbackIssueLoad:
+                store_outstanding = Logic(ctx, 'store_outstanding', 'w')
+                arch += Op(ctx, store_outstanding, "'1'", 'when', '(', stq_issue, '/=', stq_resp, ')', 'else ', "'0'")
 
-            load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
-            load_outstanding = Logic(ctx, 'load_outstanding', 'w')
-            for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
-            arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or')
+                # We can issue the fallback load candidate if:
+                # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).
+                # - It has a valid address.
+                # - There are no outstanding stores.
+                for i in range(self.configs.numLdqEntries):
+                    arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding)
+                arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
 
-            # We can issue the fallback load candidate if:
-            # - It is older than the oldest store (implicit in fallback_load_is_oldest_oh[]).
-            # - It has a valid address.
-            # - There are no outstanding stores.
-            for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, (fallback_load_idx_oh, i), fallback_load_is_oldest_oh[i], 'and', ldq_addr_valid_p0[i], 'and', 'not', store_outstanding)
-            arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
+            if self.configs.fallbackIssueStore:
+                # FIXME: This can reuse code from the dependency-check refactor which is part of another PR.
+                load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
+                load_outstanding = Logic(ctx, 'load_outstanding', 'w')
+                for i in range(self.configs.numLdqEntries):
+                    arch += Op(ctx, load_outstanding_arr[i], "'1'", 'when', '(', ldq_issue[i], 'and', 'not', ldq_data_valid[i], ')', 'else', "'0'")
+                arch += Reduce(ctx, load_outstanding, load_outstanding_arr, 'or')
 
-            # We can issue the fallback store candidate (if it is valid) if:
-            # - It is older than the oldest store (NOT fallback_load_is_oldest).
-            # - There are no outstanding loads.
-            arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding)
+                # We can issue the fallback store candidate (if it is valid) if:
+                # - It is older than the oldest store (NOT fallback_load_is_oldest).
+                # - There are no outstanding loads.
+                arch += Op(ctx, fallback_store_en_if_valid, 'not', fallback_load_is_oldest, 'and', 'not', load_outstanding)
 
         # Store
         # When pipelining (pipe0) is enabled, this uses look-ahead to the next store entry to reduce the critical path.
@@ -872,7 +876,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         if self.configs.pipe0:
             store_req_valid_p0.regInit(init=0)
             st_ld_conflict_p0.regInit()
-        if self.configs.fallbackIssue:
+        if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
             store_is_older_arr_p0 = LogicArray(ctx, 'store_is_older_arr_p0', pipe0_type, self.configs.numLdqEntries)
             if self.configs.pipe0:
                 store_is_older_arr_p0.regInit()
@@ -940,7 +944,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                        'when', stq_issue_en, 'else', st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_next, 'when',
                        stq_issue_en, 'else', store_req_valid_curr)
-            if self.configs.fallbackIssue:
+            if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
                 for i in range(self.configs.numLdqEntries):
                     arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_next[i], 'when',
                                stq_issue_en, 'else', store_is_older_arr_curr[i])
@@ -948,7 +952,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             # without pipelining: only consider current store entry
             arch += Op(ctx, st_ld_conflict_p0, st_ld_conflict_curr)
             arch += Op(ctx, store_req_valid_p0, store_req_valid_curr)
-            if self.configs.fallbackIssue:
+            if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
                 for i in range(self.configs.numLdqEntries):
                     arch += Op(ctx, store_is_older_arr_p0[i], store_is_older_arr_curr[i])
 
@@ -956,14 +960,16 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         arch += Reduce(ctx, store_conflict, st_ld_conflict_p0, 'or')
         arch += Op(ctx, store_idx, stq_issue)
         # The store can be issued when it is valid AND (no conflict OR it is older than the fallback load).
-        if self.configs.fallbackIssue:
+        if self.configs.fallbackIssueStore:
             arch += Op(ctx, store_en, store_req_valid_p0, 'and', '(', 'not', store_conflict, 'or', fallback_store_en_if_valid, ')')
-            # ordering information needed by fallback issue logic
-            for i in range(self.configs.numLdqEntries):
-                arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i])
         else:
             arch += Op(ctx, store_en, store_req_valid_p0, 'and', 'not', store_conflict)
 
+        # ordering information needed by fallback issue logic
+        if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
+            for i in range(self.configs.numLdqEntries):
+                arch += Op(ctx, store_is_older_arr[i], store_is_older_arr_p0[i])
+
         # Bypass
         bypass_idx_oh_p0 = LogicVecArray(
             ctx, 'bypass_idx_oh_p0', pipe0_type, self.configs.numLdqEntries, self.configs.numStqEntries)

From 5f4da8783bbea25958fc824bbf41ace13072a468 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Fri, 6 Mar 2026 17:35:49 +0100
Subject: [PATCH 09/11] [integration-test] Fix formatting

---
 integration-test/test_memory_deps/test_memory_deps.c | 6 ++++--
 integration-test/test_memory_deps/test_memory_deps.h | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/integration-test/test_memory_deps/test_memory_deps.c b/integration-test/test_memory_deps/test_memory_deps.c
index 2b2a64118e..1935e6bf9c 100644
--- a/integration-test/test_memory_deps/test_memory_deps.c
+++ b/integration-test/test_memory_deps/test_memory_deps.c
@@ -4,7 +4,8 @@
 #include "dynamatic/Integration.h"
 #include <stdlib.h>
 
-void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n) {
+void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000],
+                      inout_int_t data[1000], in_int_t n) {
   int sum = 0;
   for (int i = 0; i < n; ++i) {
     sum += data[load_addrs[i]];
@@ -20,7 +21,8 @@ int main(void) {
 
   in_int_t n = 1000;
   for (int i = 0; i < n; ++i) {
-    // addresses alternate randomly between 1 and 2, creating RAW and WAR hazards
+    // addresses alternate randomly between 1 and 2, creating RAW and WAR
+    // hazards
     load_addrs[i] = (rand() % 4) + 1;
     load_addrs[i] = (rand() % 4) + 1;
     store_addrs[i] = (i == 0) ? 1 : 2;
diff --git a/integration-test/test_memory_deps/test_memory_deps.h b/integration-test/test_memory_deps/test_memory_deps.h
index 74a5719499..6bc07a1cdd 100644
--- a/integration-test/test_memory_deps/test_memory_deps.h
+++ b/integration-test/test_memory_deps/test_memory_deps.h
@@ -4,6 +4,7 @@
 typedef int in_int_t;
 typedef int inout_int_t;
 
-void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000], inout_int_t data[1000], in_int_t n);
+void test_memory_deps(in_int_t load_addrs[1000], in_int_t store_addrs[1000],
+                      inout_int_t data[1000], in_int_t n);
 
 #endif // TEST_MEMORY_DEPS_TEST_MEMORY_DEPS_H

From 4c3b8dc4fb65e6e0e253740d8c571253d9e0c669 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Tue, 10 Mar 2026 10:02:51 +0100
Subject: [PATCH 10/11] [LSQ] Address review comments

---
 .../lsq-generator-python/vhdl_gen/configs.py  |  4 +--
 .../vhdl_gen/generators/lsq.py                | 29 +++++++++++--------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/configs.py b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
index 4d520defa1..932dfb81f7 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/configs.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/configs.py
@@ -134,6 +134,6 @@ def __init__(self, config: dict) -> None:
             assert not self.bypass, "Fallback issue is not compatible with bypassing."
         if self.fallbackIssueLoad:
             # TODO: To properly support multiple load channels, we need to ensure that the fallback load is not
-            # duplicated a load # issued by another load channel in the same cycle. Multiple load channels are not
+            # duplicated a load issued by another load channel in the same cycle. Multiple load channels are not
             # currently used by Dynamatic, so this is left as future work.
-            assert self.numLdMem == 1, "Fallback issue is only supported for single load port configuration."
+            assert self.numLdMem == 1, "Fallback issue is only supported for single load channel configuration."
diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index a52a2d3409..80590e0e35 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -589,6 +589,7 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         store_conflict = Logic(ctx, 'store_conflict', 'w')
         if self.configs.fallbackIssueLoad or self.configs.fallbackIssueStore:
             # whether the to-be-issued store entry is older than each of the load entries
+            # only needed for fallback logic, as regular store dependency checking uses "internal" signals instead
             store_is_older_arr = LogicArray(ctx, 'store_is_older_arr', 'w', self.configs.numLdqEntries)
         # store request enable (after fallback logic)
         store_en = Logic(ctx, 'store_en', 'w')
@@ -791,13 +792,18 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                                load_idx_oh_LogicArray[i], 'and', can_load_list[w][i])
 
         for w in range(self.configs.numLdMem):
-            last_load_port = (w == self.configs.numLdMem - 1)
-            if self.configs.fallbackIssueLoad and last_load_port:
-                # last load port: use fallback load (if any) as the first priority, then service other loads (from _tmp)
+            # We use the last load memory channel for potential fallback loads for two reasons:
+            # - It is the last channel to be utilized for regular loads, thus it is least likely we need to preempt a
+            #   regular load for a fallback load.
+            # - If all load channels could be occupied by regular loads, we are preempting the youngest load. This
+            #   probably has the least performance impact.
+            last_load_channel = (w == self.configs.numLdMem - 1)
+            if self.configs.fallbackIssueLoad and last_load_channel:
+                # last channel: use fallback load (if any) as the first priority, then service other loads (from _tmp)
                 arch += Op(ctx, load_idx_oh[w], fallback_load_idx_oh, 'when', fallback_load_en, 'else', load_idx_tmp_oh[w])
                 arch += Op(ctx, load_en[w], fallback_load_en, 'or', load_en_tmp[w])
             else:
-                # non-last load port: use _tmp signals directly
+                # non-last load channel: use _tmp signals directly
                 arch += Op(ctx, load_idx_oh[w], load_idx_tmp_oh[w])
                 arch += Op(ctx, load_en[w], load_en_tmp[w])
 
@@ -811,10 +817,9 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                 arch += Op(ctx, ldq_alloc_no_issue[i], ldq_alloc_p0[i], 'and', 'not', ldq_issue[i])
             arch += CyclicPriorityMasking(ctx, fallback_load_candidate_oh, ldq_alloc_no_issue, ldq_head_oh_p0)
 
-            # The fallback store canddiate is the oldest allocated and un-issued store. This is simply
-            # the store at the store queue issue pointer (if allocated). We do not explicitly track the
-            # fallback store candidate, but rather just keep the relevant row/column from the order
-            # matrix.
+            # The fallback store candidate is the oldest allocated and un-issued store. This is simply the store at the
+            # store queue issue pointer (if allocated). We do not explicitly track the fallback store candidate, but
+            # rather just keep the relevant row/column from the order matrix.
 
             # If the oldest load is older than the oldest store, this contains a single bit set (at the oldest load entry).
             # Otherwise (oldest store is oldest overall), this is all zeros.
@@ -822,10 +827,6 @@ def generate(self, lsq_submodules, path_rtl) -> None:
             for i in range(self.configs.numLdqEntries):
                 arch += Op(ctx, fallback_load_is_oldest_oh[i], fallback_load_candidate_oh[i], 'and', 'not', store_is_older_arr[i])
 
-            # Whether the fallback load is the oldest.
-            fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
-            arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
-
             # NOTE: For both the outstanding loads and stores, we only need to consider loads/store which were issued
             # previously. If a load (store) is issued through the regular path in the same cycle as the fallback store
             # (load), it cannot conflict with the fallback store (load). This is because:
@@ -847,6 +848,10 @@ def generate(self, lsq_submodules, path_rtl) -> None:
                 arch += Reduce(ctx, fallback_load_en, fallback_load_idx_oh, 'or')
 
             if self.configs.fallbackIssueStore:
+                # whether the fallback load is the oldest
+                fallback_load_is_oldest = Logic(ctx, 'fallback_load_is_oldest', 'w')
+                arch += Reduce(ctx, fallback_load_is_oldest, fallback_load_is_oldest_oh, 'or')
+
                 # FIXME: This can reuse code from the dependency-check refactor which is part of another PR.
                 load_outstanding_arr = LogicArray(ctx, 'load_outstanding_arr', 'w', self.configs.numLdqEntries)
                 load_outstanding = Logic(ctx, 'load_outstanding', 'w')

From b7e56173b815d91ff33de3d10d8bde52329d85fc Mon Sep 17 00:00:00 2001
From: Max Wipfli <mwipfli@ethz.ch>
Date: Tue, 10 Mar 2026 16:39:46 +0100
Subject: [PATCH 11/11] [LSQ] Fix merge conflict

---
 tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
index 80590e0e35..9b8b855955 100644
--- a/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
+++ b/tools/backend/lsq-generator-python/vhdl_gen/generators/lsq.py
@@ -875,7 +875,6 @@ def generate(self, lsq_submodules, path_rtl) -> None:
         for i in range(self.configs.numStqEntries):
             arch += Op(ctx, store_req_valid_arr[i], stq_alloc_pcomp[i], 'and', stq_addr_valid_pcomp[i], 'and', stq_data_valid_pcomp[i])
 
-        store_conflict = Logic(ctx, 'store_conflict', 'w')
         store_req_valid_p0 = Logic(ctx, 'store_req_valid_p0', pipe0_type)
         st_ld_conflict_p0 = LogicVec(ctx, 'st_ld_conflict_p0', pipe0_type, self.configs.numLdqEntries)
         if self.configs.pipe0: