diff --git a/conv_1d_bc.cpp b/conv_1d_bc.cpp
deleted file mode 100644
index 8efbcd282..000000000
--- a/conv_1d_bc.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-#ifndef __VIVADO_SYNTH__
-#include <fstream>
-using namespace std;
-
-  // Debug utility
-  ofstream* global_debug_handle;
-
-#endif //__VIVADO_SYNTH__
-#include "accumulate_3.h"
-
-#include "hw_classes.h"
-
-struct M_get_input_0_to_M_compute_output_3_cache {
-	// RAM Box: {[0, 9]}
-	// Capacity: 3
-	// # of read delays: 3
-	fifo<hw_uint<32> , 3> f;
-	inline hw_uint<32>  peek(const int offset) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-    return f.peek(2 - offset);
-  }
-
-
-
-	inline void push(const hw_uint<32>  value) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-    return f.push(value);
-  }
-
-};
-
-struct M_get_input_0_merged_banks_2_cache {
-	// RAM Box: {[0, 9]}
-	// Capacity: 2
-	// # of read delays: 2
-	hw_uint<32>  f0;
-	hw_uint<32>  f2;
-
-
-	inline hw_uint<32>  peek_0() {
-		return f0;
-	}
-
-	inline hw_uint<32>  peek_1() {
-		return f2;
-	}
-
-
-
-	inline void push(const hw_uint<32>  value) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-    // cap: 1 reading from capacity: 1
-    f2 = f0;
-    // cap: 1
-    f0 = value;
-	}
-
-};
-
-struct M_cache {
-  M_get_input_0_to_M_compute_output_3_cache M_get_input_0_to_M_compute_output_3;
-  M_get_input_0_merged_banks_2_cache M_get_input_0_merged_banks_2;
-};
-
-
-
-inline void M_get_input_0_write(hw_uint<32> & M_get_input_0, M_cache& M, int root, int p) {
-  M.M_get_input_0_to_M_compute_output_3.push(M_get_input_0);
-  M.M_get_input_0_merged_banks_2.push(M_get_input_0);
-}
-
-inline hw_uint<32>  M_compute_output_3_select(M_cache& M, int root, int c) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-  // M_compute_output_3 read pattern: { compute_output[root = 0, c] -> M[c] : 0 <= c <= 8; compute_output[root = 0, c = 9] -> M[9] }
-  // Read schedule : { compute_output[root = 0, c] -> [2 + c, 1] : 0 <= c <= 9 }
-  // Write schedule: { get_input[root = 0, p] -> [p, 0] : 0 <= p <= 9 }
-  // DD fold: { compute_output[root, c] -> 2 : root = 0 and 0 <= c <= 7; compute_output[root, c] -> 1 : root = 0 and c = 8 }
-  auto value_M_get_input_0 = M.M_get_input_0_to_M_compute_output_3.peek(/* one reader or all rams */ (-8 + c == 0) ? (1) : (7 - c >= 0) ? (2) : 0);
-  return value_M_get_input_0;
-#ifndef __VIVADO_SYNTH__
-	cout << "Error: Unsupported offsets: " << " root = " << root  << " c = " << c  << endl;
-	assert(false);
-	return 0;
-#endif //__VIVADO_SYNTH__
-}
-
-inline hw_uint<32>  M_compute_output_4_select(M_cache& M, int root, int c) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-  // M_compute_output_4 read pattern: { compute_output[root = 0, c] -> M[9] : 8 <= c <= 9; compute_output[root = 0, c] -> M[1 + c] : 0 <= c <= 7 }
-  // Read schedule : { compute_output[root = 0, c] -> [2 + c, 1] : 0 <= c <= 9 }
-  // Write schedule: { get_input[root = 0, p] -> [p, 0] : 0 <= p <= 9 }
-  // DD fold: { compute_output[root, c] -> 1 : root = 0 and 0 <= c <= 7 }
-  auto value_M_get_input_0 = M.M_get_input_0_merged_banks_2.peek(/* Needs general delay string */ (7 - c >= 0) ? (1) : 0);
-  return value_M_get_input_0;
-#ifndef __VIVADO_SYNTH__
-	cout << "Error: Unsupported offsets: " << " root = " << root  << " c = " << c  << endl;
-	assert(false);
-	return 0;
-#endif //__VIVADO_SYNTH__
-}
-
-inline hw_uint<32>  M_compute_output_5_select(M_cache& M, int root, int c) {
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-  // M_compute_output_5 read pattern: { compute_output[root = 0, c] -> M[9] : 7 <= c <= 9; compute_output[root = 0, c] -> M[2 + c] : 0 <= c <= 6 }
-  // Read schedule : { compute_output[root = 0, c] -> [2 + c, 1] : 0 <= c <= 9 }
-  // Write schedule: { get_input[root = 0, p] -> [p, 0] : 0 <= p <= 9 }
-  // DD fold: {  }
-  auto value_M_get_input_0 = M.M_get_input_0_merged_banks_2.peek_0();
-  return value_M_get_input_0;
-#ifndef __VIVADO_SYNTH__
-	cout << "Error: Unsupported offsets: " << " root = " << root  << " c = " << c  << endl;
-	assert(false);
-	return 0;
-#endif //__VIVADO_SYNTH__
-}
-
-// # of bundles = 3
-// M_get_input_0
-//	M_get_input_0
-inline void M_M_get_input_0_bundle_write(hw_uint<32>& M_get_input_0, M_cache& M, int root, int p) {
-	hw_uint<32>  M_get_input_0_res = M_get_input_0.extract<0, 31>();
-	M_get_input_0_write(M_get_input_0_res, M, root, p);
-}
-
-// compute_output_read
-//	M_compute_output_3
-//	M_compute_output_4
-//	M_compute_output_5
-inline hw_uint<96> M_compute_output_read_bundle_read(M_cache& M, int root, int c) {
-  // # of ports in bundle: 3
-    // M_compute_output_3
-    // M_compute_output_4
-    // M_compute_output_5
-
-	hw_uint<96> result;
-	hw_uint<32>  M_compute_output_3_res = M_compute_output_3_select(M, root, c);
-	set_at<0, 96>(result, M_compute_output_3_res);
-	hw_uint<32>  M_compute_output_4_res = M_compute_output_4_select(M, root, c);
-	set_at<32, 96>(result, M_compute_output_4_res);
-	hw_uint<32>  M_compute_output_5_res = M_compute_output_5_select(M, root, c);
-	set_at<64, 96>(result, M_compute_output_5_res);
-	return result;
-}
-
-// get_input_write
-//	M_get_input_0
-inline void M_get_input_write_bundle_write(hw_uint<32>& get_input_write, M_cache& M, int root, int p) {
-	hw_uint<32>  M_get_input_0_res = get_input_write.extract<0, 31>();
-	M_get_input_0_write(M_get_input_0_res, M, root, p);
-}
-
-
-
-// Operation logic
-inline void get_input(HWStream<hw_uint<32> >& /* buffer_args num ports = 1 */in, M_cache& M, int root, int p) {
-	// Consume: in
-	auto in_p_value = in.read();
-	// Produce: M
-	M_get_input_write_bundle_write(in_p_value, M, root, p);
-
-#ifndef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-
-}
-
-inline void compute_output(HWStream<hw_uint<32> >& /* buffer_args num ports = 1 */out, int root, int c) {
-	auto compute_result = accumulate_3();
-	// Produce: out
-	out.write(compute_result);
-
-#ifndef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-
-}
-
-// Driver function
-void conv_1d_bc(HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */in, HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */out, int num_epochs) {
-
-#ifndef __VIVADO_SYNTH__
-  ofstream debug_file("conv_1d_bc_debug.csv");
-  global_debug_handle = &debug_file;
-#endif //__VIVADO_SYNTH__
-  M_cache M;
-#ifdef __VIVADO_SYNTH__
-#endif //__VIVADO_SYNTH__
-#ifdef __VIVADO_SYNTH__
-#pragma HLS inline recursive
-#endif // __VIVADO_SYNTH__
-
-  for (int epoch = 0; epoch < num_epochs; epoch++) {
-	for (int c0 = 0; c0 <= 11; c0 += 1) {
-	  if (c0 <= 9)
-	get_input(in, M, 0, c0);
-	  if (c0 >= 2)
-	compute_output(M, out, 0, c0 - 2);
-	}
-	
-  }
-
-#ifndef __VIVADO_SYNTH__
-  debug_file.close();
-#endif //__VIVADO_SYNTH__
-}
-
-void conv_1d_bc(HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */in, HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */out) {
-
-  conv_1d_bc(in, out, 1);
-}
-#ifdef __VIVADO_SYNTH__
-const int get_input_read_num_transfers = 0;
-const int compute_output_write_num_transfers = 0;
-
-
-extern "C" {
-
-static void read_get_input_read(hw_uint<32>* input, HWStream<hw_uint<32> >& v, const int size) {
-  hw_uint<32> burst_reg;
-  int num_transfers = get_input_read_num_transfers*size;
-  for (int i = 0; i < num_transfers; i++) {
-    #pragma HLS pipeline II=1
-    burst_reg = input[i];
-    v.write(burst_reg);
-  }
-}
-
-static void write_compute_output_write(hw_uint<32>* output, HWStream<hw_uint<32> >& v, const int size) {
-  hw_uint<32> burst_reg;
-  int num_transfers = compute_output_write_num_transfers*size;
-  for (int i = 0; i < num_transfers; i++) {
-    #pragma HLS pipeline II=1
-    burst_reg = v.read();
-    output[i] = burst_reg;
-  }
-}
-
-void conv_1d_bc_accel(hw_uint<32>* get_input_read, hw_uint<32>* compute_output_write, const int size) { 
-#pragma HLS dataflow
-#pragma HLS INTERFACE m_axi port = get_input_read offset = slave depth = 65536 bundle = gmem0
-#pragma HLS INTERFACE m_axi port = compute_output_write offset = slave depth = 65536 bundle = gmem1
-
-#pragma HLS INTERFACE s_axilite port = get_input_read bundle = control
-#pragma HLS INTERFACE s_axilite port = compute_output_write bundle = control
-#pragma HLS INTERFACE s_axilite port = size bundle = control
-#pragma HLS INTERFACE s_axilite port = return bundle = control
-
-  static HWStream<hw_uint<32> > get_input_read_channel;
-  static HWStream<hw_uint<32> > compute_output_write_channel;
-
-  read_get_input_read(get_input_read, get_input_read_channel, size);
-
-  conv_1d_bc(get_input_read_channel, compute_output_write_channel, size);
-
-  write_compute_output_write(compute_output_write, compute_output_write_channel, size);
-}
-
-}
-#endif //__VIVADO_SYNTH__
-
diff --git a/conv_2d_bc.cpp b/conv_2d_bc.cpp
new file mode 100644
index 000000000..1ccc864c8
--- /dev/null
+++ b/conv_2d_bc.cpp
@@ -0,0 +1,790 @@
+#ifndef __VIVADO_SYNTH__
+#include <fstream>
+using namespace std;
+
+  // Debug utility
+  ofstream* global_debug_handle;
+
+#endif //__VIVADO_SYNTH__
+#include "conv_3x3.h"
+
+#include "hw_classes.h"
+
+struct I_write_0_merged_banks_19_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 66
+	// # of read delays: 4
+	hw_uint<32>  f0;
+	hw_uint<32>  f2;
+	fifo<hw_uint<32> , 62> f3;
+	hw_uint<32>  f4;
+	hw_uint<32>  f6;
+
+
+	inline hw_uint<32>  peek_0() {
+		return f0;
+	}
+
+	inline hw_uint<32>  peek_1() {
+		return f2;
+	}
+
+	inline hw_uint<32>  peek_63() {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+		return f3.back();
+	}
+
+	inline hw_uint<32>  peek_64() {
+		return f4;
+	}
+
+	inline hw_uint<32>  peek_65() {
+		return f6;
+	}
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    // cap: 1 reading from capacity: 1
+    f6 = f4;
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    // cap: 1 reading from capacity: 62
+    f4 = f3.back();
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    // cap: 62 reading from capacity: 1
+    f3.push(f2);
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    // cap: 1 reading from capacity: 1
+    f2 = f0;
+    // cap: 1
+    f0 = value;
+	}
+
+};
+
+struct I_write_0_to_I_read_0_10_1_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 62
+	// # of read delays: 62
+	fifo<hw_uint<32> , 62> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(61 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_11_5_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 62
+	// # of read delays: 62
+	fifo<hw_uint<32> , 62> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(61 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_3_8_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 131
+	// # of read delays: 67
+	fifo<hw_uint<32> , 131> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(130 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_3_9_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 64
+	// # of read delays: 64
+	fifo<hw_uint<32> , 64> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(63 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_3_10_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 129
+	// # of read delays: 66
+	fifo<hw_uint<32> , 129> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(128 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_4_12_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 64
+	// # of read delays: 64
+	fifo<hw_uint<32> , 64> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(63 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_4_13_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 67
+	// # of read delays: 3
+	fifo<hw_uint<32> , 67> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(66 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_5_16_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 64
+	// # of read delays: 64
+	fifo<hw_uint<32> , 64> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(63 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_5_17_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 3
+	// # of read delays: 3
+	fifo<hw_uint<32> , 3> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(2 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_6_20_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 129
+	// # of read delays: 66
+	fifo<hw_uint<32> , 129> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(128 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_6_21_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 130
+	// # of read delays: 66
+	fifo<hw_uint<32> , 130> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(129 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_6_23_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 63
+	// # of read delays: 63
+	fifo<hw_uint<32> , 63> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(62 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_7_25_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 63
+	// # of read delays: 63
+	fifo<hw_uint<32> , 63> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(62 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_8_29_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 63
+	// # of read delays: 63
+	fifo<hw_uint<32> , 63> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(62 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_9_32_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 129
+	// # of read delays: 66
+	fifo<hw_uint<32> , 129> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(128 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_9_33_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 129
+	// # of read delays: 65
+	fifo<hw_uint<32> , 129> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(128 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_write_0_to_I_read_0_9_35_cache {
+	// RAM Box: {[0, 63], [0, 63]}
+	// Capacity: 62
+	// # of read delays: 62
+	fifo<hw_uint<32> , 62> f;
+	inline hw_uint<32>  peek(const int offset) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.peek(61 - offset);
+  }
+
+
+
+	inline void push(const hw_uint<32>  value) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+    return f.push(value);
+  }
+
+};
+
+struct I_cache {
+  I_write_0_merged_banks_19_cache I_write_0_merged_banks_19;
+  I_write_0_to_I_read_0_10_1_cache I_write_0_to_I_read_0_10_1;
+  I_write_0_to_I_read_0_11_5_cache I_write_0_to_I_read_0_11_5;
+  I_write_0_to_I_read_0_3_8_cache I_write_0_to_I_read_0_3_8;
+  I_write_0_to_I_read_0_3_9_cache I_write_0_to_I_read_0_3_9;
+  I_write_0_to_I_read_0_3_10_cache I_write_0_to_I_read_0_3_10;
+  I_write_0_to_I_read_0_4_12_cache I_write_0_to_I_read_0_4_12;
+  I_write_0_to_I_read_0_4_13_cache I_write_0_to_I_read_0_4_13;
+  I_write_0_to_I_read_0_5_16_cache I_write_0_to_I_read_0_5_16;
+  I_write_0_to_I_read_0_5_17_cache I_write_0_to_I_read_0_5_17;
+  I_write_0_to_I_read_0_6_20_cache I_write_0_to_I_read_0_6_20;
+  I_write_0_to_I_read_0_6_21_cache I_write_0_to_I_read_0_6_21;
+  I_write_0_to_I_read_0_6_23_cache I_write_0_to_I_read_0_6_23;
+  I_write_0_to_I_read_0_7_25_cache I_write_0_to_I_read_0_7_25;
+  I_write_0_to_I_read_0_8_29_cache I_write_0_to_I_read_0_8_29;
+  I_write_0_to_I_read_0_9_32_cache I_write_0_to_I_read_0_9_32;
+  I_write_0_to_I_read_0_9_33_cache I_write_0_to_I_read_0_9_33;
+  I_write_0_to_I_read_0_9_35_cache I_write_0_to_I_read_0_9_35;
+};
+
+
+
+inline void I_write_0_write(hw_uint<32> & I_write_0, I_cache& I, int root, int pr, int pc) {
+  I.I_write_0_merged_banks_19.push(I_write_0);
+  I.I_write_0_to_I_read_0_10_1.push(I_write_0);
+  I.I_write_0_to_I_read_0_11_5.push(I_write_0);
+  I.I_write_0_to_I_read_0_3_8.push(I_write_0);
+  I.I_write_0_to_I_read_0_3_9.push(I_write_0);
+  I.I_write_0_to_I_read_0_3_10.push(I_write_0);
+  I.I_write_0_to_I_read_0_4_12.push(I_write_0);
+  I.I_write_0_to_I_read_0_4_13.push(I_write_0);
+  I.I_write_0_to_I_read_0_5_16.push(I_write_0);
+  I.I_write_0_to_I_read_0_5_17.push(I_write_0);
+  I.I_write_0_to_I_read_0_6_20.push(I_write_0);
+  I.I_write_0_to_I_read_0_6_21.push(I_write_0);
+  I.I_write_0_to_I_read_0_6_23.push(I_write_0);
+  I.I_write_0_to_I_read_0_7_25.push(I_write_0);
+  I.I_write_0_to_I_read_0_8_29.push(I_write_0);
+  I.I_write_0_to_I_read_0_9_32.push(I_write_0);
+  I.I_write_0_to_I_read_0_9_33.push(I_write_0);
+  I.I_write_0_to_I_read_0_9_35.push(I_write_0);
+}
+
+inline hw_uint<32>  I_read_0_10_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_10 read pattern: { read_0[root = 0, lr, lc] -> I[63, 63] : 62 <= lr <= 63 and 61 <= lc <= 63; read_0[root = 0, lr, lc] -> I[2 + lc, 63] : 62 <= lr <= 63 and 0 <= lc <= 60; read_0[root = 0, lr, lc] -> I[63, 1 + lr] : 0 <= lr <= 61 and 61 <= lc <= 63; read_0[root = 0, lr, lc] -> I[2 + lc, 1 + lr] : 0 <= lr <= 61 and 0 <= lc <= 60 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (61 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 60; read_0[root, lr, lc] -> 64 : root = 0 and lc = 63 and 0 <= lr <= 61; read_0[root, lr, lc] -> 64 : root = 0 and lc = 61 and 0 <= lr <= 61; read_0[root, lr, lc] -> (2 + lc) : root = 0 and lc = 62 and 0 <= lr <= 61; read_0[root, lr, lc] -> 64 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 60 }
+  auto value_I_write_0 = I.I_write_0_merged_banks_19.peek(/* Needs general delay string */ ((-63 + lc == 0 && 61 - lr >= 0) || (61 - lc >= 0 && 61 - lr >= 0)) ? (64) : (-62 + lc == 0 && 61 - lr >= 0) ? (64) : (-62 + lr >= 0 && 60 - lc >= 0) ? ((61 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_11_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_11 read pattern: { read_0[root = 0, lr, lc] -> I[63, 63] : 61 <= lr <= 63 and 61 <= lc <= 63; read_0[root = 0, lr, lc] -> I[2 + lc, 63] : 61 <= lr <= 63 and 0 <= lc <= 60; read_0[root = 0, lr, lc] -> I[63, 2 + lr] : 0 <= lr <= 60 and 61 <= lc <= 63; read_0[root = 0, lr, lc] -> I[2 + lc, 2 + lr] : 0 <= lr <= 60 and 0 <= lc <= 60 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (61 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 60 }
+  auto value_I_write_0 = I.I_write_0_merged_banks_19.peek(/* Needs general delay string */ (-62 + lr >= 0 && 60 - lc >= 0) ? ((61 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_3_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_3 read pattern: { read_0[root = 0, lr, lc] -> I[lc, lr] : 0 <= lr <= 62 and 0 <= lc <= 62; read_0[root = 0, lr = 63, lc] -> I[lc, 63] : 0 <= lc <= 62; read_0[root = 0, lr, lc = 63] -> I[63, lr] : 0 <= lr <= 62; read_0[root = 0, lr = 63, lc = 63] -> I[63, 63] }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (63 - lc) : root = 0 and lr = 63 and 0 <= lc <= 62; read_0[root, lr, lc] -> 128 : root = 0 and lc = 63 and 0 <= lr <= 61; read_0[root, lr, lc] -> 64 : root = 0 and lr = 62 and lc = 63; read_0[root, lr, lc] -> 130 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 61; read_0[root, lr, lc] -> 129 : root = 0 and lc = 62 and 0 <= lr <= 61; read_0[root, lr, lc] -> (127 - lc) : root = 0 and lr = 62 and 0 <= lc <= 62 }
+  auto value_I_write_0 = I.I_write_0_to_I_read_0_3_8.peek(/* one reader or all rams */ (-63 + lc == 0 && -62 + lr == 0) ? (64) : (-63 + lc == 0 && 61 - lr >= 0) ? (128) : (-62 + lc == 0 && 61 - lr >= 0) ? (129) : (61 - lc >= 0 && 61 - lr >= 0) ? (130) : (-63 + lr == 0 && 62 - lc >= 0) ? ((63 - lc)) : (-62 + lr == 0 && 62 - lc >= 0) ? ((127 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_4_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_4 read pattern: { read_0[root = 0, lr, lc] -> I[lc, 63] : 62 <= lr <= 63 and 0 <= lc <= 62; read_0[root = 0, lr, lc] -> I[lc, 1 + lr] : 0 <= lr <= 61 and 0 <= lc <= 62; read_0[root = 0, lr, lc = 63] -> I[63, 63] : 62 <= lr <= 63; read_0[root = 0, lr, lc = 63] -> I[63, 1 + lr] : 0 <= lr <= 61 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (63 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 62; read_0[root, lr, lc] -> 64 : root = 0 and lc = 63 and 0 <= lr <= 61; read_0[root, lr, lc] -> 66 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 61; read_0[root, lr, lc] -> 65 : root = 0 and lc = 62 and 0 <= lr <= 61 }
+  auto value_I_write_0 = I.I_write_0_to_I_read_0_4_12.peek(/* one reader or all rams */ (-63 + lc == 0 && 61 - lr >= 0) ? (64) : (-62 + lc == 0 && 61 - lr >= 0) ? (65) : (61 - lc >= 0 && 61 - lr >= 0) ? (66) : (-62 + lr >= 0 && 62 - lc >= 0) ? ((63 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_5_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_5 read pattern: { read_0[root = 0, lr, lc] -> I[lc, 63] : 61 <= lr <= 63 and 0 <= lc <= 62; read_0[root = 0, lr, lc] -> I[lc, 2 + lr] : 0 <= lr <= 60 and 0 <= lc <= 62; read_0[root = 0, lr, lc = 63] -> I[63, 63] : 61 <= lr <= 63; read_0[root = 0, lr, lc = 63] -> I[63, 2 + lr] : 0 <= lr <= 60 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (63 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 62; read_0[root, lr, lc] -> 2 : root = 0 and lr = 61 and 0 <= lc <= 61; read_0[root, lr, lc] -> 1 : root = 0 and lr = 61 and lc = 62; read_0[root, lr, lc] -> 2 : root = 0 and 0 <= lr <= 60 and 0 <= lc <= 61; read_0[root, lr, lc] -> 1 : root = 0 and lc = 62 and 0 <= lr <= 60 }
+  auto value_I_write_0 = I.I_write_0_to_I_read_0_5_16.peek(/* one reader or all rams */ (-62 + lc == 0 && 61 - lr >= 0) ? (1) : (61 - lc >= 0 && 61 - lr >= 0) ? (2) : (-62 + lr >= 0 && 62 - lc >= 0) ? ((63 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_6_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_6 read pattern: { read_0[root = 0, lr, lc] -> I[63, lr] : 0 <= lr <= 62 and 62 <= lc <= 63; read_0[root = 0, lr, lc] -> I[1 + lc, lr] : 0 <= lr <= 62 and 0 <= lc <= 61; read_0[root = 0, lr = 63, lc] -> I[63, 63] : 62 <= lc <= 63; read_0[root = 0, lr = 63, lc] -> I[1 + lc, 63] : 0 <= lc <= 61 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (62 - lc) : root = 0 and lr = 63 and 0 <= lc <= 61; read_0[root, lr, lc] -> 128 : root = 0 and 0 <= lr <= 61 and 62 <= lc <= 63; read_0[root, lr, lc] -> 64 : root = 0 and lr = 62 and 62 <= lc <= 63; read_0[root, lr, lc] -> 129 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 61; read_0[root, lr, lc] -> (126 - lc) : root = 0 and lr = 62 and 0 <= lc <= 61 }
+  auto value_I_write_0 = I.I_write_0_to_I_read_0_6_20.peek(/* one reader or all rams */ (-62 + lr == 0 && -62 + lc >= 0) ? (64) : (-62 + lc >= 0 && 61 - lr >= 0) ? (128) : (61 - lc >= 0 && 61 - lr >= 0) ? (129) : (-63 + lr == 0 && 61 - lc >= 0) ? ((62 - lc)) : (-62 + lr == 0 && 61 - lc >= 0) ? ((126 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_7_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_7 read pattern: { read_0[root = 0, lr, lc] -> I[63, 63] : 62 <= lr <= 63 and 62 <= lc <= 63; read_0[root = 0, lr, lc] -> I[1 + lc, 63] : 62 <= lr <= 63 and 0 <= lc <= 61; read_0[root = 0, lr, lc] -> I[63, 1 + lr] : 0 <= lr <= 61 and 62 <= lc <= 63; read_0[root = 0, lr, lc] -> I[1 + lc, 1 + lr] : 0 <= lr <= 61 and 0 <= lc <= 61 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (62 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 61; read_0[root, lr, lc] -> 64 : root = 0 and 0 <= lr <= 61 and 62 <= lc <= 63; read_0[root, lr, lc] -> 65 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 61 }
+  auto value_I_write_0 = I.I_write_0_merged_banks_19.peek(/* Needs general delay string */ (-62 + lc >= 0 && 61 - lr >= 0) ? (64) : (61 - lc >= 0 && 61 - lr >= 0) ? (65) : (-62 + lr >= 0 && 61 - lc >= 0) ? ((62 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_8_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_8 read pattern: { read_0[root = 0, lr, lc] -> I[63, 63] : 61 <= lr <= 63 and 62 <= lc <= 63; read_0[root = 0, lr, lc] -> I[1 + lc, 63] : 61 <= lr <= 63 and 0 <= lc <= 61; read_0[root = 0, lr, lc] -> I[63, 2 + lr] : 0 <= lr <= 60 and 62 <= lc <= 63; read_0[root = 0, lr, lc] -> I[1 + lc, 2 + lr] : 0 <= lr <= 60 and 0 <= lc <= 61 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (62 - lc) : root = 0 and 62 <= lr <= 63 and 0 <= lc <= 61; read_0[root, lr, lc] -> 1 : root = 0 and lr = 61 and 0 <= lc <= 61; read_0[root, lr, lc] -> 1 : root = 0 and 0 <= lr <= 60 and 0 <= lc <= 61 }
+  auto value_I_write_0 = I.I_write_0_merged_banks_19.peek(/* Needs general delay string */ (61 - lc >= 0 && 61 - lr >= 0) ? (1) : (-62 + lr >= 0 && 61 - lc >= 0) ? ((62 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+inline hw_uint<32>  I_read_0_9_select(I_cache& I, int root, int lr, int lc) {
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+  // I_read_0_9 read pattern: { read_0[root = 0, lr, lc] -> I[63, lr] : 0 <= lr <= 62 and 61 <= lc <= 63; read_0[root = 0, lr, lc] -> I[2 + lc, lr] : 0 <= lr <= 62 and 0 <= lc <= 60; read_0[root = 0, lr = 63, lc] -> I[63, 63] : 61 <= lc <= 63; read_0[root = 0, lr = 63, lc] -> I[2 + lc, 63] : 0 <= lc <= 60 }
+  // Read schedule : { read_0[root = 0, lr, lc] -> [2 + lr, 2 + lc, 1] : 0 <= lr <= 63 and 0 <= lc <= 63 }
+  // Write schedule: { write[root = 0, pr, pc] -> [pr, pc, 0] : 0 <= pr <= 63 and 0 <= pc <= 63 }
+  // DD fold: { read_0[root, lr, lc] -> (61 - lc) : root = 0 and lr = 63 and 0 <= lc <= 60; read_0[root, lr, lc] -> 128 : root = 0 and lc = 63 and 0 <= lr <= 61; read_0[root, lr, lc] -> 128 : root = 0 and lc = 61 and 0 <= lr <= 61; read_0[root, lr, lc] -> (66 + lc) : root = 0 and lc = 62 and 0 <= lr <= 61; read_0[root, lr, lc] -> 64 : root = 0 and lr = 62 and ((61 <= lc <= 62) or lc = 63); read_0[root, lr, lc] -> 128 : root = 0 and 0 <= lr <= 61 and 0 <= lc <= 60; read_0[root, lr, lc] -> (125 - lc) : root = 0 and lr = 62 and 0 <= lc <= 60 }
+  auto value_I_write_0 = I.I_write_0_to_I_read_0_9_32.peek(/* one reader or all rams */ (-62 + lr == 0 && -61 + lc >= 0) ? (64) : ((-63 + lc == 0 && 61 - lr >= 0) || (61 - lc >= 0 && 61 - lr >= 0)) ? (128) : (-63 + lr == 0 && 60 - lc >= 0) ? ((61 - lc)) : (-62 + lc == 0 && 61 - lr >= 0) ? (128) : (-62 + lr == 0 && 60 - lc >= 0) ? ((125 - lc)) : 0);
+  return value_I_write_0;
+#ifndef __VIVADO_SYNTH__
+	cout << "Error: Unsupported offsets: " << " root = " << root  << " lr = " << lr  << " lc = " << lc  << endl;
+	assert(false);
+	return 0;
+#endif //__VIVADO_SYNTH__
+}
+
+// # of bundles = 3
+// I_write_0
+//	I_write_0
+inline void I_I_write_0_bundle_write(hw_uint<32>& I_write_0, I_cache& I, int root, int pr, int pc) {
+	hw_uint<32>  I_write_0_res = I_write_0.extract<0, 31>();
+	I_write_0_write(I_write_0_res, I, root, pr, pc);
+}
+
+// read_0_read
+//	I_read_0_3
+//	I_read_0_4
+//	I_read_0_5
+//	I_read_0_6
+//	I_read_0_7
+//	I_read_0_8
+//	I_read_0_9
+//	I_read_0_10
+//	I_read_0_11
+inline hw_uint<288> I_read_0_read_bundle_read(I_cache& I, int root, int lr, int lc) {
+  // # of ports in bundle: 9
+    // I_read_0_3
+    // I_read_0_4
+    // I_read_0_5
+    // I_read_0_6
+    // I_read_0_7
+    // I_read_0_8
+    // I_read_0_9
+    // I_read_0_10
+    // I_read_0_11
+
+	hw_uint<288> result;
+	hw_uint<32>  I_read_0_3_res = I_read_0_3_select(I, root, lr, lc);
+	set_at<0, 288>(result, I_read_0_3_res);
+	hw_uint<32>  I_read_0_4_res = I_read_0_4_select(I, root, lr, lc);
+	set_at<32, 288>(result, I_read_0_4_res);
+	hw_uint<32>  I_read_0_5_res = I_read_0_5_select(I, root, lr, lc);
+	set_at<64, 288>(result, I_read_0_5_res);
+	hw_uint<32>  I_read_0_6_res = I_read_0_6_select(I, root, lr, lc);
+	set_at<96, 288>(result, I_read_0_6_res);
+	hw_uint<32>  I_read_0_7_res = I_read_0_7_select(I, root, lr, lc);
+	set_at<128, 288>(result, I_read_0_7_res);
+	hw_uint<32>  I_read_0_8_res = I_read_0_8_select(I, root, lr, lc);
+	set_at<160, 288>(result, I_read_0_8_res);
+	hw_uint<32>  I_read_0_9_res = I_read_0_9_select(I, root, lr, lc);
+	set_at<192, 288>(result, I_read_0_9_res);
+	hw_uint<32>  I_read_0_10_res = I_read_0_10_select(I, root, lr, lc);
+	set_at<224, 288>(result, I_read_0_10_res);
+	hw_uint<32>  I_read_0_11_res = I_read_0_11_select(I, root, lr, lc);
+	set_at<256, 288>(result, I_read_0_11_res);
+	return result;
+}
+
+// write_write
+//	I_write_0
+inline void I_write_write_bundle_write(hw_uint<32>& write_write, I_cache& I, int root, int pr, int pc) {
+	hw_uint<32>  I_write_0_res = write_write.extract<0, 31>();
+	I_write_0_write(I_write_0_res, I, root, pr, pc);
+}
+
+
+
+// Operation logic
+inline void write(HWStream<hw_uint<32> >& /* buffer_args num ports = 1 */in, I_cache& I, int root, int pr, int pc) {
+	// Consume: in
+	auto in_pc_c__pr_value = in.read();
+	// Produce: I
+	I_write_write_bundle_write(in_pc_c__pr_value, I, root, pr, pc);
+
+#ifndef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+
+}
+
+inline void read_0(HWStream<hw_uint<32> >& /* buffer_args num ports = 1 */out, int root, int lr, int lc) {
+	auto compute_result = conv_3_3();
+	// Produce: out
+	out.write(compute_result);
+
+#ifndef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+
+}
+
+// Driver function
+void conv_2d_bc(HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */in, HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */out, int num_epochs) {
+
+#ifndef __VIVADO_SYNTH__
+  ofstream debug_file("conv_2d_bc_debug.csv");
+  global_debug_handle = &debug_file;
+#endif //__VIVADO_SYNTH__
+  I_cache I;
+#ifdef __VIVADO_SYNTH__
+#endif //__VIVADO_SYNTH__
+#ifdef __VIVADO_SYNTH__
+#pragma HLS inline recursive
+#endif // __VIVADO_SYNTH__
+
+  for (int epoch = 0; epoch < num_epochs; epoch++) {
+	for (int c0 = 0; c0 <= 65; c0 += 1) {
+	  if (c0 >= 2) {
+	    if (c0 <= 63)
+	      for (int c1 = 0; c1 <= 1; c1 += 1)
+	write(in, I, 0, c0, c1);
+	    for (int c1 = 2; c1 <= 65; c1 += 1) {
+	      if (c0 <= 63 && c1 <= 63)
+	write(in, I, 0, c0, c1);
+	read_0(I, out, 0, c0 - 2, c1 - 2);
+	    }
+	  } else {
+	    for (int c1 = 0; c1 <= 63; c1 += 1)
+	write(in, I, 0, c0, c1);
+	  }
+	}
+	
+  }
+
+#ifndef __VIVADO_SYNTH__
+  debug_file.close();
+#endif //__VIVADO_SYNTH__
+}
+
+void conv_2d_bc(HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */in, HWStream<hw_uint<32> >& /* no bundle get_args num ports = 1 */out) {
+
+  conv_2d_bc(in, out, 1);
+}
+#ifdef __VIVADO_SYNTH__
+const int write_read_num_transfers = 0;
+const int read_0_write_num_transfers = 0;
+
+
+extern "C" {
+
+static void read_write_read(hw_uint<32>* input, HWStream<hw_uint<32> >& v, const int size) {
+  hw_uint<32> burst_reg;
+  int num_transfers = write_read_num_transfers*size;
+  for (int i = 0; i < num_transfers; i++) {
+    #pragma HLS pipeline II=1
+    burst_reg = input[i];
+    v.write(burst_reg);
+  }
+}
+
+static void write_read_0_write(hw_uint<32>* output, HWStream<hw_uint<32> >& v, const int size) {
+  hw_uint<32> burst_reg;
+  int num_transfers = read_0_write_num_transfers*size;
+  for (int i = 0; i < num_transfers; i++) {
+    #pragma HLS pipeline II=1
+    burst_reg = v.read();
+    output[i] = burst_reg;
+  }
+}
+
+void conv_2d_bc_accel(hw_uint<32>* write_read, hw_uint<32>* read_0_write, const int size) { 
+#pragma HLS dataflow
+#pragma HLS INTERFACE m_axi port = write_read offset = slave depth = 65536 bundle = gmem0
+#pragma HLS INTERFACE m_axi port = read_0_write offset = slave depth = 65536 bundle = gmem1
+
+#pragma HLS INTERFACE s_axilite port = write_read bundle = control
+#pragma HLS INTERFACE s_axilite port = read_0_write bundle = control
+#pragma HLS INTERFACE s_axilite port = size bundle = control
+#pragma HLS INTERFACE s_axilite port = return bundle = control
+
+  static HWStream<hw_uint<32> > write_read_channel;
+  static HWStream<hw_uint<32> > read_0_write_channel;
+
+  read_write_read(write_read, write_read_channel, size);
+
+  conv_2d_bc(write_read_channel, read_0_write_channel, size);
+
+  write_read_0_write(read_0_write, read_0_write_channel, size);
+}
+
+}
+#endif //__VIVADO_SYNTH__
+
diff --git a/ubuffer.cpp b/ubuffer.cpp
index a9aa49601..53eb8f3cc 100644
--- a/ubuffer.cpp
+++ b/ubuffer.cpp
@@ -227,12 +227,15 @@ void generate_bank(CodegenOptions& options,
     out << "\t// # of read delays: " << read_delays.size() << endl;
 
     read_delays = sort_unique(read_delays);
-
+    // cout << "PEEK num readers " << num_readers << endl;
+    // cout << "PEEK options.all_rams " << options.all_rams << endl;
     if (num_readers == 1 || options.all_rams) {
       int partition_capacity = 1 + maxdelay;
       out << "\tfifo<" << pt_type_string << ", " << partition_capacity << "> f" << ";" << endl;
+      // cout << "peek1" << endl;
       out << "\tinline " + pt_type_string + " peek(const int offset) {" << endl;
       ignore_inter_deps(out, "f");
+      // cout << "peek2" << endl;
       out << tab(2) << "return f.peek(" << partition_capacity - 1 << " - offset);" << endl;
       out << tab(1) << "}" << endl << endl;
 
@@ -286,7 +289,7 @@ void generate_bank(CodegenOptions& options,
       //}
 
       //assert(capacities.size() == partitions.size());
-
+      // cout<<"num partitions "<<partitions.size()<<endl;
       out << endl << endl;
       int nind = 0;
       for (auto p : partitions) {
@@ -294,6 +297,7 @@ void generate_bank(CodegenOptions& options,
         //int capacity = capacities.at(nind);
         int capacity = p.second;
         assert(dv >= 0);
+        // cout << "peek3" << endl;
         out << "\tinline " << pt_type_string << " peek_" << to_string(dv) << "() {" << endl;
         if (capacity > 1) {
           ignore_inter_deps(out, p.first);
@@ -519,11 +523,15 @@ void generate_code_prefix(CodegenOptions& options,
     UBuffer& buf) {
 
   //banking and merge pass
+  // cout << "before generate bank and merge " << endl;
   buf.generate_bank_and_merge(options);
 
   //string inpt = buf.get_in_port();
   out << "#include \"hw_classes.h\"" << endl << endl;
+  cout << "before get banks " << endl;
   for (auto b : buf.get_banks()) {
+    // cout << "BANK NAME " << b.name << endl;
+    // cout<< "BANK MERGED READERS " << b.num_readers << endl;
     generate_bank(options, out, b);
   }
 
@@ -547,9 +555,11 @@ void generate_code_prefix(CodegenOptions& options,
     concat(args, dimension_var_decls(inpt, buf));
     string var_args = comma_list(dimension_var_args(inpt, buf));
 
+    // write func for every input port that gets called in this bundle
     out << "inline void " << inpt << "_write(";
     out << comma_list(args) << ") {" << endl;
 
+    // copy and broadcast whenever write to port is done
     //Different ram type, different address
     for (auto sb : buf.receiver_banks(inpt)) {
       //if (sb.tp == BANK_TYPE_STACK) {
@@ -583,6 +593,8 @@ void generate_code_prefix(CodegenOptions& options,
       pieces_dom = unn(pieces_dom, to_uset(p.first));
     }
 
+    // cout<<"DOMAIN "<<str(out_domain)<<endl;
+
     bool pieces_are_complete =
       subset(to_uset(out_domain), (pieces_dom));
     int ub = int_upper_bound(qpd);
@@ -660,37 +672,49 @@ void generate_code_prefix(CodegenOptions& options,
     out << tab(1) << "// DD fold: " << str(dd_fold) << endl;
     string delay_expr = evaluate_dd(buf, outpt, inpt);
     string value_str = "";
+    // cout<<"PEEK inpt "<< inpt<<endl;
+    // cout<<"output "<<outpt<<endl;
     bool opt_const = is_optimizable_constant_dd(inpt, outpt, buf);
     if (options.inner_bank_offset_mode == INNER_BANK_OFFSET_LINEAR) {
       string linear_addr = buf.generate_linearize_ram_addr(outpt);
       value_str = bank + ".read(/*ram type address*/ "+ linear_addr + ")";
     }
     else if (options.inner_bank_offset_mode == INNER_BANK_OFFSET_STACK) {
+      // std::cout << "PEEK 4 options all rams " << options.all_rams << endl;
+      // std::cout << "PEEK 4 num readers " << buf.get_bank(bank).num_readers << endl;
+      // std::cout << "PEEK 4 opt const " << opt_const<< endl;
       if (options.all_rams || buf.get_bank(bank).num_readers == 1) {
+        // cout << "peek4" << endl;
         value_str = bank + ".peek(/* one reader or all rams */ " + delay_expr + ")";
       } else if (opt_const) {
         if (!options.all_rams && is_number(dx)) {
           assert(safe_stoi(dx) >= 0);
+          // cout << "peek5" << endl;
           value_str = bank + ".peek_" + dx + "()";
         } else {
+          // cout << "peek6" << endl;
           value_str = bank + ".peek" + "( /* is opt const */ " + delay_expr + ")";
         }
       } else if (pieces.size() == 0 && !options.all_rams) {
+        // cout << "peek7" << endl;
         value_str = bank + ".peek_0()";
       } else if (pieces.size() == 1 &&
           isl_set_is_subset(cpy(out_domain), cpy(pieces[0].first))) {
         string dx = codegen_c(pieces[0].second);
         if (!options.all_rams && is_number(dx)) {
           assert(safe_stoi(dx) >= 0);
+          // cout << "peek8" << endl;
           value_str = bank + ".peek_" + dx + "()";
         } else {
+          // cout << "peek9" << endl;
           value_str = bank + ".peek" + "(/* is one piece but not a number */" + dx + ")";
         }
       } else {
+        // cout << "peek10" << endl;
         value_str = bank + ".peek" + "(/* Needs general delay string */ " + delay_expr + ")";
       }
     }
-
+    // cout<<"value_str "<<value_str<<endl;
     return buf.name + "." + value_str;
   }
 
@@ -726,7 +750,7 @@ void generate_code_prefix(CodegenOptions& options,
       in_ports_to_conditions[inpt] =
         codegen_c(overlapped_read_condition);
     }
-
+    // cout<<"possible_ports.size "<<possible_ports.size()<<endl;
     if (possible_ports.size() == 1) {
       string inpt = possible_ports.at(0);
       string peeked_val = delay_string(options, out, inpt, outpt, buf);
@@ -854,6 +878,7 @@ void generate_code_prefix(CodegenOptions& options,
   }
 
   void generate_hls_code(CodegenOptions& options, std::ostream& out, UBuffer& buf) {
+    // cout << "generate hls code " << endl;
     generate_code_prefix(options, out, buf);
 
     for (auto outpt : buf.get_out_ports()) {
@@ -1137,13 +1162,15 @@ void generate_code_prefix(CodegenOptions& options,
     int num_readers = 0;
 
     auto in_actions = domain.at(inpt);
-    //cout << "\t in action : " << str(in_actions) << endl;
+    // cout << "\t in action : " << str(in_actions) << endl;
     auto lex_max_events = get_lexmax_events(outpt);
-    //cout << "\t lexmax result: " << str(lex_max_events) << endl;
+    // cout << "\t lexmax result: " << str(lex_max_events) << endl;
     auto act_dom =
       ::domain(its_range(lex_max_events, to_uset(in_actions)));
 
-    //cout <<"\t act dom: " << str(act_dom) << endl;
+    // cout <<"\t act dom: " << str(act_dom) << endl;
+
+    // cout << "COMPUTE BANK INFO " << !isl_union_set_is_empty(act_dom) << endl;
 
     if (!isl_union_set_is_empty(act_dom)) {
       num_readers++;
@@ -1151,7 +1178,7 @@ void generate_code_prefix(CodegenOptions& options,
       int qpd = compute_dd_bound(outpt, inpt, true);
       int lb = compute_dd_bound(outpt, inpt, false);
 
-      //cout << "ub: " << qpd << ", lb: " << lb << endl;
+      cout << "ub: " << qpd << ", lb: " << lb << endl;
 
       for (int i = lb; i < qpd + 1; i++) {
         read_delays.push_back(i);
@@ -1161,14 +1188,31 @@ void generate_code_prefix(CodegenOptions& options,
 
     string pt_type_string = port_type_string();
     string name = inpt + "_to_" + outpt;
-    //cout << "inpt  = " << inpt << endl;
-    //cout << "outpt = " << outpt << endl;
-    //cout << "name of bank = " << name << endl;
+    cout << "inpt  = " << inpt << endl;
+    cout << "outpt = " << outpt << endl;
+    cout << "name of bank = " << name << endl;
 
     auto rddom =
       unn(range(access_map.at(inpt)),
           range(access_map.at(outpt)));
-    //cout << "Read domain for bank: " << str(rddom) << endl;
+    cout << "Read domain for bank: " << str(rddom) << endl;
+    cout<<"access map "<<str(access_map.at(outpt))<<endl;
+    for(auto m : get_maps(access_map.at(outpt))){
+      cout<<"Map"<<endl;
+      cout<<tab(1)<<str(m)<<endl;
+      for(auto m_ : get_basic_maps(m)){
+        cout<<tab(2)<<str(m_)<<endl;
+      }
+    }
+
+    isl_union_map* test =access_map.at(outpt);
+    auto maptest = to_map(test);
+    cout<<"access map output "<< domain_name(maptest)<<endl;
+    cout<<"access map output "<< range_name(maptest)<<endl;
+    /*for(auto mapi : maptest){
+      cout<<str(mapi.first)<<endl;
+    }
+    */
     //Box mem_box = extract_box(rddom);
 
     //initial the delay map
@@ -1181,6 +1225,7 @@ void generate_code_prefix(CodegenOptions& options,
   }
 
   void UBuffer::merge_bank(CodegenOptions& options, string inpt, vector<stack_bank> mergeable) {
+    cout << "merge bank called " << endl;
     if (!options.conditional_merge){
       stack_bank merged;
       merged.tp = BANK_TYPE_STACK;
@@ -1191,6 +1236,7 @@ void generate_code_prefix(CodegenOptions& options,
       merged.pt_type_string =
         mergeable.at(0).pt_type_string;
       merged.num_readers = mergeable.size();
+      // cout << "MERGED NUM READERS " << merged.num_readers << endl;
       merged.maxdelay = -1;
       for (auto m : mergeable) {
         //cout << "merge: " << m.name << endl;
@@ -1204,8 +1250,9 @@ void generate_code_prefix(CodegenOptions& options,
         }
       }
       merged.read_delays = sort_unique(merged.read_delays);
-
+cout << "mergeable size " << mergeable.size() << endl;
       for (auto to_replace : mergeable) {
+        cout << "replace bank called" << endl;
         replace_bank(to_replace, merged);
       }
     }
@@ -1215,9 +1262,6 @@ void generate_code_prefix(CodegenOptions& options,
       sort(mergeable.begin(), mergeable.end(), [](const bank& l, const bank& r) {
           return l.maxdelay > r.maxdelay;
           });
-      for (auto merge_bank : mergeable) {
-        //cout << merge_bank.name << " with delay : " << merge_bank.maxdelay << endl;
-      }
 
       while(mergeable.size()) {
         //keep pop port to merged bank and replace origin bank
@@ -1243,7 +1287,7 @@ void generate_code_prefix(CodegenOptions& options,
           merged.rddom = unn(merged.rddom, m.rddom);
           merged.maxdelay = m.maxdelay;
           merged.read_delays.push_back(m.maxdelay);
-          cout << m.maxdelay <<", " << merged.maxdelay << endl;
+          //cout << m.maxdelay <<", " << merged.maxdelay << endl;
 
           //get the next data
           mergeable.pop_back();
@@ -1257,6 +1301,7 @@ void generate_code_prefix(CodegenOptions& options,
 
         for (auto to_replace : replace_candidates) {
           cout << to_replace.name << endl;
+          cout << "replace bank called 2 " << endl;
           replace_bank(to_replace, merged);
         }
         cout << "Create a new bank !"<< endl;
@@ -1287,7 +1332,7 @@ void generate_code_prefix(CodegenOptions& options,
       for (auto outpt : get_out_ports()) {
         auto overlap =
           its(range(access_map.at(inpt)), range(access_map.at(outpt)));
-
+cout<<"access map in "<<str(access_map.at(inpt))<<" out "<<str(access_map.at(outpt))<<endl;
         if (!empty(overlap)) {
           stack_bank bank = compute_bank_info(inpt, outpt);
           add_bank_between(inpt, outpt, bank);
@@ -1295,43 +1340,82 @@ void generate_code_prefix(CodegenOptions& options,
       }
     }
 
+    int counter = 0;
+    cout << "num inpt ports " << get_in_ports().size() << endl;
     for (auto inpt : get_in_ports()) {
       // try to turn the banks for this inpt into one big linebuffer
       vector<stack_bank> receivers = receiver_banks(inpt);
-      //cout << "Receiver banks for " << inpt << endl;
       vector<stack_bank> mergeable;
+      cout << "num receivers " << receivers.size() << endl;
       for (auto bnk : receivers) {
-        //cout << tab(1) << bnk.name << ", # read offsets: " << bnk.read_delays.size() << endl;
-        //cout << tab(2) << "# receivers: " << receivers.size() << endl;
 
-        if (options.debug_options.expect_all_linebuffers) {
-          //assert(receivers.size() == 1 || bnk.read_delays.size() == 2);
-          assert(bnk.read_delays.size() == 2);
-        }
-        if (bnk.read_delays.size() == 2) {
-          assert(bnk.read_delays[0] == 0);
-          mergeable.push_back(bnk);
-        }
+        if (bnk.read_delays.size() != 2) {
+          auto outpt_vect = bnk.get_out_ports();
+          auto outpt = outpt_vect[0];
+         
+          cout << "before splitting banks" << endl;
+/*          cout << " SCHEDULE : " << str(schedule.at(outpt)) << endl;
+          for (auto s : get_maps(schedule.at(outpt))) {
+            for (auto s_ : get_basic_maps(s)) {
+            cout << tab(1) << str(s_) << endl;
+            }
+          }*/
+          remove_bank(outpt);
+
+          vector<stack_bank> split_banks; 
+          for (auto m : get_maps(access_map.at(outpt))) {
+            for (auto m_ : get_basic_maps(m)) {
+              string new_output = outpt + "_" + to_string(counter);
+              access_map.insert(std::pair<std::string, umap*>(new_output, to_umap(to_map(m_))));
+              schedule.insert(std::pair<std::string, isl_union_map*>(new_output, schedule.at(outpt)));
+              //cout << "ACCESS MAP INSERT " << endl;
+	
+              stack_bank b_ = compute_bank_info(inpt, new_output);
+              add_bank_between(inpt, outpt, b_);
+              if (b_.read_delays.size() == 2) {
+                mergeable.push_back(b_);
+              }
+//              access_map.erase(new_output);
+//              schedule.erase(new_output);
+              //for (int i = 0; i < b_.read_delays.size(); i++) {
+              //  cout << "counter: " << counter << " " << " NEW BANK READ DELAYS: " << b_.read_delays[i] << endl;
+              //}
+              //cout << "ACCESS MAP: " << str(m_) << endl; 
+              counter++;
+            }
+          }
 
+        } else { 
+          if (options.debug_options.expect_all_linebuffers) {
+            //assert(receivers.size() == 1 || bnk.read_delays.size() == 2);
+            assert(bnk.read_delays.size() == 2);
+          }
+          if (bnk.read_delays.size() == 2) {
+            assert(bnk.read_delays[0] == 0);
+            mergeable.push_back(bnk);
+          }
+        }
       }
 
-      if (mergeable.size() > 0) {
-        merge_bank(options, inpt, mergeable);
-        auto banks = get_banks();
-        //cout << "finished create bank!" << endl;
-        //for (bank bk : banks) {
-        //cout << bk.name << " has delays: ";//<< bk.read_delays << endl;
-        //cout << tab(1);
-        //for (int dl: bk.read_delays) {
-        //cout << dl << "," ;
-        //}
-        //cout << endl;
-        //for (auto dl: bk.delay_map) {
-        //cout <<tab(1)<< dl.first << ":" << dl.second <<endl; ;
-        //}
-
-        //}
-      }
+        if (mergeable.size() > 0) {
+cout << "before merge bank call" << endl;
+          merge_bank(options, inpt, mergeable);
+//          auto banks = get_banks();
+        }
+          //cout << "finished create bank!" << endl;
+          //for (bank bk : banks) {
+          //cout << bk.name << " has delays: ";//<< bk.read_delays << endl;
+          //cout << tab(1);
+          //for (int dl: bk.read_delays) {
+          //cout << dl << "," ;
+          //}
+          //cout << endl;
+          //for (auto dl: bk.delay_map) {
+          //cout <<tab(1)<< dl.first << ":" << dl.second <<endl; ;
+          //}
+
+          //}
+ //     }
     }
   }
 
@@ -1614,7 +1698,7 @@ void generate_code_prefix(CodegenOptions& options,
         break;
       }
     }
-    cout << pt_name << endl;
+    //cout << pt_name << endl;
     auto pt_map = to_map(access_map.at(pt_name));
     auto pt_range = range(pt_map);
     Box ret;
diff --git a/ubuffer.h b/ubuffer.h
index 0d85bc3cc..77edb9767 100644
--- a/ubuffer.h
+++ b/ubuffer.h
@@ -615,7 +615,7 @@ class UBuffer {
     std::map<string, isl_union_map*> schedule;
     std::map<string, vector<string> > port_bundles;
 
-    map<pair<string, string>, stack_bank > stack_banks;
+    map<pair<string, string>, std::vector<stack_bank> > stack_banks;
     map<string, selector> selectors;
 
     //lowering ubuffer to memtile
@@ -916,9 +916,12 @@ class UBuffer {
     }
 
     bank get_bank(const std::string& name) const {
+     cout << "bank name in get_bank " << name << endl;
       for (auto b : stack_banks) {
-        if (b.second.name == name) {
-          return b.second;
+        for (auto b_ : b.second) {
+          if (b_.name == name) {
+            return b_;
+          }
         }
       }
       cout << "Error: No such bank as: " << name << endl;
@@ -928,8 +931,10 @@ class UBuffer {
 
     string get_bank_input(const std::string& name) const {
       for (auto b : stack_banks) {
-        if (b.second.name == name) {
-          return b.first.first;
+        for (auto b_ : b.second) {
+          if (b_.name == name) {
+            return b.first.first;
+          }
         }
       }
       cout << "Error: No such bank as: " << name << endl;
@@ -940,8 +945,10 @@ class UBuffer {
     std::set<string> get_bank_inputs(const std::string& name) const {
       std::set<string> ret;
       for (auto b : stack_banks) {
-        if (b.second.name == name) {
-          ret.insert(b.first.first);
+        for (auto b_ : b.second) {
+          if (b_.name == name) {
+            ret.insert(b.first.first);
+          }
         }
       }
       return ret;
@@ -950,28 +957,36 @@ class UBuffer {
     std::set<string> get_bank_outputs(const std::string& name) const {
       std::set<string> ret;
       for (auto b : stack_banks) {
-        if (b.second.name == name) {
-          ret.insert(b.first.second);
+        for (auto b_ : b.second) {
+          if (b_.name == name) {
+            ret.insert(b.first.second);
+          }
         }
       }
       return ret;
     }
 
     void replace_bank(stack_bank& target, stack_bank& replacement) {
+      //cout << "target name " << target.name << " replace name " << replacement.name << endl;
       for (auto bnk : stack_banks) {
-        if (bnk.second.name == target.name) {
-          stack_banks[bnk.first] = replacement;
-          break;
+        for (int i = 0; i < bnk.second.size(); i++) {
+          auto b_ = bnk.second[i];
+          if (b_.name == target.name) {
+             stack_banks[bnk.first][i] = replacement;
+             
+            break;
+          }
         }
       }
     }
 
+    // removes all banks at this output port
     void remove_bank(string pt_name) {
-        map<pair<string, string>, bank> replace;
+        map<pair<string, string>, std::vector<stack_bank>> replace;
         for (auto bnk : stack_banks) {
-            if (bnk.first.second != pt_name) {
-                replace.insert(bnk);
-            }
+          if (bnk.first.second != pt_name) {
+              replace.insert(bnk);
+          }
         }
         stack_banks = replace;
     }
@@ -986,34 +1001,54 @@ class UBuffer {
     vector<stack_bank> get_banks() {
       vector<stack_bank> bnk;
       std::set<string> done;
-      for (auto bs : stack_banks) {
-        if (!elem(bs.second.name, done)) {
-          bnk.push_back(bs.second);
-          done.insert(bs.second.name);
+      for (auto b : stack_banks) {
+        for (auto b_ : b.second) {
+          cout << "bank name " << b_.name << endl;
+          if (!elem(b_.name, done)){
+            bnk.push_back(b_);
+            done.insert(b_.name);
+          }
         }
       }
       return bnk;
     }
 
     void add_bank_between(const std::string& inpt, const std::string& outpt, stack_bank& bank) {
-      stack_banks[{inpt, outpt}] = bank;
+
+      if (has_bank_between(inpt, outpt)) {
+        std::vector<stack_bank> b_ = stack_banks[{inpt, outpt}];
+        b_.push_back(bank);
+        stack_banks[{inpt, outpt}] = b_;
+        //stack_banks[{inpt, outpt}] = bank;
+      } else {
+        std::vector<stack_bank> b_;
+        b_.push_back(bank);
+        stack_banks[{inpt, outpt}] = b_;
+      }
     }
 
+    // returns true if at least one bank between given input and output ports
     bool has_bank_between(const std::string& inpt, const std::string& outpt) const {
       for (auto bs : stack_banks) {
         if (bs.first.first == inpt && bs.first.second == outpt) {
           return true;
         }
+       
       }
 
       return false;
     }
 
+    // returns name of ONE bank in between given input and output ports, even
+    // if there are multiple banks -- may want to consider extending this to
+    // returning ALL banks between given input and output port
     string bank_between(const std::string& inpt, const std::string& outpt) const {
 
       for (auto bs : stack_banks) {
         if (bs.first.first == inpt && bs.first.second == outpt) {
-          return bs.second.name;
+          auto first_bank = bs.second[0];
+          cout << "first bank name " << first_bank.name << endl;
+          return first_bank.name;
         }
       }
 
@@ -1022,23 +1057,51 @@ class UBuffer {
       return "";
     }
 
+   std::vector<string> banks_between(const std::string& inpt, const std::string& outpt) const {
+      std::vector<string> bank_names;
+      for (auto bs : stack_banks) {
+        if (bs.first.first == inpt && bs.first.second == outpt) {
+          for (auto b_ : bs.second) {
+            bank_names.push_back(b_.name);
+          }
+          return bank_names;
+        }
+      }
+
+      cout << "Error: No bank between: " << inpt << " and " << outpt << endl;
+      assert(false);
+      return {""};
+    }
+
+
     bank get_bank_between(const std::string& inpt, const std::string& outpt) const {
       string bk_name = bank_between(inpt, outpt);
       return get_bank(bk_name);
     }
+ 
+   std::vector<stack_bank> get_banks_between(const std::string& inpt, const std::string& outpt) const {
+     std::vector<string> bank_names = banks_between(inpt, outpt);
+     std::vector<bank> banks;
+     for (auto name : bank_names) {
+       banks.push_back(get_bank(name));
+     }
+     return banks;
+  }
 
     vector<stack_bank> receiver_banks(const std::string& inpt) {
       vector<stack_bank> bnks;
       vector<string> done;
       for (auto bs : stack_banks) {
-        if (bs.first.first == inpt) {
+        for (auto b_ : bs.second) {
+          if (bs.first.first == inpt) {
+  
+            if (!elem(b_.name, done)) {
+              bnks.push_back(b_);
+              done.push_back(b_.name);
+            }
 
-          if (!elem(bs.second.name, done)) {
-            bnks.push_back(bs.second);
-            done.push_back(bs.second.name);
+            //assert(bnks.back().read_delays.size() == bs.second.read_delays.size());
           }
-
-          //assert(bnks.back().read_delays.size() == bs.second.read_delays.size());
         }
       }
       return bnks;
@@ -1085,7 +1148,8 @@ class UBuffer {
           for (auto outpt: get_out_ports()) {
               if (buf.has_bank_between(inpt, outpt)) {
                   stack_banks[make_pair(inpt, outpt)] =
-                      buf.get_bank_between(inpt, outpt);
+                      //{buf.get_bank_between(inpt, outpt)};
+                      buf.get_banks_between(inpt, outpt);
               }
           }
       }