diff --git a/test/unit/test_select_and_scatter.py b/test/unit/test_select_and_scatter.py
deleted file mode 100644
index e95beb2..0000000
--- a/test/unit/test_select_and_scatter.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import pytest
-
-from nki_samples.reference.vision import select_and_scatter_kernel
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-bench_func = benchmark(warmup=5, iters=10)(select_and_scatter_kernel)
-
-np.random.seed(0)
-
-def cpu_golden_result(operand_tensor, source_tensor, window_dimensions=(3, 3), window_strides=(2, 2),padding=(1, 1)):
-    N, C, H, W = operand_tensor.shape  # batch, channel, height, width
-    sw_h, sw_w = window_dimensions  # set window dimensions to 3
-    stride_h, stride_w = window_strides # set window strides to 2
-    src_n, src_c, src_h, src_w = source_tensor.shape
-    padded_h = H + sum(padding)
-    padded_w = W + sum(padding)
-    assert N == src_n and C == src_c
-    assert (padded_h - sw_h) // stride_h + 1 == src_h
-    assert (padded_w - sw_w) // stride_w + 1 == src_w
-    assert H == W and src_h == src_w
-
-    assert operand_tensor.dtype == source_tensor.dtype
-    dtype = operand_tensor.dtype
-
-    output_shape = (N, C, H, W)
-
-    padded_operand_tensor = np.pad(operand_tensor, ((0, 0), (0, 0), padding, padding), 'constant')
-    output_tensor = np.zeros(output_shape, dtype)
-
-    for n in range(N):
-        for c in range(C):
-            for h in range(src_h):
-                for w in range(src_w):
-                    local_max_idx = np.argmax(padded_operand_tensor[n, c, h*stride_h:h*stride_h+sw_h, w*stride_w:w*stride_w+sw_w])
-                    local_h, local_w = local_max_idx // sw_w, local_max_idx % sw_w
-                    out_h = h * stride_h + local_h - padding[0]
-                    out_w = w * stride_w + local_w - padding[1]
-                    output_tensor[n, c, out_h, out_w] += source_tensor[n, c, h, w]
-    return output_tensor
-
-class TestSelectAndScatter:
-    @pytest.mark.parametrize("n, c, operand_h, operand_w, source_h, source_w, dtype, latency", [
- 	    [8, 64, 112, 112, 56, 56, np.float32, 4500],
- 	])
-    def test_select_and_scatter_for_perf(self, n, c, operand_h, operand_w, source_h, source_w, dtype, latency):
-        operand_dev = nl.static_cast(np.random.random_sample((n, c, operand_h, operand_w)), dtype)
-        source_dev = nl.static_cast(np.random.random_sample((n, c, source_h, source_w)), dtype)
-
-        bench_func(operand_dev, source_dev)
-        latency_res = bench_func.benchmark_result.nc_latency
-        p99 = latency_res.get_latency_percentile(50)
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("n, c, operand_h, operand_w, source_h, source_w, dtype", [
- 	    [8, 64, 112, 112, 56, 56, np.float32],
- 	    [8, 64, 112, 112, 56, 56, nl.bfloat16],
- 	])
-    def test_select_and_scatter_for_numeric(self,simulation_only, n, c, operand_h, operand_w, source_h, source_w, dtype):
-        operand_dev = nl.static_cast(np.random.random_sample((n, c, operand_h, operand_w)), dtype)
-        source_dev = nl.static_cast(np.random.random_sample((n, c, source_h, source_w)), dtype)
-
-        sw = nl.static_cast(np.ndarray(shape=(n, c, source_h, source_w, 3, 3)), dtype)
-        operand_tensor = nl.static_cast(operand_dev, np.float32)
-        source_tensor = nl.static_cast(source_dev, np.float32)
-
-        numeric_func = baremetal(select_and_scatter_kernel)
-        if simulation_only:
-            output_dev = simulate_kernel(numeric_func, operand_dev, source_dev)
-        else:
-            output_dev = numeric_func(operand_dev, source_dev)
-        golden_result = cpu_golden_result(operand_tensor, source_tensor)
-        nki_result = nl.static_cast(output_dev, np.float32)
-
-        assert np.allclose(nki_result, golden_result, rtol=1e-2, atol=1e-2)