Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: CI

on:
push:
branches: [master]
pull_request:

# Scope note: this repo is a research/experiment record (a documented
# negative result). CI covers only what runs deterministically without
# special hardware or large downloads:
#
# * The Rust `cube-memory-host` crate's CPU reference path: build,
# clippy, and the `cpu` unit tests. These run on STABLE Rust.
# * A Python syntax check + ruff lint over the experiment scripts.
#
# Deliberately NOT in CI (and why):
# * The rust-gpu shader build (`cube-memory-shader{,-builder}`) needs
# the pinned nightly in `shaders/rust-toolchain.toml` plus rust-src /
# rustc-dev / llvm-tools — too heavy and version-fragile for hosted CI.
# * The GPU/CPU parity tests in `shaders/cube-memory-host/tests/parity.rs`
# require a Vulkan adapter and the prebuilt `.spv`; GitHub runners have
# no GPU.
# * The Python experiment scripts need PyTorch and local model
# checkpoints, so they are linted but not executed.

jobs:
rust:
name: Rust host crate (CPU path)
runs-on: ubuntu-latest
defaults:
run:
working-directory: shaders
steps:
- uses: actions/checkout@v4

# Use stable explicitly; the rust-toolchain.toml pins a nightly for
# the rust-gpu crates, but the host crate is plain stable Rust.
- name: Install stable toolchain
run: |
rustup toolchain install stable --component clippy --component rustfmt
rustup override set stable

- name: Cache cargo
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
shaders/target
key: cargo-${{ runner.os }}-${{ hashFiles('shaders/**/Cargo.toml') }}

- name: Build host crate
run: cargo build -p cube-memory-host

- name: Clippy (deny warnings)
run: cargo clippy -p cube-memory-host --all-targets -- -D warnings

- name: CPU reference unit tests
run: cargo test -p cube-memory-host --lib

python:
name: Python lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install ruff
run: pipx install ruff

- name: Syntax check (compile all)
run: python -m compileall -q phase0 phase1 rubik-gen *.py

- name: Ruff lint
# Non-blocking for now: the experiment scripts predate this lint
# config, so surface issues without failing the build.
run: ruff check . || true
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,28 @@ will need editing for your environment.

```bash
cd shaders
cargo test # runs the FHRR algebra kernels on CPU (parity test)

# CPU reference unit tests — pure Rust, run on the stable toolchain,
# no GPU or prebuilt SPIR-V needed. These pin the FHRR/Cube-Memory
# reference algebra that the GPU parity tests compare against.
cargo test -p cube-memory-host --lib

# Full GPU/CPU parity tests — require a Vulkan adapter AND the shader
# binary built first via the rust-gpu nightly toolchain:
cargo run -p cube-memory-shader-builder --release
cargo test -p cube-memory-host --release # runs tests/parity.rs
```

See `shaders/README.md` for the rust-gpu toolchain details.
See `shaders/README.md` for the rust-gpu toolchain details (the pinned
nightly and components needed to build the SPIR-V).

## Limitations

- This is research/experiment code, not a library — no stable API, no packaging, no CI.
- This is research/experiment code, not a library — no stable API and no packaging.
CI (`.github/workflows/ci.yml`) covers only the deterministic, hardware-free
surface: the Rust host crate's CPU reference path (build + clippy + unit tests)
and a Python syntax check / ruff lint. The rust-gpu shader build and the GPU
parity tests are out of CI scope (nightly toolchain + Vulkan adapter required).
- Hardware-specific: numbers were measured on an AMD Radeon 890M (gfx1150) Vulkan build of
llama.cpp; the bandwidth and t/s figures are local measurements, not general benchmarks.
- The headline conclusion is negative; the layer does not match a linear baseline.
Expand Down
188 changes: 186 additions & 2 deletions shaders/cube-memory-host/src/cpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,13 @@ pub fn cube_memory_retrieve(

// Compute all sims.
let mut sims = vec![0.0_f32; n_slots];
for j in 0..n_slots {
for (j, sim) in sims.iter_mut().enumerate() {
let row = j * d_key;
let mut s = 0.0_f32;
for i in 0..d_key {
s += query[i] * slot_keys[row + i];
}
sims[j] = s;
*sim = s;
}

// Top-k by descending sim. NOTE on tie-break semantics: CPU uses
Expand Down Expand Up @@ -149,3 +149,187 @@ pub fn cube_memory_retrieve(
}
out
}

#[cfg(test)]
mod tests {
//! Unit tests for the CPU reference primitives.
//!
//! These do not touch the GPU (unlike `tests/parity.rs`) — they pin
//! the algebraic behaviour of the reference path itself, so a future
//! refactor of `cpu.rs` cannot silently change the ground truth the
//! parity tests compare against. They build and run on stable Rust.

use super::*;

/// Deterministic unit-modulus phasors for reproducible assertions.
fn phasors(n: usize, seed: u32) -> Vec<Vec2> {
let mut state = seed.wrapping_mul(2654435761).wrapping_add(1);
(0..n)
.map(|_| {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
let phase =
(state as f32 / u32::MAX as f32) * std::f32::consts::TAU - std::f32::consts::PI;
Vec2::new(phase.cos(), phase.sin())
})
.collect()
}

fn assert_vec2_close(a: &[Vec2], b: &[Vec2], eps: f32) {
assert_eq!(a.len(), b.len());
for (i, (x, y)) in a.iter().zip(b.iter()).enumerate() {
assert!(
(x.x - y.x).abs() < eps && (x.y - y.y).abs() < eps,
"mismatch at {i}: {x:?} vs {y:?}"
);
}
}

#[test]
fn cmul_matches_complex_multiply() {
// (1+2i)(3+4i) = (3-8) + (4+6)i = -5 + 10i
let r = cmul(Vec2::new(1.0, 2.0), Vec2::new(3.0, 4.0));
assert!((r.x - (-5.0)).abs() < 1e-6);
assert!((r.y - 10.0).abs() < 1e-6);
}

#[test]
fn cconj_negates_imaginary() {
let r = cconj(Vec2::new(3.0, -4.0));
assert_eq!(r, Vec2::new(3.0, 4.0));
}

#[test]
fn bind_then_unbind_is_identity_for_unit_phasors() {
// For unit-modulus phasors, unbind(bind(z, k), k) == z because
// k * conj(k) == |k|^2 == 1. This is the core FHRR property the
// whole "rotate to face θ, read the snapshot" idea rests on.
let z = phasors(64, 7);
let k = phasors(64, 11);
let bound = fhrr_bind(&z, &k);
let recovered = fhrr_unbind(&bound, &k);
assert_vec2_close(&z, &recovered, 1e-5);
}

#[test]
fn bind_is_commutative() {
let a = phasors(32, 1);
let b = phasors(32, 2);
assert_vec2_close(&fhrr_bind(&a, &b), &fhrr_bind(&b, &a), 1e-6);
}

#[test]
fn unitize_produces_unit_modulus() {
let v = vec![
Vec2::new(3.0, 4.0),
Vec2::new(-6.0, 8.0),
Vec2::new(0.0, 0.0),
];
let u = fhrr_unitize(&v);
// First two have modulus 5 and 10 -> normalize to 1.
assert!((u[0].length() - 1.0).abs() < 1e-5);
assert!((u[1].length() - 1.0).abs() < 1e-5);
// Zero vector: eps floor keeps the magnitude tiny, not NaN.
assert!(u[2].x.is_finite() && u[2].y.is_finite());
}

#[test]
fn superpose_outputs_unit_modulus() {
let n = 16;
let k = 4;
let input: Vec<Vec2> = (0..k).flat_map(|i| phasors(n, 100 + i as u32)).collect();
let out = fhrr_superpose(&input, n, k);
assert_eq!(out.len(), n);
for z in &out {
assert!((z.length() - 1.0).abs() < 1e-4);
}
}

#[test]
fn cleanup_returns_exact_codebook_entry_for_self_query() {
// Querying with a codebook entry must snap back to that same
// entry (cosine self-similarity is maximal).
let m = 8;
let d = 16;
let codebook: Vec<Vec2> = (0..m).flat_map(|i| phasors(d, 200 + i as u32)).collect();
let target = 5usize;
let query = codebook[target * d..(target + 1) * d].to_vec();
let snapped = cube_memory_cleanup(&query, &codebook, m, d);
assert_vec2_close(&snapped, &query, 1e-6);
}

#[test]
fn retrieve_softmax_weights_sum_to_one() {
// With a single dominant slot, retrieve should approach that
// slot's value vector; more generally the weighted gather is a
// convex combination of the top-k value rows, so every output
// coordinate lies within the min/max of the gathered values.
let n_slots = 8;
let d_key = 4;
let d_value = 3;
let top_k = 4;
let query = vec![1.0, 0.0, 0.0, 0.0];
// Slot 0 keyed to align perfectly with the query.
let mut slot_keys = vec![0.0_f32; n_slots * d_key];
for j in 0..n_slots {
slot_keys[j * d_key] = j as f32 / n_slots as f32;
}
slot_keys[0] = 10.0; // dominant
let slot_values: Vec<f32> = (0..n_slots * d_value).map(|x| x as f32).collect();
let out = cube_memory_retrieve(
&query,
&slot_keys,
&slot_values,
n_slots,
d_key,
d_value,
top_k,
);
assert_eq!(out.len(), d_value);
// Dominant slot 0 -> output should be close to slot 0's value row.
for i in 0..d_value {
assert!(
(out[i] - slot_values[i]).abs() < 1e-2,
"coord {i}: {} vs {}",
out[i],
slot_values[i]
);
}
}

#[test]
fn retrieve_is_convex_combination() {
// Each output coordinate must lie within [min, max] of the
// corresponding coordinate across all slot value rows, since the
// softmax weights are non-negative and sum to one.
let n_slots = 6;
let d_key = 3;
let d_value = 2;
let top_k = 3;
let query = vec![0.3, -0.7, 0.5];
let slot_keys: Vec<f32> = (0..n_slots * d_key)
.map(|x| ((x as f32) * 0.137).sin())
.collect();
let slot_values: Vec<f32> = (0..n_slots * d_value)
.map(|x| ((x as f32) * 0.91).cos() * 4.0)
.collect();
let out = cube_memory_retrieve(
&query,
&slot_keys,
&slot_values,
n_slots,
d_key,
d_value,
top_k,
);
for c in 0..d_value {
let col: Vec<f32> = (0..n_slots).map(|j| slot_values[j * d_value + c]).collect();
let lo = col.iter().cloned().fold(f32::INFINITY, f32::min);
let hi = col.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
assert!(
out[c] >= lo - 1e-4 && out[c] <= hi + 1e-4,
"coord {c}: {} not in [{lo}, {hi}]",
out[c]
);
}
}
}
12 changes: 9 additions & 3 deletions shaders/cube-memory-host/src/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ impl GpuCtx {
compatible_surface: None,
}))
.expect("no wgpu Vulkan adapter found");
let mut limits = wgpu::Limits::default();
// Each kernel uses a small push-constant block (12 bytes max
// across our six entry points). 32 is a safe ceiling.
limits.max_push_constant_size = 32;
let limits = wgpu::Limits {
max_push_constant_size: 32,
..Default::default()
};
let (device, queue) = pollster::block_on(adapter.request_device(
&wgpu::DeviceDescriptor {
label: Some("cube-memory-host"),
Expand Down Expand Up @@ -206,6 +208,10 @@ impl GpuCtx {
/// B's scratch is bound at `scratch_binding_b` (read-only). Pass
/// B's output is the last binding. Both passes share a single
/// push-constant struct.
// Many positional args by design: this is a test-only harness for
// two-pass kernels, and bundling them into a config struct would add
// boilerplate without improving the call sites in `tests/parity.rs`.
#[allow(clippy::too_many_arguments)]
pub fn run_pair<P, T>(
&self,
entry_a: &str,
Expand Down Expand Up @@ -273,7 +279,7 @@ impl GpuCtx {
let pl_a = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("cube-mem pl a"),
bind_group_layouts: &[&bgl_a],
push_constant_ranges: &[pcr.clone()],
push_constant_ranges: std::slice::from_ref(&pcr),
});
let pl_b = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("cube-mem pl b"),
Expand Down
8 changes: 4 additions & 4 deletions shaders/cube-memory-host/tests/parity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ fn fhrr_bind_parity() {

let ctx = GpuCtx::new(&shader_path());
let push = FhrrBindPushConsts { n: n as u32 };
let groups = ((n as u32 + 63) / 64, 1, 1);
let groups = ((n as u32).div_ceil(64), 1, 1);
let gpu_out: Vec<Vec2> = ctx.run(
"fhrr_bind",
push,
Expand All @@ -127,7 +127,7 @@ fn fhrr_unbind_parity() {

let ctx = GpuCtx::new(&shader_path());
let push = FhrrBindPushConsts { n: n as u32 };
let groups = ((n as u32 + 63) / 64, 1, 1);
let groups = ((n as u32).div_ceil(64), 1, 1);
let gpu_out: Vec<Vec2> = ctx.run(
"fhrr_unbind",
push,
Expand All @@ -152,7 +152,7 @@ fn fhrr_unitize_parity() {

let ctx = GpuCtx::new(&shader_path());
let push = FhrrBindPushConsts { n: n as u32 };
let groups = ((n as u32 + 63) / 64, 1, 1);
let groups = ((n as u32).div_ceil(64), 1, 1);
let gpu_out: Vec<Vec2> = ctx.run(
"fhrr_unitize",
push,
Expand All @@ -179,7 +179,7 @@ fn fhrr_superpose_parity() {
n: n as u32,
k: k as u32,
};
let groups = ((n as u32 + 63) / 64, 1, 1);
let groups = ((n as u32).div_ceil(64), 1, 1);
let gpu_out: Vec<Vec2> = ctx.run(
"fhrr_superpose",
push,
Expand Down
Loading