Skip to content

Commit fbcd326

Browse files
Prometheus MVP: tiny LM trained end-to-end in pure OMC
Goal stop condition MET: a tiny neural language model trained via OMC's substrate-native autograd (no PyTorch), with loss decreasing AND correct argmax predictions. Numbers from the run that ships in this commit (38ms wall-clock): step 0 loss=0.2515 step 100 loss=0.0151 step 199 loss=0.0450 loss reduction: 5.6x === Inference: bigram predictions === a → b (expected b) ✓ b → c (expected c) ✓ c → a (expected a) ✓ argmax accuracy: 3/3 [OK] Prometheus end-to-end training works. Architecture: - examples/lib/prometheus.omc: Linear / ReLU / Sigmoid / MSE loss / SGD step / one_hot / argmax / collect_params helpers - examples/prometheus_tinylm.omc: 2-layer MLP next-char predictor trained on "abcabcabc..." bigram cycle - omnimcode-core/src/prometheus/: Rust scaffolding module + README documenting the roadmap (substrate-unique primitives that need Rust-level support: tape_save_weights via .omcs, tape_geodesic_attention as fused op, tape_update_scaled for harmonic optimizer, tape_cache_forward for substrate-cached activations) Built entirely on existing primitives: - tape_var / tape_const / tape_matmul / tape_add / tape_sub / tape_mul / tape_relu / tape_mean / tape_backward / tape_update - The 4 trainable parameter tensors (W1, b1, W2, b2) are tape vars; the forward graph rebuilds each step; backward computes gradients; SGD updates in-place. Strategic significance: This is the first proof that OMC can train a neural network END TO END using only substrate-native primitives — no PyTorch, no Python in the training loop. The composition layer (prometheus.omc) is small (~150 lines) because the substrate already shipped the hard pieces (reverse-mode autograd, matmul backprop, ML kernels). PyTorch remains the answer for GPU training at scale (we'll keep the py_import path via examples/lib/torch.omc). Prometheus is the substrate-native answer for the workloads where alpha-rename invariance, content-addressed checkpoints, geodesic attention, and harmonic optimizers matter. Roadmap for the Rust module (documented in README.md): 1. tape_save_weights / tape_load_weights via .omcs 2. tape_geodesic_attention as fused primitive (3/3 wins today) 3. tape_update_scaled for harmonic SGD hypothesis 4. tape_cache_forward for substrate-keyed activation memoization Each is a separable extension built on what's already shipped. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 24b7b6c commit fbcd326

5 files changed

Lines changed: 542 additions & 0 deletions

File tree

examples/lib/prometheus.omc

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# Prometheus — substrate-native ML composition layer.
2+
#
3+
# A pure-OMC framework built on the existing tape_* (reverse-mode autograd)
4+
# and arr_* (ML kernels + linalg) primitives. NOT a PyTorch wrapper — every
5+
# layer here is implemented in OMC and trains via the substrate's own
6+
# autograd engine.
7+
#
8+
# Status: MVP. Provides Linear + activations + MSE loss + SGD step.
9+
# Trained end-to-end in examples/prometheus_tinylm.omc.
10+
#
11+
# Strategic intent: be substrate-native for the things substrate uniquely
12+
# provides (alpha-rename-invariant content hashing of model weights,
13+
# harmonic-resonance modulated gradients, geodesic attention as a first-
14+
# class layer), while harvesting Python's correctness oracles via py_*
15+
# imports during development.
16+
#
17+
# Conventions:
18+
# - A "module" is a dict carrying parameter tape-var IDs.
19+
# - prom_*_new(...) → constructs the module + registers params on the tape
20+
# - prom_*_forward(mod, x) → runs the forward pass; returns a tape node id
21+
# - prom_collect_params(mod) → returns a flat list of every tape-var id in the module
22+
# - The user manages tape_reset() once before constructing the model;
23+
# param IDs persist across training steps as the forward graph rebuilds.
24+
25+
# ---------------------------------------------------------------------------
26+
# Initialization helpers
27+
# ---------------------------------------------------------------------------
28+
29+
# Uniform [-bound, bound) — deterministic if seed_state is threaded through.
30+
# OMC doesn't ship a real RNG; we use a tiny LCG for reproducibility.
31+
fn _prom_lcg_step(state) {
32+
return (state * 1103515245 + 12345) % 2147483648;
33+
}
34+
35+
fn _prom_uniform(state, bound) {
36+
h s = _prom_lcg_step(state);
37+
h r = s / 2147483648.0; # [0, 1)
38+
h v = (r * 2.0 - 1.0) * bound; # [-bound, bound)
39+
h out = dict_new();
40+
dict_set(out, "state", s);
41+
dict_set(out, "value", v);
42+
return out;
43+
}
44+
45+
# Build a random matrix [rows][cols] in OMC array form, register each cell
46+
# as a tape_var, return both the matrix-of-ids and the updated rng state.
47+
fn _prom_random_matrix(rows, cols, bound, state) {
48+
h matrix = [];
49+
h s = state;
50+
h i = 0;
51+
while i < rows {
52+
h row = [];
53+
h j = 0;
54+
while j < cols {
55+
h step = _prom_uniform(s, bound);
56+
s = dict_get(step, "state");
57+
arr_push(row, dict_get(step, "value"));
58+
j = j + 1;
59+
}
60+
arr_push(matrix, row);
61+
i = i + 1;
62+
}
63+
h node = tape_var(matrix); # register the whole matrix as one variable
64+
h out = dict_new();
65+
dict_set(out, "node", node);
66+
dict_set(out, "state", s);
67+
return out;
68+
}
69+
70+
# Same as above but produces a zero-initialized bias row vector.
71+
fn _prom_zeros_row(cols) {
72+
h row = [];
73+
h j = 0;
74+
while j < cols {
75+
arr_push(row, 0.0);
76+
j = j + 1;
77+
}
78+
h matrix = [row]; # 1 x cols
79+
return tape_var(matrix);
80+
}
81+
82+
# ---------------------------------------------------------------------------
83+
# Linear layer: y = x @ W + b
84+
# x: [batch, in_dim], W: [in_dim, out_dim], b: [1, out_dim]
85+
# ---------------------------------------------------------------------------
86+
87+
fn prom_linear_new(in_dim, out_dim, rng_state) {
88+
# Xavier-ish bound: sqrt(6 / (in + out))
89+
h bound = 0.5;
90+
h Wr = _prom_random_matrix(in_dim, out_dim, bound, rng_state);
91+
h b = _prom_zeros_row(out_dim);
92+
h layer = dict_new();
93+
dict_set(layer, "kind", "linear");
94+
dict_set(layer, "in_dim", in_dim);
95+
dict_set(layer, "out_dim", out_dim);
96+
dict_set(layer, "W", dict_get(Wr, "node"));
97+
dict_set(layer, "b", b);
98+
dict_set(layer, "rng_state", dict_get(Wr, "state"));
99+
return layer;
100+
}
101+
102+
fn prom_linear_forward(layer, x_id) {
103+
h W = dict_get(layer, "W");
104+
h b = dict_get(layer, "b");
105+
h xW = tape_matmul(x_id, W);
106+
return tape_add(xW, b);
107+
}
108+
109+
fn prom_linear_params(layer) {
110+
h out = [];
111+
arr_push(out, dict_get(layer, "W"));
112+
arr_push(out, dict_get(layer, "b"));
113+
return out;
114+
}
115+
116+
# ---------------------------------------------------------------------------
117+
# Activations — substrate-native (route to tape ops).
118+
# ---------------------------------------------------------------------------
119+
120+
fn prom_relu(x_id) { return tape_relu(x_id); }
121+
fn prom_sigmoid(x_id) { return tape_sigmoid(x_id); }
122+
123+
# ---------------------------------------------------------------------------
124+
# Losses — MSE works with what the tape currently supports
125+
# (no exp/log on the tape, so true softmax+CE isn't yet here; using
126+
# sigmoid + MSE against one-hot target is a viable LM loss for tiny
127+
# scale and demonstrates substrate-native training).
128+
# ---------------------------------------------------------------------------
129+
130+
fn prom_mse_loss(pred_id, target_id) {
131+
h diff = tape_sub(pred_id, target_id);
132+
h sq = tape_mul(diff, diff);
133+
return tape_mean(sq);
134+
}
135+
136+
# ---------------------------------------------------------------------------
137+
# Optimizer — substrate-native SGD step.
138+
# Calls tape_update on every parameter in the supplied flat list.
139+
# ---------------------------------------------------------------------------
140+
141+
fn prom_sgd_step(params, lr) {
142+
h i = 0;
143+
while i < arr_len(params) {
144+
tape_update(arr_get(params, i), lr);
145+
i = i + 1;
146+
}
147+
}
148+
149+
# Variant: substrate-modulated SGD step.
150+
# Reads each param's gradient, scales by (1 + phi.res(grad_hash)) so
151+
# gradients pointing toward Fibonacci attractors get a small boost.
152+
# Experimental; the architectural rule derived today says substrate
153+
# metric on float activations doesn't work as an attention modulator,
154+
# but on GRADIENT MAGNITUDES (integer-keyed via hash of grad bytes)
155+
# the rule may differ. Worth measuring once the baseline converges.
156+
fn prom_harmonic_sgd_step(params, lr) {
157+
# TODO: read tape_grad(p), compute substrate scaling factor, then
158+
# call a (yet-to-add) tape_update_scaled(p, lr, scale). For now,
159+
# fall back to plain SGD.
160+
prom_sgd_step(params, lr);
161+
}
162+
163+
# ---------------------------------------------------------------------------
164+
# Helpers
165+
# ---------------------------------------------------------------------------
166+
167+
# Build a [vocab] one-hot row vector as a tape_const.
168+
fn prom_one_hot(idx, vocab) {
169+
h row = [];
170+
h i = 0;
171+
while i < vocab {
172+
if i == idx { arr_push(row, 1.0); }
173+
else { arr_push(row, 0.0); }
174+
i = i + 1;
175+
}
176+
return tape_const([row]); # [1, vocab]
177+
}
178+
179+
# Argmax over a logits vector. Handles either a flat 1D array (when
180+
# tape_value collapses a 1xN matrix) or a true 2D [1, vocab] matrix.
181+
fn prom_argmax_row(logits) {
182+
h row = logits;
183+
# If the first element is itself an array, this is 2D and we need row 0.
184+
if arr_len(logits) > 0 {
185+
h first = arr_get(logits, 0);
186+
if type_of(first) == "array" { row = first; }
187+
}
188+
h best = 0;
189+
h best_val = arr_get(row, 0);
190+
h i = 1;
191+
while i < arr_len(row) {
192+
h v = arr_get(row, i);
193+
if v > best_val { best_val = v; best = i; }
194+
i = i + 1;
195+
}
196+
return best;
197+
}
198+
199+
# ---------------------------------------------------------------------------
200+
# Convenience: collect parameters from a list of layers.
201+
# ---------------------------------------------------------------------------
202+
203+
fn prom_collect_params(layers) {
204+
h out = [];
205+
h i = 0;
206+
while i < arr_len(layers) {
207+
h layer = arr_get(layers, i);
208+
h kind = dict_get(layer, "kind");
209+
if kind == "linear" {
210+
h ps = prom_linear_params(layer);
211+
h j = 0;
212+
while j < arr_len(ps) {
213+
arr_push(out, arr_get(ps, j));
214+
j = j + 1;
215+
}
216+
}
217+
i = i + 1;
218+
}
219+
return out;
220+
}

examples/prometheus_tinylm.omc

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Prometheus tiny LM — substrate-native bigram next-char prediction.
2+
#
3+
# A 2-layer MLP trained via OMC's reverse-mode autograd (no PyTorch).
4+
# Predicts P(next_char | current_char) on a tiny ABA-pattern corpus.
5+
# Loss must decrease over training steps; final predictions must
6+
# show the model learned the bigram structure.
7+
#
8+
# This is the MVP that proves the Prometheus thesis: you can train a
9+
# neural language model in pure OMC, using only substrate-native
10+
# autograd and ML kernels.
11+
#
12+
# Architecture:
13+
# input = one-hot of current char [1, vocab]
14+
# hidden = relu(input @ W1 + b1) [1, hidden]
15+
# output = hidden @ W2 + b2 [1, vocab] (logits, MSE-trained
16+
# against one-hot target)
17+
#
18+
# Training:
19+
# - SGD with substrate-native tape_update
20+
# - MSE loss against one-hot next-char target
21+
# - 200 steps over a synthetic bigram corpus
22+
#
23+
# Stop condition (substrate-clear): loss must strictly decrease
24+
# between step 0 and step 200, and final prediction on at least
25+
# one bigram must be correct (argmax matches target).
26+
27+
import "examples/lib/prometheus.omc";
28+
29+
# ---------------------------------------------------------------------------
30+
# Corpus + tokenization
31+
# ---------------------------------------------------------------------------
32+
33+
# Tiny alphabet, deterministic bigram pattern: a→b, b→c, c→a (cycle).
34+
# Train data is "abcabcabcabc...". A successful model will learn this
35+
# exact mapping (vocab = 3, perfect predictability).
36+
fn make_corpus() {
37+
h chars = ["a", "b", "c"]; # vocab = 3
38+
h text = "abcabcabcabcabcabcabcabcabc";
39+
h ids = [];
40+
h i = 0;
41+
while i < str_len(text) {
42+
h ch = str_slice(text, i, i + 1);
43+
# Index of ch in chars
44+
h idx = 0;
45+
if ch == "a" { idx = 0; }
46+
elif ch == "b" { idx = 1; }
47+
elif ch == "c" { idx = 2; }
48+
arr_push(ids, idx);
49+
i = i + 1;
50+
}
51+
h corpus = dict_new();
52+
dict_set(corpus, "chars", chars);
53+
dict_set(corpus, "vocab", 3);
54+
dict_set(corpus, "ids", ids);
55+
return corpus;
56+
}
57+
58+
# ---------------------------------------------------------------------------
59+
# Build model
60+
# ---------------------------------------------------------------------------
61+
62+
fn build_model(vocab, hidden, rng_state) {
63+
h L1 = prom_linear_new(vocab, hidden, rng_state);
64+
h L2 = prom_linear_new(hidden, vocab, dict_get(L1, "rng_state"));
65+
h model = dict_new();
66+
dict_set(model, "L1", L1);
67+
dict_set(model, "L2", L2);
68+
return model;
69+
}
70+
71+
fn forward(model, x_id) {
72+
h L1 = dict_get(model, "L1");
73+
h L2 = dict_get(model, "L2");
74+
h h_pre = prom_linear_forward(L1, x_id);
75+
h h_post = prom_relu(h_pre);
76+
h logits = prom_linear_forward(L2, h_post);
77+
return logits;
78+
}
79+
80+
fn model_params(model) {
81+
return prom_collect_params([dict_get(model, "L1"), dict_get(model, "L2")]);
82+
}
83+
84+
# ---------------------------------------------------------------------------
85+
# Training loop
86+
# ---------------------------------------------------------------------------
87+
88+
fn main() {
89+
print("=== Prometheus tiny LM ===");
90+
h corpus = make_corpus();
91+
h vocab = dict_get(corpus, "vocab");
92+
h ids = dict_get(corpus, "ids");
93+
h n_pairs = arr_len(ids) - 1;
94+
print(concat_many("corpus pairs (current→next): ", to_string(n_pairs)));
95+
print(concat_many("vocab: ", to_string(vocab)));
96+
97+
tape_reset();
98+
h model = build_model(vocab, 8, 42);
99+
h params = model_params(model);
100+
print(concat_many("trainable param tensors: ", to_string(arr_len(params))));
101+
102+
h lr = 0.05;
103+
h steps = 200;
104+
h loss_history = [];
105+
h step = 0;
106+
while step < steps {
107+
# Online SGD: cycle through pairs deterministically.
108+
h k = step % n_pairs;
109+
h cur = arr_get(ids, k);
110+
h nxt = arr_get(ids, k + 1);
111+
h x = prom_one_hot(cur, vocab);
112+
h target = prom_one_hot(nxt, vocab);
113+
h pred = forward(model, x);
114+
h loss = prom_mse_loss(pred, target);
115+
tape_backward(loss);
116+
prom_sgd_step(params, lr);
117+
h lv = tape_value(loss);
118+
arr_push(loss_history, lv);
119+
if step % 20 == 0 || step == steps - 1 {
120+
print(concat_many("step ", to_string(step), " loss=", to_string(lv)));
121+
}
122+
step = step + 1;
123+
}
124+
print("");
125+
126+
# ---- Stop-condition checks ----
127+
128+
h first = arr_get(loss_history, 0);
129+
h last = arr_get(loss_history, arr_len(loss_history) - 1);
130+
print(concat_many("first loss: ", to_string(first)));
131+
print(concat_many("final loss: ", to_string(last)));
132+
h ratio = first / last;
133+
print(concat_many("loss reduction ratio: ", to_string(ratio), "x"));
134+
135+
# Quality check: predict argmax for each bigram, count correct.
136+
print("");
137+
print("=== Inference: bigram predictions ===");
138+
h chars = dict_get(corpus, "chars");
139+
h correct = 0;
140+
h total = 0;
141+
h c = 0;
142+
while c < vocab {
143+
h x = prom_one_hot(c, vocab);
144+
h pred = forward(model, x);
145+
h logits = tape_value(pred);
146+
h pred_idx = prom_argmax_row(logits);
147+
# Expected next: (c+1) % vocab for the abcabc cycle
148+
h expected = (c + 1) % vocab;
149+
h ok = pred_idx == expected;
150+
if ok { correct = correct + 1; }
151+
total = total + 1;
152+
h marker = "✗";
153+
if ok { marker = "✓"; }
154+
print(concat_many(" ", arr_get(chars, c), " → ", arr_get(chars, pred_idx),
155+
" (expected ", arr_get(chars, expected), ") ", marker));
156+
c = c + 1;
157+
}
158+
print("");
159+
print(concat_many("argmax accuracy: ", to_string(correct), "/", to_string(total)));
160+
161+
# Final verdict.
162+
print("");
163+
if last < first && correct >= 1 {
164+
print("[OK] Prometheus end-to-end training works.");
165+
print(" Loss decreased AND argmax predicted at least one bigram.");
166+
} else {
167+
print("[FAIL] Did not meet stop condition.");
168+
if last >= first { print(" Loss did not decrease."); }
169+
if correct == 0 { print(" Zero correct predictions."); }
170+
}
171+
}
172+
173+
main();

0 commit comments

Comments
 (0)