RandomCoder-lab
diff --git a/‎examples/lib/prometheus.omc‎
Lines changed: 220 additions & 0 deletions b/‎examples/lib/prometheus.omc‎
Lines changed: 220 additions & 0 deletions
diff --git a/‎examples/prometheus_tinylm.omc‎
Lines changed: 173 additions & 0 deletions b/‎examples/prometheus_tinylm.omc‎
Lines changed: 173 additions & 0 deletions
@@ -0,0 +1,220 @@
+# Prometheus — substrate-native ML composition layer.
+#
+# A pure-OMC framework built on the existing tape_* (reverse-mode autograd)
+# and arr_* (ML kernels + linalg) primitives. NOT a PyTorch wrapper — every
+# layer here is implemented in OMC and trains via the substrate's own
+# autograd engine.
+#
+# Status: MVP. Provides Linear + activations + MSE loss + SGD step.
+# Trained end-to-end in examples/prometheus_tinylm.omc.
+#
+# Strategic intent: be substrate-native for the things substrate uniquely
+# provides (alpha-rename-invariant content hashing of model weights,
+# harmonic-resonance modulated gradients, geodesic attention as a first-
+# class layer), while harvesting Python's correctness oracles via py_*
+# imports during development.
+#
+# Conventions:
+#   - A "module" is a dict carrying parameter tape-var IDs.
+#   - prom_*_new(...)         → constructs the module + registers params on the tape
+#   - prom_*_forward(mod, x)  → runs the forward pass; returns a tape node id
+#   - prom_collect_params(mod) → returns a flat list of every tape-var id in the module
+#   - The user manages tape_reset() once before constructing the model;
+#     param IDs persist across training steps as the forward graph rebuilds.
+
+# ---------------------------------------------------------------------------
+# Initialization helpers
+# ---------------------------------------------------------------------------
+
+# Uniform [-bound, bound) — deterministic if seed_state is threaded through.
+# OMC doesn't ship a real RNG; we use a tiny LCG for reproducibility.
+fn _prom_lcg_step(state) {
+    return (state * 1103515245 + 12345) % 2147483648;
+}
+
+fn _prom_uniform(state, bound) {
+    h s = _prom_lcg_step(state);
+    h r = s / 2147483648.0;          # [0, 1)
+    h v = (r * 2.0 - 1.0) * bound;   # [-bound, bound)
+    h out = dict_new();
+    dict_set(out, "state", s);
+    dict_set(out, "value", v);
+    return out;
+}
+
+# Build a random matrix [rows][cols] in OMC array form, register each cell
+# as a tape_var, return both the matrix-of-ids and the updated rng state.
+fn _prom_random_matrix(rows, cols, bound, state) {
+    h matrix = [];
+    h s = state;
+    h i = 0;
+    while i < rows {
+        h row = [];
+        h j = 0;
+        while j < cols {
+            h step = _prom_uniform(s, bound);
+            s = dict_get(step, "state");
+            arr_push(row, dict_get(step, "value"));
+            j = j + 1;
+        }
+        arr_push(matrix, row);
+        i = i + 1;
+    }
+    h node = tape_var(matrix);   # register the whole matrix as one variable
+    h out = dict_new();
+    dict_set(out, "node", node);
+    dict_set(out, "state", s);
+    return out;
+}
+
+# Same as above but produces a zero-initialized bias row vector.
+fn _prom_zeros_row(cols) {
+    h row = [];
+    h j = 0;
+    while j < cols {
+        arr_push(row, 0.0);
+        j = j + 1;
+    }
+    h matrix = [row];   # 1 x cols
+    return tape_var(matrix);
+}
+
+# ---------------------------------------------------------------------------
+# Linear layer: y = x @ W + b
+# x: [batch, in_dim],  W: [in_dim, out_dim],  b: [1, out_dim]
+# ---------------------------------------------------------------------------
+
+fn prom_linear_new(in_dim, out_dim, rng_state) {
+    # Xavier-ish bound: sqrt(6 / (in + out))
+    h bound = 0.5;
+    h Wr = _prom_random_matrix(in_dim, out_dim, bound, rng_state);
+    h b = _prom_zeros_row(out_dim);
+    h layer = dict_new();
+    dict_set(layer, "kind", "linear");
+    dict_set(layer, "in_dim", in_dim);
+    dict_set(layer, "out_dim", out_dim);
+    dict_set(layer, "W", dict_get(Wr, "node"));
+    dict_set(layer, "b", b);
+    dict_set(layer, "rng_state", dict_get(Wr, "state"));
+    return layer;
+}
+
+fn prom_linear_forward(layer, x_id) {
+    h W = dict_get(layer, "W");
+    h b = dict_get(layer, "b");
+    h xW = tape_matmul(x_id, W);
+    return tape_add(xW, b);
+}
+
+fn prom_linear_params(layer) {
+    h out = [];
+    arr_push(out, dict_get(layer, "W"));
+    arr_push(out, dict_get(layer, "b"));
+    return out;
+}
+
+# ---------------------------------------------------------------------------
+# Activations — substrate-native (route to tape ops).
+# ---------------------------------------------------------------------------
+
+fn prom_relu(x_id)    { return tape_relu(x_id); }
+fn prom_sigmoid(x_id) { return tape_sigmoid(x_id); }
+
+# ---------------------------------------------------------------------------
+# Losses — MSE works with what the tape currently supports
+# (no exp/log on the tape, so true softmax+CE isn't yet here; using
+# sigmoid + MSE against one-hot target is a viable LM loss for tiny
+# scale and demonstrates substrate-native training).
+# ---------------------------------------------------------------------------
+
+fn prom_mse_loss(pred_id, target_id) {
+    h diff = tape_sub(pred_id, target_id);
+    h sq = tape_mul(diff, diff);
+    return tape_mean(sq);
+}
+
+# ---------------------------------------------------------------------------
+# Optimizer — substrate-native SGD step.
+# Calls tape_update on every parameter in the supplied flat list.
+# ---------------------------------------------------------------------------
+
+fn prom_sgd_step(params, lr) {
+    h i = 0;
+    while i < arr_len(params) {
+        tape_update(arr_get(params, i), lr);
+        i = i + 1;
+    }
+}
+
+# Variant: substrate-modulated SGD step.
+# Reads each param's gradient, scales by (1 + phi.res(grad_hash)) so
+# gradients pointing toward Fibonacci attractors get a small boost.
+# Experimental; the architectural rule derived today says substrate
+# metric on float activations doesn't work as an attention modulator,
+# but on GRADIENT MAGNITUDES (integer-keyed via hash of grad bytes)
+# the rule may differ. Worth measuring once the baseline converges.
+fn prom_harmonic_sgd_step(params, lr) {
+    # TODO: read tape_grad(p), compute substrate scaling factor, then
+    # call a (yet-to-add) tape_update_scaled(p, lr, scale). For now,
+    # fall back to plain SGD.
+    prom_sgd_step(params, lr);
+}
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Build a [vocab] one-hot row vector as a tape_const.
+fn prom_one_hot(idx, vocab) {
+    h row = [];
+    h i = 0;
+    while i < vocab {
+        if i == idx { arr_push(row, 1.0); }
+        else { arr_push(row, 0.0); }
+        i = i + 1;
+    }
+    return tape_const([row]);    # [1, vocab]
+}
+
+# Argmax over a logits vector. Handles either a flat 1D array (when
+# tape_value collapses a 1xN matrix) or a true 2D [1, vocab] matrix.
+fn prom_argmax_row(logits) {
+    h row = logits;
+    # If the first element is itself an array, this is 2D and we need row 0.
+    if arr_len(logits) > 0 {
+        h first = arr_get(logits, 0);
+        if type_of(first) == "array" { row = first; }
+    }
+    h best = 0;
+    h best_val = arr_get(row, 0);
+    h i = 1;
+    while i < arr_len(row) {
+        h v = arr_get(row, i);
+        if v > best_val { best_val = v; best = i; }
+        i = i + 1;
+    }
+    return best;
+}
+
+# ---------------------------------------------------------------------------
+# Convenience: collect parameters from a list of layers.
+# ---------------------------------------------------------------------------
+
+fn prom_collect_params(layers) {
+    h out = [];
+    h i = 0;
+    while i < arr_len(layers) {
+        h layer = arr_get(layers, i);
+        h kind = dict_get(layer, "kind");
+        if kind == "linear" {
+            h ps = prom_linear_params(layer);
+            h j = 0;
+            while j < arr_len(ps) {
+                arr_push(out, arr_get(ps, j));
+                j = j + 1;
+            }
+        }
+        i = i + 1;
+    }
+    return out;
+}
@@ -0,0 +1,173 @@
+# Prometheus tiny LM — substrate-native bigram next-char prediction.
+#
+# A 2-layer MLP trained via OMC's reverse-mode autograd (no PyTorch).
+# Predicts P(next_char | current_char) on a tiny ABA-pattern corpus.
+# Loss must decrease over training steps; final predictions must
+# show the model learned the bigram structure.
+#
+# This is the MVP that proves the Prometheus thesis: you can train a
+# neural language model in pure OMC, using only substrate-native
+# autograd and ML kernels.
+#
+# Architecture:
+#   input  = one-hot of current char  [1, vocab]
+#   hidden = relu(input @ W1 + b1)    [1, hidden]
+#   output = hidden @ W2 + b2         [1, vocab]   (logits, MSE-trained
+#                                                   against one-hot target)
+#
+# Training:
+#   - SGD with substrate-native tape_update
+#   - MSE loss against one-hot next-char target
+#   - 200 steps over a synthetic bigram corpus
+#
+# Stop condition (substrate-clear): loss must strictly decrease
+#   between step 0 and step 200, and final prediction on at least
+#   one bigram must be correct (argmax matches target).
+
+import "examples/lib/prometheus.omc";
+
+# ---------------------------------------------------------------------------
+# Corpus + tokenization
+# ---------------------------------------------------------------------------
+
+# Tiny alphabet, deterministic bigram pattern: a→b, b→c, c→a (cycle).
+# Train data is "abcabcabcabc...". A successful model will learn this
+# exact mapping (vocab = 3, perfect predictability).
+fn make_corpus() {
+    h chars = ["a", "b", "c"];        # vocab = 3
+    h text = "abcabcabcabcabcabcabcabcabc";
+    h ids = [];
+    h i = 0;
+    while i < str_len(text) {
+        h ch = str_slice(text, i, i + 1);
+        # Index of ch in chars
+        h idx = 0;
+        if ch == "a" { idx = 0; }
+        elif ch == "b" { idx = 1; }
+        elif ch == "c" { idx = 2; }
+        arr_push(ids, idx);
+        i = i + 1;
+    }
+    h corpus = dict_new();
+    dict_set(corpus, "chars", chars);
+    dict_set(corpus, "vocab", 3);
+    dict_set(corpus, "ids", ids);
+    return corpus;
+}
+
+# ---------------------------------------------------------------------------
+# Build model
+# ---------------------------------------------------------------------------
+
+fn build_model(vocab, hidden, rng_state) {
+    h L1 = prom_linear_new(vocab, hidden, rng_state);
+    h L2 = prom_linear_new(hidden, vocab, dict_get(L1, "rng_state"));
+    h model = dict_new();
+    dict_set(model, "L1", L1);
+    dict_set(model, "L2", L2);
+    return model;
+}
+
+fn forward(model, x_id) {
+    h L1 = dict_get(model, "L1");
+    h L2 = dict_get(model, "L2");
+    h h_pre = prom_linear_forward(L1, x_id);
+    h h_post = prom_relu(h_pre);
+    h logits = prom_linear_forward(L2, h_post);
+    return logits;
+}
+
+fn model_params(model) {
+    return prom_collect_params([dict_get(model, "L1"), dict_get(model, "L2")]);
+}
+
+# ---------------------------------------------------------------------------
+# Training loop
+# ---------------------------------------------------------------------------
+
+fn main() {
+    print("=== Prometheus tiny LM ===");
+    h corpus = make_corpus();
+    h vocab = dict_get(corpus, "vocab");
+    h ids = dict_get(corpus, "ids");
+    h n_pairs = arr_len(ids) - 1;
+    print(concat_many("corpus pairs (current→next): ", to_string(n_pairs)));
+    print(concat_many("vocab: ", to_string(vocab)));
+
+    tape_reset();
+    h model = build_model(vocab, 8, 42);
+    h params = model_params(model);
+    print(concat_many("trainable param tensors: ", to_string(arr_len(params))));
+
+    h lr = 0.05;
+    h steps = 200;
+    h loss_history = [];
+    h step = 0;
+    while step < steps {
+        # Online SGD: cycle through pairs deterministically.
+        h k = step % n_pairs;
+        h cur = arr_get(ids, k);
+        h nxt = arr_get(ids, k + 1);
+        h x = prom_one_hot(cur, vocab);
+        h target = prom_one_hot(nxt, vocab);
+        h pred = forward(model, x);
+        h loss = prom_mse_loss(pred, target);
+        tape_backward(loss);
+        prom_sgd_step(params, lr);
+        h lv = tape_value(loss);
+        arr_push(loss_history, lv);
+        if step % 20 == 0 || step == steps - 1 {
+            print(concat_many("step ", to_string(step), "  loss=", to_string(lv)));
+        }
+        step = step + 1;
+    }
+    print("");
+
+    # ---- Stop-condition checks ----
+
+    h first = arr_get(loss_history, 0);
+    h last = arr_get(loss_history, arr_len(loss_history) - 1);
+    print(concat_many("first loss: ", to_string(first)));
+    print(concat_many("final loss: ", to_string(last)));
+    h ratio = first / last;
+    print(concat_many("loss reduction ratio: ", to_string(ratio), "x"));
+
+    # Quality check: predict argmax for each bigram, count correct.
+    print("");
+    print("=== Inference: bigram predictions ===");
+    h chars = dict_get(corpus, "chars");
+    h correct = 0;
+    h total = 0;
+    h c = 0;
+    while c < vocab {
+        h x = prom_one_hot(c, vocab);
+        h pred = forward(model, x);
+        h logits = tape_value(pred);
+        h pred_idx = prom_argmax_row(logits);
+        # Expected next: (c+1) % vocab for the abcabc cycle
+        h expected = (c + 1) % vocab;
+        h ok = pred_idx == expected;
+        if ok { correct = correct + 1; }
+        total = total + 1;
+        h marker = "✗";
+        if ok { marker = "✓"; }
+        print(concat_many("  ", arr_get(chars, c), " → ", arr_get(chars, pred_idx),
+                          "  (expected ", arr_get(chars, expected), ") ", marker));
+        c = c + 1;
+    }
+    print("");
+    print(concat_many("argmax accuracy: ", to_string(correct), "/", to_string(total)));
+
+    # Final verdict.
+    print("");
+    if last < first && correct >= 1 {
+        print("[OK] Prometheus end-to-end training works.");
+        print("     Loss decreased AND argmax predicted at least one bigram.");
+    } else {
+        print("[FAIL] Did not meet stop condition.");
+        if last >= first { print("       Loss did not decrease."); }
+        if correct == 0 { print("       Zero correct predictions."); }
+    }
+}
+
+main();