code_intel: omc_code_diff + omc_code_metrics + 14 tests

RandomCoder-lab · claude · RandomCoder-lab · commit f2d6c02fdb2a · 2026-05-16T11:36:20.000-05:00
omc_code_diff(a, b) — structural diff between two programs after
canonicalization. Returns {added, removed, modified, unchanged} as
function-name arrays. Alpha-renames don't show up as modifications
because hashes are computed on the canonical form. The LLM's "what
did my edit actually change?" check.

omc_code_metrics(code) — one-shot bulk metrics:
  {complexity, ast_size, ast_depth, source_bytes, token_count,
   compression_ratio}. Saves N round-trips through MCP for the
common case where the LLM wants all stats at once.

14 new tests cover:
  - diff: identical / added / removed / modified / alpha-rename
  - metrics: all fields present, complexity ordering, token_count
    positive, compression_ratio positive
  - composition: diff-then-metrics workflow
  - summary classes/imports/stmt_count completeness
  - canonical idempotence (canonicalize(canonical) == canonical)

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/OMC_REFERENCE.md b/OMC_REFERENCE.md
@@ -2,9 +2,9 @@
 
 Auto-generated from `omnimcode-core/src/docs.rs`. Run `omc --gen-docs > OMC_REFERENCE.md` to regenerate.
 
-**Total documented builtins**: 538
+**Total documented builtins**: 540
 
-**OMC-unique**: 60 (no direct Python/NumPy equivalent — these are why you reach for OMC over numpy)
+**OMC-unique**: 61 (no direct Python/NumPy equivalent — these are why you reach for OMC over numpy)
 
 ---
 
@@ -25,7 +25,7 @@ Auto-generated from `omnimcode-core/src/docs.rs`. Run `omc --gen-docs > OMC_REFE
 - [exceptions](#exceptions) (2 builtins)
 - [introspection](#introspection) (22 builtins)
 - [tokenizer](#tokenizer) (16 builtins)
-- [code_intel](#code_intel) (14 builtins)
+- [code_intel](#code_intel) (16 builtins)
 - [math](#math) (58 builtins)
 - [dicts](#dicts) (26 builtins)
 - [test_runner](#test_runner) (8 builtins)
@@ -4555,6 +4555,26 @@ Hash blended with substrate-resonance of the hash itself — OMC-only dual-band
 omc_hbit_hash("h x = 1;")  // substrate-weighted int
 ```
 
+### `omc_code_diff` 🔱 *OMC-unique*
+
+**Signature**: `(a: string, b: string) -> dict`
+
+Structural diff between two programs (after canonicalization). {added, removed, modified, unchanged} as function-name arrays.
+
+```omc
+omc_code_diff(old, new)  // {modified: ["loss"], ...}
+```
+
+### `omc_code_metrics`
+
+**Signature**: `(code: string) -> dict`
+
+Bulk metrics: {complexity, ast_size, ast_depth, source_bytes, token_count, compression_ratio}. One call instead of N.
+
+```omc
+omc_code_metrics(src)  // all stats at once
+```
+
 ---
 
 ## math
diff --git a/examples/tests/test_code_intel_extras.omc b/examples/tests/test_code_intel_extras.omc
@@ -0,0 +1,118 @@
+# Additional code_intel coverage including diff + metrics.
+
+fn assert_eq(actual, expected, msg) {
+    if actual != expected {
+        test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual));
+    }
+}
+
+fn assert_true(cond, msg) { if !cond { test_record_failure(msg); } }
+
+# omc_code_diff
+fn test_diff_identical() {
+    h d = omc_code_diff(
+        "fn f(x) { return x; }",
+        "fn f(x) { return x; }"
+    );
+    assert_eq(arr_len(dict_get(d, "added")), 0, "no additions");
+    assert_eq(arr_len(dict_get(d, "removed")), 0, "no removals");
+    assert_eq(arr_len(dict_get(d, "modified")), 0, "no mods");
+    assert_eq(arr_len(dict_get(d, "unchanged")), 1, "one unchanged");
+}
+
+fn test_diff_added() {
+    h d = omc_code_diff(
+        "fn f(x) { return x; }",
+        "fn f(x) { return x; } fn g(x) { return x; }"
+    );
+    assert_eq(arr_len(dict_get(d, "added")), 1, "g added");
+}
+
+fn test_diff_removed() {
+    h d = omc_code_diff(
+        "fn f(x) { return x; } fn g(x) { return x; }",
+        "fn f(x) { return x; }"
+    );
+    assert_eq(arr_len(dict_get(d, "removed")), 1, "g removed");
+}
+
+fn test_diff_modified() {
+    h d = omc_code_diff(
+        "fn f(x) { return x; }",
+        "fn f(x) { return x + 1; }"
+    );
+    assert_eq(arr_len(dict_get(d, "modified")), 1, "f modified");
+}
+
+fn test_diff_alpha_rename_unchanged() {
+    h d = omc_code_diff(
+        "fn f(x) { return x; }",
+        "fn f(a) { return a; }"
+    );
+    assert_eq(arr_len(dict_get(d, "unchanged")), 1, "alpha-rename is unchanged");
+    assert_eq(arr_len(dict_get(d, "modified")), 0, "no actual mods");
+}
+
+# omc_code_metrics
+fn test_metrics_has_all_fields() {
+    h m = omc_code_metrics("fn f(x) { return x; }");
+    assert_true(dict_has(m, "complexity"), "has complexity");
+    assert_true(dict_has(m, "ast_size"), "has ast_size");
+    assert_true(dict_has(m, "ast_depth"), "has ast_depth");
+    assert_true(dict_has(m, "source_bytes"), "has source_bytes");
+    assert_true(dict_has(m, "token_count"), "has token_count");
+    assert_true(dict_has(m, "compression_ratio"), "has compression_ratio");
+}
+
+fn test_metrics_complexity_grows() {
+    h simple = omc_code_metrics("fn f(x) { return x; }");
+    h branchy = omc_code_metrics("fn f(x) { if x > 0 { if x > 1 { return 1; } return 0; } return 0; }");
+    assert_true(dict_get(branchy, "complexity") > dict_get(simple, "complexity"),
+        "complexity orders");
+}
+
+fn test_metrics_token_count_positive() {
+    h m = omc_code_metrics("fn f(x) { return x; }");
+    assert_true(dict_get(m, "token_count") > 0, "non-zero tokens");
+}
+
+fn test_metrics_compression_ratio_positive() {
+    h m = omc_code_metrics("fn f(x) { return x; }");
+    assert_true(dict_get(m, "compression_ratio") > 0.0, "non-zero ratio");
+}
+
+# Composition tests
+fn test_diff_then_metrics() {
+    # The basic LLM workflow: diff to see what changed, then metrics
+    # on the new version.
+    h old = "fn f(x) { return x; }";
+    h new = "fn f(x) { return x + 1; } fn g(x) { return x * 2; }";
+    h d = omc_code_diff(old, new);
+    assert_eq(arr_len(dict_get(d, "added")), 1, "added g");
+    h m = omc_code_metrics(new);
+    assert_true(dict_get(m, "complexity") >= 1.0, "metrics on new");
+}
+
+# Larger summary integration test
+fn test_summary_has_classes() {
+    h s = omc_code_summary("class Foo { x; y; } fn f() {}");
+    assert_eq(arr_len(dict_get(s, "classes")), 1, "one class");
+    assert_eq(arr_get(dict_get(s, "classes"), 0), "Foo", "class name");
+}
+
+fn test_summary_imports() {
+    h s = omc_code_summary("import \"foo\"; fn main() {}");
+    assert_eq(arr_len(dict_get(s, "imports")), 1, "one import");
+}
+
+fn test_summary_stmt_count() {
+    h s = omc_code_summary("fn a() {} fn b() {} fn c() {}");
+    assert_eq(dict_get(s, "stmt_count"), 3, "three stmts");
+}
+
+# canonical hash invariance through rounds of canonical
+fn test_canonical_idempotent() {
+    h c1 = omc_code_canonical("fn f(x) { return x; }");
+    h c2 = omc_code_canonical(c1);
+    assert_eq(c1, c2, "canonical is idempotent");
+}
diff --git a/omnimcode-core/src/code_intel.rs b/omnimcode-core/src/code_intel.rs
@@ -391,6 +391,67 @@ pub fn substrate_fingerprint(source: &str) -> Result<i64, String> {
     tokenizer::crt_pack(&streams, &moduli)
 }
 
+/// Structural diff between two programs: which functions appear only
+/// in A, only in B, in both but with different bodies, or both with
+/// same body. Compared after canonicalization so renames don't show
+/// up as diffs.
+#[derive(Clone, Debug, Default)]
+pub struct CodeDiff {
+    pub added: Vec<String>,
+    pub removed: Vec<String>,
+    pub modified: Vec<String>,
+    pub unchanged: Vec<String>,
+}
+
+pub fn diff(a: &str, b: &str) -> Result<CodeDiff, String> {
+    let sa = summarise(a)?;
+    let sb = summarise(b)?;
+    use std::collections::HashMap;
+    let a_map: HashMap<&str, i64> = sa.functions.iter()
+        .map(|f| (f.name.as_str(), f.canonical_hash))
+        .collect();
+    let b_map: HashMap<&str, i64> = sb.functions.iter()
+        .map(|f| (f.name.as_str(), f.canonical_hash))
+        .collect();
+    let mut diff = CodeDiff::default();
+    for f in &sa.functions {
+        match b_map.get(f.name.as_str()) {
+            None => diff.removed.push(f.name.clone()),
+            Some(&bh) if bh == f.canonical_hash => diff.unchanged.push(f.name.clone()),
+            Some(_) => diff.modified.push(f.name.clone()),
+        }
+    }
+    for f in &sb.functions {
+        if !a_map.contains_key(f.name.as_str()) {
+            diff.added.push(f.name.clone());
+        }
+    }
+    diff.added.sort();
+    diff.removed.sort();
+    diff.modified.sort();
+    diff.unchanged.sort();
+    Ok(diff)
+}
+
+/// Quick metrics: substrate score + complexity + size all in one shot.
+/// Computed in one parse-and-canonicalize pass each.
+pub fn quick_metrics(source: &str) -> Result<std::collections::BTreeMap<String, f64>, String> {
+    let mut out = std::collections::BTreeMap::new();
+    let cpx = complexity(source)? as f64;
+    let size = ast_size(source)? as f64;
+    let depth = ast_depth(source)? as f64;
+    out.insert("complexity".to_string(), cpx);
+    out.insert("ast_size".to_string(), size);
+    out.insert("ast_depth".to_string(), depth);
+    out.insert("source_bytes".to_string(), source.len() as f64);
+    let ids = crate::tokenizer::encode(source).len() as f64;
+    out.insert("token_count".to_string(), ids);
+    if source.len() > 0 {
+        out.insert("compression_ratio".to_string(), source.len() as f64 / ids.max(1.0));
+    }
+    Ok(out)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/omnimcode-core/src/docs.rs b/omnimcode-core/src/docs.rs
@@ -1059,6 +1059,20 @@ pub const BUILTINS: &[BuiltinDoc] = &[
         example: "omc_search_builtins(\"softmax\")  // [\"arr_softmax\"]",
         unique_to_omc: false,
     },
+    BuiltinDoc {
+        name: "omc_code_diff", category: "code_intel",
+        signature: "(a: string, b: string) -> dict",
+        description: "Structural diff between two programs (after canonicalization). {added, removed, modified, unchanged} as function-name arrays.",
+        example: "omc_code_diff(old, new)  // {modified: [\"loss\"], ...}",
+        unique_to_omc: true,
+    },
+    BuiltinDoc {
+        name: "omc_code_metrics", category: "code_intel",
+        signature: "(code: string) -> dict",
+        description: "Bulk metrics: {complexity, ast_size, ast_depth, source_bytes, token_count, compression_ratio}. One call instead of N.",
+        example: "omc_code_metrics(src)  // all stats at once",
+        unique_to_omc: false,
+    },
     // ---- Auto-generated docs for previously-undocumented builtins ----
     // Each entry covers one runtime builtin that lacked introspection.
     // Stubs are conservative — refine as you learn the actual signatures.
diff --git a/omnimcode-core/src/interpreter.rs b/omnimcode-core/src/interpreter.rs
@@ -7705,6 +7705,48 @@ impl Interpreter {
                     .collect();
                 Ok(Value::Array(HArray::from_vec(out)))
             }
+            "omc_code_diff" => {
+                // Structural diff: returns {added, removed, modified, unchanged}.
+                // Compared after canonicalization so renames don't show.
+                if args.len() < 2 {
+                    return Err("omc_code_diff requires (a, b)".to_string());
+                }
+                let a = self.eval_expr(&args[0])?.to_display_string();
+                let b = self.eval_expr(&args[1])?.to_display_string();
+                let d = crate::code_intel::diff(&a, &b)
+                    .map_err(|e| format!("omc_code_diff: {}", e))?;
+                let mut map = std::collections::BTreeMap::new();
+                map.insert("added".to_string(), Value::Array(HArray::from_vec(
+                    d.added.iter().map(|s| Value::String(s.clone())).collect()
+                )));
+                map.insert("removed".to_string(), Value::Array(HArray::from_vec(
+                    d.removed.iter().map(|s| Value::String(s.clone())).collect()
+                )));
+                map.insert("modified".to_string(), Value::Array(HArray::from_vec(
+                    d.modified.iter().map(|s| Value::String(s.clone())).collect()
+                )));
+                map.insert("unchanged".to_string(), Value::Array(HArray::from_vec(
+                    d.unchanged.iter().map(|s| Value::String(s.clone())).collect()
+                )));
+                Ok(Value::dict_from(map))
+            }
+            "omc_code_metrics" => {
+                // Bulk metrics in one call: complexity + ast_size +
+                // ast_depth + source_bytes + token_count +
+                // compression_ratio. Avoids N separate round-trips
+                // through the MCP server.
+                if args.is_empty() {
+                    return Err("omc_code_metrics requires (code)".to_string());
+                }
+                let code = self.eval_expr(&args[0])?.to_display_string();
+                let m = crate::code_intel::quick_metrics(&code)
+                    .map_err(|e| format!("omc_code_metrics: {}", e))?;
+                let mut map = std::collections::BTreeMap::new();
+                for (k, v) in m {
+                    map.insert(k, Value::HFloat(v));
+                }
+                Ok(Value::dict_from(map))
+            }
             "omc_search_builtins" => {
                 // Substring search across name + description. Returns
                 // matching names. Useful when you don't know what