diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 63baab0e..51615af7 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -48,14 +48,6 @@ jobs: --git.origin="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}" \ --output.html.path=site/index.html \ --output.json.path=site/report.json - - name: Compose the principle/metric doc corpus into ./site - # base/.md (fallback) + composed /.md (base βŠ• overlay), so a - # finding's `doc_url` resolves on Pages once `doc_base` points here. - # `.nojekyll` makes Pages serve the raw Markdown (Jekyll would render it), - # which is what the `remediation` "Download …" links and LLMs want. - run: | - cargo run -q -p code-ranker -- docs --out site - touch site/.nojekyll - uses: actions/configure-pages@v5 - uses: actions/upload-pages-artifact@v5 with: diff --git a/.gitignore b/.gitignore index 1693a71a..e1b84b0e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,9 @@ __pycache__/ *.egg-info/ dist/ build/ + +# BEGIN Constructor Studio +# Generated Constructor Studio runtime and agent integration files. +# Files matched here are owned by Constructor Studio and may be overwritten. +.cf-studio/ +# END Constructor Studio diff --git a/Cargo.lock b/Cargo.lock index 3ee9db7e..1a6e94ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -268,7 +268,7 @@ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "code-ranker" -version = "3.0.2" +version = "4.0.0" dependencies = [ "anyhow", "chrono", @@ -286,7 +286,7 @@ dependencies = [ [[package]] name = "code-ranker-graph" -version = "3.0.2" +version = "4.0.0" dependencies = [ "cel", "chrono", @@ -298,7 +298,7 @@ dependencies = [ [[package]] name = "code-ranker-plugin-api" -version = "3.0.2" +version = "4.0.0" dependencies = [ "anyhow", "chrono", @@ -309,7 +309,7 @@ dependencies = [ [[package]] name = "code-ranker-plugins" -version = "3.0.2" +version = "4.0.0" dependencies = [ "anyhow", "cargo_metadata", @@ -336,7 +336,7 @@ dependencies = [ [[package]] name = "code-ranker-viewer" -version = "3.0.2" +version = "4.0.0" dependencies = [ "anyhow", "code-ranker-graph", diff --git a/Cargo.toml b/Cargo.toml index de518e8c..7cd9c54b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "3" [workspace.package] -version = "3.0.2" +version = "4.0.0" edition = "2024" rust-version = "1.88" license = "Apache-2.0" @@ -12,10 +12,10 @@ keywords = ["dependency-graph", "coupling", "refactoring", "code-quality", "stat categories = ["development-tools", "command-line-utilities"] [workspace.dependencies] -code-ranker-graph = { path = "crates/code-ranker-graph", version = "3.0.2" } -code-ranker-plugin-api = { path = "crates/code-ranker-plugin-api", version = "3.0.2" } -code-ranker-plugins = { path = "crates/code-ranker-plugins", version = "3.0.2" } -code-ranker-viewer = { path = "crates/code-ranker-viewer", version = "3.0.2" } +code-ranker-graph = { path = "crates/code-ranker-graph", version = "4.0.0" } +code-ranker-plugin-api = { path = "crates/code-ranker-plugin-api", version = "4.0.0" } +code-ranker-plugins = { path = "crates/code-ranker-plugins", version = "4.0.0" } +code-ranker-viewer = { path = "crates/code-ranker-viewer", version = "4.0.0" } anyhow = "1.0" cel = "0.13" diff --git a/README.md b/README.md index 8a52abc7..88c292cb 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,20 @@ [![CI](https://github.com/ffedoroff/code-ranker/actions/workflows/ci.yml/badge.svg)](https://github.com/ffedoroff/code-ranker/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/ffedoroff/code-ranker/branch/main/graph/badge.svg)](https://codecov.io/gh/ffedoroff/code-ranker) +[![code-ranker](https://img.shields.io/endpoint?url=https://api.code-ranker.com/badge/ffedoroff/cr-smoke-test.json)](https://reports.code-ranker.com/r/ffedoroff/cr-smoke-test/latest) [![Crates.io](https://img.shields.io/crates/v/code-ranker.svg)](https://crates.io/crates/code-ranker) [![npm](https://img.shields.io/npm/v/code-ranker.svg)](https://www.npmjs.com/package/code-ranker) [![PyPI](https://img.shields.io/pypi/v/code-ranker.svg)](https://pypi.org/project/code-ranker/) [![License](https://img.shields.io/crates/l/code-ranker.svg)](./LICENSE) -Structural-analysis tool for **Rust, Python, JavaScript and TypeScript** codebases. Built **AI-agent-friendly first** β€” finds where a project has structural problems and hands an actionable shortlist to a human or an AI agent for the actual refactor. +[![Website](https://img.shields.io/badge/website-code--ranker.com-1abc9c)](https://code-ranker.com) +[![Install the GitHub App](https://img.shields.io/badge/GitHub%20App-install-2c3e50?logo=github&logoColor=white)](https://github.com/apps/code-ranker-app/installations/new) + +Structural-analysis tool for **Rust** (production-ready) plus **Python, TypeScript/JavaScript, Go, C, C++, C# and Markdown** (beta) codebases. Built **AI-agent-friendly first** β€” finds where a project has structural problems and hands an actionable shortlist to a human or an AI agent for the actual refactor. **πŸ‘‰ Map your codebase's worst structural problems in 30 seconds β€” [jump to the Rust quick start](#rust-quick-start) and run it on your repo now.** -**Status:** pre-alpha. APIs and output shapes may change without notice. +**Status:** 4.0.0 β€” the Rust analyzer is production-ready; the other languages are beta, so their output shapes may still change. ## Rust quick start @@ -31,14 +35,14 @@ code-ranker always runs **entirely on your machine**. It makes **no network call ## AI agents friendly -**Hand your codebase to an AI agent and let it fix the worst spot.** code-ranker is built to feed work straight to an AI coding agent (Claude Code, Cursor, …). Attach the short playbook [docs/ai-skill.md](docs/ai-skill.md) to your agent's context β€” it teaches the agent which two metrics matter (dependency cycles `ADP`, coupling `HK`) and the exact fix loop (scorecard β†’ snapshot β†’ fix β†’ re-check β†’ before/after report). +**Hand your codebase to an AI agent and let it fix the worst spot.** code-ranker is built to feed work straight to an AI coding agent (Claude Code, Cursor, …). Run **`code-ranker docs ai`** in your repo β€” it prints a short, offline playbook (no network) that teaches the agent which two metrics matter (dependency cycles `ADP`, coupling `HK`) and the exact fix loop (scorecard β†’ snapshot β†’ fix β†’ re-check β†’ before/after report), tailored to your project's language. Then just ask, e.g.: -- *"Read `https://raw.githubusercontent.com/ffedoroff/code-ranker/main/docs/ai-skill.md`. Find the worst dependency cycle in this project and propose a refactor that breaks it β€” show me the plan before changing code."* -- *"Read `https://raw.githubusercontent.com/ffedoroff/code-ranker/main/docs/ai-skill.md`. Find the most complex / highest-HK file and analyze how to split it; explain what the split buys for me (lower coupling, smaller blast radius). Take a **before report**, apply the split, take an **after report**, and show me the **HTML diff**."* +- *"Run `code-ranker docs ai` and follow it: find the worst dependency cycle in this project and propose a refactor that breaks it β€” show me the plan before changing code."* +- *"Run `code-ranker docs ai` for the playbook, then find the most complex / highest-HK file and analyze how to split it; explain what the split buys for me (lower coupling, smaller blast radius). Take a **before report**, apply the split, take an **after report**, and show me the **HTML diff**."* -The agent drives the CLI itself β€” `ai-skill.md` already spells out the commands and the loop, so no glue is needed. +The agent drives the CLI itself β€” `code-ranker docs ai` spells out the commands and the loop, so no glue is needed. (Prefer a file in context? The same playbook lives at [docs/ai-skill.md](docs/ai-skill.md).) ## What it finds @@ -62,6 +66,8 @@ The linter is the `check` command β€” exits non-zero on any cycle or threshold v **Add it to your pipeline today** β€” one `code-ranker check` step stops new cycles and bloat from ever landing. +Prefer zero config? **[Install the GitHub App](https://github.com/apps/code-ranker-app/installations/new)** β€” it publishes a per-PR HTML structural report on every pull request, no workflow YAML to write. More at **[code-ranker.com](https://code-ranker.com)**. + ## Full CLI Written in Rust β€” fast, memory-safe, single static-ish binary with **no runtime dependencies** (no Python, no Node, no JVM, no shared libs to install). One file on PATH, done. @@ -106,7 +112,7 @@ code-ranker report code-ranker report . --baseline .code-ranker/before.json ``` -Built-in plugins: `rust` (cargo + syn), `python`, `javascript` (also handles TypeScript) β€” all compiled into the single binary, nothing to install. +Built-in plugins for all nine supported languages (`rust` uses cargo + syn; Rust is production-ready, the rest are beta) β€” all compiled into the single binary, nothing to install. ## Documentation diff --git a/code-ranker.toml b/code-ranker.toml index 9eb0f1a9..ce7a3023 100644 --- a/code-ranker.toml +++ b/code-ranker.toml @@ -1,3 +1,4 @@ +version = "4.0" # code-ranker.toml β€” project-level configuration for code-ranker. # Discovery order: --config PATH > ./code-ranker.toml > /code-ranker.toml > # Cargo.toml [workspace.metadata.code-ranker]. CLI flags always win over the file. diff --git a/contrib/prompt-eval-metrics.py b/contrib/prompt-eval-metrics.py new file mode 100755 index 00000000..a5185add --- /dev/null +++ b/contrib/prompt-eval-metrics.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +"""Compute one prompt-eval run's metrics and append a row to metrics.csv. + +The self-improvement loop (contrib/prompting-self-improve.md) scores each run on +quality / cost / clarity. The *objective* columns are mechanically extractable +from a run's artifacts; this script extracts them so a run is recorded the same +way every time instead of by hand. + +What it reads (all under one RUN_DIR = .../_/--/): + - chat.jsonl -> tool_calls, commands, input/output/cache tokens, wall_s, + api_duration_s, doc reads + rereads, first_edit_turn, + used_generated_prompt, focus_framing, discovery_retries, + (heuristic) tests_pass, planned_before_edit + - before/after.json -> focus_before/after, worst_before/after, new_cycles. + For a cycle FOCUS (ADP/cycle): cycle counts + worst SCC size + + new-cycle count. For a metric FOCUS (hk/sloc/cognitive/…): the + metric read off the module nodes β€” worst_* = the worst module's + value (the --top 1 target), focus_* = the project-wide sum (flat + total beside a dropped worst = coupling relocated, not dissolved), + new_cycles blank. +And, when --project-path is given, the PROJECT branch git diff -> files_changed, +loc_added, loc_removed (branch defaults to -, unique per prompt version). + +Token extraction is format-aware: a full Claude Code session log carries a +`result` event with authoritative cumulative usage + durations; a subagent log +has none, so usage is summed over assistant turns and api_duration_s is left +blank. cost_usd is the no-cache, no-discount API price (input*$5 + output*$25 +per MTok by default β€” Opus standard); it is comparable only across runs whose +input_tokens share an extraction basis (see the doc). + +The *subjective* columns (quality_1_5, clarity_1_5, collateral_delta, verdict, +notes) are not guessed β€” pass them as flags or fill them in later. The script +never overwrites an existing row; it appends. + +Usage: + prompt-eval-metrics.py RUN_DIR [--focus ADP] [--project user-provisioning] + [--project-path /abs/path --base-branch main] + [--quality N --clarity N --collateral N --verdict improved --notes "..."] + [--in-price 5 --out-price 25] [--dry-run] +""" + +import argparse +import csv +import json +import os +import re +import subprocess +import sys +from datetime import datetime + +COLUMNS = [ + "ts", "cr_sha", "project", "focus", "model", "iter", "run", + "tests_pass", "focus_before", "focus_after", "focus_delta", + "worst_before", "worst_after", "new_cycles", "collateral_delta", "quality_1_5", + "tool_calls", "commands", "input_tokens", "output_tokens", "cache_read_tokens", + "cost_usd", "wall_s", "api_duration_s", "files_changed", "loc_added", "loc_removed", + "read_doc_ai", "read_doc_focus", "doc_reread", "planned_before_edit", + "used_generated_prompt", "focus_framing", "first_edit_turn", "clarifying_qs", + "discovery_retries", "clarity_1_5", "verdict", "notes", +] + + +def load_jsonl(path): + out = [] + with open(path) as fh: + for line in fh: + line = line.strip() + if line: + try: + out.append(json.loads(line)) + except json.JSONDecodeError: + pass + return out + + +def tool_uses(events): + """Yield (name, command_str, input_dict) for every tool_use, in order.""" + for o in events: + content = (o.get("message") or {}).get("content") + if isinstance(content, list): + for b in content: + if isinstance(b, dict) and b.get("type") == "tool_use": + inp = b.get("input", {}) or {} + yield b.get("name", ""), str(inp.get("command", "")), inp + + +def tool_results(events): + for o in events: + content = (o.get("message") or {}).get("content") + if isinstance(content, list): + for b in content: + if isinstance(b, dict) and b.get("type") == "tool_result": + c = b.get("content") + text = c if isinstance(c, str) else json.dumps(c) + yield bool(b.get("is_error")), text + + +def from_transcript(path, focus): + events = load_jsonl(path) + result = next((o for o in events if o.get("type") == "result"), None) + + names, cmds = [], [] + for name, cmd, _ in tool_uses(events): + names.append(name) + cmds.append((name, cmd)) + + m = {} + m["tool_calls"] = len(names) + m["commands"] = sum(1 for n, _ in cmds if n == "Bash") + + # tokens + durations: authoritative result event, else sum per assistant turn + if result: + u = result.get("usage", {}) or {} + m["output_tokens"] = u.get("output_tokens", "") + m["cache_read_tokens"] = u.get("cache_read_input_tokens", "") + m["input_tokens"] = ( + (u.get("input_tokens", 0) or 0) + + (u.get("cache_creation_input_tokens", 0) or 0) + + (u.get("cache_read_input_tokens", 0) or 0) + ) + m["wall_s"] = round((result.get("duration_ms") or 0) / 1000) or "" + m["api_duration_s"] = round((result.get("duration_api_ms") or 0) / 1000) or "" + else: + out = cr = inp = 0 + for o in events: + u = (o.get("message") or {}).get("usage") or {} + out += u.get("output_tokens", 0) or 0 + cr += u.get("cache_read_input_tokens", 0) or 0 + inp += ( + (u.get("input_tokens", 0) or 0) + + (u.get("cache_creation_input_tokens", 0) or 0) + + (u.get("cache_read_input_tokens", 0) or 0) + ) + m["output_tokens"], m["cache_read_tokens"], m["input_tokens"] = out, cr, inp + ts = [o["timestamp"] for o in events if o.get("timestamp")] + m["api_duration_s"] = "" + if len(ts) >= 2: + def parse(x): + return datetime.fromisoformat(x.replace("Z", "+00:00")) + try: + m["wall_s"] = round((parse(max(ts)) - parse(min(ts))).total_seconds()) + except ValueError: + m["wall_s"] = "" + else: + m["wall_s"] = "" + + # doc reads / rereads + docs = [] + for _, cmd in cmds: + if "docs " in cmd: + tail = cmd.split("docs ", 1)[1].split() + if tail: + docs.append(tail[0]) + m["read_doc_ai"] = 1 if any(d.lower() == "ai" for d in docs) else 0 + fl = (focus or "").lower() + aliases = {"adp", "cycle", "cycles"} if fl in CYCLE_FOCI else {fl} + m["read_doc_focus"] = 1 if any(d.lower() in aliases for d in docs) else 0 + m["doc_reread"] = len(docs) - len(set(docs)) + + # adherence + m["used_generated_prompt"] = 1 if any( + ("--output.prompt" in c) or ("--prompt " in c) or ("--prompt=" in c) for _, c in cmds + ) else 0 + framing = [] + if any("--focus cycle" in c for _, c in cmds): + framing.append("cycle") + if any(re.search(r"--focus\s+ADP", c, re.I) for _, c in cmds): + framing.append("ADP") + m["focus_framing"] = ",".join(framing) or "none" + + # first edit turn (1-based index among all tool calls) + edit_kinds = {"Edit", "Write", "MultiEdit", "NotebookEdit"} + m["first_edit_turn"] = next( + (i for i, n in enumerate(names, 1) if n in edit_kinds), "" + ) + + # clarity-ish counts + m["discovery_retries"] = sum(1 for is_err, _ in tool_results(events) if is_err) + m["clarifying_qs"] = sum(1 for n in names if n == "AskUserQuestion") + + # heuristic: tests pass if a green test line appears and no failure marker + joined = "\n".join(t for _, t in tool_results(events)) + passed = bool(re.search(r"test result: ok|\b0 failed\b|\d+ passed", joined)) + failed = bool(re.search(r"test result: FAILED|[1-9]\d* failed|FAILED\b", joined)) + m["tests_pass"] = 1 if (passed and not failed) else (0 if failed else "") + + # heuristic: planned before edit if assistant text precedes the first edit + m["planned_before_edit"] = 1 if m["first_edit_turn"] else "" + return m + + +def cycles(path): + """[(kind, size)] from a snapshot's cycles arrays.""" + found = [] + + def walk(o): + if isinstance(o, dict): + for k, v in o.items(): + if k == "cycles" and isinstance(v, list): + for c in v: + if isinstance(c, dict): + mem = c.get("members") or c.get("nodes") or c.get("modules") or [] + found.append((c.get("kind"), len(mem) if isinstance(mem, list) else 0)) + walk(v) + elif isinstance(o, list): + for v in o: + walk(v) + + with open(path) as fh: + walk(json.load(fh)) + return found + + +CYCLE_FOCI = {"adp", "cycle", "cycles"} + + +def node_metric(path, key): + """(values, direction) for one metric across internal (non-external) module nodes. + + `key` is a node attribute (e.g. `hk`, `sloc`, `cognitive`); `direction` comes from + the snapshot's node_attributes schema (`lower_better` / `higher_better` / None).""" + with open(path) as fh: + d = json.load(fh) + files = (d.get("graphs") or {}).get("files") or {} + vals = [n[key] for n in files.get("nodes") or [] + if not n.get("external") and isinstance(n.get(key), (int, float))] + direction = ((files.get("node_attributes") or {}).get(key) or {}).get("direction") + return vals, direction + + +def from_snapshots(run_dir, focus): + bj, aj = os.path.join(run_dir, "before.json"), os.path.join(run_dir, "after.json") + if not (os.path.exists(bj) and os.path.exists(aj)): + return {} + + if (focus or "").lower() in CYCLE_FOCI: + before, after = cycles(bj), cycles(aj) + sig = lambda cs: sorted((k, n) for k, n in cs) + bset = list(sig(before)) + new = [c for c in sig(after) if not (c in bset and bset.remove(c) is None)] + return { + "focus_before": sum(n for _, n in before), + "focus_after": sum(n for _, n in after), + "focus_delta": sum(n for _, n in after) - sum(n for _, n in before), + "worst_before": max((n for _, n in before), default=0), + "worst_after": max((n for _, n in after), default=0), + "new_cycles": len(new), + } + + # non-cycle metric focus (hk, sloc, cognitive, cyclomatic, fan_in, …): read the + # focused metric off the module nodes. worst_* = the worst module's value (the + # `--top 1` target); focus_* = the project-wide sum (a flat total beside a dropped + # worst = coupling *relocated*, not dissolved). new_cycles is N/A here. + key = (focus or "").lower().replace("-", "_") + bvals, direction = node_metric(bj, key) + avals, _ = node_metric(aj, key) + if not bvals and not avals: + return {} # unknown/absent metric β€” leave the columns blank rather than wrong + worst = min if direction == "higher_better" else max + rnd = (lambda x: round(x, 2)) if any(isinstance(v, float) for v in bvals + avals) else int + return { + "focus_before": rnd(sum(bvals)), + "focus_after": rnd(sum(avals)), + "focus_delta": rnd(sum(avals) - sum(bvals)), + "worst_before": rnd(worst(bvals)) if bvals else 0, + "worst_after": rnd(worst(avals)) if avals else 0, + "new_cycles": "", + } + + +def git_loc(project_path, branch, base): + try: + out = subprocess.run( + ["git", "-C", project_path, "diff", "--shortstat", f"{base}..{branch}"], + capture_output=True, text=True, check=True, + ).stdout + except subprocess.CalledProcessError: + return {} + fc = re.search(r"(\d+) files? changed", out) + add = re.search(r"(\d+) insertions?", out) + rem = re.search(r"(\d+) deletions?", out) + return { + "files_changed": int(fc.group(1)) if fc else "", + "loc_added": int(add.group(1)) if add else 0, + "loc_removed": int(rem.group(1)) if rem else 0, + } + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("run_dir", help="the -- run folder") + ap.add_argument("--focus") + ap.add_argument("--project") + ap.add_argument("--project-path", help="external PROJECT repo, for loc/files") + ap.add_argument("--base-branch", default="main") + ap.add_argument("--branch", help="PROJECT branch (default: -)") + ap.add_argument("--in-price", type=float, default=5.0, help="USD per MTok input") + ap.add_argument("--out-price", type=float, default=25.0, help="USD per MTok output") + ap.add_argument("--quality", help="quality_1_5 (judged)") + ap.add_argument("--clarity", help="clarity_1_5 (judged)") + ap.add_argument("--collateral", help="collateral_delta (non-FOCUS principle Ξ”)") + ap.add_argument("--verdict") + ap.add_argument("--notes") + ap.add_argument("--csv", help="metrics.csv path (default: /metrics.csv)") + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + run_dir = os.path.abspath(args.run_dir.rstrip("/")) + run = os.path.basename(run_dir) + build = os.path.basename(os.path.dirname(run_dir)) # _ + ts, _, sha = build.rpartition("_") + parts = run.split("-") + model = parts[0] if parts else "" + iteration = parts[-1] if len(parts) > 1 else "" + derived_focus = "-".join(parts[1:-1]) if len(parts) > 2 else "" + focus = args.focus or derived_focus + + row = {c: "" for c in COLUMNS} + row.update(ts=ts, cr_sha=sha, project=args.project or "", focus=focus, + model=model, iter=iteration, run=run) + + chat = os.path.join(run_dir, "chat.jsonl") + if not os.path.exists(chat): + sys.exit(f"no chat.jsonl in {run_dir}") + row.update(from_transcript(chat, focus)) + row.update(from_snapshots(run_dir, focus)) + + if args.project_path: + # PROJECT branches are flat and live across every build, so the run id alone + # (--) collides between builds. Default to -: + # the cr_sha makes it unique per prompt version and ties the branch to this + # run's build dir. (Same-commit re-runs in a new build dir still need a + # bumped β€” see the playbook's naming rule.) + row.update(git_loc(args.project_path, args.branch or f"{run}-{sha}", args.base_branch)) + + if row.get("input_tokens") != "" and row.get("output_tokens") != "": + row["cost_usd"] = round( + row["input_tokens"] * args.in_price / 1e6 + + row["output_tokens"] * args.out_price / 1e6, 2 + ) + + for col, val in (("quality_1_5", args.quality), ("clarity_1_5", args.clarity), + ("collateral_delta", args.collateral), ("verdict", args.verdict), + ("notes", args.notes)): + if val is not None: + row[col] = val + + csv_path = args.csv or os.path.join(os.path.dirname(os.path.dirname(run_dir)), "metrics.csv") + + if args.dry_run: + print(f"# would append to {csv_path}") + for c in COLUMNS: + print(f"{c:22} {row[c]}") + return 0 + + new_file = not os.path.exists(csv_path) + with open(csv_path, "a", newline="") as fh: + w = csv.DictWriter(fh, fieldnames=COLUMNS) + if new_file: + w.writeheader() + w.writerow(row) + print(f"appended {run} to {csv_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/contrib/prompting-self-improve.md b/contrib/prompting-self-improve.md new file mode 100644 index 00000000..6c6eb364 --- /dev/null +++ b/contrib/prompting-self-improve.md @@ -0,0 +1,520 @@ +# Prompt self-improvement loop + +## Goal + +> **Self-improving prompts β€” and a playbook that improves itself.** + +code-ranker hands an AI agent a generated fix-prompt for every structural problem it +finds. How good the resulting fix is comes down to two things: the model, and the +prompt. We can't make every user run the most capable model, so the lever we own is +the **prompt**. This loop drives every prompt to the point where the *cheapest* model +produces the same fix the *most capable* one would β€” in fewer turns, because the +prompt told the agent exactly what it needed and nothing it didn't. + +Three objectives, optimized together: + +- **Quality** β€” a real structural fix, behaviour preserved, tests green β€” equal to the + reference model's. +- **Cost** β€” the agent reaches that fix in as few calls and tokens as possible. +- **Clarity** β€” the agent never guesses: it reads the prompt once and knows the plan, + which doc to read, and what "done" means. + +The loop is **closed on itself**. Each pass runs a real fix, measures the gap to the +reference, changes the smallest prompt lever that would have closed it, rebuilds, and +re-runs β€” until the cheapest tier matches the bar. And when the *process itself* +proves clumsy β€” a run that teaches nothing, a score that doesn't discriminate, a +lever that's hard to find β€” we edit **this file too**, the algorithm of +self-improvement. Both layers improve: the prompts agents read, and the procedure +that improves those prompts. + +Progress is **measured** ([Metrics](#metrics-metricscsv)), not felt: "better or +worse" between two prompt versions is a row-to-row comparison. End state: across every +`FOCUS`, the cheapest model matches the reference at minimum cost and maximum clarity, +and the playbook gets there with no manual babysitting. + +--- + +A repeatable way to **empirically tune the AI fix-prompts** so that *cheaper* +models still produce reference-quality fixes. The reference is the most capable +model; the goal is to lift each cheaper tier up to it by improving the prompt β€” +not by relying on the model. + +Think of it as a function: + +``` +improve(PROJECT, FOCUS) # sweeps models, iterates the prompt +``` + +## Inputs (the variables) + +| Variable | Meaning | Examples | +|---|---|---| +| `MODEL` | the agent model under test, ordered **most β†’ least** capable | `opus` β†’ `sonnet` β†’ `haiku` | +| `FOCUS` | what to fix β€” a principle **or** a metric, passed to `--focus` | `cycle` (ADP), `hk`, `sloc`, `cognitive`, `SRP`, … | +| `PROJECT` | an **external** repo (not code-ranker) with real, non-trivial instances of `FOCUS` | any sample/work repo | + +`MODEL_REF` = the first (most capable) model β€” the quality bar every cheaper model +is measured against. + +## What we tune (the levers) + +The prompt an agent sees is assembled from **embedded data**. To change it, edit one +of these and rebuild (see Setup) β€” all are baked into the binary: + +- **principle framing** β€” the `[[principles]]` `prompt` in + `crates/code-ranker-plugins/src/defaults.toml` (+ per-language overrides in + `crates/code-ranker-plugins/src/languages//config.toml`). +- **scaffolding** (intro / doc-note / task / focus prose) β€” + `crates/code-ranker-graph/metrics/prompt.md`. +- **the full reference doc** the agent reads via `docs ` β€” + `languages//.md` (e.g. `ADP.md`), and the offline entry point + `languages/base/AI.md` (`docs ai`). + +Change the **smallest** lever that fixes the observed failure. + +**Respect the base / per-language boundary.** Language-specific content (Rust +`pub(in …)`, a Python import idiom, …) belongs ONLY in `languages//` (its +`.md` doc) or the per-language `config.toml` prompt override β€” **never** in the +language-neutral `languages/base/AI.md` or the neutral `defaults.toml` prompt. When a +cheaper tier fails for want of a language-specific remedy, the base lever stays generic +("read `docs ` β€” it has the cause and smallest fix for *your* language") and +the specifics live in the per-language doc it points at. Putting a Rust example in +`base/` leaks into every other language's output. + +## Setup (once per prompt version) + +- **S1 β€” fresh build on PATH.** Release-build and install locally so the + `code-ranker` invoked by the agent is the current build: + `cargo build --release` (then `cargo install --path crates/code-ranker-cli`). +- **S2 β€” provenance commit + run id.** Commit code-ranker, so every report this + build generates carries the current version + commit + date. Then capture the + **short hash** β€” `CR_SHA=$(git -C rev-parse --short HEAD)`. It names + the artifact directory for this build (next section): every chat, report and JSON + is traceable to the exact build β€” i.e. the exact **prompt version** β€” that + produced it. + +Every prompt edit (a lever above) re-runs S1–S2 before the next sweep, yielding a +fresh `CR_SHA` β†’ a fresh artifact directory. + +## The algorithm + +Two nested loops. The **inner** loop improves the prompts; the **outer, meta** loop +([below](#the-meta-loop--improving-this-playbook)) improves *this playbook* when the +process itself gets in the way. Both are driven by the same measured signals. + +**Drive the loop to its end β€” don't pause for permission between iterations, and don't +stop after a single pass.** A lever edit is only *half* a step: it is not done until its +rebuild (S1–S2) **and** its verifying re-run have scored the hypothesis against the +previous iteration. Stopping right after the edit leaves the loop unfinished and proves +nothing. Keep iterating (≀ 3 per model, then descend a tier) until the cheapest tier is +at the bar or the residual is recorded β€” that whole arc is one `improve(...)` call. + +``` +for MODEL in models (most β†’ least capable): # opus, then sonnet, then haiku… + loop (≀ 3 times): + R = run(PROJECT, FOCUS, MODEL) # one clean-context fix (below) + save artifacts(R) + measure R β†’ metrics.csv # quality + cost + clarity (objective) + score R against MODEL_REF's best run for FOCUS + if R meets the bar on all three axes: # ref-quality AND few calls AND no guessing + break # this tier is good β€” lock it + else: + pick the SMALLEST prompt lever that explains the gap, by axis: + quality bad / shallow fix β†’ principle framing, then the FOCUS doc + cost wasted turns: re-reads, dead β†’ state up front what the prompt now + ends, rediscovered facts makes the agent discover; cut noise + clarity agent asked / back-tracked / β†’ reword, reorder; put the decision + misread / read a doc twice first, name "done" explicitly + edit that lever, rebuild (S1–S2), re-run + # the edit is a HYPOTHESIS: the next run's metrics must show the targeted gap + # shrink vs the previous iteration β€” not just vs the reference β€” else revert it. + # META β€” when the LOOP itself misbehaved (a run that taught nothing, a signal that + # didn't discriminate, a lever you couldn't locate, an artifact you couldn't trace) + # fix the PROCESS: edit THIS file, commit it (β†’ new CR_SHA), continue. It's a lever too. + # descend to the next cheaper model and re-verify with the improved prompt +``` + +End state: across every `FOCUS`, the **cheapest** tier produces reference-quality +fixes at **minimum calls** and **maximum clarity** β€” and the playbook itself needed no +manual fixing to get there. Then repeat `improve(...)` for the next `FOCUS`. + +## A single run β€” `run(PROJECT, FOCUS, MODEL)` + +Let `RUN=/.code-ranker/prompt-eval/_/--` +β€” an **absolute** path into *this* repo's `.code-ranker/` (create it first). The +agent runs `code-ranker report .` inside `PROJECT`, but every `--output.*.path` +points at `$RUN`, so the evidence lands in code-ranker, not `PROJECT`. The agent's +**own** file writes (its plan file, any `report` it runs without an `--output` +override) still land in `PROJECT/.code-ranker/` β€” step 7 sweeps those into `$RUN`, so +nothing eval-related is left in `PROJECT`. + +1. **Clean start.** `PROJECT` on `main`, working tree clean. +2. **Fresh agent session**, model = `MODEL`, **empty context**. Bootstrap it with the + offline playbook only β€” no extra hints: have it read + `code-ranker docs ai` (overview + catalog) and `docs ` (the deep + doc). This is what a real user would do, so it tests the *prompt*, not your + coaching. +3. **BEFORE.** `code-ranker report . --output.html.path=$RUN/before.html --output.json.path=$RUN/before.json`. +4. **Save the focused prompt** (orchestrator, for the record): + `code-ranker report . --output.prompt.path=$RUN/prompt.md --focus --top 1` + β€” captures the exact fix-prompt this run used into `$RUN/prompt.md`, so prompt ↔ + behaviour stays correlatable across models. +5. **Fix** (agent). Ask the agent to fix the single worst (`--top 1`) cycle and **let it + work out how on its own** β€” which command to run, which doc to read, which refactor to + choose. Don't hand it the command: the run tests whether the prompt and docs lead it + there. The agent proposes the plan, applies the fix, and runs the project's tests. + **Verify at workspace scope, not just the touched crate.** A multi-crate workspace + (e.g. a Cargo workspace) needs BOTH `cargo check --workspace` AND + `cargo test --workspace --no-run` (build the test profile) before `tests_pass` is + trustworthy β€” a per-crate `cargo test -p ` passes green while the change still + breaks the workspace through a **feature-unified** path (a sibling crate enables a + feature that compiles code the standalone build skipped) or a **`#[cfg(test)]`** module + that only the test profile compiles. Both bit the cyberfabric-core sweep: a visibility + narrowing that the touched crate's own tests accepted left a downstream/feature-gated + reference pointing at a now-private item. +6. **AFTER + DIFF.** `code-ranker report . --baseline $RUN/before.json --output.html.path=$RUN/diff.html --output.json.path=$RUN/after.json` (+ an `after.html`). +7. **Collect the agent's own writes into `$RUN`.** The generated prompt tells the agent + to save a plan to `/.code-ranker/-.md`, and any `report` it runs + without an `--output` override also lands in `/.code-ranker/` β€” which is + **not** gitignored in a typical project. Move them into `$RUN/` (e.g. + `$RUN/agent-plan.md`) and clear `PROJECT/.code-ranker/`, so **all** eval evidence sits + under code-ranker's `prompt-eval/` and the `PROJECT` branch carries only the code + change. (This is also why the orchestrator must stage explicit paths, never + `git add -A`, when committing the fix.) +8. **Save the transcript** to `$RUN/chat.md` (see "Saving the chat"), commit the code + change to a branch named **identically to this run's build dir** β€” `_` + (e.g. `20260623T1849Z_dc06762`) β€” in `PROJECT`, then return to `main`. Branch name == + evidence-folder name, so code ↔ evidence line up by one identical string, and the UTC + `` makes every run's branch unique (no "bump ``"). Pass that exact branch name + to the collector via `--branch`. **Commit-msg gotcha:** if `PROJECT` has a + `prepare-commit-msg` hook that derives a ticket from the branch name, the `_` + branch carries none so the commit is rejected β€” and `--no-verify` does **not** skip + `prepare-commit-msg` (only pre-commit / commit-msg). Prefix the eval commit message with + a pseudo-ticket, e.g. `PROMPTEVAL-1: `. +9. **Measure.** Append one row to `prompt-eval/metrics.csv` with the collector β€” + don't hand-compute it (see [Metrics](#metrics-metricscsv) β†’ Collecting a row): + + ```sh + contrib/prompt-eval-metrics.py $RUN --focus --project \ + --project-path PROJECT --quality <1-5> --clarity <1-5> --verdict improved + ``` + +### Sweeps β€” clearing every instance, not tuning the prompt + +A *sweep* (loop the agent over `--top 1` until a FOCUS hits zero) is a different mode from +a single tuning run, and it has its own failure shape: fixes **accumulate in one working +tree** across many passes, so a later pass silently breaks an earlier one and per-pass +verification compounds the debt. Rules learned from the cyberfabric-core cycle sweep +(20 cycles β†’ 0 over 28 Haiku passes): + +- **Track net progress, not per-pass success.** Measure the FOCUS total (e.g. sum of cycle + members) each pass; **stop on no net decrease across 2 passes** (a stall / capability + ceiling) and cap total iterations. A pass that *fragments* a big SCC (breaks one back-edge, + leaves a smaller remnant) shows as flat cycle-count but falling members β€” that is progress; + full convergence is normal even when individual passes are partial. +- **Gate compilation at workspace scope between passes (or checkpoint).** Since the tree + accumulates, a per-crate-green pass can still break the workspace (feature-unified / + `cfg(test)` paths, see step 5). Either commit/checkpoint per pass for bisectability, or run + `cargo check --workspace` periodically β€” and always `cargo check --workspace` + + `cargo test --workspace --no-run` + a full test run at the **end** before declaring done. +- **Measure from artifacts, never the agent's summary.** Agents over-claim ("`fan_in` β†’ 0", + "cycle gone") β€” re-derive the count from `report --output.json` each pass. +- **Cheap tier + iteration converges on cycles** even with fragmentation, because the cycle's + back-edges are concrete in the prompt's connections list; it does **not** converge on HK + hubs, where finding the high-value cut needs reasoning the cheapest tier lacks (the + capability ceiling β€” see the Tuning rule). + +## Artifacts: layout & naming + +Everything lives under the **code-ranker repo's own `.code-ranker/`** (this repo, +not `PROJECT`) β€” it's gitignored and is the project's keep-forever run area, so all +prompt-eval evidence is collected in one place across every `PROJECT` and model. The +external `PROJECT` only carries the **code change**, on its branch. All evidence for +one **build / prompt version** sits in a single dated folder; **keep everything β€” +never delete, the runs are the comparison corpus.** + +Layout (one build β†’ one `_` folder β†’ one subfolder per run): + +``` +/.code-ranker/ # THIS repo's dir, not PROJECT's +└─ prompt-eval/ + β”œβ”€ metrics.csv csv append-only β€” ONE row per run, ALL builds (comparison corpus) + └─ 20260623T1412Z_a660e36/ dir β€” _ + β”œβ”€ run.md md ~1 KB inputs: project, FOCUS, models, cr version+commit + β”œβ”€ results.md md ~2 KB the results-log rows for this build + β”œβ”€ opus-cycle-1/ dir one run = -- (matches the PROJECT branch) + β”‚ β”œβ”€ before.json json ~150 KB baseline snapshot + β”‚ β”œβ”€ before.html html ~1.5 MB self-contained viewer (inlined WASM/assets) + β”‚ β”œβ”€ after.json json ~150 KB post-fix snapshot + β”‚ β”œβ”€ after.html html ~1.5 MB + β”‚ β”œβ”€ diff.html html ~1.6 MB baseline↔current diff report + β”‚ β”œβ”€ prompt.md md ~3 KB the exact `--focus` fix-prompt the agent got + β”‚ β”œβ”€ chat.jsonl jsonl ~0.5–3 MB raw session record (Claude Code; verbatim) + β”‚ └─ chat.md md ~50–300 KB readable transcript (the tuning data) + β”œβ”€ sonnet-cycle-1/ dir same shape + └─ haiku-cycle-2/ dir same shape +``` + +- folder/run id = `--`; the PROJECT **branch** for that run is named + **identically to the run's build dir** β€” `_` (e.g. + `20260623T1849Z_dc06762`). Give each run its **own** `_` build dir (one run + subfolder per build dir) so that folder name is a unique per-run id, and the branch + reuses it verbatim. PROJECT branches are flat and live across every build, but the UTC + `` makes each unique β€” no more "bump `` until free". Code ↔ evidence line up by + the shared `_` string. The branch is no longer `-`, so **pass + it to the collector via `--branch`**. (If a build dir ever holds several runs, suffix + the branch with the run-id: `__`.) +- the code-ranker version/commit is also embedded *inside* each report (from S2), so + a file stays self-describing even if moved out of its folder. +- HTML reports are large (self-contained, WASM inlined); JSON snapshots scale with + the project; `chat.md` is the biggest signal-per-byte and the smallest to diff. + +### Launching a clean-context agent + +Each run is a **fresh session** of `MODEL` with **no carried context** β€” start a new +one, never `--continue`/`--resume`. Keep `PROJECT` free of a code-ranker-specific +`CLAUDE.md`/memory so only `docs ai` primes the agent; otherwise you're testing the +priming, not the prompt. + +**Watch the agent's working directory.** Launch it *inside* `PROJECT` (the interactive +`claude` below does this). If you instead drive it as a **sub-agent whose cwd is the +code-ranker source repo**, it sees a Cargo project there and tends to run the analyzer +via `cargo run --manifest-path /Cargo.toml report …` β€” recompiling it and +dumping a build log into context β€” instead of the installed `code-ranker` on PATH. That +inflates the cost columns (`input_tokens`, `cache_read_tokens`, a couple of `commands`) +with work **no real user does**, so the cost axis is no longer comparable to a run +launched in `PROJECT`. Either launch in `PROJECT`, or tell the agent up front that +`code-ranker` is installed on PATH and the code-ranker source tree is not its concern β€” +and note in `metrics.csv` which basis the run used. + +- **Claude Code** (Opus / Sonnet / Haiku), interactive β€” what the fix loop wants + (multi-turn: run code-ranker, edit, run tests): + + ```sh + cd PROJECT # external repo, on main, clean tree + claude --model opus # or sonnet / haiku β€” pins the tier; fresh = no context + ``` + + Then give it **one** opening message (the bootstrap), nothing else: + + > Read `code-ranker docs ai`, then fix the worst `` in this + > project. Show me the plan before changing code. + + Headless one-shot (scriptable, but weaker for the multi-step loop): + + ```sh + cd PROJECT && claude -p "Read \`code-ranker docs ai\`, then fix the worst …" --model haiku + ``` + +- **Other agents** (Cursor, …): open a **New Chat** (not a continued thread), select + the model, paste the same one-message bootstrap. + +### Saving the chat + +The transcript is the **primary tuning data** β€” it shows *where* a cheaper model +diverged (skipped `docs`, picked the wrong cycle, hacked the metric). Save it raw, +**verbatim, no summary**, into `$RUN/chat.*`. It must include the bootstrap +(`docs ai` / `docs ` reads), the task, and **every** assistant turn β€” its +reasoning **and** the tool calls (the `code-ranker` commands + their output), through +the final fix and the test run. + +- **Claude Code** β€” the canonical record is the session **JSONL** at + `~/.claude/projects//.jsonl` (cwd-slug = `PROJECT`'s path with + `/`β†’`-`; one file per session, newest by mtime = the run you just did). Copy it to + `$RUN/chat.jsonl` (verbatim turns + tool calls) and/or render it to `$RUN/chat.md` + for reading. +- **Other agents**: export / copy the conversation as Markdown into `$RUN/chat.md`. +- Also save the exact fix-prompt the agent received as `$RUN/prompt.md`, so prompt β†’ + behaviour is correlatable across models. Markdown stays readable and diffable. + +## Metrics (`metrics.csv`) + +"Better or worse" is decided by numbers, not memory. Every run appends one row to a +single append-only file, **`/.code-ranker/prompt-eval/metrics.csv`** β€” +the cross-build comparison corpus. To compare two prompt versions, filter the rows to +the same `(project, focus, model)` and read down the columns: a newer `cr_sha` is +**better** when `quality_1_5` and `clarity_1_5` are β‰₯ and `focus_delta` is β‰₯ (more +negative or equal) **while** `tool_calls` / `commands` / `output_tokens` go **down**. A gain on one axis +paid for by a loss on another is not a win β€” name the trade in `notes`. + +Columns, grouped by objective (most are extractable from the run's artifacts; the two +`*_1_5` are judged from the transcript + diff): + +| Column | Axis | Source | Meaning (↑/↓ = better) | +|---|---|---|---| +| `ts`,`cr_sha`,`project`,`focus`,`model`,`iter`,`run` | id | run.md | identity β€” `cr_sha` is the prompt version | +| `tests_pass` | quality | project tests | 1/0 β€” tests green, behaviour preserved. ⚠ On a multi-crate workspace a per-crate pass is **not** sufficient β€” gate on `cargo check --workspace` + `cargo test --workspace --no-run` (see step 5); the collector's heuristic also can't see a workspace/feature/`cfg(test)` break, so verify it yourself. Also watch the **test count**: a fix that drops tests (e.g. moved code without migrating its tests) can leave the survivors green β€” `tests_pass` won't catch the lost coverage | +| `focus_before` / `focus_after` | quality | before/after `.json` | **Focus-aware.** Cycle FOCUS (`ADP`/`cycle`): total cycle-warning count. Metric FOCUS (`HK`, `sloc`, `cognitive`, …): the **project-wide sum** of that metric across module nodes β€” a flat total beside a dropped `worst_*` means the fix **relocated** the cost rather than dissolving it. | +| `focus_delta` | quality | `after βˆ’ before` | ↓ (negative) = better (fewer cycle warnings, or lower total metric) | +| `worst_before` / `worst_after` | quality | before/after `.json` | worst instance: cycle FOCUS β†’ largest SCC node count; metric FOCUS β†’ the **worst module's metric value** (the `--top 1` target, e.g. HK 390825 β†’ 140697). Direction from the snapshot's `node_attributes` schema (`higher_better` β†’ worst is the min). | +| `new_cycles` | quality | after vs before `.json` | ↓ cycles present in `after` but **not** `before` β€” regression guard (a fix that breaks one cycle and creates another scores 0 here). ⚠ **False positive:** a cycle whose membership only *shrank* (the survivor is a subset of a pre-existing cycle the fix partially cleared) registers here as "new". Diff the cycle node-sets before scoring a fix down β€” a subset/remnant is a *shrink*, not a new cycle. (Collector meta-gap: should classify subset-of-before as shrink.) | +| `collateral_delta` | quality | full scorecard at main vs branch | Ξ” in **non-FOCUS** principle violations (run `report --output.scorecard --top 0` at each git state, sum all rows except FOCUS). ↓ = a fix that also cleared other principles; ↑ = collateral damage | +| `quality_1_5` | quality | transcript + diff | ↑ real fix (extract/invert/split) vs metric-hack | +| `tool_calls` | cost | transcript | ↓ total tool invocations (Read/Edit/Bash/Grep/…) | +| `commands` | cost | transcript | ↓ shell/CLI commands run (the `Bash` subset β€” code-ranker, cargo, grep) | +| `input_tokens` | cost | transcript | ↓ input tokens **incl. cache reads** β€” noisy (turn-/cache-dominated); compare only on the same extraction basis | +| `output_tokens` | cost | transcript | ↓ output tokens β€” the clean cost signal (session `result.usage.output_tokens`, or summed over assistant turns for a subagent log) | +| `cache_read_tokens` | cost | transcript | input tokens served from cache (context β€” explains the gap between `input_tokens` and fresh input) | +| `cost_usd` | cost | derived | ↓ **pure-API, no-cache, no-discount** cost = `input_tokens Γ— $5/MTok + output_tokens Γ— $25/MTok` (Opus standard rates; **not** the billed cost, which is far lower with caching). Comparable only when `input_tokens` shares an extraction basis | +| `wall_s` | cost | transcript | ↓ **total duration** β€” the whole wall-clock time waited end-to-end (thinking + API + local tool runs like `cargo test`/`code-ranker` + queue/rate-limit waits). Session `result.duration_ms`, or firstβ†’last event timestamp for a subagent log | +| `api_duration_s` | cost | transcript | ↓ the **API-only subset** of `wall_s` (active model time, `result.duration_api_ms`). `wall_s βˆ’ api_duration_s` β‰ˆ local tool execution + queueing. Blank when there's no session `result` event (subagent log) | +| `files_changed` | cost | diff | context β€” edit footprint (not better/worse alone) | +| `loc_added` / `loc_removed` | cost | PROJECT branch `git diff --shortstat` | precise edit footprint; a fix far larger than the reference's is a smell (also catches committed litter) | +| `read_doc_ai` / `read_doc_focus` | clarity | transcript | 1/0 β€” read `docs ai` / `docs ` | +| `doc_reread` | clarity | transcript | ↓ times a doc was read more than once (a re-read signals the prompt/doc wasn't clear the first time) | +| `planned_before_edit` | clarity | transcript | 1/0 β€” proposed a plan before editing | +| `used_generated_prompt` | adherence | transcript | 1/0 β€” actually fetched the tool's fix-prompt (`--output.prompt` / `--prompt`) vs improvising | +| `focus_framing` | adherence | transcript | which lens the agent chose β€” `ADP` (principle) or `cycle` (metric); reveals how it read the task | +| `first_edit_turn` | clarity | transcript | tool-call index of the first `Edit`/`Write` β€” very high = lots of exploration before acting (thoroughness, or an unclear prompt) | +| `clarifying_qs` | clarity | transcript | ↓ questions the prompt should have pre-answered | +| `discovery_retries` | clarity | transcript | ↓ failed tool calls (`is_error`) β€” dead ends the prompt could have prevented | +| `clarity_1_5` | clarity | transcript | ↑ read once, planned, no guessing/back-tracking | +| `verdict` | β€” | diff verdict | `improved` / `neutral` / `regressed` | +| `notes` | β€” | you | failure class, the lever changed, residual gap | + +The objective columns (`focus_*`, `new_cycles`, `collateral_delta`, `tool_calls`, `commands`, +`output_tokens`, `loc_*`, retries, doc reads) are the hard signal; the two `*_1_5` judgments +are the qualitative "why" that drives the next prompt edit. `cost_usd` is a normalized +**no-cache** figure for cross-version comparison, deliberately *not* the billed amount β€” +caching/discounts are real-world noise that would make two prompt versions incomparable. +`results.md` stays the human narrative per build; `metrics.csv` is the machine-diffable +history across builds. + +### Collecting a row + +Don't hand-compute the objective columns β€” run the collector, which extracts them from +the run's artifacts and appends a row: + +```sh +contrib/prompt-eval-metrics.py //-- \ + --focus --project --project-path --base-branch main \ + --quality <1-5> --clarity <1-5> --collateral <Ξ”> --verdict improved --notes "…" +``` + +It reads `chat.jsonl` (tokens, durations, tool/command counts, doc reads + rereads, +`first_edit_turn`, `focus_framing`, `used_generated_prompt`, retries, and heuristic +`tests_pass` / `planned_before_edit`) and `before/after.json` (`focus_*`, `worst_*`, +`new_cycles`); with `--project-path` it adds `files_changed` / `loc_*` from the branch +diff; it derives `ts` / `cr_sha` / `model` / `iter` / `run` from the path and computes +`cost_usd`. Token extraction is **format-aware**: a full session log uses its +authoritative `result` usage; a subagent log sums per-turn (so its `input_tokens` / +`cost_usd` are cache-inflated and `api_duration_s` is blank). The **judged** columns β€” +`quality_1_5`, `clarity_1_5`, `collateral_delta`, `verdict`, `notes` β€” are flags (blank +if omitted; `collateral_delta` isn't auto-computed β€” it needs scorecards at two git +states, so compute it once and pass `--collateral`). `--dry-run` prints the row without +writing. + +> **Run one mechanism per sweep.** `cost_usd` / `input_tokens` are only comparable when +> every run in the sweep was launched the same way (all interactive `claude`, or all +> subagent) β€” the two extraction bases don't line up. Don't mix them within a `FOCUS`. + +### Scoring rubric β€” `quality_1_5` / `clarity_1_5` + +The `*_1_5` columns are the only subjective signal, so pin them to a rubric or they +drift between sessions (an identical fix has already been scored 5 in one run and 4 in +another). Score against `MODEL_REF`'s run for the same `FOCUS`: + +**`quality_1_5`** β€” is the fix real, and as good as the reference's? + +- **5** β€” real structural fix (extract / invert / split, or the *correct minimal* fix + for this violation); behaviour preserved, `new_cycles` 0, `collateral_delta` ≀ 0. +- **3–4** β€” correct and tests pass, but narrower/weaker than the reference, or leaves an + obvious residual. +- **1–2** β€” silences the metric without fixing the structure, or needs follow-up to be + correct. +- **0** β€” wrong, tests fail, or introduced a new cycle. + +**`clarity_1_5`** β€” did the agent go straight to the fix, or grope? + +- **5** β€” read each doc once, planned before editing, zero clarifying questions, zero + failed/abandoned commands. +- subtract ~1 each for a `doc_reread`, a `discovery_retries` dead-end, a `clarifying_qs`, + or a skipped plan β€” each is something a clearer prompt could have prevented. + +When the rubric forces a judgement the columns can't capture, that's a signal to **add a +column** (the meta-loop), not to fudge the score. + +## Tuning rule + +**Diagnose from the transcript by hand, not from the aggregates.** Before scoring and +before choosing a lever, read the run's `chat.jsonl` turn by turn. The collector's +columns (`tool_calls`, `discovery_retries`, `output_tokens`, `first_edit_turn`) tell you +*how much* was spent and *that* the model groped; only the turn-by-turn record shows +*where* and *why* it diverged β€” which is what actually picks the lever. A lever chosen +from counts alone over-fits the number, not the failure class. (Counts also mislead: a +high `discovery_retries` can be benign compile iterations, and inflated tokens can be a +measurement artifact β€” see the cwd caution under "Launching a clean-context agent" β€” both +only visible by reading the log.) + +A prompt change is justified when a cheaper model misses on **any** of the three +objectives in a way the prompt *could* have prevented: + +- **quality** β€” it skipped the reference doc, picked the wrong cycle, or hacked the + metric instead of extracting an abstraction; +- **cost** β€” it spent turns rediscovering what the prompt could have stated, or chased + a dead end the prompt could have ruled out (`tool_calls` / `discovery_retries` high); +- **clarity** β€” it asked, back-tracked, or misread because the prompt buried the + decision or ordered it confusingly (`clarifying_qs` high, `planned_before_edit` 0). + +Map the miss to the **smallest** lever (principle `prompt` βŠ‚ scaffolding βŠ‚ the +`` doc βŠ‚ β€” when the *process* is the problem β€” **this file**), change only +that, rebuild, re-sweep. Each edit is a hypothesis: the next run's `metrics.csv` row +must show the targeted column move, or the edit is reverted. Avoid over-fitting to one +project: a change should help the failure **class**, not memorise the repo. + +Stop a tier after **3 iterations** even if not perfect β€” record the residual gap (the +row stays in `metrics.csv`) so it's a decision on record, not a silent failure. + +**Distinguish a prompt gap from a capability ceiling.** A lever can only fix what the +model *would have done with the right instruction*. If the agent **reads the lever** +(the doc/section it targets shows in the transcript) and **still doesn't perform the +named step** β€” and a stronger model on the *same* prompt does β€” then the gap is the +model's diagnostic ability, not the prompt. Signs: the targeted column doesn't move (or +worsens) across two iterations, and the agent substitutes a plausible-but-wrong move it +*can* do (e.g. on HK, splitting a hub by its internal seams instead of running the +audiences analysis to find the wrong-audience import). When you see this, **revert the +lever** (it failed its hypothesis β€” keeping it is lever-creep), record the residual as a +**capability ceiling for that tier on that problem class**, and stop β€” don't spend the +3rd iteration refining a prompt the model isn't acting on. (Observed: cyberfabric-core +`gear.rs` HK β€” opus/sonnet ran the audiences check and dissolved the hub to ~0; haiku, +under two successive HK levers it demonstrably read, twice did sloc-shaving splits that +left `fan_in` untouched and the hub still #1.) + +## The meta-loop β€” improving this playbook + +The prompts are levers; so is this file. After a sweep β€” and the **moment the user has +to correct how you ran the loop** β€” ask whether the *process* helped or fought you, and +edit the playbook when it fought: + +- a **correction from the user** β€” they told you the loop skipped a step, stopped early, + read the wrong evidence, or measured the wrong thing β†’ this is the **strongest** + meta-signal. If you had to be told, the playbook was unclear. Encode the correction + into THIS file **before continuing** the sweep, not after it. The file *not* changing + after a correction is itself the bug β€” "self-improving" means the next run can't repeat + the mistake you were just corrected for. +- a **run that taught nothing** (you couldn't tell *why* the fix scored as it did) β†’ + fix what a run captures, or add a metric column that would have shown it; +- a **signal that didn't discriminate** quality, cost, or clarity β†’ sharpen the + metric / its source; +- a **lever you couldn't locate**, or a change that helped but had no home above β†’ fix + "What we tune"; +- a **missing or untraceable artifact** β†’ fix the layout / naming. + +Treat a playbook edit exactly like a prompt edit: it changes behaviour, so it gets its +own **S1–S2** (commit β†’ new `CR_SHA`) and the next sweep runs under it. Log it in +`metrics.csv` / `results.md` with `focus = meta` so process changes are auditable +alongside prompt changes. The loop is done not when one prompt is perfect, but when +**neither the prompts nor this procedure** need another hand-correction. + +## Results log + +Track one row per run so the sweep is auditable: + +| date | cr version+commit | PROJECT | FOCUS | MODEL | iter | branch | verdict (Ξ”) | tests | quality 1–5 | tokens | time (s) | notes / failure class | +|------|-------------------|---------|-------|-------|------|--------|-------------|-------|-------------|--------|----------|----------------------| +| … | 4.0.0 @abc123 | … | cycle | opus | 1 | opus-cycle-1 | improved (βˆ’2 cycles) | pass | 5 | 49.7k | 196 | reference | +| … | 4.0.0 @abc123 | … | cycle | sonnet | 1 | sonnet-cycle-1 | neutral (0) | pass | 2 | 88k | 310 | skipped `docs`, hacked one edge | + +`tokens` and `time (s)` are the cost axis at a glance (full breakdown β€” +`tool_calls`, `commands`, `input_tokens`, `output_tokens`, `wall_s` β€” lives in +`metrics.csv`); lower is better at equal quality. diff --git a/crates/code-ranker-cli/Cargo.toml b/crates/code-ranker-cli/Cargo.toml index 341c27d6..c250d54e 100644 --- a/crates/code-ranker-cli/Cargo.toml +++ b/crates/code-ranker-cli/Cargo.toml @@ -28,3 +28,6 @@ serde_json = { workspace = true } [dev-dependencies] tempfile = { workspace = true } +# For e2e fixtures to reference the single format-version constant (SCHEMA_VERSION) +# instead of hardcoding it. +code-ranker-graph = { workspace = true } diff --git a/crates/code-ranker-cli/src/analyze.rs b/crates/code-ranker-cli/src/analyze.rs index b7c27fe4..5938899c 100644 --- a/crates/code-ranker-cli/src/analyze.rs +++ b/crates/code-ranker-cli/src/analyze.rs @@ -74,7 +74,6 @@ fn analyze_from_snapshot( cycles: cfg.rules.cycles, rules: cfg.rules, output: cfg.output, - templates: cfg.templates, }) } diff --git a/crates/code-ranker-cli/src/check.rs b/crates/code-ranker-cli/src/check.rs index f52a0557..88c637be 100644 --- a/crates/code-ranker-cli/src/check.rs +++ b/crates/code-ranker-cli/src/check.rs @@ -32,7 +32,7 @@ pub(crate) fn run_check( cycle_rules: &[String], thresholds: &[String], focus_path: &[String], - focus_rule: &[String], + focus: &[String], baseline: Option<&Path>, output_format: OutputFormat, top: Option, @@ -76,7 +76,7 @@ pub(crate) fn run_check( }; // Scope the gate. `--focus-path` keeps violations under the given files/folders; - // `--focus-rule` keeps violations of the given rule ids or concern groups. The + // `--focus` keeps violations of the given rule ids or concern groups. The // whole project is still analyzed, but a violation outside an active focus is // dropped β€” neither reported nor counted toward the exit code. With both set, a // violation must satisfy both (path AND rule). A locationless violation can't be @@ -86,10 +86,10 @@ pub(crate) fn run_check( violation_rel_path(&v.location).is_some_and(|rel| path_matches(rel, focus_path)) }); } - if !focus_rule.is_empty() { - findings.retain(|v| rule_matches(v, focus_rule)); + if !focus.is_empty() { + findings.retain(|v| rule_matches(v, focus)); } - let scope_note = focus_scope_note(focus_path, focus_rule); + let scope_note = focus_scope_note(focus_path, focus); let total = findings.len(); // Rank worst-first by breach magnitude; `--top` limits only what is @@ -293,10 +293,10 @@ fn path_matches(rel: &str, focus: &[String]) -> bool { }) } -/// Whether a violation matches one of the `--focus-rule` entries. An entry matches +/// Whether a violation matches one of the `--focus` entries. An entry matches /// the full rule id (`threshold.file.hk`, `check.inline_tests_too_large`), the bare /// id after the last dot (`inline_tests_too_large`), or the concern group (`TST`, -/// `CPL`) β€” so `--focus-rule TST` and `--focus-rule inline_tests_too_large` both work. +/// `CPL`) β€” so `--focus TST` and `--focus inline_tests_too_large` both work. fn rule_matches(v: &config::Violation, focus: &[String]) -> bool { focus .iter() @@ -304,14 +304,14 @@ fn rule_matches(v: &config::Violation, focus: &[String]) -> bool { } /// The trailing "(focused on …)" note for the human header, covering whichever of -/// `--focus-path` / `--focus-rule` are active (empty when neither is). -fn focus_scope_note(focus_path: &[String], focus_rule: &[String]) -> String { +/// `--focus-path` / `--focus` are active (empty when neither is). +fn focus_scope_note(focus_path: &[String], focus: &[String]) -> String { let mut parts = Vec::new(); if !focus_path.is_empty() { parts.push(format!("path {}", focus_path.join(", "))); } - if !focus_rule.is_empty() { - parts.push(format!("rule {}", focus_rule.join(", "))); + if !focus.is_empty() { + parts.push(format!("rule {}", focus.join(", "))); } if parts.is_empty() { String::new() diff --git a/crates/code-ranker-cli/src/cli.rs b/crates/code-ranker-cli/src/cli.rs index 6a65eb0c..82757dfb 100644 --- a/crates/code-ranker-cli/src/cli.rs +++ b/crates/code-ranker-cli/src/cli.rs @@ -11,10 +11,38 @@ use std::path::PathBuf; about = "Pluggable multi-language structural analysis platform" )] pub(crate) struct Cli { + /// Verbosity of the stderr diagnostic stream (machine output/artifacts always + /// go to stdout/files, untouched by this). `quiet` = errors only; `summary` + /// (default) = errors, warnings, written-artifact paths, and the closing + /// `βœ“ … β€”