diff --git a/Makefile b/Makefile index c817b2d..9393c50 100644 --- a/Makefile +++ b/Makefile @@ -88,6 +88,8 @@ validate-cli: printf '%s\n' '{"prompt":"Write one short sentence about Paris.","model_id":"gpt2-small","steering":{"feature_id":"10200","layer":"6-res-jb","strength":5}}' >/tmp/agent-machine-steer-request.json $(PYCLI) steer stub-response /tmp/agent-machine-steer-request.json --pretty >/tmp/agent-machine-pycli-steer-stub-response.json $(BOOTSTRAP_CLI) steer stub-response /tmp/agent-machine-steer-request.json --pretty >/tmp/agent-machine-bootstrap-steer-stub-response.json + $(PYCLI) steer preflight --sourceset gpt2-small.res-jb --pretty >/tmp/agent-machine-pycli-steer-preflight.json + $(BOOTSTRAP_CLI) steer preflight --sourceset gpt2-small.res-jb --pretty >/tmp/agent-machine-bootstrap-steer-preflight.json $(PYCLI) version $(PYCLI) paths --format json $(PYCLI) doctor --format json diff --git a/docs/index.md b/docs/index.md index a0796ac..65d6a53 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,6 +14,7 @@ Agent Machine is a bootstrap runtime-control substrate for SourceOS agent worklo | [Local SAE steering inference readiness](inference-local-steering.md) | Inspection record for Neuronpedia-compatible local steering readiness and current gaps. | | [Local /steer endpoint contract](local-steer-endpoint.md) | Noetica-compatible local steering endpoint contract and stub behavior. | | [Steering sourceset registry](steering-sourcesets.md) | Registered model/SAE sourceset records for local steering work. | +| [GPT-2 Small steering activation path](steering-activation-path.md) | Fail-closed real-path entrypoint and remaining blockers for controlled activation. | ## Architecture diff --git a/docs/steering-activation-path.md b/docs/steering-activation-path.md new file mode 100644 index 0000000..e80b81d --- /dev/null +++ b/docs/steering-activation-path.md @@ -0,0 +1,68 @@ +# GPT-2 Small Controlled Steering Activation Path + +Status: Issue #34 implementation-prep. This document records the first controlled activation entrypoint for `gpt2-small.res-jb` and the remaining blockers before #34 can close. + +## Scope + +This path is GPT-2 Small only. + +Gemma sourcesets remain out of scope for #34 closure because Gemma model access depends on operator terms/access verification. + +## Commands + +Preflight readiness: + +```bash +agent-machine steer preflight --sourceset gpt2-small.res-jb --pretty +``` + +Serve sourceset-aware local endpoint in fail-closed mode: + +```bash +agent-machine steer serve --sourceset gpt2-small.res-jb --host 127.0.0.1 --port 8080 +``` + +The existing contract stub remains available: + +```bash +agent-machine steer serve-stub --host 127.0.0.1 --port 8080 --status not_configured +``` + +## Current behavior + +`steer preflight` resolves the registered `SteeringSourceset`, checks optional runtime dependency presence, and reports missing activation prerequisites. + +`steer serve --sourceset ...` starts a local `/steer` endpoint using the registered sourceset posture. Until all prerequisites are present, it returns a valid Noetica-compatible `SteeringResult` with: + +```json +{ + "status": "not_configured" +} +``` + +It must not return `status: "applied"` until a real forward pass and feature injection succeed. + +## Remaining blockers before #34 can close + +- optional ML dependencies installed from `requirements-steering.txt` +- verified GPT-2 Small model artifacts +- verified SAE artifacts for SAELens release `gpt2-small-res-jb`, SAE id `blocks.6.hook_resid_pre` +- digest locks for model and SAE artifacts +- storage receipt for the resolved artifact locations +- policy admission and agent-registry grant records +- real activation injection implementation +- local smoke record showing `status: applied`, baseline, steered output, and evidence hash + +## Noetica integration target + +Once the real path is ready, Noetica should work without code changes by setting: + +```bash +NEURONPEDIA_BASE_URL=http://localhost:8080 +``` + +Then Noetica `/api/steer` should call Agent Machine `/steer` and receive `status: applied` only after real activation succeeds. + +## Boundary + +This document and the current `serve --sourceset` entrypoint do not close #34. They add the fail-closed entrypoint and preflight surface needed before the real activation injection implementation lands. diff --git a/requirements-steering.txt b/requirements-steering.txt new file mode 100644 index 0000000..08ed744 --- /dev/null +++ b/requirements-steering.txt @@ -0,0 +1,10 @@ +# Optional runtime dependencies for local SAE steering activation. +# These are intentionally not part of requirements-dev.txt or default validation. +# Install only on an operator machine intended to run the local steering server. + +torch +transformers +transformer-lens +sae-lens +safetensors +huggingface_hub diff --git a/src/agent_machine/cli.py b/src/agent_machine/cli.py index 724b68b..1ee63af 100644 --- a/src/agent_machine/cli.py +++ b/src/agent_machine/cli.py @@ -289,30 +289,18 @@ def resolve_activation_policy_and_grant(args: argparse.Namespace, agentpod: dict """Resolve activation policy/grant from explicit files or local policy store.""" policy_json = args.policy_json grant_json = args.grant_json - - # Backward-compatible shorthand: - # agent-machine activate evaluate --policy-dir examples ... - # argparse first assigns the single optional positional to policy_json, so we - # reinterpret it as grant_json when a policy store/resolver option is present. resolver_requested = bool(args.policy_file or args.policy_dir or args.policy_id or args.expected_status) if grant_json is None and policy_json is not None and resolver_requested: grant_json = policy_json policy_json = None - if grant_json is None: raise AssertionError( "grant JSON is required. Use either ` ` " "or ` --policy-dir `" ) - if policy_json is not None: return load_json(policy_json), load_json(grant_json) - - policies = policy_fabric.load_policy_admissions( - files=args.policy_file, - directories=args.policy_dir, - root=REPO_ROOT, - ) + policies = policy_fabric.load_policy_admissions(files=args.policy_file, directories=args.policy_dir, root=REPO_ROOT) policy = policy_fabric.resolve_policy_admission( policies=policies, agentpod_id=str(agentpod.get("id")), @@ -334,10 +322,7 @@ def cmd_activate_evaluate(args: argparse.Namespace) -> int: policy_fabric = import_renderer(lambda: __import__("agent_machine.policy_fabric", fromlist=["_unused"])) agentpod = load_json(args.agentpod_json) policy, grant = resolve_activation_policy_and_grant(args, agentpod, policy_fabric) - storage_receipts = activation.load_storage_receipts( - files=args.storage_receipt_file, - directories=args.storage_receipt_dir, - ) + storage_receipts = activation.load_storage_receipts(files=args.storage_receipt_file, directories=args.storage_receipt_dir) storage_receipt_refs = list(args.storage_receipt_ref or []) if not storage_receipt_refs and storage_receipts: storage_receipt_refs = [str(receipt.get("id")) for receipt in storage_receipts] @@ -364,10 +349,7 @@ def cmd_steer_stub_response(args: argparse.Namespace) -> int: steering_stub = __import__("agent_machine.steering_stub", fromlist=["_unused"]) request = steering_stub.load_steer_request(str(args.request_json)) result = steering_stub.build_stub_steer_result(request, status=args.status) - if args.pretty: - print(json.dumps(result, indent=2, sort_keys=True)) - else: - print(json.dumps(result, sort_keys=True, separators=(",", ":"))) + print(json.dumps(result, indent=2 if args.pretty else None, sort_keys=True)) return 0 @@ -376,6 +358,18 @@ def cmd_steer_serve_stub(args: argparse.Namespace) -> int: return int(steering_stub.serve_stub(host=args.host, port=args.port, status=args.status)) +def cmd_steer_preflight(args: argparse.Namespace) -> int: + steering_runtime = __import__("agent_machine.steering_runtime", fromlist=["_unused"]) + result = steering_runtime.runtime_preflight(args.sourceset) + print(json.dumps(result, indent=2 if args.pretty else None, sort_keys=True)) + return 0 + + +def cmd_steer_serve(args: argparse.Namespace) -> int: + steering_runtime = __import__("agent_machine.steering_runtime", fromlist=["_unused"]) + return int(steering_runtime.serve_sourceset(args.sourceset, host=args.host, port=args.port)) + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Agent Machine Python CLI") subcommands = parser.add_subparsers(dest="command", required=True) @@ -459,7 +453,7 @@ def build_parser() -> argparse.ArgumentParser: activate_evaluate.add_argument("--pretty", action="store_true") activate_evaluate.set_defaults(func=cmd_activate_evaluate) - steer = subcommands.add_parser("steer", help="Inspect or serve local steering endpoint stubs") + steer = subcommands.add_parser("steer", help="Inspect or serve local steering endpoints") steer_subcommands = steer.add_subparsers(dest="steer_command", required=True) stub_response = steer_subcommands.add_parser("stub-response", help="Render a Noetica-compatible steering stub response") @@ -474,6 +468,17 @@ def build_parser() -> argparse.ArgumentParser: serve_stub.add_argument("--status", choices=["not_configured", "noop"], default="not_configured") serve_stub.set_defaults(func=cmd_steer_serve_stub) + preflight = steer_subcommands.add_parser("preflight", help="Inspect readiness for a registered steering sourceset") + preflight.add_argument("--sourceset", required=True) + preflight.add_argument("--pretty", action="store_true") + preflight.set_defaults(func=cmd_steer_preflight) + + serve = steer_subcommands.add_parser("serve", help="Serve sourceset-aware local /steer endpoint in fail-closed mode") + serve.add_argument("--sourceset", required=True) + serve.add_argument("--host", default="127.0.0.1") + serve.add_argument("--port", type=int, default=8080) + serve.set_defaults(func=cmd_steer_serve) + return parser diff --git a/src/agent_machine/steering_runtime.py b/src/agent_machine/steering_runtime.py new file mode 100644 index 0000000..b4473d2 --- /dev/null +++ b/src/agent_machine/steering_runtime.py @@ -0,0 +1,173 @@ +"""Controlled local steering runtime preflight and fail-closed server. + +This module owns the first real-path entrypoint for Issue #34. It intentionally +keeps activation fail-closed until optional ML dependencies, verified artifacts, +storage receipts, policy admission, and grants are present. It never returns +``status: applied`` unless a future implementation successfully runs a real +activation-injection path. +""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any + +from agent_machine.contracts import load_json +from agent_machine.paths import repo_root_from_file +from agent_machine.steering_stub import SteeringStubError, build_stub_steer_result + +REPO_ROOT = repo_root_from_file(__file__) +SOURCESET_DIR = REPO_ROOT / "examples" / "steering-sourcesets" +OPTIONAL_RUNTIME_MODULES = { + "torch": "torch", + "transformers": "transformers", + "transformer_lens": "transformer-lens", + "sae_lens": "sae-lens", + "safetensors": "safetensors", + "huggingface_hub": "huggingface_hub", +} + + +class SteeringRuntimeError(AssertionError): + """Raised when the real steering runtime cannot proceed safely.""" + + +def load_sourceset(sourceset_id: str) -> dict[str, Any]: + """Load a registered SteeringSourceset by sourcesetId.""" + for path in sorted(SOURCESET_DIR.glob("*.steering-sourceset.json")): + payload = load_json(path) + if payload.get("sourcesetId") == sourceset_id: + return payload + raise SteeringRuntimeError(f"sourceset not registered: {sourceset_id}") + + +def runtime_preflight(sourceset_id: str) -> dict[str, Any]: + """Return a secret-free readiness record for the real steering path.""" + try: + sourceset = load_sourceset(sourceset_id) + except SteeringRuntimeError as exc: + return { + "ok": False, + "status": "not_configured", + "sourceset": sourceset_id, + "registered": False, + "ready": False, + "missing": [str(exc)], + "activationImplemented": False, + } + + missing: list[str] = [] + optional_dependencies = {} + for module_name, package_name in OPTIONAL_RUNTIME_MODULES.items(): + available = importlib.util.find_spec(module_name) is not None + optional_dependencies[package_name] = available + if not available: + missing.append(f"optional runtime dependency missing: {package_name}") + + activation = sourceset.get("activation", {}) if isinstance(sourceset, dict) else {} + for item in activation.get("missing", []) if isinstance(activation, dict) else []: + if isinstance(item, str) and item not in missing: + missing.append(item) + + ready = not missing and bool(activation.get("loadableToday")) and bool(activation.get("activationImplemented")) + + return { + "ok": True, + "status": "available" if ready else "not_configured", + "sourceset": sourceset_id, + "registered": True, + "sourcesetStatus": sourceset.get("status"), + "ready": ready, + "model": sourceset.get("model", {}).get("source", {}).get("repo"), + "sae": sourceset.get("sae", {}).get("source", {}).get("repo"), + "hook": sourceset.get("sae", {}).get("hook"), + "optionalDependencies": optional_dependencies, + "missing": missing, + "activationImplemented": False, + "downloadsPerformed": False, + "message": ( + "Real steering activation is not ready; serve --sourceset will return status=not_configured " + "until optional dependencies, verified artifacts, storage receipts, policy/grant admission, and activation injection are present." + ), + } + + +def serve_sourceset(sourceset_id: str, host: str = "127.0.0.1", port: int = 8080) -> int: + """Serve the sourceset-aware local steering endpoint in fail-closed mode.""" + preflight = runtime_preflight(sourceset_id) + + class Handler(BaseHTTPRequestHandler): + server_version = "AgentMachineSteerRuntime/0.1" + + def do_GET(self) -> None: # noqa: N802 - BaseHTTPRequestHandler API + if self.path not in {"/health", "/ready"}: + self.send_json({"error": "not_found"}, status_code=404) + return + self.send_json({"ok": True, "kind": "NeuronpediaCompatibleLocalSteerRuntime", "preflight": preflight}) + + def do_POST(self) -> None: # noqa: N802 - BaseHTTPRequestHandler API + if self.path != "/steer": + self.send_json({"error": "not_found"}, status_code=404) + return + try: + payload = self.read_json() + result = build_fail_closed_result(payload, preflight) + except (json.JSONDecodeError, UnicodeDecodeError, SteeringStubError, SteeringRuntimeError) as exc: + self.send_json({"error": "invalid_steer_request", "message": str(exc)}, status_code=400) + return + self.send_json(result) + + def read_json(self) -> dict[str, Any]: + length_header = self.headers.get("content-length") + if not length_header: + raise SteeringRuntimeError("missing content-length") + length = int(length_header) + if length > 1_048_576: + raise SteeringRuntimeError("request body exceeds 1 MiB") + raw = self.rfile.read(length) + payload = json.loads(raw.decode("utf-8")) + if not isinstance(payload, dict): + raise SteeringRuntimeError("steer request root must be a JSON object") + return payload + + def send_json(self, payload: dict[str, Any], status_code: int = 200) -> None: + body = json.dumps(payload, sort_keys=True).encode("utf-8") + self.send_response(status_code) + self.send_header("content-type", "application/json") + self.send_header("content-length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format: str, *args: Any) -> None: # noqa: A002 - inherited name + print(f"agent-machine steer runtime: {self.address_string()} - {format % args}", file=sys.stderr) + + print( + f"agent-machine steer runtime serving http://{host}:{port}/steer sourceset={sourceset_id} ready={preflight.get('ready')}", + file=sys.stderr, + ) + if not preflight.get("ready"): + print(json.dumps({"warning": "real activation not ready", "preflight": preflight}, sort_keys=True), file=sys.stderr) + server = ThreadingHTTPServer((host, port), Handler) + try: + server.serve_forever() + except KeyboardInterrupt: + print("agent-machine steer runtime stopped", file=sys.stderr) + finally: + server.server_close() + return 0 + + +def build_fail_closed_result(payload: dict[str, Any], preflight: dict[str, Any]) -> dict[str, Any]: + """Return status=not_configured unless a future real path proves readiness.""" + result = build_stub_steer_result(payload, status="not_configured") + missing = preflight.get("missing", []) + missing_text = "; ".join(str(item) for item in missing[:8]) + result["diff_summary"] = ( + "Agent Machine real steering path is not configured for applied activation. " + f"Sourceset={preflight.get('sourceset')} ready={preflight.get('ready')}. Missing: {missing_text or 'unknown'}" + ) + return result