From 63d661a30d153d20dd1e0bad72138ba943ce5e83 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Sat, 16 May 2026 21:29:36 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20standalone=20CLI=20+=20test=20harness?= =?UTF-8?q?=20+=20Di=C3=A1taxis=20docs=20+=20benchmarks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `decision-record` CLI (second bin alongside MCP server) that drives the full planning pipeline against any OpenAI-compatible endpoint (OPENAI_API_KEY + OPENAI_BASE_URL — works with OpenAI, OpenRouter, Ollama, vLLM, LiteLLM). Phase state machine, sub-agents (scoping, deciding, lens-rotating skeptic, decomposer), checkpointed control flow, PRD ingestion, resume support. - Add reusable test harness: event-driven MCP stdio client, disposable tmp-project helper, script-replay mock OpenAI client. 50 tests across unit (gate eval + schemas, 48 tests) and flow (full pipeline + skeptic-block path, 2 tests). All green in 210ms. - Reorganize docs/ into Diátaxis quadrants: tutorials/, how-to/, reference/, explanation/. New first-user tutorial walks the roguelike benchmark prompt end-to-end. Five how-to guides cover install, run, providers, Linear handoff, and gate calibration. Four reference pages document the CLI, MCP tools, data model, and gate matrix. Three explanation pages cover design rationale, the five-phase pipeline, and Joel's canonical material (preserved from upstream-canon). - Add benchmarks/ with the roguelike-ai-poc canonical prompt + reference artifacts + a run.sh for regression checks as the system evolves. - Add GitHub Actions CI (.github/workflows/test.yml) that runs typecheck, build, and the test matrix on Node 20 + 22 for every push and PR. - Minor fixes: add semver regex to PipelineState.schema_version; add cli.ts entry to tsup config; bug fix in orchestrator where pre-advance gate check treated sign-off as a blocker. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/test.yml | 41 ++ CITATION.cff | 4 +- CONTRIBUTING.md | 2 +- LICENSE | 4 +- README.md | 27 +- benchmarks/README.md | 32 ++ benchmarks/roguelike-ai-poc/prompt.md | 63 +++ ...01-choose-the-implementation-language.json | 115 +++++ ...0001-choose-the-implementation-language.md | 120 +++++ ...the-world-representation-and-renderer.json | 85 ++++ ...e-the-world-representation-and-renderer.md | 92 ++++ ...0003-define-the-agent-action-contract.json | 83 ++++ .../0003-define-the-agent-action-contract.md | 90 ++++ ...-tick-loop-and-termination-conditions.json | 68 +++ ...he-tick-loop-and-termination-conditions.md | 74 +++ .../roguelike-ai-poc/reference/events.jsonl | 33 ++ .../roguelike-ai-poc/reference/index.html | 231 +++++++++ .../roguelike-ai-poc/reference/project.json | 64 +++ .../roguelike-ai-poc/reference/project.md | 64 +++ .../tasks/T0001-bootstrap-repository.json | 30 ++ .../tasks/T0001-bootstrap-repository.md | 23 + ...nt-world-module-tile-grid-entity-dict.json | 32 ++ ...ment-world-module-tile-grid-entity-dict.md | 23 + .../tasks/T0003-implement-frame-renderer.json | 32 ++ .../tasks/T0003-implement-frame-renderer.md | 23 + .../T0004-implement-openai-agent-client.json | 34 ++ .../T0004-implement-openai-agent-client.md | 24 + ...ction-handlers-and-termination-checks.json | 33 ++ ...-action-handlers-and-termination-checks.md | 24 + ...06-implement-the-tick-based-game-loop.json | 35 ++ ...0006-implement-the-tick-based-game-loop.md | 23 + .../T0007-implement-cli-entry-script.json | 33 ++ .../tasks/T0007-implement-cli-entry-script.md | 23 + benchmarks/roguelike-ai-poc/run.sh | 35 ++ docs/README.md | 50 ++ docs/architecture.md | 197 -------- docs/explanation/design-rationale.md | 104 +++++ docs/explanation/the-five-phases.md | 133 ++++++ .../why-decision-records.md} | 0 docs/how-to/calibrate-gates.md | 79 ++++ docs/how-to/configure-providers.md | 103 ++++ docs/how-to/handoff-to-linear.md | 83 ++++ docs/how-to/install.md | 80 ++++ docs/how-to/run-the-cli.md | 114 +++++ docs/quickstart.md | 80 ---- docs/reference/cli.md | 108 +++++ docs/reference/data-model.md | 152 ++++++ docs/reference/gates.md | 78 ++++ docs/reference/mcp-tools.md | 188 ++++++++ docs/tutorials/your-first-plan.md | 164 +++++++ docs/usage.md | 145 ------ server/package-lock.json | 22 + server/package.json | 8 +- server/src/cli.ts | 2 + server/src/cli/agents/deciding.ts | 56 +++ server/src/cli/agents/decomposer.ts | 70 +++ server/src/cli/agents/scoping.ts | 58 +++ server/src/cli/agents/skeptic.ts | 103 ++++ server/src/cli/checkpoints.ts | 82 ++++ server/src/cli/index.ts | 232 ++++++++++ server/src/cli/orchestrator.ts | 415 +++++++++++++++++ server/src/cli/prd.ts | 36 ++ server/src/llm/agent.ts | 161 +++++++ server/src/llm/client.ts | 34 ++ server/src/llm/tools.ts | 94 ++++ server/src/schemas/index.ts | 2 +- server/tests/flow-poc-pipeline.test.ts | 406 ++++++++++++++++ server/tests/helpers/index.ts | 2 + server/tests/helpers/mcp-client.ts | 194 ++++++++ server/tests/helpers/mock-openai.ts | 82 ++++ server/tests/helpers/tmp-project.ts | 44 ++ server/tests/unit-gate.test.ts | 438 ++++++++++++++++++ server/tests/unit-schemas.test.ts | 273 +++++++++++ server/tsup.config.ts | 2 +- 74 files changed, 5957 insertions(+), 436 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 benchmarks/README.md create mode 100644 benchmarks/roguelike-ai-poc/prompt.md create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json create mode 100644 benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md create mode 100644 benchmarks/roguelike-ai-poc/reference/events.jsonl create mode 100644 benchmarks/roguelike-ai-poc/reference/index.html create mode 100644 benchmarks/roguelike-ai-poc/reference/project.json create mode 100644 benchmarks/roguelike-ai-poc/reference/project.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json create mode 100644 benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md create mode 100755 benchmarks/roguelike-ai-poc/run.sh create mode 100644 docs/README.md delete mode 100644 docs/architecture.md create mode 100644 docs/explanation/design-rationale.md create mode 100644 docs/explanation/the-five-phases.md rename docs/{upstream-canon.md => explanation/why-decision-records.md} (100%) create mode 100644 docs/how-to/calibrate-gates.md create mode 100644 docs/how-to/configure-providers.md create mode 100644 docs/how-to/handoff-to-linear.md create mode 100644 docs/how-to/install.md create mode 100644 docs/how-to/run-the-cli.md delete mode 100644 docs/quickstart.md create mode 100644 docs/reference/cli.md create mode 100644 docs/reference/data-model.md create mode 100644 docs/reference/gates.md create mode 100644 docs/reference/mcp-tools.md create mode 100644 docs/tutorials/your-first-plan.md delete mode 100644 docs/usage.md create mode 100644 server/src/cli.ts create mode 100644 server/src/cli/agents/deciding.ts create mode 100644 server/src/cli/agents/decomposer.ts create mode 100644 server/src/cli/agents/scoping.ts create mode 100644 server/src/cli/agents/skeptic.ts create mode 100644 server/src/cli/checkpoints.ts create mode 100644 server/src/cli/index.ts create mode 100644 server/src/cli/orchestrator.ts create mode 100644 server/src/cli/prd.ts create mode 100644 server/src/llm/agent.ts create mode 100644 server/src/llm/client.ts create mode 100644 server/src/llm/tools.ts create mode 100644 server/tests/flow-poc-pipeline.test.ts create mode 100644 server/tests/helpers/index.ts create mode 100644 server/tests/helpers/mcp-client.ts create mode 100644 server/tests/helpers/mock-openai.ts create mode 100644 server/tests/helpers/tmp-project.ts create mode 100644 server/tests/unit-gate.test.ts create mode 100644 server/tests/unit-schemas.test.ts diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..a264d3c --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,41 @@ +name: test + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + defaults: + run: + working-directory: server + strategy: + matrix: + node-version: [20, 22] + steps: + - uses: actions/checkout@v4 + + - name: Set up Node ${{ matrix.node-version }} + uses: actions/setup-node@v4 + with: + node-version: ${{ matrix.node-version }} + cache: npm + cache-dependency-path: server/package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Type check + run: npm run typecheck + + - name: Build + run: npm run build + + - name: Unit tests + run: npm run test:unit + + - name: Flow tests + run: npm run test:flow diff --git a/CITATION.cff b/CITATION.cff index ff3db7e..0adcd7b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -33,5 +33,5 @@ references: repository-code: 'https://github.com/joelparkerhenderson/decision-record/' abstract: >- The canonical concept, template, and teamwork model for decision - records — preserved in this fork at docs/upstream-canon.md and - templates/canonical.md. + records — preserved in this fork at docs/explanation/why-decision-records.md + and templates/canonical.md. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31a3c63..0853e98 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,7 +24,7 @@ This repo is the planning system itself. We deliberately stop at the handoff — ## Attribution -The conceptual core derives from Joel Parker Henderson's [canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). Preserve attribution to upstream in any rework of `docs/upstream-canon.md` or `templates/canonical.md`. +The conceptual core derives from Joel Parker Henderson's [canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). Preserve attribution to upstream in any rework of `docs/explanation/why-decision-records.md` or `templates/canonical.md`. ## License diff --git a/LICENSE b/LICENSE index 30603ec..04d47e0 100644 --- a/LICENSE +++ b/LICENSE @@ -22,8 +22,8 @@ SOFTWARE. --- -The preserved canonical material in `docs/upstream-canon.md` and the -canonical decision record template at `templates/canonical.md` derive from +The preserved canonical material in `docs/explanation/why-decision-records.md` +and the canonical decision record template at `templates/canonical.md` derive from the upstream work of Joel Parker Henderson: . That material should be attributed to its original author; see CITATION.cff. diff --git a/README.md b/README.md index 8a8a886..4a14326 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This repository is a Claude Code plugin + bundled MCP server. It runs inside a fresh or template repo, partners with a human and an AI agent, and produces an executable MVP plan: a scoped manifest, a set of accepted decision records, and a dependency-aware task graph. Output goes to Linear (primary) or stays as filesystem artifacts (fallback). -This project is a derivative of [Joel Parker Henderson's canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). The canonical explanation of what a DR is and why it matters is preserved at [`docs/upstream-canon.md`](docs/upstream-canon.md). What this fork adds is **enforcement**: workflows, tools, and a state machine that make DRs a non-skippable part of planning with an agentic system. +This project is a derivative of [Joel Parker Henderson's canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). The canonical explanation of what a DR is and why it matters is preserved at [`docs/explanation/why-decision-records.md`](docs/explanation/why-decision-records.md). What this fork adds is **enforcement**: workflows, tools, and a state machine that make DRs a non-skippable part of planning with an agentic system. ## What you get @@ -17,7 +17,16 @@ This project is a derivative of [Joel Parker Henderson's canonical decision-reco ## Status -Active development — first usable cut is in. The pipeline is functional end-to-end (intake → scope → decisions → tasks → handoff to filesystem or Linear). See [`docs/quickstart.md`](docs/quickstart.md) for the five-minute walkthrough, [`docs/usage.md`](docs/usage.md) for the full interaction model, and [`docs/architecture.md`](docs/architecture.md) for the data model. +Active development — first usable cut is in. The pipeline is functional end-to-end (intake → scope → decisions → tasks → handoff to filesystem or Linear). A standalone CLI (`decision-record`) ships alongside the Claude Code plugin and MCP server. + +## Documentation + +Docs follow the [Diátaxis](https://diataxis.fr) framework — start at [`docs/README.md`](docs/README.md) to orient. + +- **Brand new?** → [`docs/tutorials/your-first-plan.md`](docs/tutorials/your-first-plan.md) is a 15-minute end-to-end walkthrough. +- **How do I do X?** → [`docs/how-to/`](docs/how-to/) (install, run the CLI, configure providers, hand off to Linear, calibrate gates). +- **What's the exact spec?** → [`docs/reference/`](docs/reference/) (CLI flags, MCP tools, data model, gates). +- **Why is it built this way?** → [`docs/explanation/`](docs/explanation/) (design rationale, the five phases, why decision records). ## How it's structured @@ -58,18 +67,26 @@ npm install npm run build ``` -Then either link as a Claude Code plugin (symlink the repo into `~/.claude/plugins/decision-record/`) or run the MCP server standalone via `node /path/to/decision-record/server/dist/index.js`. Full instructions: [`docs/quickstart.md`](docs/quickstart.md). +Then either: +- Use the **standalone CLI**: `export OPENAI_API_KEY=… && node dist/cli.js --idea "your idea here"` +- Use the **Claude Code plugin**: symlink the repo into `~/.claude/plugins/decision-record/` and run `/plan` inside Claude Code. + +Full install instructions: [`docs/how-to/install.md`](docs/how-to/install.md). First-run walkthrough: [`docs/tutorials/your-first-plan.md`](docs/tutorials/your-first-plan.md). (A published marketplace release is on the roadmap.) +## Benchmarks + +We use a canonical prompt — an AI-driven roguelike POC — to spot regressions as the system evolves. See [`benchmarks/`](benchmarks/) for the prompt, expected output shape, and a `run.sh` to re-run it. + ## Contributing See [CONTRIBUTING.md](CONTRIBUTING.md). Issues and pull requests welcome. ## Acknowledgments -The conceptual core — what a decision record is, the canonical template structure, the teamwork model around DRs — is the work of [Joel Parker Henderson](https://joelparkerhenderson.com). See [`docs/upstream-canon.md`](docs/upstream-canon.md) for the preserved canonical material, and [CITATION.cff](CITATION.cff) for citation metadata. +The conceptual core — what a decision record is, the canonical template structure, the teamwork model around DRs — is the work of [Joel Parker Henderson](https://joelparkerhenderson.com). See [`docs/explanation/why-decision-records.md`](docs/explanation/why-decision-records.md) for the preserved canonical material, and [CITATION.cff](CITATION.cff) for citation metadata. ## License -[MIT](LICENSE) — for the code, schemas, and tooling in this repository. The preserved canonical content in `docs/upstream-canon.md` and the canonical template at `templates/canonical.md` derive from upstream and should be attributed to Joel Parker Henderson per CITATION.cff. +[MIT](LICENSE) — for the code, schemas, and tooling in this repository. The preserved canonical content in `docs/explanation/why-decision-records.md` and the canonical template at `templates/canonical.md` derive from upstream and should be attributed to Joel Parker Henderson per CITATION.cff. diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..a5416ca --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,32 @@ +# Benchmarks + +Canonical prompts we run against the decision-record planning pipeline to catch regressions as the system evolves. + +| Benchmark | Prompt | Effort | Purpose | +|---|---|---|---| +| [roguelike-ai-poc](roguelike-ai-poc/) | AI-driven roguelike where the agent plays the game | `poc` | Exercises all five pipeline phases on a small, well-bounded problem. The original dogfood case. | + +## How to run a benchmark + +```bash +cd benchmarks/ +./run.sh +``` + +Each benchmark has: + +- `prompt.md` — the exact idea, effort level, and what "good output" looks like +- `reference/` — a baseline artifact snapshot from a canonical run +- `run.sh` — one-shot runner that fires the CLI against a fresh tmp dir + +## What we look for when comparing runs + +Each benchmark's `prompt.md` defines its own success criteria. Generally: + +- Pipeline reaches `handed-off` +- Decision count and shape match expectations for the effort tier +- Tasks are vertical slices, every leaf has a decision ref, graph validates +- Render artifacts are emitted (Markdown + HTML) +- Event log is coherent + +These benchmarks are **not unit tests** — they're regression observability. Different runs will produce slightly different plans and that's by design. Treat the reference as "shape we expect," not "bytes we require." diff --git a/benchmarks/roguelike-ai-poc/prompt.md b/benchmarks/roguelike-ai-poc/prompt.md new file mode 100644 index 0000000..745bdb9 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/prompt.md @@ -0,0 +1,63 @@ +# Benchmark: roguelike-ai-poc + +This is the canonical benchmark for the decision-record planning pipeline. We re-run it as the system evolves to spot regressions in plan quality, gate behavior, agent prompts, and rendering. + +## The prompt + +**Idea (free-form):** + +> A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area. + +**Effort level:** `poc` + +## Invocation + +```bash +decision-record \ + --title "AI-driven roguelike POC" \ + --description "$(cat <<'EOF' +A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area. +EOF +)" \ + --effort poc \ + --cwd ./tmp-roguelike-bench \ + --yes +``` + +Or the one-shot wrapper: `./run.sh` (creates a fresh tmp dir, runs the CLI, prints where the artifacts landed). + +## What "good output" looks like + +A run is healthy if the produced plan: + +- **Pipeline reaches `handed-off`** — every gate passes, sign-offs recorded, project finalized. +- **3-5 significant decisions** are proposed and accepted — language, world representation, agent action contract, tick-loop control. (Not 1; not 12.) +- **5-8 vertical-slice tasks** — bootstrap → world → renderer → agent client → action handlers → game loop → CLI entry. Every leaf ≤ 16h (poc cap). Every task references at least one accepted DR. +- **The seed library is consulted** for at least the language decision (`dr_seed_search` + `dr_seed_load` on `language-choice`). +- **Graph validates clean** — no cycles, no orphan deps, no missing decision refs. +- **Artifacts emitted** — `dr/project.json`, `dr/decisions/*.json`, `dr/tasks/*.json`, rendered `.md` siblings, `dr/index.html`. `.dr/events.jsonl` contains a coherent audit trail. + +## Reference snapshot + +`./reference/` holds the artifacts from the canonical run produced by hand-driving the MCP tools (2026-05-16, the dogfood test that originally produced this benchmark). Treat it as a "this is what good looks like" baseline, not a strict equality target — different agent runs will pick slightly different positions, phrasing, and task decomposition, and that's fine. + +When comparing a new run against `./reference/`: + +- **Same final phase, gate decisions, event mix** → no regression. +- **More/fewer decisions or tasks** → check whether the new run is denser/sparser appropriately or whether the agent over- or under-decomposed. +- **Different selected positions** → fine if defensible; concerning if the argument is weaker. +- **Missing seed usage** → bug or prompt drift; the agent should reach for `language-choice` here. +- **Tasks without decision refs** → regression. Every task must link to a DR. +- **Validation failures** → regression. The graph must validate. + +## What this benchmark exercises + +| Surface | Coverage | +|---|---| +| Phase machine | All five transitions: intake → scoping → deciding → decomposing → handing-off → handed-off | +| Seed library | At least one `dr_seed_load` (language-choice) | +| Decision lifecycle | propose → update with position + argument → accept (no review under poc preset) | +| Task graph | Multi-node dependency chain with decision_refs | +| Gates | `min_tasks=3`, `max_task_estimate_hours=16`, `require_human_signoff_phases=['handing-off']` | +| Render | Markdown per record + static HTML index | +| Handoff | Filesystem path (Linear path is exercised by separate live test) | diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json new file mode 100644 index 0000000..f07d744 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json @@ -0,0 +1,115 @@ +{ + "id": "0001-choose-the-implementation-language", + "number": 1, + "slug": "choose-the-implementation-language", + "title": "Choose the implementation language", + "status": "accepted", + "template_variant": "architecture", + "created_at": "2026-05-17T04:13:38.681Z", + "updated_at": "2026-05-17T04:13:38.685Z", + "summary": "Decide the primary implementation language for the project.", + "issue": "Every other foundational decision (runtime, package manager, framework choices, testing tools) flows from the language choice. Picking this early and explicitly avoids drift.", + "assumptions": [ + "Team has existing language strengths to lean on.", + "Project lifespan is long enough that hiring and onboarding matter.", + "Ecosystem maturity matters for the project's domain." + ], + "constraints": [ + "Team's current expertise.", + "Target runtime environments (browser, server, native, embedded).", + "Performance and memory budgets.", + "Licensing or compliance restrictions on language ecosystems." + ], + "positions": [ + { + "title": "TypeScript", + "description": "Strongly typed JavaScript. Best for full-stack web work, ubiquitous tooling.", + "pros": [ + "Ubiquitous in web", + "Strong types catch errors early", + "Massive ecosystem", + "Frontend/backend code sharing" + ], + "cons": [ + "Build step overhead", + "Type system can be over-engineered", + "Slower than native languages for hot paths" + ], + "links": [] + }, + { + "title": "Python", + "description": "Dynamic, batteries-included. Best for data work, scripting, ML, fast prototypes.", + "pros": [ + "Excellent ML/data ecosystem", + "Fast to write", + "Readable", + "Huge stdlib" + ], + "cons": [ + "Slow runtime without C extensions", + "GIL limits concurrency", + "Dynamic typing → runtime errors" + ], + "links": [] + }, + { + "title": "Go", + "description": "Statically typed, compiled, built for concurrent services.", + "pros": [ + "Simple language", + "Single binary deployment", + "Strong concurrency primitives", + "Fast compile times" + ], + "cons": [ + "Generics still maturing", + "Verbose error handling", + "Less rich third-party ecosystem than JS/Python" + ], + "links": [] + }, + { + "title": "Rust", + "description": "Memory-safe systems language. Best for performance-critical or systems work.", + "pros": [ + "No GC, predictable performance", + "Memory safety", + "Excellent tooling (cargo)", + "Strong types" + ], + "cons": [ + "Steep learning curve", + "Slower to ship initial features", + "Compile times can be long" + ], + "links": [] + } + ], + "opinions": [], + "argument": "Python is fastest to write for a single-script game-loop POC. The OpenAI SDK + a tiny terminal renderer fit naturally; no build step or transpile loop slows iteration. Team is comfortable with Python and the project never needs to leave a single repo.", + "selected_position": "Python", + "implications": [ + "Use the official openai Python SDK for agent calls.", + "Single-file or small-module layout; no package manager beyond pip/uv.", + "Pin to Python 3.11+ for ergonomic match-statement parsing of agent actions." + ], + "depends_on": [], + "related_decisions": [], + "related_artifacts": [], + "review": [], + "sign_off": { + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:13:38.685Z", + "notes": "poc preset, no review required" + }, + "seed_origin": "language-choice", + "tags": [ + "foundation", + "poc", + "foundation", + "architecture", + "stack" + ] +} diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md new file mode 100644 index 0000000..8a3a4b3 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md @@ -0,0 +1,120 @@ +# 0001-choose-the-implementation-language — Choose the implementation language + +| Field | Value | +| --- | --- | +| Status | `accepted` | +| Template | `architecture` | +| Updated | 2026-05-17T04:13:38.685Z | +| Selected | **Python** | +| Depends on | _(none)_ | + +## Summary + +Decide the primary implementation language for the project. + +## Issue + +Every other foundational decision (runtime, package manager, framework choices, testing tools) flows from the language choice. Picking this early and explicitly avoids drift. + +## Assumptions + +- Team has existing language strengths to lean on. +- Project lifespan is long enough that hiring and onboarding matter. +- Ecosystem maturity matters for the project's domain. + +## Constraints + +- Team's current expertise. +- Target runtime environments (browser, server, native, embedded). +- Performance and memory budgets. +- Licensing or compliance restrictions on language ecosystems. + +## Positions + +### TypeScript + +Strongly typed JavaScript. Best for full-stack web work, ubiquitous tooling. + +**Pros** + +- Ubiquitous in web +- Strong types catch errors early +- Massive ecosystem +- Frontend/backend code sharing + +**Cons** + +- Build step overhead +- Type system can be over-engineered +- Slower than native languages for hot paths + +### Python ✅ + +Dynamic, batteries-included. Best for data work, scripting, ML, fast prototypes. + +**Pros** + +- Excellent ML/data ecosystem +- Fast to write +- Readable +- Huge stdlib + +**Cons** + +- Slow runtime without C extensions +- GIL limits concurrency +- Dynamic typing → runtime errors + +### Go + +Statically typed, compiled, built for concurrent services. + +**Pros** + +- Simple language +- Single binary deployment +- Strong concurrency primitives +- Fast compile times + +**Cons** + +- Generics still maturing +- Verbose error handling +- Less rich third-party ecosystem than JS/Python + +### Rust + +Memory-safe systems language. Best for performance-critical or systems work. + +**Pros** + +- No GC, predictable performance +- Memory safety +- Excellent tooling (cargo) +- Strong types + +**Cons** + +- Steep learning curve +- Slower to ship initial features +- Compile times can be long + +## Argument + +Python is fastest to write for a single-script game-loop POC. The OpenAI SDK + a tiny terminal renderer fit naturally; no build step or transpile loop slows iteration. Team is comfortable with Python and the project never needs to leave a single repo. + +## Implications + +- Use the official openai Python SDK for agent calls. +- Single-file or small-module layout; no package manager beyond pip/uv. +- Pin to Python 3.11+ for ergonomic match-statement parsing of agent actions. + +## Sign-off + +- **By:** kj (human) +- **At:** 2026-05-17T04:13:38.685Z +- **Notes:** poc preset, no review required + +--- + +_Instantiated from seed: `language-choice`_ diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json new file mode 100644 index 0000000..7afe41a --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json @@ -0,0 +1,85 @@ +{ + "id": "0002-define-the-world-representation-and-renderer", + "number": 2, + "slug": "define-the-world-representation-and-renderer", + "title": "Define the world representation and renderer", + "status": "accepted", + "template_variant": "data-model", + "created_at": "2026-05-17T04:13:38.686Z", + "updated_at": "2026-05-17T04:13:38.688Z", + "summary": "How the room is stored in memory and rendered to the terminal each tick.", + "issue": "The world is small (one 10×10 room) but the representation must support: easy frame rendering, fast collision/hazard checks, and a stable serialization that the agent can read on each tick. Pick a model now so the action handlers and renderer can converge.", + "assumptions": [ + "10×10 fixed grid", + "Single player entity", + "Static tiles set at startup", + "Frame fits in a single terminal redraw" + ], + "constraints": [ + "Frame must be readable both by humans and the LLM", + "No external graphics libraries" + ], + "positions": [ + { + "title": "Nested list of chars", + "description": "world: list[list[str]] indexed by [y][x]. Player position stored separately.", + "pros": [ + "Simplest possible", + "Trivial to mutate", + "Renders by row-join" + ], + "cons": [ + "No type safety on tile semantics", + "Have to scan grid for entity positions" + ], + "links": [] + }, + { + "title": "Tile-grid + entity dict", + "description": "static_tiles: list[list[str]] for walls/floor/hazard/exit; entities: dict[id, {pos, hp, glyph}] overlaid at render time.", + "pros": [ + "Separates static map from dynamic state", + "Easy to add entities later if needed", + "Clean serialization to JSON" + ], + "cons": [ + "Two structures to keep consistent", + "Slightly more code" + ], + "links": [] + }, + { + "title": "Single 2D numpy array + glyph table", + "description": "Each cell is an int; render by mapping ints to glyphs.", + "pros": [ + "Compact", + "Fast", + "Numpy is familiar" + ], + "cons": [ + "Numpy is overkill for 10×10", + "Adds a dep we do not otherwise need", + "Less Pythonic for tiny data" + ], + "links": [] + } + ], + "opinions": [], + "argument": "Static map + entity overlay is the simplest model that survives the day-2 question can we add a second entity? without a rewrite. It serializes naturally to JSON for the LLM payload and keeps render code in one row-join.", + "selected_position": "Tile-grid + entity dict", + "implications": [ + "Tile glyphs: # wall, . floor, X hazard, > exit; entities overlay (@ for player).", + "Each tick the renderer composes static_tiles + entity glyphs at their positions.", + "JSON state sent to the agent: { frame: [], hp, tick, exit_pos, player_pos }." + ], + "depends_on": [], + "related_decisions": [], + "related_artifacts": [], + "review": [], + "sign_off": { + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:13:38.688Z" + }, + "tags": [] +} diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md new file mode 100644 index 0000000..dfbf675 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md @@ -0,0 +1,92 @@ +# 0002-define-the-world-representation-and-renderer — Define the world representation and renderer + +| Field | Value | +| --- | --- | +| Status | `accepted` | +| Template | `data-model` | +| Updated | 2026-05-17T04:13:38.688Z | +| Selected | **Tile-grid + entity dict** | +| Depends on | _(none)_ | + +## Summary + +How the room is stored in memory and rendered to the terminal each tick. + +## Issue + +The world is small (one 10×10 room) but the representation must support: easy frame rendering, fast collision/hazard checks, and a stable serialization that the agent can read on each tick. Pick a model now so the action handlers and renderer can converge. + +## Assumptions + +- 10×10 fixed grid +- Single player entity +- Static tiles set at startup +- Frame fits in a single terminal redraw + +## Constraints + +- Frame must be readable both by humans and the LLM +- No external graphics libraries + +## Positions + +### Nested list of chars + +world: list[list[str]] indexed by [y][x]. Player position stored separately. + +**Pros** + +- Simplest possible +- Trivial to mutate +- Renders by row-join + +**Cons** + +- No type safety on tile semantics +- Have to scan grid for entity positions + +### Tile-grid + entity dict ✅ + +static_tiles: list[list[str]] for walls/floor/hazard/exit; entities: dict[id, {pos, hp, glyph}] overlaid at render time. + +**Pros** + +- Separates static map from dynamic state +- Easy to add entities later if needed +- Clean serialization to JSON + +**Cons** + +- Two structures to keep consistent +- Slightly more code + +### Single 2D numpy array + glyph table + +Each cell is an int; render by mapping ints to glyphs. + +**Pros** + +- Compact +- Fast +- Numpy is familiar + +**Cons** + +- Numpy is overkill for 10×10 +- Adds a dep we do not otherwise need +- Less Pythonic for tiny data + +## Argument + +Static map + entity overlay is the simplest model that survives the day-2 question can we add a second entity? without a rewrite. It serializes naturally to JSON for the LLM payload and keeps render code in one row-join. + +## Implications + +- Tile glyphs: # wall, . floor, X hazard, > exit; entities overlay (@ for player). +- Each tick the renderer composes static_tiles + entity glyphs at their positions. +- JSON state sent to the agent: { frame: [], hp, tick, exit_pos, player_pos }. + +## Sign-off + +- **By:** kj (human) +- **At:** 2026-05-17T04:13:38.688Z diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json new file mode 100644 index 0000000..0e98040 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json @@ -0,0 +1,83 @@ +{ + "id": "0003-define-the-agent-action-contract", + "number": 3, + "slug": "define-the-agent-action-contract", + "title": "Define the agent action contract", + "status": "accepted", + "template_variant": "architecture", + "created_at": "2026-05-17T04:13:38.689Z", + "updated_at": "2026-05-17T04:13:38.690Z", + "summary": "How the LLM receives the world state per tick and how it returns the chosen action.", + "issue": "The agent must produce a structured, validated action every tick. We need the protocol pinned so the game loop never has to guess what the agent meant.", + "assumptions": [ + "OpenAI-compatible API is the LLM transport", + "Strategy prompt is supplied once at startup", + "Per-tick latency budget ~2-5s is acceptable" + ], + "constraints": [ + "Action set is small (move N/S/E/W + noop)", + "Agent must not stall the game with malformed output", + "Must be debuggable from logs" + ], + "positions": [ + { + "title": "Plain-text response parsing", + "description": "Agent returns N/S/E/W/noop as plain text; we parse first token.", + "pros": [ + "Lowest token cost", + "Works with any model" + ], + "cons": [ + "Brittle to extra punctuation/prose", + "No reasoning surface", + "Hard to audit why" + ], + "links": [] + }, + { + "title": "Tool-call (function calling) with one tool: do_action(direction)", + "description": "Define a single OpenAI tool; agent invokes it once per tick with a strict enum direction.", + "pros": [ + "Schema-validated", + "Free reasoning text alongside the call", + "Easy to extend with new actions later" + ], + "cons": [ + "Slightly more tokens per call", + "Requires a model that supports function calling" + ], + "links": [] + }, + { + "title": "JSON-only response with output_config", + "description": "Force agent to emit {\"action\":\"N\",\"reason\":\"…\"} via structured outputs.", + "pros": [ + "Schema-validated", + "Reasoning captured in same payload" + ], + "cons": [ + "Some providers do not honor strict mode", + "Slightly more setup than tool-call" + ], + "links": [] + } + ], + "opinions": [], + "argument": "Tool-calling is the cleanest contract: the model gets free-form reasoning in `content` AND a strict-enum action in `tool_calls`. We can log both, and extending to new actions later is just adding enum values. Plain-text parsing trades 100 tokens of savings for a constant brittleness tax.", + "selected_position": "Tool-call (function calling) with one tool: do_action(direction)", + "implications": [ + "Define tool `do_action` with input_schema requiring `direction` in {N,S,E,W,noop}.", + "Use tool_choice=\"required\" each tick to force a call.", + "Log the assistant message text (the reasoning) alongside the chosen direction for replay/debug." + ], + "depends_on": [], + "related_decisions": [], + "related_artifacts": [], + "review": [], + "sign_off": { + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:13:38.690Z" + }, + "tags": [] +} diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md new file mode 100644 index 0000000..1bd6e3a --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md @@ -0,0 +1,90 @@ +# 0003-define-the-agent-action-contract — Define the agent action contract + +| Field | Value | +| --- | --- | +| Status | `accepted` | +| Template | `architecture` | +| Updated | 2026-05-17T04:13:38.690Z | +| Selected | **Tool-call (function calling) with one tool: do_action(direction)** | +| Depends on | _(none)_ | + +## Summary + +How the LLM receives the world state per tick and how it returns the chosen action. + +## Issue + +The agent must produce a structured, validated action every tick. We need the protocol pinned so the game loop never has to guess what the agent meant. + +## Assumptions + +- OpenAI-compatible API is the LLM transport +- Strategy prompt is supplied once at startup +- Per-tick latency budget ~2-5s is acceptable + +## Constraints + +- Action set is small (move N/S/E/W + noop) +- Agent must not stall the game with malformed output +- Must be debuggable from logs + +## Positions + +### Plain-text response parsing + +Agent returns N/S/E/W/noop as plain text; we parse first token. + +**Pros** + +- Lowest token cost +- Works with any model + +**Cons** + +- Brittle to extra punctuation/prose +- No reasoning surface +- Hard to audit why + +### Tool-call (function calling) with one tool: do_action(direction) ✅ + +Define a single OpenAI tool; agent invokes it once per tick with a strict enum direction. + +**Pros** + +- Schema-validated +- Free reasoning text alongside the call +- Easy to extend with new actions later + +**Cons** + +- Slightly more tokens per call +- Requires a model that supports function calling + +### JSON-only response with output_config + +Force agent to emit {"action":"N","reason":"…"} via structured outputs. + +**Pros** + +- Schema-validated +- Reasoning captured in same payload + +**Cons** + +- Some providers do not honor strict mode +- Slightly more setup than tool-call + +## Argument + +Tool-calling is the cleanest contract: the model gets free-form reasoning in `content` AND a strict-enum action in `tool_calls`. We can log both, and extending to new actions later is just adding enum values. Plain-text parsing trades 100 tokens of savings for a constant brittleness tax. + +## Implications + +- Define tool `do_action` with input_schema requiring `direction` in {N,S,E,W,noop}. +- Use tool_choice="required" each tick to force a call. +- Log the assistant message text (the reasoning) alongside the chosen direction for replay/debug. + +## Sign-off + +- **By:** kj (human) +- **At:** 2026-05-17T04:13:38.690Z diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json new file mode 100644 index 0000000..4f6becd --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json @@ -0,0 +1,68 @@ +{ + "id": "0004-define-the-tick-loop-and-termination-conditions", + "number": 4, + "slug": "define-the-tick-loop-and-termination-conditions", + "title": "Define the tick loop and termination conditions", + "status": "accepted", + "template_variant": "architecture", + "created_at": "2026-05-17T04:13:38.691Z", + "updated_at": "2026-05-17T04:13:38.692Z", + "summary": "How the game advances tick by tick, when it stops, and how the user observes it.", + "issue": "With an LLM in the loop, each tick is slow (~2-5s). We need a predictable loop with hard stops so the POC always terminates and is always watchable.", + "assumptions": [ + "One-player synchronous game", + "User runs the script in a terminal and watches frames", + "LLM calls happen on the same thread" + ], + "constraints": [ + "Must terminate on win, death, or step limit", + "Frame must visibly update each tick", + "Must not deadlock on a stuck agent" + ], + "positions": [ + { + "title": "Synchronous loop with step cap", + "description": "while not terminal: render → ask agent → apply → check win/death. Hard cap at N steps (e.g., 50).", + "pros": [ + "Simplest mental model", + "Easy to log", + "Predictable termination" + ], + "cons": [ + "UI freezes during LLM call (acceptable for POC)" + ], + "links": [] + }, + { + "title": "Async loop with timeout per tick", + "description": "Wrap each agent call in a 10s timeout; on timeout, treat as noop.", + "pros": [ + "Robust to slow API", + "Game keeps moving" + ], + "cons": [ + "More complex", + "Asyncio inside a CLI script is heavier than warranted" + ], + "links": [] + } + ], + "opinions": [], + "argument": "For a single-window terminal demo, synchronous is fine. Adding asyncio doubles the code size for no demo-visible benefit. The step cap protects against an agent that wanders forever and ensures every run terminates.", + "selected_position": "Synchronous loop with step cap", + "implications": [ + "Step cap = 50; on cap, exit with status \"timeout\" and final HP.", + "Use time.sleep(0.05) after each render so the user can see the frames advance.", + "Loop logs each tick to stdout: frame, action, reasoning, hp, tick#." + ], + "depends_on": [], + "related_decisions": [], + "related_artifacts": [], + "review": [], + "sign_off": { + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:13:38.692Z" + }, + "tags": [] +} diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md new file mode 100644 index 0000000..0d83a25 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md @@ -0,0 +1,74 @@ +# 0004-define-the-tick-loop-and-termination-conditions — Define the tick loop and termination conditions + +| Field | Value | +| --- | --- | +| Status | `accepted` | +| Template | `architecture` | +| Updated | 2026-05-17T04:13:38.692Z | +| Selected | **Synchronous loop with step cap** | +| Depends on | _(none)_ | + +## Summary + +How the game advances tick by tick, when it stops, and how the user observes it. + +## Issue + +With an LLM in the loop, each tick is slow (~2-5s). We need a predictable loop with hard stops so the POC always terminates and is always watchable. + +## Assumptions + +- One-player synchronous game +- User runs the script in a terminal and watches frames +- LLM calls happen on the same thread + +## Constraints + +- Must terminate on win, death, or step limit +- Frame must visibly update each tick +- Must not deadlock on a stuck agent + +## Positions + +### Synchronous loop with step cap ✅ + +while not terminal: render → ask agent → apply → check win/death. Hard cap at N steps (e.g., 50). + +**Pros** + +- Simplest mental model +- Easy to log +- Predictable termination + +**Cons** + +- UI freezes during LLM call (acceptable for POC) + +### Async loop with timeout per tick + +Wrap each agent call in a 10s timeout; on timeout, treat as noop. + +**Pros** + +- Robust to slow API +- Game keeps moving + +**Cons** + +- More complex +- Asyncio inside a CLI script is heavier than warranted + +## Argument + +For a single-window terminal demo, synchronous is fine. Adding asyncio doubles the code size for no demo-visible benefit. The step cap protects against an agent that wanders forever and ensures every run terminates. + +## Implications + +- Step cap = 50; on cap, exit with status "timeout" and final HP. +- Use time.sleep(0.05) after each render so the user can see the frames advance. +- Loop logs each tick to stdout: frame, action, reasoning, hp, tick#. + +## Sign-off + +- **By:** kj (human) +- **At:** 2026-05-17T04:13:38.692Z diff --git a/benchmarks/roguelike-ai-poc/reference/events.jsonl b/benchmarks/roguelike-ai-poc/reference/events.jsonl new file mode 100644 index 0000000..42ab62f --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/events.jsonl @@ -0,0 +1,33 @@ +{"at":"2026-05-17T04:12:02.030Z","actor":"agent","kind":"project_initialized","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"effort_level":"poc"}} +{"at":"2026-05-17T04:12:40.988Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"scoping","payload":{"from":"intake","to":"scoping"}} +{"at":"2026-05-17T04:12:40.991Z","actor":"agent","kind":"scope_updated","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"scope":{"in_scope":["A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)","Tick-based game loop: each tick prints the frame, then queries the agent for one action","A small action vocabulary: move N/S/E/W and noop","Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death","Strategy prompt provided once at startup, fed to the agent as system prompt for every tick","LLM agent receives current frame + HP + tick number, returns a single action"],"out_of_scope":["Multiple rooms, dungeon generation, procedural levels","Combat with enemies, NPCs, monsters","Inventory, items, equipment","Save/load, persistence","Visual UI beyond ASCII to terminal","Multiplayer, networking","Self-improving agent loops or RL training"],"success_criteria":["A user can run a single command, supply a strategy prompt, and watch the agent play until win or death","Win and death paths both observed in manual playtests","Different strategy prompts produce visibly different agent behavior","End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call"],"nice_to_have":["Configurable room layout from a text file","Replay log written to disk for post-hoc inspection","A few preset strategy prompts to demo (cautious, greedy, exploratory)"]}}} +{"at":"2026-05-17T04:12:40.991Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"deciding","payload":{"from":"scoping","to":"deciding"}} +{"at":"2026-05-17T04:13:38.681Z","actor":"agent","kind":"seed_loaded","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language","payload":{"seed_name":"language-choice"}} +{"at":"2026-05-17T04:13:38.684Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language","payload":{"changed":["argument","selected_position","implications"]}} +{"at":"2026-05-17T04:13:38.685Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language"} +{"at":"2026-05-17T04:13:38.686Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer","payload":{"template_variant":"data-model"}} +{"at":"2026-05-17T04:13:38.687Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer","payload":{"changed":["argument","selected_position","implications"]}} +{"at":"2026-05-17T04:13:38.688Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer"} +{"at":"2026-05-17T04:13:38.689Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract","payload":{"template_variant":"architecture"}} +{"at":"2026-05-17T04:13:38.689Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract","payload":{"changed":["argument","selected_position","implications"]}} +{"at":"2026-05-17T04:13:38.690Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract"} +{"at":"2026-05-17T04:13:38.691Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions","payload":{"template_variant":"architecture"}} +{"at":"2026-05-17T04:13:38.692Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions","payload":{"changed":["argument","selected_position","implications"]}} +{"at":"2026-05-17T04:13:38.692Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions"} +{"at":"2026-05-17T04:13:38.694Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"decomposing","payload":{"from":"deciding","to":"decomposing"}} +{"at":"2026-05-17T04:14:22.524Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0001-bootstrap-repository","payload":{"decision_refs":["0001-choose-the-implementation-language"],"depends_on":[]}} +{"at":"2026-05-17T04:14:22.526Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0002-implement-world-module-tile-grid-entity-dict","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0001-bootstrap-repository"]}} +{"at":"2026-05-17T04:14:22.527Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0003-implement-frame-renderer","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0002-implement-world-module-tile-grid-entity-dict"]}} +{"at":"2026-05-17T04:14:22.528Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0004-implement-openai-agent-client","payload":{"decision_refs":["0003-define-the-agent-action-contract"],"depends_on":["T0001-bootstrap-repository"]}} +{"at":"2026-05-17T04:14:22.529Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0005-implement-action-handlers-and-termination-checks","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0002-implement-world-module-tile-grid-entity-dict"]}} +{"at":"2026-05-17T04:14:22.530Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0006-implement-the-tick-based-game-loop","payload":{"decision_refs":["0004-define-the-tick-loop-and-termination-conditions","0002-define-the-world-representation-and-renderer"],"depends_on":["T0003-implement-frame-renderer","T0004-implement-openai-agent-client","T0005-implement-action-handlers-and-termination-checks"]}} +{"at":"2026-05-17T04:14:22.532Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0007-implement-cli-entry-script","payload":{"decision_refs":["0001-choose-the-implementation-language","0004-define-the-tick-loop-and-termination-conditions"],"depends_on":["T0006-implement-the-tick-based-game-loop"]}} +{"at":"2026-05-17T04:14:22.534Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}} +{"at":"2026-05-17T04:14:30.972Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}} +{"at":"2026-05-17T04:14:37.477Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}} +{"at":"2026-05-17T04:14:44.523Z","actor":"human","actor_name":"kj","kind":"phase_advanced","entity_kind":"phase","entity_id":"handing-off","payload":{"from":"decomposing","to":"handing-off","notes":"All decisions accepted, graph validates clean."}} +{"at":"2026-05-17T04:14:44.523Z","actor":"human","actor_name":"kj","kind":"sign_off_recorded","entity_kind":"phase","entity_id":"handing-off"} +{"at":"2026-05-17T04:14:44.538Z","actor":"agent","kind":"render_run","payload":{"decisions":4,"tasks":7}} +{"at":"2026-05-17T04:14:44.540Z","actor":"human","actor_name":"kj","kind":"export_started","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"target":"filesystem"}} +{"at":"2026-05-17T04:14:44.540Z","actor":"human","actor_name":"kj","kind":"export_completed","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"target":"filesystem","issue_count":7,"document_count":4}} +{"at":"2026-05-17T04:14:44.544Z","actor":"agent","kind":"render_run","payload":{"decisions":4,"tasks":7}} diff --git a/benchmarks/roguelike-ai-poc/reference/index.html b/benchmarks/roguelike-ai-poc/reference/index.html new file mode 100644 index 0000000..75276fc --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/index.html @@ -0,0 +1,231 @@ + + + + + +AI-driven roguelike POC — Decision Record + + + +
+ +
+
ai-driven-roguelike-poc
+

AI-driven roguelike POC

+
+ Phase: handed-off + Effort: poc + Updated: 2026-05-17T04:14:44.540Z + Decisions: 4 (4 accepted) + Tasks: 7 (0 done) +
+
+ +

A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.

+ +
+

Scope

+
+
+

In scope

+
  • A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)
  • Tick-based game loop: each tick prints the frame, then queries the agent for one action
  • A small action vocabulary: move N/S/E/W and noop
  • Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death
  • Strategy prompt provided once at startup, fed to the agent as system prompt for every tick
  • LLM agent receives current frame + HP + tick number, returns a single action
+
+

Success criteria

+
  • A user can run a single command, supply a strategy prompt, and watch the agent play until win or death
  • Win and death paths both observed in manual playtests
  • Different strategy prompts produce visibly different agent behavior
  • End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call
+
+

Out of scope

+
  • Multiple rooms, dungeon generation, procedural levels
  • Combat with enemies, NPCs, monsters
  • Inventory, items, equipment
  • Save/load, persistence
  • Visual UI beyond ASCII to terminal
  • Multiplayer, networking
  • Self-improving agent loops or RL training
+
+

Nice to have

+
  • Configurable room layout from a text file
  • Replay log written to disk for post-hoc inspection
  • A few preset strategy prompts to demo (cautious, greedy, exploratory)
+
+
+
+
+

Handed off

+
+ Target: filesystem + At: 2026-05-17T04:14:44.540Z + + +
+
+ +

Decisions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTitleStatusSelectedDepends on
0001-choose-the-implementation-languageChoose the implementation language [architecture]acceptedPython
0002-define-the-world-representation-and-rendererDefine the world representation and renderer [data-model]acceptedTile-grid + entity dict
0003-define-the-agent-action-contractDefine the agent action contract [architecture]acceptedTool-call (function calling) with one tool: do_action(direction)
0004-define-the-tick-loop-and-termination-conditionsDefine the tick loop and termination conditions [architecture]acceptedSynchronous loop with step cap
+ +

Task graph

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTitleStatusPriEstimateDepends onDecision refs
T0001-bootstrap-repositoryBootstrap repositoryreadyp01h0001-choose-the-implementation-language
T0002-implement-world-module-tile-grid-entity-dictImplement world module (tile grid + entity dict)openp02hT0001-bootstrap-repository0002-define-the-world-representation-and-renderer
T0003-implement-frame-rendererImplement frame rendereropenp01hT0002-implement-world-module-tile-grid-entity-dict0002-define-the-world-representation-and-renderer
T0004-implement-openai-agent-clientImplement OpenAI agent clientopenp02hT0001-bootstrap-repository0003-define-the-agent-action-contract
T0005-implement-action-handlers-and-termination-checksImplement action handlers and termination checksopenp01hT0002-implement-world-module-tile-grid-entity-dict0002-define-the-world-representation-and-renderer
T0006-implement-the-tick-based-game-loopImplement the tick-based game loopopenp02hT0003-implement-frame-renderer T0004-implement-openai-agent-client T0005-implement-action-handlers-and-termination-checks0004-define-the-tick-loop-and-termination-conditions 0002-define-the-world-representation-and-renderer
T0007-implement-cli-entry-scriptImplement CLI entry scriptopenp01hT0006-implement-the-tick-based-game-loop0001-choose-the-implementation-language 0004-define-the-tick-loop-and-termination-conditions
+ +
+ Generated by decision-record · + Last render: 2026-05-17T04:14:44.544Z +
+ +
+ + \ No newline at end of file diff --git a/benchmarks/roguelike-ai-poc/reference/project.json b/benchmarks/roguelike-ai-poc/reference/project.json new file mode 100644 index 0000000..3b4c9fb --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/project.json @@ -0,0 +1,64 @@ +{ + "id": "ai-driven-roguelike-poc", + "title": "AI-driven roguelike POC", + "description": "A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.", + "created_at": "2026-05-17T04:12:02.030Z", + "updated_at": "2026-05-17T04:14:44.540Z", + "effort_level": "poc", + "status": "handed-off", + "scope": { + "in_scope": [ + "A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)", + "Tick-based game loop: each tick prints the frame, then queries the agent for one action", + "A small action vocabulary: move N/S/E/W and noop", + "Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death", + "Strategy prompt provided once at startup, fed to the agent as system prompt for every tick", + "LLM agent receives current frame + HP + tick number, returns a single action" + ], + "out_of_scope": [ + "Multiple rooms, dungeon generation, procedural levels", + "Combat with enemies, NPCs, monsters", + "Inventory, items, equipment", + "Save/load, persistence", + "Visual UI beyond ASCII to terminal", + "Multiplayer, networking", + "Self-improving agent loops or RL training" + ], + "success_criteria": [ + "A user can run a single command, supply a strategy prompt, and watch the agent play until win or death", + "Win and death paths both observed in manual playtests", + "Different strategy prompts produce visibly different agent behavior", + "End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call" + ], + "nice_to_have": [ + "Configurable room layout from a text file", + "Replay log written to disk for post-hoc inspection", + "A few preset strategy prompts to demo (cautious, greedy, exploratory)" + ] + }, + "sign_offs": [ + { + "phase": "handing-off", + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:14:44.523Z", + "notes": "All decisions accepted, graph validates clean." + }, + { + "phase": "handing-off", + "by": "human", + "actor": "kj", + "at": "2026-05-17T04:14:44.540Z" + } + ], + "handoff": { + "target": "filesystem", + "exported_at": "2026-05-17T04:14:44.540Z", + "issue_count": 7, + "document_count": 4 + }, + "gate_config": { + "preset": "poc" + }, + "tags": [] +} diff --git a/benchmarks/roguelike-ai-poc/reference/project.md b/benchmarks/roguelike-ai-poc/reference/project.md new file mode 100644 index 0000000..538b476 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/project.md @@ -0,0 +1,64 @@ +# AI-driven roguelike POC + +| Field | Value | +| --- | --- | +| ID | `ai-driven-roguelike-poc` | +| Status | `handed-off` | +| Effort level | `poc` | +| Created | 2026-05-17T04:12:02.030Z | +| Updated | 2026-05-17T04:14:44.540Z | +| Decisions | 4 | +| Tasks | 7 | + +## Description + +A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area. + +## Scope + +**In scope** + +- A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X) +- Tick-based game loop: each tick prints the frame, then queries the agent for one action +- A small action vocabulary: move N/S/E/W and noop +- Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death +- Strategy prompt provided once at startup, fed to the agent as system prompt for every tick +- LLM agent receives current frame + HP + tick number, returns a single action + +**Success criteria** + +- A user can run a single command, supply a strategy prompt, and watch the agent play until win or death +- Win and death paths both observed in manual playtests +- Different strategy prompts produce visibly different agent behavior +- End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call + +**Out of scope** + +- Multiple rooms, dungeon generation, procedural levels +- Combat with enemies, NPCs, monsters +- Inventory, items, equipment +- Save/load, persistence +- Visual UI beyond ASCII to terminal +- Multiplayer, networking +- Self-improving agent loops or RL training + +**Nice to have** + +- Configurable room layout from a text file +- Replay log written to disk for post-hoc inspection +- A few preset strategy prompts to demo (cautious, greedy, exploratory) + +## Sign-offs + +- **handing-off** by kj (human) at 2026-05-17T04:14:44.523Z — All decisions accepted, graph validates clean. + +- **handing-off** by kj (human) at 2026-05-17T04:14:44.540Z + +## Handoff + +| Field | Value | +| --- | --- | +| Target | `filesystem` | +| Exported at | 2026-05-17T04:14:44.540Z | +| Target ID | — | +| Target URL | — | diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json new file mode 100644 index 0000000..c433a10 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json @@ -0,0 +1,30 @@ +{ + "id": "T0001-bootstrap-repository", + "number": 1, + "slug": "bootstrap-repository", + "title": "Bootstrap repository", + "description": "Initialize the Python project layout: pyproject.toml or requirements.txt with openai pin, a src/ module path, a README stub, and a .gitignore. Verify a `python -c \"import openai\"` succeeds in a fresh venv.", + "status": "ready", + "estimate": { + "unit": "hours", + "value": 1, + "confidence": "high" + }, + "acceptance_criteria": [ + "pyproject.toml or requirements.txt committed", + "openai SDK installable in a venv", + "README explains 30-second quickstart", + "python -c \"from src import __init__\" runs" + ], + "depends_on": [], + "decision_refs": [ + "0001-choose-the-implementation-language" + ], + "priority": "p0", + "labels": [ + "foundation" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.524Z", + "updated_at": "2026-05-17T04:14:22.524Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md new file mode 100644 index 0000000..09effaa --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md @@ -0,0 +1,23 @@ +# T0001-bootstrap-repository — Bootstrap repository + +| Field | Value | +| --- | --- | +| Status | `ready` | +| Priority | `p0` | +| Estimate | 1 hours (high confidence) | +| Depends on | _(none)_ | +| Decision refs | `0001-choose-the-implementation-language` — Choose the implementation language | +| Assignee hint | agent | +| Labels | `foundation` | +| Updated | 2026-05-17T04:14:22.524Z | + +## Description + +Initialize the Python project layout: pyproject.toml or requirements.txt with openai pin, a src/ module path, a README stub, and a .gitignore. Verify a `python -c "import openai"` succeeds in a fresh venv. + +## Acceptance criteria + +- [ ] pyproject.toml or requirements.txt committed +- [ ] openai SDK installable in a venv +- [ ] README explains 30-second quickstart +- [ ] python -c "from src import __init__" runs diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json new file mode 100644 index 0000000..c7a6c75 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json @@ -0,0 +1,32 @@ +{ + "id": "T0002-implement-world-module-tile-grid-entity-dict", + "number": 2, + "slug": "implement-world-module-tile-grid-entity-dict", + "title": "Implement world module (tile grid + entity dict)", + "description": "Build src/world.py: World dataclass with static_tiles: list[list[str]] and entities: dict[str, dict]. Provide constructors for a default 10×10 room (walls border, one hazard, one exit). Pure data and helpers; no rendering, no game logic.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 2, + "confidence": "med" + }, + "acceptance_criteria": [ + "World.default_room() returns a valid 10x10 with #, ., X, > tiles", + "entities dict contains a player at a known spawn", + "is_walkable(x,y) returns False for walls, True for floor and hazard", + "unit test: default room is fully walkable from spawn to exit" + ], + "depends_on": [ + "T0001-bootstrap-repository" + ], + "decision_refs": [ + "0002-define-the-world-representation-and-renderer" + ], + "priority": "p0", + "labels": [ + "core" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.526Z", + "updated_at": "2026-05-17T04:14:22.526Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md new file mode 100644 index 0000000..ff06ca3 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md @@ -0,0 +1,23 @@ +# T0002-implement-world-module-tile-grid-entity-dict — Implement world module (tile grid + entity dict) + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 2 hours (med confidence) | +| Depends on | `T0001-bootstrap-repository` | +| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer | +| Assignee hint | agent | +| Labels | `core` | +| Updated | 2026-05-17T04:14:22.526Z | + +## Description + +Build src/world.py: World dataclass with static_tiles: list[list[str]] and entities: dict[str, dict]. Provide constructors for a default 10×10 room (walls border, one hazard, one exit). Pure data and helpers; no rendering, no game logic. + +## Acceptance criteria + +- [ ] World.default_room() returns a valid 10x10 with #, ., X, > tiles +- [ ] entities dict contains a player at a known spawn +- [ ] is_walkable(x,y) returns False for walls, True for floor and hazard +- [ ] unit test: default room is fully walkable from spawn to exit diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json new file mode 100644 index 0000000..0caf6b1 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json @@ -0,0 +1,32 @@ +{ + "id": "T0003-implement-frame-renderer", + "number": 3, + "slug": "implement-frame-renderer", + "title": "Implement frame renderer", + "description": "Build src/render.py: render_frame(world) -> list[str]. Compose static_tiles + entity glyphs (entity overrides tile). Provide a small HUD line below the frame showing tick number, HP, and last action. Return as list of strings so the game loop can join + print or send to LLM.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 1, + "confidence": "high" + }, + "acceptance_criteria": [ + "render_frame returns 10 strings of length 10", + "player @ is visible at its current position", + "HUD line includes tick, hp, last_action", + "manual visual check: frame looks like a roguelike room" + ], + "depends_on": [ + "T0002-implement-world-module-tile-grid-entity-dict" + ], + "decision_refs": [ + "0002-define-the-world-representation-and-renderer" + ], + "priority": "p0", + "labels": [ + "core" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.527Z", + "updated_at": "2026-05-17T04:14:22.527Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md new file mode 100644 index 0000000..8bfc535 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md @@ -0,0 +1,23 @@ +# T0003-implement-frame-renderer — Implement frame renderer + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 1 hours (high confidence) | +| Depends on | `T0002-implement-world-module-tile-grid-entity-dict` | +| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer | +| Assignee hint | agent | +| Labels | `core` | +| Updated | 2026-05-17T04:14:22.527Z | + +## Description + +Build src/render.py: render_frame(world) -> list[str]. Compose static_tiles + entity glyphs (entity overrides tile). Provide a small HUD line below the frame showing tick number, HP, and last action. Return as list of strings so the game loop can join + print or send to LLM. + +## Acceptance criteria + +- [ ] render_frame returns 10 strings of length 10 +- [ ] player @ is visible at its current position +- [ ] HUD line includes tick, hp, last_action +- [ ] manual visual check: frame looks like a roguelike room diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json new file mode 100644 index 0000000..cdc8821 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json @@ -0,0 +1,34 @@ +{ + "id": "T0004-implement-openai-agent-client", + "number": 4, + "slug": "implement-openai-agent-client", + "title": "Implement OpenAI agent client", + "description": "Build src/agent.py: AgentClient class with constructor(strategy_prompt, model, api_key). Single method choose_action(world_state_json, tick, hp) → (direction, reasoning). Uses tool-calling with one tool do_action(direction in {N,S,E,W,noop}); tool_choice=\"required\". Returns the chosen direction and the assistant message content as reasoning.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 2, + "confidence": "med" + }, + "acceptance_criteria": [ + "AgentClient instantiates without making a call", + "choose_action returns a valid direction enum", + "reasoning is captured as a string (may be empty)", + "malformed responses raise a clear error (does not silently noop)", + "strategy_prompt is in the system role on every call" + ], + "depends_on": [ + "T0001-bootstrap-repository" + ], + "decision_refs": [ + "0003-define-the-agent-action-contract" + ], + "priority": "p0", + "labels": [ + "llm", + "core" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.528Z", + "updated_at": "2026-05-17T04:14:22.528Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md new file mode 100644 index 0000000..0244119 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md @@ -0,0 +1,24 @@ +# T0004-implement-openai-agent-client — Implement OpenAI agent client + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 2 hours (med confidence) | +| Depends on | `T0001-bootstrap-repository` | +| Decision refs | `0003-define-the-agent-action-contract` — Define the agent action contract | +| Assignee hint | agent | +| Labels | `llm`, `core` | +| Updated | 2026-05-17T04:14:22.528Z | + +## Description + +Build src/agent.py: AgentClient class with constructor(strategy_prompt, model, api_key). Single method choose_action(world_state_json, tick, hp) → (direction, reasoning). Uses tool-calling with one tool do_action(direction in {N,S,E,W,noop}); tool_choice="required". Returns the chosen direction and the assistant message content as reasoning. + +## Acceptance criteria + +- [ ] AgentClient instantiates without making a call +- [ ] choose_action returns a valid direction enum +- [ ] reasoning is captured as a string (may be empty) +- [ ] malformed responses raise a clear error (does not silently noop) +- [ ] strategy_prompt is in the system role on every call diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json new file mode 100644 index 0000000..20ad30f --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json @@ -0,0 +1,33 @@ +{ + "id": "T0005-implement-action-handlers-and-termination-checks", + "number": 5, + "slug": "implement-action-handlers-and-termination-checks", + "title": "Implement action handlers and termination checks", + "description": "Build src/actions.py: apply_action(world, direction) -> ActionResult. Moves the player one cell if walkable; otherwise noop. Compute side effects: HP-1 when stepping onto hazard, win flag when player_pos == exit_pos, dead flag when HP <= 0. Return ActionResult dataclass with new_world, hp_delta, terminal, terminal_reason.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 1, + "confidence": "high" + }, + "acceptance_criteria": [ + "Moving into a wall is a noop with no HP change", + "Moving onto hazard triggers hp_delta = -1", + "Moving onto exit triggers terminal=\"win\"", + "HP reaching 0 triggers terminal=\"death\"", + "Unit tests for each transition" + ], + "depends_on": [ + "T0002-implement-world-module-tile-grid-entity-dict" + ], + "decision_refs": [ + "0002-define-the-world-representation-and-renderer" + ], + "priority": "p0", + "labels": [ + "core" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.529Z", + "updated_at": "2026-05-17T04:14:22.529Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md new file mode 100644 index 0000000..5ad2496 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md @@ -0,0 +1,24 @@ +# T0005-implement-action-handlers-and-termination-checks — Implement action handlers and termination checks + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 1 hours (high confidence) | +| Depends on | `T0002-implement-world-module-tile-grid-entity-dict` | +| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer | +| Assignee hint | agent | +| Labels | `core` | +| Updated | 2026-05-17T04:14:22.529Z | + +## Description + +Build src/actions.py: apply_action(world, direction) -> ActionResult. Moves the player one cell if walkable; otherwise noop. Compute side effects: HP-1 when stepping onto hazard, win flag when player_pos == exit_pos, dead flag when HP <= 0. Return ActionResult dataclass with new_world, hp_delta, terminal, terminal_reason. + +## Acceptance criteria + +- [ ] Moving into a wall is a noop with no HP change +- [ ] Moving onto hazard triggers hp_delta = -1 +- [ ] Moving onto exit triggers terminal="win" +- [ ] HP reaching 0 triggers terminal="death" +- [ ] Unit tests for each transition diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json new file mode 100644 index 0000000..129cd6b --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json @@ -0,0 +1,35 @@ +{ + "id": "T0006-implement-the-tick-based-game-loop", + "number": 6, + "slug": "implement-the-tick-based-game-loop", + "title": "Implement the tick-based game loop", + "description": "Build src/loop.py: run_game(world, agent_client, max_steps=50). Each iteration: render frame, call agent_client.choose_action, apply action, check terminal, sleep 0.05s, repeat. Logs each tick: tick#, frame, action, reasoning excerpt, hp. Exits on terminal or step cap; returns final state + reason.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 2, + "confidence": "med" + }, + "acceptance_criteria": [ + "Loop terminates on win, death, or step cap (≤50)", + "Each tick prints the frame and HUD to stdout", + "Final summary line shows reason and step count", + "No exceptions leak from agent timeouts/errors (logged and treated as noop)" + ], + "depends_on": [ + "T0003-implement-frame-renderer", + "T0004-implement-openai-agent-client", + "T0005-implement-action-handlers-and-termination-checks" + ], + "decision_refs": [ + "0004-define-the-tick-loop-and-termination-conditions", + "0002-define-the-world-representation-and-renderer" + ], + "priority": "p0", + "labels": [ + "core" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.530Z", + "updated_at": "2026-05-17T04:14:22.530Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md new file mode 100644 index 0000000..3338646 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md @@ -0,0 +1,23 @@ +# T0006-implement-the-tick-based-game-loop — Implement the tick-based game loop + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 2 hours (med confidence) | +| Depends on | `T0003-implement-frame-renderer`, `T0004-implement-openai-agent-client`, `T0005-implement-action-handlers-and-termination-checks` | +| Decision refs | `0004-define-the-tick-loop-and-termination-conditions` — Define the tick loop and termination conditions; `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer | +| Assignee hint | agent | +| Labels | `core` | +| Updated | 2026-05-17T04:14:22.530Z | + +## Description + +Build src/loop.py: run_game(world, agent_client, max_steps=50). Each iteration: render frame, call agent_client.choose_action, apply action, check terminal, sleep 0.05s, repeat. Logs each tick: tick#, frame, action, reasoning excerpt, hp. Exits on terminal or step cap; returns final state + reason. + +## Acceptance criteria + +- [ ] Loop terminates on win, death, or step cap (≤50) +- [ ] Each tick prints the frame and HUD to stdout +- [ ] Final summary line shows reason and step count +- [ ] No exceptions leak from agent timeouts/errors (logged and treated as noop) diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json new file mode 100644 index 0000000..030f430 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json @@ -0,0 +1,33 @@ +{ + "id": "T0007-implement-cli-entry-script", + "number": 7, + "slug": "implement-cli-entry-script", + "title": "Implement CLI entry script", + "description": "Build src/__main__.py: argparse for --strategy (or read from stdin), --model (default gpt-4o), --max-steps (default 50). Construct AgentClient, build default room, call run_game. Print the final outcome. Document the env vars (OPENAI_API_KEY) and a sample invocation in README.", + "status": "open", + "estimate": { + "unit": "hours", + "value": 1, + "confidence": "high" + }, + "acceptance_criteria": [ + "python -m src --strategy \"cautious explorer\" runs end-to-end", + "README has a complete example invocation", + "--help prints usage", + "Exit code 0 on win/timeout, 1 on death (so scripts can chain)" + ], + "depends_on": [ + "T0006-implement-the-tick-based-game-loop" + ], + "decision_refs": [ + "0001-choose-the-implementation-language", + "0004-define-the-tick-loop-and-termination-conditions" + ], + "priority": "p0", + "labels": [ + "cli" + ], + "assignee_hint": "agent", + "created_at": "2026-05-17T04:14:22.532Z", + "updated_at": "2026-05-17T04:14:22.532Z" +} diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md new file mode 100644 index 0000000..ba9f268 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md @@ -0,0 +1,23 @@ +# T0007-implement-cli-entry-script — Implement CLI entry script + +| Field | Value | +| --- | --- | +| Status | `open` | +| Priority | `p0` | +| Estimate | 1 hours (high confidence) | +| Depends on | `T0006-implement-the-tick-based-game-loop` | +| Decision refs | `0001-choose-the-implementation-language` — Choose the implementation language; `0004-define-the-tick-loop-and-termination-conditions` — Define the tick loop and termination conditions | +| Assignee hint | agent | +| Labels | `cli` | +| Updated | 2026-05-17T04:14:22.532Z | + +## Description + +Build src/__main__.py: argparse for --strategy (or read from stdin), --model (default gpt-4o), --max-steps (default 50). Construct AgentClient, build default room, call run_game. Print the final outcome. Document the env vars (OPENAI_API_KEY) and a sample invocation in README. + +## Acceptance criteria + +- [ ] python -m src --strategy "cautious explorer" runs end-to-end +- [ ] README has a complete example invocation +- [ ] --help prints usage +- [ ] Exit code 0 on win/timeout, 1 on death (so scripts can chain) diff --git a/benchmarks/roguelike-ai-poc/run.sh b/benchmarks/roguelike-ai-poc/run.sh new file mode 100755 index 0000000..67915d1 --- /dev/null +++ b/benchmarks/roguelike-ai-poc/run.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Run the roguelike-ai-poc benchmark prompt against a fresh tmp dir. +# Requires OPENAI_API_KEY in the environment. +# Usage: +# ./run.sh # run with defaults +# OUT=./my-output ./run.sh # specify output dir +# MODEL=gpt-4o-mini ./run.sh # override model + +set -euo pipefail + +if [[ -z "${OPENAI_API_KEY:-}" ]]; then + echo "OPENAI_API_KEY not set — refusing to run." >&2 + exit 2 +fi + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$HERE/../.." && pwd)" +OUT="${OUT:-$(mktemp -d -t dr-bench-roguelike-XXXX)}" + +DESCRIPTION="A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area." + +cd "$REPO_ROOT/server" +[[ -f dist/cli.js ]] || npm run build >&2 + +node dist/cli.js \ + --title "AI-driven roguelike POC" \ + --description "$DESCRIPTION" \ + --effort poc \ + --cwd "$OUT" \ + --yes \ + ${MODEL:+--model "$MODEL"} + +echo "" +echo "── Benchmark artifacts at: $OUT" +echo "Compare with: $HERE/reference/" diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..2063fb4 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,50 @@ +# Documentation + +The decision-record docs follow the [Diátaxis](https://diataxis.fr) framework — four kinds of documentation, each serving a different need. + +| You want to… | Read | +|---|---| +| **Learn** by following a guided first run | [Tutorials](tutorials/) | +| **Accomplish** a specific task | [How-to guides](how-to/) | +| **Look up** facts about a flag, tool, schema | [Reference](reference/) | +| **Understand** the design — why things are the way they are | [Explanation](explanation/) | + +## Start here + +**Brand new?** → [Your first plan](tutorials/your-first-plan.md) (15 minutes, end-to-end). + +**Already installed and want to do a thing?** → [How-to guides](how-to/). + +**Need the exact spec?** → [Reference](reference/). + +**Want the rationale?** → [Explanation](explanation/) — especially [why decision records](explanation/why-decision-records.md) and [design rationale](explanation/design-rationale.md). + +## Index + +### Tutorials +- [Your first plan](tutorials/your-first-plan.md) — run the roguelike benchmark prompt end-to-end + +### How-to guides +- [Install the plugin or CLI](how-to/install.md) +- [Run the CLI](how-to/run-the-cli.md) — idea, PRD, resume +- [Configure LLM providers](how-to/configure-providers.md) — OpenAI, OpenRouter, Ollama, vLLM, LiteLLM +- [Hand off to Linear](how-to/handoff-to-linear.md) +- [Calibrate gates](how-to/calibrate-gates.md) — `poc` / `mvp` / `full` + overrides + +### Reference +- [CLI](reference/cli.md) — every flag, env var, exit code +- [MCP tools](reference/mcp-tools.md) — full tool surface +- [Data model](reference/data-model.md) — entities, fields, types +- [Gates](reference/gates.md) — per-phase gate matrix + +### Explanation +- [Why decision records?](explanation/why-decision-records.md) — Joel Parker Henderson's canonical material +- [Design rationale](explanation/design-rationale.md) — why filesystem, why hard gates, why lens-rotating skeptic +- [The five phases](explanation/the-five-phases.md) — what each phase does and why this shape + +## Outside the docs tree + +- [Repo README](../README.md) — overview, status, install summary +- [CONTRIBUTING](../CONTRIBUTING.md) — how to contribute seeds, templates, and code +- [Benchmarks](../benchmarks/) — canonical prompts we use to spot regressions +- [Schemas](../schemas/) — JSON Schema source of truth for every entity diff --git a/docs/architecture.md b/docs/architecture.md deleted file mode 100644 index 60ed315..0000000 --- a/docs/architecture.md +++ /dev/null @@ -1,197 +0,0 @@ -# Architecture - -The decision-record plugin is two pieces: - -1. **An MCP server** (`server/`) — TypeScript, speaks the Model Context Protocol over stdio. Stateless aside from in-flight handling; durable state lives on disk in the target repo. -2. **A Claude Code plugin** (`.claude-plugin/`, `commands/`, `agents/`) — declares the slash command and sub-agents that drive the pipeline through MCP tool calls. - -This document covers the data model, the gate machine, and the rationale for each design choice. - -## Data model - -JSON Schema source of truth lives in [`schemas/`](../schemas/). Zod mirrors live in [`server/src/schemas/index.ts`](../server/src/schemas/index.ts). - -### Entity overview - -| Entity | Cardinality | File | Source of truth | -| --- | --- | --- | --- | -| Project | 1 per repo | `dr/project.json` | This file | -| PipelineState | 1 per repo | `.dr/state.json` | This file | -| Event | many, append-only | `.dr/events.jsonl` | This file (one entry per line) | -| Decision | 0..N | `dr/decisions/.json` | This file | -| Task | 0..N | `dr/tasks/.json` | This file | - -Markdown renderings (`*.md`, `index.html`) are **derived** — regenerated by `dr_render` from the JSON. Never edit them directly; they'll be overwritten. - -### Project - -The MVP manifest. Captures intent, scope, status, effort calibration, and post-handoff metadata. - -Key fields: -- `id` — stable kebab-case slug -- `status` — current phase (intake/scoping/deciding/decomposing/handing-off/handed-off) -- `effort_level` — `poc | mvp | full`; calibrates gate strictness -- `scope` — `{ in_scope, out_of_scope, success_criteria, nice_to_have }` -- `sign_offs` — array of phase-level sign-offs (`{ phase, by, actor, at, notes }`) -- `handoff` — populated at `handing-off → handed-off` with target + identifiers -- `gate_config` — `{ preset, overrides }`; overrides take precedence per-knob - -### Decision (DR) - -A single significant choice. Mirrors Joel Parker Henderson's canonical template structure (issue, assumptions, constraints, positions, opinions, argument, implications, related) with a few additions for the pipeline: - -- `template_variant` — `canonical | lightweight | scoping | vendor | architecture | data-model`. Affects rendering and (eventually) which sections are required. -- `status` — `rfc | proposed | accepted | rejected | deprecated | superseded`. Only `accepted` satisfies the deciding gate. -- `selected_position` — title of the winning position (must exist in `positions`). -- `review[]` — antagonistic-review passes (`reviewer`, `lens`, `verdict`, `score`, `concerns`). -- `sign_off` — final acceptance record (`by`, `actor`, `at`, `notes`). -- `depends_on[]` — IDs of decisions that must be accepted first. -- `seed_origin` — name of the seed template this DR was instantiated from, if any. - -### Task - -A beads-style work unit. Pre-handoff only — post-handoff lifecycle lives in Linear (or wherever else). - -- `status` — `open | ready | in_progress | done | blocked | deferred` -- `priority` — `p0..p3` -- `estimate` — `{ unit: 'hours'|'days', value, confidence }` -- `acceptance_criteria` — concrete done-when statements -- `depends_on[]` — task IDs that must complete first -- `decision_refs[]` — DR IDs this task implements (traceability) -- `external_ref` — set at handoff to the target system's identifier - -### PipelineState - -Internal state. Never edited by hand. - -- `phase` — same as `Project.status` but read-only from the pipeline's perspective -- `effective_gate_config` — materialized gate (preset merged with overrides) for fast lookup -- `next_decision_seq` / `next_task_seq` — monotonically-increasing counters -- `pending_questions[]` — open questions the agent has surfaced -- `gate_failures[]` — history of failed `dr_advance` attempts (useful for the agent to remember what to fix) - -### Event - -One JSONL line per state change. The events log is append-only and is the audit trail. - -Event kinds include: `project_initialized`, `phase_advanced`, `phase_advance_blocked`, `scope_updated`, `decision_proposed`, `decision_updated`, `decision_reviewed`, `decision_accepted`, `decision_rejected`, `task_proposed`, `task_updated`, `task_status_changed`, `graph_validated`, `gate_check_passed`, `gate_check_failed`, `question_asked`, `question_answered`, `seed_loaded`, `render_run`, `export_started`, `export_completed`, `export_failed`, `sign_off_recorded`. - -A future UI can replay this stream to reconstruct any historical state. - -## Gate machine - -The pipeline is a state machine with hard gates. Phases: - -``` -intake → scoping → deciding → decomposing → handing-off → handed-off -``` - -`dr_advance` is the only way to transition. The server evaluates the gate for the *next* phase against the current state. If all gate checks pass and any required sign-off is provided, the phase changes and an event is emitted. Otherwise, gate-failure reasons come back unchanged. - -### Per-phase checks - -| Phase advancing to | Checks | -| --- | --- | -| `scoping` | Project has title and description | -| `deciding` | `scope.in_scope` non-empty; `scope.success_criteria` non-empty; if review_required_phases includes 'scoping', a scoping-variant DR has a passing review | -| `decomposing` | ≥ min_decisions; if `decisions_required_status === 'accepted'`, no decisions in `proposed`/`rfc`; if `review_required_per_decision`, every accepted decision has a passing review; if `review_required_phases` includes 'deciding', at least one decision has a passing review; no dangling decision dependencies | -| `handing-off` | ≥ min_tasks; no dangling task dependencies; no cycles; every task has an estimate ≤ max_task_estimate_hours; every task's `decision_refs` resolve | -| `handed-off` | `project.handoff` exists (run dr_export_filesystem or dr_export_linear first) | - -### Sign-off requirement - -Each phase transition can require human sign-off via `require_human_signoff_phases`. When set, `dr_advance` only proceeds if you pass `sign_off_by: 'human'`. The agent cannot self-approve a human-required gate. - -## Gate configuration - -Three preset tiers calibrate strictness: - -``` -poc: - decisions_required_status: accepted - review_required_phases: [] - review_required_per_decision: false - max_task_estimate_hours: 16 - require_human_signoff_phases: [handing-off] - min_decisions: 0 - min_tasks: 3 - -mvp: - decisions_required_status: accepted - review_required_phases: [scoping, decomposing] - review_required_per_decision: false - max_task_estimate_hours: 8 - require_human_signoff_phases: [scoping, decomposing, handing-off] - min_decisions: 3 - min_tasks: 8 - -full: - decisions_required_status: accepted - review_required_phases: [scoping, deciding, decomposing] - review_required_per_decision: true - max_task_estimate_hours: 4 - require_human_signoff_phases: [scoping, deciding, decomposing, handing-off] - min_decisions: 6 - min_tasks: 15 -``` - -`gate_overrides` on the project let you tune individual knobs without changing preset: - -```json -{ - "preset": "mvp", - "overrides": { - "min_tasks": 5, - "review_required_phases": ["scoping"] - } -} -``` - -The materialized result lives at `state.effective_gate_config` for fast lookup. - -## MCP tool surface - -| Group | Tools | -| --- | --- | -| Pipeline | `dr_init`, `dr_status`, `dr_advance`, `dr_update_project`, `dr_update_scope` | -| Decisions | `dr_propose_decision`, `dr_update_decision`, `dr_review_decision`, `dr_accept_decision`, `dr_reject_decision`, `dr_list_decisions`, `dr_get_decision`, `dr_ready_decisions` | -| Tasks | `dr_propose_task`, `dr_update_task`, `dr_set_task_status`, `dr_list_tasks`, `dr_get_task`, `dr_ready_tasks`, `dr_validate_graph` | -| Seeds | `dr_seed_search`, `dr_seed_list`, `dr_seed_get`, `dr_seed_load` | -| Render | `dr_render` | -| Handoff | `dr_export_filesystem`, `dr_export_linear` | - -All tools accept `cwd` (target repo) and default to `process.cwd()` when omitted. - -## Why this shape - -### Why filesystem instead of SQLite - -[beads_rust](https://github.com/Dicklesworthstone/beads_rust) uses SQLite + JSONL. We picked filesystem-only because the user prefers data-driven artifacts that are git-diffable and human-readable, and because the working set is small (tens of decisions, dozens of tasks). The JSONL event log gives us the audit trail without the SQLite dependency. - -### Why TypeScript - -Best fit for a Claude Code plugin. Easy to iterate on prompts and templates. Smaller install footprint than a Python/Rust toolchain. We can revisit if performance ever matters (it won't at this scale). - -### Why hard gates instead of soft suggestions - -Soft gates degrade. People learn to skip them. By making the wizard refuse to emit a "ship-ready plan" until criteria are met, the artifact becomes trustworthy: if it exists, it's complete. - -### Why per-project calibration - -Not every project deserves a SWOT analysis. The POC preset removes ceremony for hack-day work; the Full preset keeps it for regulated or production-grade work. The user picks at init time. - -### Why state-driven over form-driven - -A rigid form would force the wizard to ask the same questions in the same order regardless of project shape. State-driven means: the agent reads what's in the state, identifies what's missing for the gate, and picks the next question. This is the pattern Automaker's resume-check uses ([reference here](https://github.com/protoLabsAI/automaker)). - -### Why antagonistic review - -Decisions made fast without pushback ossify. The `dr-skeptic` agent forces a structured "what could go wrong here?" pass before accepting. Inspired by Automaker's two-reviewer pattern (Ava operational + Jon strategic). - -### Why Linear as the primary handoff target - -The user works in Linear. Linear's official MCP server is mature; Linear's data model (Project + Issue + Project-Update + Initiative + Milestone) maps cleanly to our manifest + tasks. Other target adapters (Plane, GitHub Projects, Jira) can be added by following the `handoff/linear.ts` pattern. - -## Versioning - -`PipelineState.schema_version` is the durable contract. We bump it on breaking layout changes. The server refuses to mutate older versions until migrated. There's no migration tooling yet — when we cross 1.0, we'll add it. diff --git a/docs/explanation/design-rationale.md b/docs/explanation/design-rationale.md new file mode 100644 index 0000000..7b759a7 --- /dev/null +++ b/docs/explanation/design-rationale.md @@ -0,0 +1,104 @@ +# Design rationale + +The decisions behind how this system is built. Use these when you want to understand "why this way and not the obvious other way." + +## Hard gates instead of soft suggestions + +Soft gates degrade. People learn to skip them, the optional becomes invisible, and within a few iterations the artifact stops being trustworthy. We made every phase transition refusal-by-default: if a gate fails, the wizard returns reasons, does not advance, and there is no `--force`. The artifact's value is the assurance that everything it claims is real. + +Consequence: when a gate is too strict, you change the gate, not bypass it. The `gate_config.overrides` mechanism is the official escape hatch — explicit and recorded. + +## Five phases, exactly + +Intake → Scoping → Deciding → Decomposing → Handoff is the smallest sequence that gives each artifact a clean home and makes ordering load-bearing: + +- **Intake** captures the seed. +- **Scoping** sets the perimeter before decisions are made (so decisions can be evaluated against scope). +- **Deciding** resolves significant choices before tasks are written (so tasks can reference decisions for traceability). +- **Decomposing** turns decisions into work (so the work shape follows from the choices). +- **Handoff** finalizes (so the artifact has a clear "done" state). + +We tried collapsing decisions and decomposition. The decomposer ended up making decisions in passing — implicit, unreviewed, untraceable. Splitting the phases forced decisions to be first-class. + +## File-system, not a database + +Beads_rust uses SQLite + JSONL. We went filesystem-only: + +- The working set is small (tens of decisions, dozens of tasks). +- JSON files diff well in git; engineers can read them without tooling. +- A future UI can read the same files; no schema migration tax. +- The JSONL event log gives us the audit trail without the DB dependency. + +The trade-off: queries are O(N) directory scans. Acceptable at our scale. If we ever need cross-project indexing or multi-user concurrency, we revisit. + +## TypeScript everywhere + +Single language across the MCP server, CLI, and tests. Best fit for the Claude Code plugin ecosystem. The `openai` SDK is mature in TypeScript. Iterating on prompts and templates is fast. We considered Rust to match beads_rust's philosophy — rejected because we iterate on prompts more than perf, and a 100KB CLI bundle is fine. + +## OpenAI-compatible, single provider + +We initially planned dual backends (Anthropic SDK + openai SDK). Cut to OpenAI-compat only because: + +- A single SDK is half the surface area to maintain. +- `OPENAI_BASE_URL` already covers Anthropic-via-OpenRouter, local Ollama/vLLM, LiteLLM proxies, and most enterprise gateways. +- The agents do straightforward tool calling; nothing requires a vendor-specific SDK feature. + +If we ever need Anthropic-native features (cache_control, adaptive thinking), we add a thin adapter — but we don't anticipate it. + +## Antagonistic review with lens rotation + +We use a `dr-skeptic` sub-agent that reviews decisions through one specific lens (operational, strategic, security, cost, user-impact) per invocation. For the `full` preset, every decision runs through all five lenses. + +Inspired by Automaker's two-reviewer pattern (Ava operational + Jon strategic), but generalized: the lens menu is open-ended, and each lens is its own scoped prompt instead of a single reviewer trying to hold all perspectives at once. A focused agent finds more concrete concerns than a broad one. + +The skeptic doesn't have to win. A human can override `block` verdicts with explicit sign-off. But the lens output is recorded on the DR forever — visible to anyone who reads it later. + +## State-driven, not form-driven + +The wizard's job is to read the current state, identify what's missing for the next gate, and pick the next action. It is not a fixed Q&A sequence. This matches Automaker's resume-check pattern — drop in mid-pipeline, the wizard recovers gracefully. + +Practical consequence: every wizard invocation starts with `dr_status`. There's no implicit conversation state in the agent loop; everything is on disk. + +## Pre-MVP only, deliberately + +The pipeline stops at `handed-off`. We don't track post-handoff execution. That belongs in whatever execution system the team uses — Linear, Plane, GitHub Projects, etc. + +Why: planning tools that grow into execution tools accumulate scope until they're nothing in particular. By stopping at handoff, the boundary is clear: the plan is the artifact; execution is somebody else's tool. + +## Per-project gate calibration + +A weekend hack does not need the same gates as a regulated production rollout. Three presets (`poc`, `mvp`, `full`) calibrate strictness; per-knob overrides handle the edge cases. Picked at init. + +This was the user feedback that shaped the gate machine: the same hard-gate philosophy can apply to wildly different project shapes, as long as the strictness scales. + +## Seed library + +A small set (currently nine) of canned decisions for territory the agent will repeatedly see: language, runtime, data store, auth, deployment, CI/CD, testing, observability, scope-statement. Each is a starter — the agent loads it and customizes for the project. + +Why ship these: avoids the agent rediscovering the same trade-offs each project. The seed encodes prior pattern-matching as a starting point, not a final answer. The user can fork the seed library and add their team's defaults. + +## Linear as the primary handoff target + +The user's primary use case is Linear; the data model maps cleanly. We use Linear's GraphQL API directly with an API key, not their MCP server, because: + +- We need precise control over the project/issue/relation creation sequence. +- The GraphQL API is mature and well-documented. +- Adding MCP-server-as-downstream adds an extra dependency layer for a one-shot operation. + +Other handoff targets follow the `server/src/handoff/linear.ts` pattern: `buildExportPlan` (pure, testable) + per-target API calls. + +## What we explicitly didn't build + +- **A web UI** — the data model is UI-ready (JSON-everywhere, JSONL event log) but we ship Markdown + static HTML for now. UI work would dwarf the pipeline work. +- **Real-time multi-user collaboration** — single-user, single-machine. The artifact is git-tracked; that's how teams share. +- **A built-in LLM** — we depend on OpenAI-compat endpoints. No model bundling. +- **Reconciliation for partial Linear exports** — a known follow-up. For now, a failed export means deleting the partial Linear project and re-running. +- **A CI integration** — beyond the test suite. The plugin produces artifacts; what teams do with them in CI is up to the team. + +## Open questions + +- Does the lens-rotating skeptic produce meaningfully better decisions than a single skeptic? Needs benchmark data over time. +- Is the nine-seed library the right size? Probably grows. +- Should `handed-off` have a "re-open for amendment" path? Currently it's a terminal state. + +We track these by re-running benchmarks as the system changes. diff --git a/docs/explanation/the-five-phases.md b/docs/explanation/the-five-phases.md new file mode 100644 index 0000000..b1352a0 --- /dev/null +++ b/docs/explanation/the-five-phases.md @@ -0,0 +1,133 @@ +# The five phases + +The pipeline has exactly five phases between an idea and a ship-ready plan. Each phase has a single job; each transition is gated. + +``` +intake → scoping → deciding → decomposing → handing-off → handed-off +``` + +This page explains what each phase accomplishes and why it exists. + +## Intake + +**Job:** Capture the idea. + +**Inputs:** a one-line idea, an optional PRD, an effort-level choice. + +**Outputs:** a `Project` object with title, description, effort_level, and an empty everything-else. + +**Gate to next phase:** title and description non-empty. + +**Why it exists:** to write the seed down. Until the idea has an `id` on disk, the wizard has nothing to read on subsequent turns. Intake is mechanical and fast. + +## Scoping + +**Job:** Pin the MVP perimeter. + +**Inputs:** the project description, optionally a PRD, optionally a `scope-statement` seed. + +**Outputs:** + +- `project.scope.in_scope` — capabilities the MVP MUST ship +- `project.scope.out_of_scope` — explicit non-goals (this is the load-bearing list) +- `project.scope.success_criteria` — measurable signals +- `project.scope.nice_to_have` — optional capabilities +- Under `mvp`/`full` presets: a `scope-statement` DR with a selected shape (lean / walking-skeleton / polished) and an argument + +**Gate to next phase:** `in_scope` and `success_criteria` non-empty. Under `mvp`/`full`, the scope DR has a passing review. + +**Why it exists:** without explicit scope, decisions and tasks expand silently. Pinning scope first means every decision evaluated against it has a clear target. The `out_of_scope` list, in particular, is the thing that prevents scope creep later — if it's not on the in_scope list, it's not in the plan. + +## Deciding + +**Job:** Resolve significant decisions. + +**Inputs:** the scoped project. Each decision area is a "would otherwise be re-litigated" choice — language, data store, auth, deployment target, agent contract, etc. + +**Outputs:** a set of `Decision` records, each with: + +- An issue framing +- 2–4 positions with pros/cons +- A `selected_position` and an `argument` +- Under `full` preset: one `Review` entry per lens (operational, strategic, security, cost, user-impact) +- Final `status: accepted` with a `sign_off` + +**Gate to next phase:** ≥ `min_decisions` count; every decision either `accepted` or `rejected` (no in-flight `proposed`); per-decision review passed if `review_required_per_decision`; no dangling decision dependencies. + +**Why it exists:** decisions made implicitly during decomposition are untraceable. Forcing them into first-class records means future-you (or future-them) can see why the team chose X. The `seed_origin` field also lets the agent learn from past projects without redeciding the obvious. + +## Decomposing + +**Job:** Turn decisions into a task graph. + +**Inputs:** accepted decisions + scope. Each task is a vertical slice that ships some user-visible behavior end-to-end, sized to fit under the preset's `max_task_estimate_hours`. + +**Outputs:** a set of `Task` records, each with: + +- A title and description +- Acceptance criteria (concrete done-when statements) +- An estimate (hours/days + confidence) +- `decision_refs` linking back to the decisions it implements +- `depends_on` for ordering + +**Gate to next phase:** ≥ `min_tasks`; no cycles; no orphan dependencies; every estimate within budget; every `decision_refs` resolves; under `mvp`/`full`, the decomposing phase has been reviewed. + +**Why it exists:** without explicit dependencies, the team works in arbitrary order and discovers blockers late. The dependency graph makes the order legible. The `decision_refs` make traceability automatic — if a decision changes, you can find every task affected. + +## Handing off + +**Job:** Finalize the plan into a target system. + +**Inputs:** the validated decision + task graph; a handoff target (Linear or filesystem). + +**Outputs:** + +- For Linear: a Linear Project, an Issue per decision (labeled `decision`), an Issue per task with priority/estimate/acceptance criteria, `blocks` relations for `depends_on`. Each task's local JSON gets an `external_ref` for traceability. +- For filesystem: the `dr/` tree is finalized, `project.json.handoff` is set, mutations are halted. + +**Gate to next phase:** `project.handoff` set; sign-off provided. + +**Why it exists:** to mark the plan as complete and hand it to the execution system. After this point, the pipeline considers the work done; ongoing changes happen wherever the engineering team works. + +## Handed off (terminal) + +**Job:** Hold the final state. + +**Inputs:** the finished pipeline. + +**Outputs:** none. This is a terminal state — `dr_advance` from `handed-off` returns null. + +**Why it exists:** the pipeline has a clear "done." There is no post-handoff lifecycle in this system; that belongs in Linear/Plane/wherever. + +## Why exactly these five + +We tried a few alternative shapes: + +- **Three phases** (idea → plan → handoff) — too coarse; the agent had to make scope decisions and task decisions in the same step, and they collapsed into each other. +- **Seven phases** (adding "research" before scope and "verification" before handoff) — felt heavier than the workload warranted. The agent can pull research into scoping; verification is what the gates already do. +- **No explicit handoff phase** (just an export tool) — the export ended up being the implicit handoff, but without a phase boundary the gate machine couldn't enforce sign-off and completeness. + +The current shape is the smallest that gives each artifact a single owner and makes every transition load-bearing. + +## What happens between phases + +Between phases, the wizard: + +1. Reads the current state with `dr_status`. +2. Evaluates the gate to the next phase. +3. If passing and no human sign-off is required, calls `dr_advance` directly. +4. If passing and human sign-off is required, prompts the user (or auto-confirms under `--yes`). +5. If failing, surfaces the gate reasons and tries to make the agent fix them — usually by running the phase's sub-agent again. + +The phase machine is therefore not just "what's the next thing" — it's "what gate is blocking us, and what work closes that gate." + +## State-driven progression + +Critically: phase progression is **state-driven, not turn-driven**. The wizard doesn't say "we just finished scoping so I'll move to deciding." It says "scope is non-empty, the scope DR is reviewed, the gate passes, so I'll advance." This means: + +- The wizard can resume cleanly mid-phase. +- Partial work isn't wasted. +- A human can edit `project.json` between sessions and the wizard adapts. +- Phase order is enforced by the gate machine, not by the agent's memory. + +That's the underlying primitive that makes the rest work. diff --git a/docs/upstream-canon.md b/docs/explanation/why-decision-records.md similarity index 100% rename from docs/upstream-canon.md rename to docs/explanation/why-decision-records.md diff --git a/docs/how-to/calibrate-gates.md b/docs/how-to/calibrate-gates.md new file mode 100644 index 0000000..6bffb99 --- /dev/null +++ b/docs/how-to/calibrate-gates.md @@ -0,0 +1,79 @@ +# Calibrate gates + +The pipeline is hard-gated — every phase transition checks a set of conditions, and refuses to advance if they're not met. The strictness of those conditions is set per-project by an **effort level** preset, with optional per-knob overrides. + +## Choose a preset + +```bash +decision-record --idea "…" --effort poc # loosest +decision-record --idea "…" --effort mvp # default +decision-record --idea "…" --effort full # strictest +``` + +| Knob | `poc` | `mvp` (default) | `full` | +|---|---|---|---| +| Minimum decisions to advance from deciding | 0 | 3 | 6 | +| Minimum tasks to advance from decomposing | 3 | 8 | 15 | +| Max hours per leaf task | 16 | 8 | 4 | +| Phases that require reviewed scope/decisions/decomp | (none) | scoping, decomposing | scoping, deciding, decomposing | +| Every DR reviewed individually (lens-rotating skeptic) | no | no | **yes** | +| Phases that require human sign-off | handing-off | scoping, decomposing, handing-off | scoping, deciding, decomposing, handing-off | + +**When to use each:** + +- **`poc`** — weekend hacks, prototypes, internal-only spikes. Minimal ceremony. +- **`mvp`** (default) — a real product slice. Scope and decomposition get scrutiny; individual decisions don't get a full review pass. +- **`full`** — production work, regulated domains, anything where reading the decisions in six months matters. Every DR is reviewed by the lens-rotating skeptic before acceptance. + +## Override individual knobs + +Sometimes a preset is close but one knob is off. Override at init time: + +```bash +# Use MVP defaults but require only 5 tasks instead of 8 +decision-record --idea "…" --effort mvp \ + # (override flags coming — for now use the MCP dr_update_project tool after init) +``` + +> The CLI does not currently expose per-knob overrides as flags. You can override them by calling `dr_update_project` via the MCP server, or by editing `dr/project.json` directly (then re-running with `--resume`). A `--gate-override key=value` flag is a planned addition. + +### Override schema + +`project.json` has a `gate_config.overrides` object. Any knob you set there wins over the preset: + +```json +{ + "gate_config": { + "preset": "mvp", + "overrides": { + "min_tasks": 5, + "review_required_per_decision": true, + "max_task_estimate_hours": 6 + } + } +} +``` + +Available override knobs: + +| Key | Type | Effect | +|---|---|---| +| `decisions_required_status` | `"accepted"` \| `"any"` | What DR status counts toward the deciding gate. Use `"any"` to allow rejection without re-deciding. | +| `review_required_phases` | `string[]` | Phases at which an antagonistic review must happen before advance. | +| `review_required_per_decision` | `boolean` | If true, every DR needs a passing review before acceptance. | +| `max_task_estimate_hours` | `number` | Leaf task estimate ceiling. | +| `require_human_signoff_phases` | `string[]` | Phases that need human (not agent) sign-off to advance. | +| `min_decisions` | `integer` | Minimum decisions to advance from deciding. | +| `min_tasks` | `integer` | Minimum tasks to advance from decomposing. | + +## Inspect the effective config + +```bash +cat /.dr/state.json | jq '.effective_gate_config' +``` + +The `effective_gate_config` is the materialized preset + overrides; it's what the gate evaluator actually checks against. Edit `project.json` overrides, then re-run with `--resume` to see the change. + +## Why hard gates? + +Soft gates degrade. People learn to skip them. By refusing to emit a "ship-ready plan" until the criteria are met, the resulting artifact becomes trustworthy: if it exists, it's complete. See [the design rationale](../explanation/design-rationale.md) for the longer version. diff --git a/docs/how-to/configure-providers.md b/docs/how-to/configure-providers.md new file mode 100644 index 0000000..19e68f5 --- /dev/null +++ b/docs/how-to/configure-providers.md @@ -0,0 +1,103 @@ +# Configure LLM providers + +The CLI uses the **OpenAI-compatible** API surface. Anything that speaks that protocol works — OpenAI itself, OpenRouter, Ollama, vLLM, LiteLLM, etc. + +## OpenAI (the default) + +```bash +export OPENAI_API_KEY=sk-… +decision-record --idea "…" +``` + +Default model: `gpt-4o`. Override per-call: + +```bash +decision-record --idea "…" --model gpt-4o-mini +``` + +Or persistently: + +```bash +export OPENAI_MODEL=gpt-4o-mini +``` + +## OpenRouter + +[OpenRouter](https://openrouter.ai/) proxies many providers behind a single OpenAI-compatible endpoint. + +```bash +export OPENAI_API_KEY=sk-or-… +export OPENAI_BASE_URL=https://openrouter.ai/api/v1 +export OPENAI_MODEL=anthropic/claude-sonnet-4-6 +decision-record --idea "…" +``` + +## Ollama (local) + +[Ollama](https://ollama.com/) serves an OpenAI-compatible endpoint on `:11434`. + +```bash +ollama pull llama3.1:70b # one time +ollama serve # if not already running +``` + +```bash +export OPENAI_API_KEY=ollama # any non-empty string works +export OPENAI_BASE_URL=http://localhost:11434/v1 +export OPENAI_MODEL=llama3.1:70b +decision-record --idea "…" +``` + +> **Tool calling matters.** The agents rely on the model emitting tool calls. Verify your local model supports OpenAI-style function calling before running a full pipeline. Smaller models often struggle here. + +## vLLM (self-hosted) + +[vLLM](https://github.com/vllm-project/vllm) exposes an OpenAI-compatible server. + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-3.1-70B-Instruct \ + --port 8000 +``` + +```bash +export OPENAI_API_KEY=any-string +export OPENAI_BASE_URL=http://localhost:8000/v1 +export OPENAI_MODEL=meta-llama/Llama-3.1-70B-Instruct +``` + +## LiteLLM proxy + +[LiteLLM](https://github.com/BerriAI/litellm) is a universal proxy that converts many providers to OpenAI format. Once running: + +```bash +export OPENAI_API_KEY=sk-litellm-… +export OPENAI_BASE_URL=http://localhost:4000/v1 +export OPENAI_MODEL=gpt-4o # the alias you defined in litellm config +``` + +## Per-invocation overrides + +All env vars have CLI equivalents that take precedence: + +```bash +decision-record \ + --api-key sk-… \ + --base-url https://openrouter.ai/api/v1 \ + --model anthropic/claude-opus-4-7 \ + --idea "…" +``` + +## Choosing a model + +The agents do a lot of tool calling and structured reasoning. Models that work well: + +| Model | Notes | +|---|---| +| `gpt-4o` | Default; reliable tool calling, good reasoning | +| `gpt-4o-mini` | Faster and cheaper; works for `poc` and many `mvp` projects | +| Claude Sonnet 4.6 via OpenRouter | Strong on long-form reasoning and skeptic critique | +| Claude Opus 4.7 via OpenRouter | Highest-quality decisions and decompositions; slower and pricier | +| Local Llama 3.1 70B+ | Workable if your tooling supports function calling; weaker on subtle critique | + +Pick based on the project's criticality. POC throwaway → `gpt-4o-mini`. Production decision that other people will read → `gpt-4o` or Sonnet/Opus. diff --git a/docs/how-to/handoff-to-linear.md b/docs/how-to/handoff-to-linear.md new file mode 100644 index 0000000..0364f20 --- /dev/null +++ b/docs/how-to/handoff-to-linear.md @@ -0,0 +1,83 @@ +# Hand off to Linear + +When the pipeline reaches the handoff phase, the wizard can push the finished plan into Linear — a Project containing one Issue per task and one Issue (labeled `decision`) per accepted DR, with `blocks` relations matching task dependencies. + +## One-time setup + +1. **Get a Linear API key.** + Settings → API → Personal API keys → "New". Copy the `lin_api_…` value. + +2. **Find your team ID.** + Two easy ways: + - In Linear, open any issue → look at the URL: `linear.app//issue/` — the `TEAM` prefix is the team key, not the ID. To get the UUID, use the GraphQL explorer at or [`linear teams`](https://linear.app/docs/cli) in their CLI. + - Or: `curl -H 'Authorization: lin_api_…' -X POST https://api.linear.app/graphql -d '{"query":"{ teams { nodes { id name key } } }"}'` + +3. **Set env vars:** + ```bash + export LINEAR_API_KEY=lin_api_… + export LINEAR_TEAM_ID= # optional; you'll be prompted otherwise + ``` + +## Run with handoff to Linear + +```bash +decision-record --idea "…" --cwd ~/dev/my-project +``` + +When the wizard reaches the handoff phase, you'll see: + +``` +> LINEAR_API_KEY detected. Push the plan to Linear? [Y/n] +``` + +Answer yes. The wizard will: + +1. Run a **dry-run preview** — building the export plan locally without calling Linear. +2. Show you the totals: `N issues (M decisions + K tasks)`. +3. Ask **"Push to Linear now?"** Confirm to fire the real export. + +If you ran with `--yes`, both prompts auto-confirm. + +## What gets created + +| In decision-record | In Linear | +|---|---| +| Project manifest (`project.json`) | A new **Project** with the MVP manifest as the description | +| Each accepted Decision | An **Issue** labeled `decision` + `dr:`, with the issue/argument/implications in the description | +| Each Task | An **Issue** with priority + estimate + acceptance criteria as checkboxes | +| Task `depends_on` relations | Linear `blocks` issue relations | +| `LINEAR_TEAM_ID` | The team the Project and Issues are created in | + +After the export succeeds: + +- `dr/project.json` gets a `handoff` block recording the Linear project URL. +- Each task's JSON gets an `external_ref: { system: "linear", id, url }` for traceability. +- `dr/index.html` shows a Handoff banner linking to Linear. + +## Preview without pushing + +To see the export plan without calling Linear at all, the wizard's interactive prompt offers preview-first by default. If you want to script a preview only, invoke the MCP tool directly: + +```bash +node dist/index.js # start the MCP server, then call dr_export_linear with dry_run=true +``` + +Or just run with `--yes` and watch the dry-run output before answering the confirm prompt (when not in autonomous mode). + +## Filesystem only + +If `LINEAR_API_KEY` is **not** set in the environment, the wizard skips the Linear branch entirely and exports to filesystem. The plan is still complete and shippable — engineers can pick it up from `dr/` directly and create issues themselves wherever they want. + +## When it fails partway + +The current Linear export is one-shot, not idempotent. If a `dr_export_linear` call fails after creating some issues: + +1. The wizard logs `export_failed` to `events.jsonl` and exits with code 1. +2. **No reconciliation logic** — the partial Linear project exists, but a re-run will create a fresh project alongside it. +3. Delete the partial project in Linear, then re-run with `--resume`. + +A reconciliation pass that detects and continues partial exports is a known follow-up. + +## Other handoff targets + +The data model is target-agnostic. To support Plane, GitHub Projects, Jira, etc., follow the pattern in `server/src/handoff/linear.ts` — a `buildExportPlan` function plus per-target API calls. PRs welcome. diff --git a/docs/how-to/install.md b/docs/how-to/install.md new file mode 100644 index 0000000..0a5e640 --- /dev/null +++ b/docs/how-to/install.md @@ -0,0 +1,80 @@ +# Install + +Two ways to use decision-record: + +1. **Standalone CLI** — fast to set up, no Claude Code dependency. +2. **Claude Code plugin** — adds the `/plan` slash command and registers the MCP server with Claude Code. + +Both share the same MCP server binary, the same artifacts on disk, and the same gate machine. + +## Standalone CLI + +```bash +git clone https://github.com/protoLabsAI/decision-record.git +cd decision-record/server +npm install +npm run build +``` + +The build produces `dist/cli.js` (CLI) and `dist/index.js` (MCP server). Run the CLI directly: + +```bash +export OPENAI_API_KEY=sk-… +node dist/cli.js --help +``` + +Optionally, symlink it onto your PATH: + +```bash +ln -s "$(pwd)/dist/cli.js" /usr/local/bin/decision-record +chmod +x /usr/local/bin/decision-record +decision-record --help +``` + +A published-to-npm release is on the roadmap — once shipped, `npx @protolabs/decision-record-server` will work without the clone. + +## Claude Code plugin + +The repo root contains a `.claude-plugin/plugin.json` and an `.mcp.json` that point Claude Code at the bundled server. To install locally: + +```bash +git clone https://github.com/protoLabsAI/decision-record.git +cd decision-record/server +npm install +npm run build +cd .. + +# Symlink into the Claude plugins directory +ln -s "$(pwd)" ~/.claude/plugins/decision-record +``` + +Restart Claude Code. You should see: + +- The `/plan` slash command available +- The `decision-record` MCP server listed in `/mcp` +- The `dr-wizard`, `dr-skeptic`, `dr-decomposer` sub-agents available + +Trigger a session: + +``` +/plan a CLI tool that converts QuickBooks CSV exports +``` + +A marketplace-published version is planned. When available, `/plugin install decision-record` will do everything above. + +## Verify + +```bash +# Standalone +node dist/cli.js --version +# decision-record 0.1.0 + +# Plugin (inside Claude Code) +/mcp +# should list `decision-record` with green status +``` + +## Next + +- [Run the CLI](run-the-cli.md) — first invocation patterns +- [Configure LLM providers](configure-providers.md) — OpenAI, OpenRouter, Ollama, vLLM, LiteLLM diff --git a/docs/how-to/run-the-cli.md b/docs/how-to/run-the-cli.md new file mode 100644 index 0000000..40bebbe --- /dev/null +++ b/docs/how-to/run-the-cli.md @@ -0,0 +1,114 @@ +# Run the CLI + +Four common invocation patterns: + +## 1. One-line idea + +```bash +decision-record --idea "a CLI tool that normalizes accounting exports" +``` + +The wizard will derive a title from the idea text. The rest of the pipeline runs in the current directory (`.dr/` and `dr/` will appear there). + +To target a different directory: + +```bash +decision-record --idea "…" --cwd ~/dev/my-project +``` + +## 2. From a PRD file + +```bash +decision-record --prd ./docs/idea.md --cwd ~/dev/my-project +``` + +The PRD reader looks for: + +- The first `# heading` → title hint +- The first non-heading paragraph → description hint + +The full PRD text is passed to the scoping agent as context. Combine `--prd` with `--idea` if you want to override the title hint: + +```bash +decision-record --prd ./docs/idea.md --idea "ledger CLI" --cwd … +``` + +## 3. Resume an in-progress project + +If the CLI is interrupted (or you came back later), pick up where you left off: + +```bash +decision-record --resume --cwd ~/dev/my-project +``` + +The wizard reads `.dr/state.json`, sees what phase you were in, and continues from there. State is durable across sessions. + +## 4. Fully autonomous + +The `--yes` flag bypasses every interactive checkpoint: + +```bash +decision-record --idea "…" --effort poc --yes +``` + +Useful for CI, scripted runs, or benchmarks. **Read what gets produced** — the wizard will not stop to ask, including at gates that normally require human sign-off. + +## Common flags + +| Flag | Meaning | +|---|---| +| `--idea TEXT` | Free-form one-line idea | +| `--title TEXT` | Explicit project title (overrides derivation) | +| `--description TEXT` | Explicit description | +| `--prd PATH` | Read a Markdown PRD as scope context | +| `--cwd PATH` | Target project directory (default: `process.cwd()`) | +| `--effort poc\|mvp\|full` | Gate strictness preset (default: `mvp`) | +| `--model NAME` | Override `OPENAI_MODEL` | +| `--api-key KEY` | Override `OPENAI_API_KEY` | +| `--base-url URL` | Override `OPENAI_BASE_URL` | +| `--resume` | Skip intake; resume the project in `--cwd` | +| `--yes`, `-y` | Bypass interactive checkpoints | +| `--verbose`, `-v` | Stream agent reasoning + tool calls to stderr | +| `--help`, `-h` | Show full help | +| `--version` | Print version | + +Full flag reference: [`docs/reference/cli.md`](../reference/cli.md). + +## Watching the wizard work + +Use `--verbose` (or `-v`) to see agent reasoning and every MCP tool call: + +```bash +decision-record --idea "…" --effort poc --verbose +``` + +Output goes to **stderr**, so you can still pipe stdout cleanly: + +```bash +decision-record --idea "…" --yes 2>plan.log +``` + +## Exit codes + +| Code | Meaning | +|---|---| +| `0` | Pipeline completed successfully (reached `handed-off`) | +| `1` | A phase failed (gate failure, agent error, export failure) | +| `2` | Bad arguments or missing env (`OPENAI_API_KEY`) | + +## What lands on disk + +``` +<--cwd>/ +├── .dr/ # internal (gitignored automatically) +│ ├── state.json # pipeline state +│ └── events.jsonl # append-only event log +└── dr/ # tracked — commit this + ├── project.json # MVP manifest + ├── project.md # rendered view + ├── decisions/ # one .json + .md per DR + ├── tasks/ # one .json + .md per task + └── index.html # rendered project overview +``` + +JSON is the source of truth; `.md` and `index.html` are regenerated by the wizard. The `.dr/` directory's `.gitignore` is created automatically. diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index 4bbe69c..0000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,80 +0,0 @@ -# Quickstart - -A five-minute walkthrough of taking an idea to a ship-ready MVP plan. - -## Prerequisites - -- Claude Code installed -- Node 20+ -- (Optional) A Linear account and a personal API token if you want to push the final plan to Linear - -## Install (local dev) - -```bash -git clone https://github.com/protoLabsAI/decision-record.git -cd decision-record/server -npm install -npm run build -``` - -Then either: - -- **As a Claude Code plugin** — symlink the `decision-record` directory into `~/.claude/plugins/decision-record/`, restart Claude Code, and the `/plan` command + the `decision-record` MCP server should be available. -- **As a bare MCP server** — point any MCP client at `node /path/to/decision-record/server/dist/index.js`. - -## Run - -In a target repository (the project you want to plan): - -``` -/plan a small CLI that converts QuickBooks CSV exports to a normalized ledger format -``` - -You'll see the `dr-wizard` agent take over. It will: - -1. Confirm the title, description, and effort level (default: `mvp`). -2. Run `dr_init`, creating `.dr/` and `dr/` in your target repo. -3. Advance to scoping and start asking about MVP boundaries. - -## What you'll do, in order - -1. **Scope it.** Three or four bullets each for in-scope, out-of-scope, and success criteria. The wizard will push back if you're vague. -2. **Decide.** The wizard surfaces 3-6 significant decisions (language, data store, deployment, etc.), pulling from the seed library where it can. You pick a position and write a brief argument for each. The `dr-skeptic` agent will review them. -3. **Decompose.** The `dr-decomposer` agent proposes a beads-style task graph. You review, refine, and lock it. -4. **Hand off.** Push to Linear (with `LINEAR_API_KEY` and a team ID) or finalize to the filesystem. - -When the wizard reports `Phase: handed-off`, you have a complete plan. Open `dr/index.html` to see it rendered. - -## What you get - -In your target repo: - -``` -.dr/ -├── state.json # pipeline state -└── events.jsonl # audit log -dr/ -├── project.json # the MVP manifest -├── project.md # human-readable view -├── decisions/ -│ ├── 0001-*.json -│ └── 0001-*.md -├── tasks/ -│ ├── T0001-*.json -│ └── T0001-*.md -└── index.html # rendered project overview -``` - -If you handed off to Linear, you also get: - -- A Linear Project named after your manifest -- An Issue per decision (labeled `decision`) -- An Issue per task (with priority, estimate, and labels) -- `blocks` relations matching task dependencies - -## Common follow-ups - -- **Re-render after manual edits to JSON:** run the wizard again (`/plan`) and ask it to call `dr_render`. -- **Resume an interrupted session:** just run `/plan` again. The wizard's first action is `dr_status`, which picks up where you left off. -- **Loosen / tighten gates:** the wizard understands `gate_overrides` — ask it to "change `min_tasks` to 5" or similar. -- **Add a new seed:** drop a JSON file in `server/seed/` following the shape of the existing entries; the wizard will find it on next search. diff --git a/docs/reference/cli.md b/docs/reference/cli.md new file mode 100644 index 0000000..13149c2 --- /dev/null +++ b/docs/reference/cli.md @@ -0,0 +1,108 @@ +# CLI reference + +``` +decision-record [options] +``` + +## Synopsis + +```bash +decision-record [--idea TEXT | --prd PATH | --resume] [options] +``` + +## Description + +Run the decision-record planning pipeline against a target project directory. By default, starts a new project from an idea string; with `--resume`, continues an existing project; with `--prd`, reads scope context from a Markdown file. + +The CLI orchestrates a phase state machine (intake → scoping → deciding → decomposing → handing-off → handed-off), running LLM-driven sub-agents for the actual planning work and stopping at human sign-off gates when configured. + +## Options + +### Project input + +| Flag | Type | Default | Description | +|---|---|---|---| +| `--idea TEXT` | string | — | Free-form one-line idea. Used to derive title + description. | +| `--title TEXT` | string | derived from `--idea` or `--prd` | Explicit project title. Max 120 chars. | +| `--description TEXT` | string | derived from `--idea` or `--prd` | Explicit project description. | +| `--prd PATH` | string | — | Markdown PRD file; first H1 used as title hint, first paragraph as description hint, full text passed to scoping agent. | + +A positional argument can substitute for `--idea` if no other input flag is given. + +### Pipeline behavior + +| Flag | Type | Default | Description | +|---|---|---|---| +| `--cwd PATH` | string | `process.cwd()` | Target project directory. State lands under `.dr/` and `dr/`. | +| `--effort poc\|mvp\|full` | string | `mvp` | Gate strictness preset. See [Calibrate gates](../how-to/calibrate-gates.md). | +| `--resume` | flag | false | Skip intake; pick up the existing project in `--cwd`. | +| `--yes`, `-y` | flag | false | Bypass interactive checkpoints (fully autonomous). | +| `--verbose`, `-v` | flag | false | Stream agent reasoning and tool calls to stderr. | + +### LLM connection + +| Flag | Type | Default | Description | +|---|---|---|---| +| `--model NAME` | string | `$OPENAI_MODEL` or `gpt-4o` | OpenAI-compat model name. | +| `--api-key KEY` | string | `$OPENAI_API_KEY` | OpenAI-compat API key. | +| `--base-url URL` | string | `$OPENAI_BASE_URL` or OpenAI default | OpenAI-compat base URL (for OpenRouter, Ollama, vLLM, LiteLLM, etc.). | + +### Informational + +| Flag | Description | +|---|---| +| `--help`, `-h` | Print help and exit. | +| `--version` | Print version (`decision-record X.Y.Z`) and exit. | + +## Environment variables + +| Variable | Required | Description | +|---|---|---| +| `OPENAI_API_KEY` | yes (unless `--api-key`) | API key for the LLM endpoint. | +| `OPENAI_BASE_URL` | no | OpenAI-compatible base URL. Defaults to OpenAI's. | +| `OPENAI_MODEL` | no | Default model. Defaults to `gpt-4o`. | +| `LINEAR_API_KEY` | no | Enables the Linear handoff branch in the handoff phase. | +| `LINEAR_TEAM_ID` | no | Pre-fills the team ID prompt at Linear handoff. | +| `DR_LOG_LEVEL` | no | `debug` \| `info` \| `warn` \| `error`. Default `info`. Applies to the MCP server's stderr logs. | +| `DR_SEED_DIR` | no | Override the seed library directory. Defaults to the bundled `server/seed/`. | + +## Exit codes + +| Code | Meaning | +|---|---| +| `0` | Pipeline completed successfully (final phase is `handed-off`, or the user declined to advance at a checkpoint and that was a clean stop). | +| `1` | A phase failed: gate failure, agent error, validation failure, export failure. | +| `2` | Bad arguments, missing required env (`OPENAI_API_KEY`), or precondition not met (e.g., `--resume` against a directory with no project). | + +## Output + +- **stdout** — minimal; mostly empty until `--version` or terminal summaries. +- **stderr** — all wizard progress, agent summaries, checkpoint prompts. Pipe with `2>file` if you want to capture. + +## Examples + +```bash +# Minimal — uses cwd, derives title from idea +decision-record --idea "a CLI to dedupe contact lists" + +# Specify everything explicitly +decision-record \ + --title "Contact deduper" \ + --description "A CLI that reads CSVs of contacts and merges fuzzy duplicates" \ + --effort mvp \ + --cwd ~/dev/dedup \ + --model gpt-4o \ + --yes + +# From a PRD +decision-record --prd ./docs/idea.md --cwd ~/dev/my-project + +# Resume after a break +decision-record --resume --cwd ~/dev/my-project + +# Use OpenRouter +decision-record \ + --idea "…" \ + --base-url https://openrouter.ai/api/v1 \ + --model anthropic/claude-sonnet-4-6 +``` diff --git a/docs/reference/data-model.md b/docs/reference/data-model.md new file mode 100644 index 0000000..420fc42 --- /dev/null +++ b/docs/reference/data-model.md @@ -0,0 +1,152 @@ +# Data model + +The pipeline stores five entity types. JSON Schemas are the source of truth in [`../../schemas/`](../../schemas/); the Zod mirrors used at runtime live in [`server/src/schemas/index.ts`](../../server/src/schemas/index.ts). + +## Filesystem layout + +``` +/ +├── .dr/ # internal, gitignored by default +│ ├── state.json # PipelineState +│ ├── events.jsonl # Event (one per line, append-only) +│ └── cache/ # derived artifacts +└── dr/ # tracked + ├── project.json # Project + ├── project.md # rendered, derived + ├── decisions/ + │ ├── 0001-*.json # Decision + │ └── 0001-*.md # rendered, derived + ├── tasks/ + │ ├── T0001-*.json # Task + │ └── T0001-*.md # rendered, derived + └── index.html # rendered, derived +``` + +JSON is source of truth; `.md` and `index.html` are regenerated by `dr_render`. + +## Project (`dr/project.json`) + +The MVP manifest. + +| Field | Type | Notes | +|---|---|---| +| `id` | string (kebab-slug) | Derived from title at init. | +| `title` | string (1–120) | | +| `description` | string? | | +| `created_at`, `updated_at` | ISO datetime | | +| `effort_level` | `"poc" \| "mvp" \| "full"` | Calibrates gates. | +| `status` | phase enum | `intake \| scoping \| deciding \| decomposing \| handing-off \| handed-off`. | +| `scope` | object? | `{ in_scope, out_of_scope, success_criteria, nice_to_have }` — each is `string[]`. | +| `sign_offs` | array | `{ phase, by, actor?, at, notes? }`. | +| `handoff` | object? | `{ target, target_id?, target_url?, exported_at, issue_count?, document_count? }`. Set after `dr_export_*`. | +| `gate_config` | object | `{ preset, overrides? }`. See [Gates](gates.md). | +| `tags` | string[] | | + +## Decision (`dr/decisions/.json`) + +A single significant choice with context, alternatives, and rationale. + +| Field | Type | Notes | +|---|---|---| +| `id` | `"0001-slug"` | Composite identifier. | +| `number` | integer ≥1 | Monotonic per project. | +| `slug` | string | Kebab-case. | +| `title` | string (1–80) | Imperative. | +| `status` | enum | `rfc \| proposed \| accepted \| rejected \| deprecated \| superseded`. | +| `template_variant` | enum | `canonical \| lightweight \| scoping \| vendor \| architecture \| data-model`. | +| `created_at`, `updated_at` | ISO datetime | | +| `summary` | string? | One-line. | +| `issue` | string? | Why this decision needs to be made. | +| `assumptions` | string[] | | +| `constraints` | string[] | | +| `positions` | Position[] | Candidate options. | +| `opinions` | Opinion[] | Stakeholder views. | +| `argument` | string? | Rationale for the selected position. | +| `selected_position` | string? | Must match a Position title. | +| `implications` | string[] | | +| `depends_on` | DecisionId[] | Must be `accepted` before this can be. | +| `related_decisions` | DecisionId[] | Referenced but not blocking. | +| `related_artifacts` | string[] | URLs or repo paths. | +| `review` | Review[] | Antagonistic-review entries. | +| `sign_off` | object? | `{ by, actor?, at, notes? }`. Set when accepted. | +| `superseded_by` | DecisionId? | If `status === "superseded"`. | +| `seed_origin` | string? | Seed name if instantiated from one. | +| `tags` | string[] | | + +### Position + +`{ title, description?, pros, cons, cost?, links }`. Each list defaults to `[]`. + +### Opinion + +`{ author, by: "agent" | "human", at, body, position_ref? }`. + +### Review + +`{ reviewer, lens, verdict: "pass" | "block", score (1-5)?, concerns, at }`. Lenses: `operational \| strategic \| security \| cost \| user-impact`. + +## Task (`dr/tasks/.json`) + +A beads-style work unit. + +| Field | Type | Notes | +|---|---|---| +| `id` | `"T0001-slug"` | Composite identifier. | +| `number` | integer ≥1 | Monotonic per project. | +| `slug` | string | Kebab-case. | +| `title` | string (1–120) | | +| `description` | string? | | +| `status` | enum | `open \| ready \| in_progress \| done \| blocked \| deferred`. | +| `estimate` | object? | `{ unit: "hours" \| "days", value, confidence?: "low" \| "med" \| "high" }`. | +| `acceptance_criteria` | string[] | | +| `depends_on` | TaskId[] | Must be `done` before this can start. | +| `decision_refs` | DecisionId[] | Decisions this task implements. | +| `priority` | `"p0" \| "p1" \| "p2" \| "p3"` | Default `p2`. | +| `labels` | string[] | | +| `assignee_hint` | `"agent" \| "human" \| "either"`? | | +| `external_ref` | object? | Set at handoff. `{ system: "linear" \| "github" \| "plane" \| "jira" \| "other", id, url? }`. | +| `created_at`, `updated_at` | ISO datetime | | + +## PipelineState (`.dr/state.json`) + +Internal pipeline bookkeeping. Never edit by hand. + +| Field | Type | Notes | +|---|---|---| +| `schema_version` | semver string | Bumped on breaking layout changes. | +| `project_id` | string | Matches `project.json.id`. | +| `phase` | phase enum | Mirrors `project.status` but the pipeline writes this. | +| `effective_gate_config` | object | Materialized preset + overrides. | +| `next_decision_seq`, `next_task_seq` | integer ≥1 | Monotonic counters. | +| `pending_questions` | array | Open questions the agent surfaced. | +| `gate_failures` | array | History of failed advance attempts (for debugging). | +| `last_event_at`, `last_render_at` | ISO datetime? | | + +## Event (`.dr/events.jsonl`) + +One JSON line per pipeline action. Append-only audit log. + +| Field | Type | Notes | +|---|---|---| +| `at` | ISO datetime | | +| `actor` | `"agent" \| "human" \| "system"` | | +| `actor_name` | string? | | +| `kind` | enum | See below. | +| `entity_kind` | `"project" \| "decision" \| "task" \| "phase" \| "question"`? | | +| `entity_id` | string? | | +| `payload` | object? | Event-specific. | +| `correlation_id` | string? | Groups related events. | + +### Event kinds + +`project_initialized`, `phase_advanced`, `phase_advance_blocked`, `scope_updated`, `decision_proposed`, `decision_updated`, `decision_reviewed`, `decision_accepted`, `decision_rejected`, `task_proposed`, `task_updated`, `task_status_changed`, `graph_validated`, `gate_check_passed`, `gate_check_failed`, `question_asked`, `question_answered`, `seed_loaded`, `render_run`, `export_started`, `export_completed`, `export_failed`, `sign_off_recorded`. + +## ID conventions + +| Entity | Format | Example | +|---|---|---| +| Decision | `<4-digit>-` | `0003-define-the-agent-action-contract` | +| Task | `T<4-digit>-` | `T0006-implement-the-tick-based-game-loop` | +| Project | kebab-slug | `ai-driven-roguelike-poc` | + +Slugs are 2–64 chars, lower-case alphanumerics + dashes, no leading/trailing dash. diff --git a/docs/reference/gates.md b/docs/reference/gates.md new file mode 100644 index 0000000..e3dad85 --- /dev/null +++ b/docs/reference/gates.md @@ -0,0 +1,78 @@ +# Gates reference + +Every phase transition is checked by a set of gate conditions. The full evaluator lives at [`server/src/gateEval.ts`](../../server/src/gateEval.ts). This page documents what each gate checks and what each preset sets. + +## Phase machine + +``` +intake ─→ scoping ─→ deciding ─→ decomposing ─→ handing-off ─→ handed-off +``` + +`dr_advance` is the only way to move forward. It evaluates the gate for the **next** phase against current state, and either transitions (and emits `phase_advanced`) or records a `phase_advance_blocked` event with reasons. + +## What each gate checks + +| Advancing to | Conditions | +|---|---| +| `scoping` | Project title non-empty; description non-empty. | +| `deciding` | `scope.in_scope` non-empty; `scope.success_criteria` non-empty; if `review_required_phases` includes `"scoping"`, a `scoping`-variant DR has a passing review. | +| `decomposing` | Number of decisions ≥ `min_decisions`; if `decisions_required_status === "accepted"`, no decisions in `proposed`/`rfc`; if `review_required_per_decision`, every accepted decision has a passing review; if `review_required_phases` includes `"deciding"`, at least one decision has a passing review; no decisions reference missing dependency IDs. | +| `handing-off` | Number of tasks ≥ `min_tasks`; no tasks reference missing dependency tasks; no cycles in the task dependency graph; every task has an estimate ≤ `max_task_estimate_hours` (days are normalized to hours at 8h/day); every task's `decision_refs` resolve. | +| `handed-off` | `project.handoff` is set (i.e., `dr_export_filesystem` or `dr_export_linear` has run). | + +## Sign-off check (overlay) + +If the next phase is in the project's `require_human_signoff_phases`, the gate also requires `dr_advance` to be called with `sign_off_by: "human"`. Without it, the gate fails with a clear "Sign-off gate" reason. + +The orchestrator (CLI + dr-wizard) handles this automatically: it pauses at the relevant checkpoint, asks the user, then calls `dr_advance` with sign-off. Manual MCP callers must remember. + +## Preset matrix + +| Knob | `poc` | `mvp` | `full` | +|---|---|---|---| +| `decisions_required_status` | `accepted` | `accepted` | `accepted` | +| `review_required_phases` | `[]` | `["scoping", "decomposing"]` | `["scoping", "deciding", "decomposing"]` | +| `review_required_per_decision` | `false` | `false` | **`true`** | +| `max_task_estimate_hours` | `16` | `8` | `4` | +| `require_human_signoff_phases` | `["handing-off"]` | `["scoping", "decomposing", "handing-off"]` | `["scoping", "deciding", "decomposing", "handing-off"]` | +| `min_decisions` | `0` | `3` | `6` | +| `min_tasks` | `3` | `8` | `15` | + +## Override knobs + +Per-project overrides at `project.json → gate_config.overrides` take precedence per-key over the preset. Any of the seven keys above can be overridden; omitted keys inherit the preset. + +```json +{ + "gate_config": { + "preset": "mvp", + "overrides": { + "min_tasks": 5, + "review_required_per_decision": true + } + } +} +``` + +The materialized result is at `state.effective_gate_config` — that's what the evaluator actually reads. + +## Inspecting gate state + +```bash +# Current evaluation against the next phase +node dist/index.js # then call dr_status + +# Or directly: +cat /.dr/state.json | jq '.effective_gate_config' +cat /dr/project.json | jq '.gate_config' +``` + +`dr_status` returns a `gate_to_next` block: `{ pass, reasons[], next_phase }`. Read the reasons — they name the specific failing knob and the specific shortfall. + +## Why hard gates + +The system refuses to advance when gates fail. There is no `--force` flag, no admin override. + +The trade-off is intentional. Soft gates degrade — people learn to skip them, and the artifact stops being trustworthy. With hard gates, the rule is: if a plan exists and reached `handed-off`, every gate it crossed actually passed. The plan is real. + +If a gate is too strict, change the gate (override the knob in `project.json`). Don't bypass it. diff --git a/docs/reference/mcp-tools.md b/docs/reference/mcp-tools.md new file mode 100644 index 0000000..e5a1c47 --- /dev/null +++ b/docs/reference/mcp-tools.md @@ -0,0 +1,188 @@ +# MCP tools + +The MCP server exposes the planning pipeline as a set of tools an agent can call. The CLI uses the same registry in-process; nothing is CLI-only. + +Every tool accepts `cwd?: string` (the target project directory; defaults to the server's `process.cwd()`). + +## Pipeline tools + +### `dr_init` + +Initialize the pipeline in a target repo. Creates `.dr/` and `dr/` layout, writes `state.json` and `project.json`. Fails if already initialized. + +| Input | Type | Notes | +|---|---|---| +| `title` | string | Project title. | +| `description` | string? | Intake description. | +| `effort_level` | `"poc" \| "mvp" \| "full"` | Default `mvp`. | +| `gate_overrides` | object? | Per-knob preset overrides. See [Gates reference](gates.md). | +| `tags` | string[] | Free-form. | +| `project_id` | string? | Override the derived slug. | + +Returns: `{ project_id, paths, project, state, next_phase }`. + +### `dr_status` + +Read pipeline status. Returns project metadata, current phase, gate evaluation against the next phase (what's blocking advance), counts, pending questions, effective gate config. + +### `dr_advance` + +Advance to the next pipeline phase if the gate passes. Records a sign-off and emits `phase_advanced`. If the gate fails, returns reasons without changing phase. + +| Input | Type | Notes | +|---|---|---| +| `sign_off_by` | `"agent" \| "human"`? | Required when the next phase has human sign-off requirement. | +| `sign_off_actor` | string? | Identifier of the signing actor. | +| `sign_off_notes` | string? | Free-form notes attached to the sign-off. | + +### `dr_update_project` + +Patch project metadata: `title`, `description`, `tags`, and `gate_overrides`. Cannot change the `effort_level` preset (re-init for that). + +### `dr_update_scope` + +Replace any/all of `in_scope`, `out_of_scope`, `success_criteria`, `nice_to_have`. Each list is fully replaced when provided; omitted lists are unchanged. + +## Decision tools + +### `dr_propose_decision` + +Create a new decision record (`status: "proposed"`). + +| Input | Type | Notes | +|---|---|---| +| `title` | string | Short imperative, max 80 chars. | +| `template_variant` | `"canonical" \| "lightweight" \| "scoping" \| "vendor" \| "architecture" \| "data-model"` | Default `canonical`. | +| `summary`, `issue`, `assumptions`, `constraints`, `positions`, `depends_on`, `tags`, `seed_origin`, `slug` | various | Optional initial content. | + +### `dr_update_decision` + +Patch any field. Pass only the fields you want to change. `add_opinion` appends a single opinion entry. + +### `dr_review_decision` + +Record an antagonistic-review pass. + +| Input | Type | Notes | +|---|---|---| +| `id` | string | Decision id. | +| `reviewer` | string | e.g., `"dr-skeptic"`. | +| `lens` | `"operational" \| "strategic" \| "security" \| "cost" \| "user-impact"` | The review lens. | +| `verdict` | `"pass" \| "block"` | | +| `score` | number (1–5) | Optional. | +| `concerns` | string[] | Crisp one-line concerns. | + +### `dr_accept_decision` + +Move a decision to `accepted` and record sign-off. Requires `selected_position` and `argument` set. Requires a passing review if `review_required_per_decision` is true. Rejects if any blocking deps are unmet. + +### `dr_reject_decision` + +Move a decision to `rejected` with a reason and sign-off. + +### `dr_list_decisions` + +Filter by `status[]` and/or `template_variant[]`. Returns summaries. + +### `dr_get_decision` + +Fetch the full content of a decision by id. + +### `dr_ready_decisions` + +Return decisions whose `depends_on` are all `accepted` (or which have no deps). Used by the agent to pick the next DR to work on. + +## Task tools + +### `dr_propose_task` + +Create a new task node. Status defaults to `ready` if no deps, `open` otherwise. + +| Input | Type | Notes | +|---|---|---| +| `title`, `description` | string | | +| `depends_on` | string[] | Task IDs. | +| `decision_refs` | string[] | Decision IDs the task implements. | +| `estimate` | `{ unit: "hours" \| "days", value, confidence? }` | | +| `acceptance_criteria` | string[] | | +| `priority` | `"p0" \| "p1" \| "p2" \| "p3"` | Default `p2`. | +| `labels` | string[] | | +| `assignee_hint` | `"agent" \| "human" \| "either"` | | + +### `dr_update_task` + +Patch fields. Use `dr_set_task_status` to change lifecycle state. + +### `dr_set_task_status` + +Change status: `open`, `ready`, `in_progress`, `done`, `blocked`, `deferred`. + +### `dr_list_tasks`, `dr_get_task` + +Filter / fetch. + +### `dr_ready_tasks` + +Tasks whose deps are all `done` (or no deps), sorted by priority. The beads-style "what's next" query. + +### `dr_validate_graph` + +Validate the full task graph: no cycles, no orphan dependencies, all estimates ≤ `max_task_estimate_hours`, all `decision_refs` resolve. Emits `graph_validated`. Returns `{ valid, errors[], warnings[], cycles[], orphans[], oversized[], missing_decision_refs[] }`. + +## Seed library tools + +### `dr_seed_search` + +Keyword search over the bundled seed library. + +| Input | Type | Notes | +|---|---|---| +| `query` | string | Matches on name, title, keywords, tags. | +| `limit` | integer | Default 5. | + +### `dr_seed_list` + +List every seed. + +### `dr_seed_get` + +Fetch one seed's full content (including `notes_for_agent`). + +### `dr_seed_load` + +Instantiate a seed as a `proposed` DR. Pre-fills positions, assumptions, constraints, implications. + +| Input | Type | Notes | +|---|---|---| +| `seed_name` | string | E.g., `"language-choice"`. | +| `title_override` | string? | Project-specific title. | +| `slug_override` | string? | | +| `depends_on` | string[] | Decision IDs this DR depends on. | +| `tags` | string[] | | + +## Render + +### `dr_render` + +Regenerate Markdown + `index.html` from JSON. Idempotent. + +## Handoff + +### `dr_export_filesystem` + +Finalize the project to filesystem only. Records handoff metadata, transitions to `handed-off`, prevents further phase changes. Requires the project to be in `handing-off` phase. + +### `dr_export_linear` + +Push to Linear via the GraphQL API. Creates a Project, Issues per decision (labeled `decision`) and per task, with `blocks` relations matching `depends_on`. Supports `dry_run: true` to preview without calling Linear. + +| Input | Type | Notes | +|---|---|---| +| `team_id` | string | Linear team UUID. | +| `api_key` | string? | Defaults to `$LINEAR_API_KEY`. | +| `dry_run` | boolean | Default `false`. | +| `sign_off_by`, `sign_off_actor`, `sign_off_notes` | various | Sign-off metadata. | + +## Where the schemas live + +Every tool's input is validated by Zod at the server. JSON Schema mirrors for external consumers live in [`../../schemas/`](../../schemas/). The Zod source of truth is at [`server/src/schemas/index.ts`](../../server/src/schemas/index.ts). diff --git a/docs/tutorials/your-first-plan.md b/docs/tutorials/your-first-plan.md new file mode 100644 index 0000000..7f60435 --- /dev/null +++ b/docs/tutorials/your-first-plan.md @@ -0,0 +1,164 @@ +# Your first plan + +By the end of this tutorial you will have used decision-record to turn a one-line idea into a complete, scoped, decision-backed, task-decomposed MVP plan — and you will have looked at every artifact the system produces. This takes about 15 minutes. + +We will use the **roguelike-ai-poc** benchmark idea — a small but real planning problem — so you can see the system handle something other than `hello world`. + +## Before you start + +You need: + +1. **Node 20 or later** installed (`node --version` should print `v20.x` or higher). +2. **An OpenAI-compatible API key.** This can be: + - An OpenAI API key (`OPENAI_API_KEY=sk-…`), or + - Any compatible endpoint — set `OPENAI_BASE_URL` and `OPENAI_MODEL`. See [Configure LLM providers](../how-to/configure-providers.md). +3. **The repo cloned and built:** + ```bash + git clone https://github.com/protoLabsAI/decision-record.git + cd decision-record/server + npm install + npm run build + ``` + +You do **not** need the Claude Code plugin installed for this tutorial. We will run the CLI directly. + +## Step 1: Pick a working directory + +The system writes artifacts into a target project directory. We will create a fresh one: + +```bash +mkdir -p ~/dev/my-first-plan +``` + +Everything that follows lands in there. Nothing is written into the decision-record repo itself. + +## Step 2: Run the CLI + +From the `decision-record/server/` directory: + +```bash +export OPENAI_API_KEY=sk-… # if you haven't already + +node dist/cli.js \ + --idea "a CLI tool that converts QuickBooks CSV exports into a normalized double-entry ledger" \ + --effort poc \ + --cwd ~/dev/my-first-plan +``` + +You can also drop the `--idea` flag entirely and run interactively — but for a guided first run, this is cleaner. + +## Step 3: Watch the wizard work + +The CLI will print colored progress to stderr as each phase runs. You will see something like: + +``` +━━━ decision-record v0.1.0 ━━━ + Target: /Users/you/dev/my-first-plan + Model: gpt-4o +━━━ Phase: Intake ━━━ +✓ Initialized 'a-cli-tool-that-converts-quickbooks-csv-export…' at effort_level=poc +✓ Advanced: intake → scoping +━━━ Phase: Scoping ━━━ + Running scoping agent… +✓ Scoping agent finished (3 tool calls). +──────────────────────────────────────────────────────────── +Scope set. in_scope: read QuickBooks CSV, parse rows… +… +──────────────────────────────────────────────────────────── +✓ Advanced: scoping → deciding +━━━ Phase: Deciding ━━━ + Running deciding agent (proposing decisions)… +… +━━━ Antagonistic review: 4 decisions × 5 lenses ━━━ + operational: pass (4/5) + strategic: pass (4/5) +… +✓ Accepted 0001-… +… +━━━ Phase: Decomposing ━━━ + Running decomposer agent (building task graph)… +✓ Decomposer finished (28 tool calls). Graph validates. +… +━━━ Phase: Handoff ━━━ +✓ Artifacts rendered. +> LINEAR_API_KEY detected. Push the plan to Linear? [Y/n] [auto-yes] +✓ Plan finalized to filesystem. +✓ Pipeline complete. Final phase: handed-off +``` + +Each phase shows what it did. Read the summaries — they tell you what the agent decided. + +> **About checkpoints:** Under the `poc` preset, only the **handoff** transition requires human sign-off. Because you passed `--yes`, the wizard auto-confirms; without it, you would be prompted before each gate that needs sign-off. See [Calibrate gates](../how-to/calibrate-gates.md) for the difference between `poc`, `mvp`, and `full`. + +## Step 4: Look at what got produced + +```bash +ls ~/dev/my-first-plan/dr/ +``` + +You should see: + +``` +project.json # the MVP manifest — scope, status, sign-offs +project.md # human-readable view of project.json +decisions/ # one .json + .md per decision +tasks/ # one .json + .md per task +index.html # rendered overview — open in a browser +``` + +Open `~/dev/my-first-plan/dr/index.html` in a browser. You will see the full plan: scope, decisions with their selected positions, and the task graph. + +```bash +open ~/dev/my-first-plan/dr/index.html # macOS +xdg-open ~/dev/my-first-plan/dr/index.html # Linux +``` + +## Step 5: Inspect a decision + +Pick one. For example: + +```bash +cat ~/dev/my-first-plan/dr/decisions/0001-*.md +``` + +You will see the full record: issue, positions considered, the selected position, the argument for why it won, the implications, and five lens reviews from the skeptic. + +```bash +cat ~/dev/my-first-plan/dr/decisions/0001-*.json | jq . +``` + +Same content, machine-readable. + +## Step 6: Inspect a task + +```bash +cat ~/dev/my-first-plan/dr/tasks/T0001-*.md +``` + +Tasks have: title, description, acceptance criteria (as a checkbox list), estimate, dependencies, and the decisions they implement (`decision_refs`). A developer can pick up T0001 and ship it. + +## Step 7: Look at the audit log + +```bash +tail ~/dev/my-first-plan/.dr/events.jsonl | jq . +``` + +Every action the wizard took — phase advances, decisions proposed, reviews completed, tasks created, exports — is recorded as one JSON line. This is your replay log; it never gets rewritten. + +## You are done + +You ran a complete planning pipeline end-to-end. From a one-line idea you produced: + +- A scoped MVP manifest with success criteria and explicit non-goals +- A set of accepted decisions, each with reviewed rationale +- A dependency-aware task graph linked back to those decisions +- Rendered Markdown and HTML for human review +- An immutable event log + +## Next steps + +- **Hand off to Linear instead of filesystem** — [How-to: Hand off to Linear](../how-to/handoff-to-linear.md) +- **Run with a PRD instead of a one-liner** — [How-to: Run the CLI](../how-to/run-the-cli.md) +- **Use a different model** — [How-to: Configure LLM providers](../how-to/configure-providers.md) +- **Understand what just happened** — [Explanation: The five phases](../explanation/the-five-phases.md) and [Design rationale](../explanation/design-rationale.md) +- **Look up a specific flag** — [Reference: CLI](../reference/cli.md) diff --git a/docs/usage.md b/docs/usage.md deleted file mode 100644 index 30be959..0000000 --- a/docs/usage.md +++ /dev/null @@ -1,145 +0,0 @@ -# Usage - -A walk-through of how an `idea → ship-ready MVP plan` session goes with this plugin. - -## Setup - -### Install the plugin (when published) - -```bash -# In Claude Code -/plugin install decision-record -``` - -Until the plugin lands in a marketplace, you can use it locally: - -```bash -git clone https://github.com/protoLabsAI/decision-record.git -cd decision-record/server -npm install -npm run build -``` - -Then point Claude Code at the local plugin (settings → plugins, or symlink into `~/.claude/plugins/`). - -### Optional: configure Linear handoff - -If you want to push the final plan to Linear, set a personal API token in the environment of whichever shell launches the MCP server: - -```bash -export LINEAR_API_KEY=lin_api_xxx -``` - -You'll pass your Linear team ID per-export at handoff time. Find it in Linear (Settings → API → Personal API keys; team IDs visible in the GraphQL explorer or team URL). - -Without Linear, everything still works — the plugin will hand off to the filesystem. - -## Running the pipeline - -In a target repository (fresh or template), open Claude Code and run: - -``` -/plan -``` - -Optionally pass a one-line idea: - -``` -/plan a CLI tool that converts CSV exports from QuickBooks into a normalized ledger format -``` - -The `dr-wizard` agent runs. It reads pipeline state from `.dr/state.json` (or initializes if missing) and drives forward one phase at a time. - -## The five phases - -### 1. Intake - -The wizard captures the raw idea: a title, a one-paragraph description, and an effort level. - -- **POC** — single-day spike. Light gates: ≥3 tasks, no required reviews, only the handoff requires human sign-off. -- **MVP** (default) — a few weeks of work. Gates: scope and decomposing reviewed, ≥3 decisions, ≥8 tasks, ≤8h per leaf task. -- **Full** — production-quality. Every gate reviewed, every DR reviewed individually, ≥6 decisions, ≥15 tasks, ≤4h per leaf task. - -You can override individual knobs at init or via `dr_update_project` — see [architecture.md#gate-configuration](architecture.md#gate-configuration). - -### 2. Scoping - -The most important phase, often skipped to everyone's regret. The wizard pushes you to commit to: - -- **In scope** — what the MVP MUST do. -- **Out of scope** — what it explicitly WON'T do. -- **Success criteria** — measurable signals it worked. -- **Nice to have** — optional capabilities (won't block ship). - -In MVP and Full presets, the wizard also instantiates a `scope-statement` DR — a formal decision record about the scope choice (lean MVP vs walking-skeleton vs polished). The DR gets a human sign-off before advancing. - -### 3. Deciding - -The wizard surfaces *which decisions need to be made* for this project. It uses two signals: - -- **Seed library** — common decisions (language, runtime, auth, data store, CI/CD, etc.). The wizard searches with `dr_seed_search`, finds matches, and instantiates them with `dr_seed_load`. -- **Project-specific decisions** — anything the seed library doesn't cover gets proposed fresh. - -For each decision, the wizard asks one question at a time, drives you to pick a position, write a brief argument, and (in MVP/Full presets) requests an antagonistic review from `dr-skeptic` before acceptance. - -Decisions can depend on each other (e.g., "runtime target" depends on "language choice"). The wizard calls `dr_ready_decisions` to find what's unblocked next. - -You leave this phase when every significant decision is `accepted` (or explicitly `rejected`), and the wizard advances with your sign-off. - -### 4. Decomposing - -The wizard delegates to `dr-decomposer`, which: - -1. Reads the project, scope, and accepted DRs. -2. Proposes a beads-style task graph — tasks with titles, descriptions, acceptance criteria, estimates, dependencies, and `decision_refs` linking back to the DRs they implement. -3. Calls `dr_validate_graph` to confirm: no cycles, no orphan deps, no oversized estimates, every `decision_refs` resolves. - -You then review with the wizard: split tasks that are too big, merge tasks that are too small, fix anything missing. When the graph is clean, advance with your sign-off. - -### 5. Handing off - -The wizard renders the artifacts (`dr_render` regenerates Markdown + the static `index.html`) and asks where to hand off: - -**Linear (preferred)** — provide your team ID. The wizard: -- First runs `dr_export_linear { dry_run: true }` to show you the plan. -- On your confirm, runs without dry_run: creates a Linear Project, an Issue per decision (labeled `decision`), an Issue per task, and `blocks` relations matching `depends_on`. -- Updates each task's `external_ref` so the local file knows the Linear identifier. - -**Filesystem only** — `dr_export_filesystem` finalizes the plan in place. The team picks up where they want. - -The project transitions to `handed-off`. The plugin's work is done; ongoing project management lives wherever you want. - -## Resuming an in-progress project - -Just run `/plan` again. The wizard's first move is `dr_status`, which discovers the existing project and jumps to the right phase. The state in `.dr/` is durable across sessions — restart-safe, agent-restart safe, machine-reboot safe. - -## Inspecting state - -```bash -# Read project -cat dr/project.json | jq - -# Read events (everything that's happened) -tail -f .dr/events.jsonl | jq - -# Re-render artifacts -# (in Claude Code:) -# Use the dr_render MCP tool, or just run /plan and let the wizard refresh. - -# Open the rendered index -open dr/index.html -``` - -## Common situations - -**"The wizard wants me to write more decisions, but my project is simple."** -You're probably running with the wrong effort level. Re-init with `effort_level: 'poc'`, or override `min_decisions` via `dr_update_project`'s `gate_overrides`. - -**"`dr_advance` keeps failing with vague reasons."** -The wizard returns the gate failures verbatim. Read them. They name the specific knob and the specific shortfall. - -**"I want to change my mind about a decision after acceptance."** -You can re-open a decision by marking it `superseded` and pointing it at a new DR. The old DR stays on file (immutability matters); the new one carries the current state. - -**"My Linear export failed partway."** -Linear creates issues incrementally — partial state may exist. Either delete the partial project in Linear and re-run, or fix the underlying issue and call `dr_export_linear` again (Note: the current implementation doesn't reconcile — a fresh export creates a fresh project. PR welcome.). diff --git a/server/package-lock.json b/server/package-lock.json index de0c7f9..3ac4ccc 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "@modelcontextprotocol/sdk": "^1.0.0", "nanoid": "^5.0.0", + "openai": "^6.38.0", "zod": "^3.23.0" }, "bin": { @@ -1913,6 +1914,27 @@ "wrappy": "1" } }, + "node_modules/openai": { + "version": "6.38.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-6.38.0.tgz", + "integrity": "sha512-AoMplt2UalrpgUDMh3L09QWjNRlgJPipclQvA6sYAaeF6nHNBMgmikAZGmcYLn8on4d9sQY9Q8bOLfrBS7Lc8g==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", diff --git a/server/package.json b/server/package.json index 1ae0fca..2bd82ce 100644 --- a/server/package.json +++ b/server/package.json @@ -5,7 +5,8 @@ "license": "MIT", "type": "module", "bin": { - "decision-record-mcp": "dist/index.js" + "decision-record-mcp": "dist/index.js", + "decision-record": "dist/cli.js" }, "main": "dist/index.js", "exports": { @@ -22,7 +23,9 @@ "dev": "tsx watch src/index.ts", "start": "node dist/index.js", "typecheck": "tsc --noEmit", - "test": "tsx --test src/**/*.test.ts" + "test": "node --import tsx/esm --test tests/unit-*.test.ts tests/flow-*.test.ts 2>&1 | grep -v 'ExperimentalWarning' || true", + "test:unit": "node --import tsx/esm --test tests/unit-*.test.ts", + "test:flow": "node --import tsx/esm --test tests/flow-*.test.ts" }, "engines": { "node": ">=20" @@ -30,6 +33,7 @@ "dependencies": { "@modelcontextprotocol/sdk": "^1.0.0", "nanoid": "^5.0.0", + "openai": "^6.38.0", "zod": "^3.23.0" }, "devDependencies": { diff --git a/server/src/cli.ts b/server/src/cli.ts new file mode 100644 index 0000000..5da4fea --- /dev/null +++ b/server/src/cli.ts @@ -0,0 +1,2 @@ +// CLI entrypoint — re-exports from cli/index.ts so tsup builds it as a separate bundle. +import "./cli/index.js"; diff --git a/server/src/cli/agents/deciding.ts b/server/src/cli/agents/deciding.ts new file mode 100644 index 0000000..08a9c17 --- /dev/null +++ b/server/src/cli/agents/deciding.ts @@ -0,0 +1,56 @@ +import OpenAI from "openai"; +import { LLMConfig } from "../../llm/client.js"; +import { runAgentTurn } from "../../llm/agent.js"; + +const SYSTEM = `You are the deciding phase of an idea-to-MVP planning pipeline. + +Your one job: identify every significant decision this project needs to make, propose options, pick winners, and record them. You do NOT accept decisions — the orchestrator does that after running antagonistic review. You leave them as 'proposed' with a selected_position and argument. + +Workflow: +1. Call \`dr_status\` to read the project's current state, including scope and any pre-existing decisions. +2. Call \`dr_list_decisions\` to see what's already on file. +3. For each project, identify 3-8 significant decisions (or however many the gate requires — see status.effective_gate_config.min_decisions). Significant means: would otherwise be re-litigated, has multiple defensible options, and load-bearing for the MVP. + + For each decision: + a. **Check the seed library first.** Call \`dr_seed_search\` with a query relevant to the decision topic (e.g., 'language', 'data store', 'auth'). If a seed matches, use \`dr_seed_load\` to instantiate it — this gives you well-thought-out starter content. + b. **If no seed matches**, call \`dr_propose_decision\` with title, issue, 2-4 positions (each with title, description, pros, cons), assumptions, and constraints. + c. **Pick a position.** Call \`dr_update_decision\` with selected_position (matching one of the position titles) and a 1-2 sentence argument for why it wins. + +4. After each decision is selected, the orchestrator runs antagonistic review. If a review blocks, you may be called again to revise — but for now, don't accept anything. + +Constraints: +- Stay inside the project's scope. Don't propose decisions about out-of-scope capabilities. +- One DR per significant choice. Don't fragment one decision into many tiny ones. +- Set \`depends_on\` when a decision logically follows another (e.g., 'runtime target' depends on 'language choice'). + +When you've covered all the decisions you think this project needs, return a brief plain-text summary: +- Total decisions proposed (count). +- A line per decision: \` → <selected_position>\`. +- Any decisions you intentionally left out (and why). + +Be decisive. The human reviews at the next checkpoint.`; + +export interface DecidingResult { + summary: string; + toolCallCount: number; +} + +export async function runDecidingAgent( + client: OpenAI, + config: LLMConfig, + cwd: string, + verbose: boolean +): Promise<DecidingResult> { + const turn = await runAgentTurn( + { + client, + config, + system: SYSTEM, + toolContext: { cwd }, + verbose, + maxIterations: 60, // many decisions = many tool calls + }, + "Please identify and propose all the decisions this project needs to make. Use dr_status to read scope first." + ); + return { summary: turn.text, toolCallCount: turn.toolCalls.length }; +} diff --git a/server/src/cli/agents/decomposer.ts b/server/src/cli/agents/decomposer.ts new file mode 100644 index 0000000..4fdc652 --- /dev/null +++ b/server/src/cli/agents/decomposer.ts @@ -0,0 +1,70 @@ +import OpenAI from "openai"; +import { LLMConfig } from "../../llm/client.js"; +import { runAgentTurn } from "../../llm/agent.js"; + +const SYSTEM = `You are the decomposing phase of an idea-to-MVP planning pipeline. You turn accepted decisions into a beads-style task graph. + +Workflow: +1. Call \`dr_status\` to read the project's scope and gate config — specifically \`effective_gate_config.max_task_estimate_hours\` and \`min_tasks\`. +2. Call \`dr_list_decisions\` with \`status: ['accepted']\` and read full content via \`dr_get_decision\` for any that look load-bearing. +3. Plan the graph end-to-end: + - Start with foundations (repo bootstrap, dependencies, config). + - Build up to user-visible features. + - Each task is atomic — under \`max_task_estimate_hours\` of work. + - Each task has acceptance_criteria (concrete done-when statements). + - Each task has decision_refs (which DRs it implements). + - Each task has depends_on for ordering. +4. Create tasks via \`dr_propose_task\`. Order matters — create dependencies before dependents. +5. Call \`dr_validate_graph\`. If it returns errors (cycles, orphans, oversized estimates, missing refs), fix them by calling \`dr_update_task\` and re-validating until clean. + +Principles: +- **Vertical slices, not horizontal layers.** A task that ships a feature end-to-end is better than three tasks that each touch one layer but ship nothing alone. +- **Every task has decision_refs.** If you can't link a task to an accepted DR, the project's decisions are incomplete — flag it in your summary. +- **Stay in scope.** Out-of-scope items must NOT become tasks. If something seems necessary but isn't in_scope, raise it in your summary — don't quietly add it. +- **Estimate honestly.** When unsure, set \`confidence: 'low'\` rather than padding hours. + +After the graph validates, return a plain-text summary: +- Total tasks (count). +- A line per task: \`<id> — <title> (<estimate>) [pri:<priority>] depends on: <ids> | implements: <decision ids>\`. +- The critical path (a chain of tasks that must complete in order). +- Any tasks you couldn't link to a decision (flagged for the human).`; + +export interface DecomposerResult { + summary: string; + toolCallCount: number; + validationPassed: boolean; +} + +export async function runDecomposerAgent( + client: OpenAI, + config: LLMConfig, + cwd: string, + verbose: boolean +): Promise<DecomposerResult> { + const turn = await runAgentTurn( + { + client, + config, + system: SYSTEM, + toolContext: { cwd }, + verbose, + maxIterations: 100, // task graph creation can need many calls + }, + "Please decompose the accepted decisions into a beads-style task graph. End by validating the graph." + ); + + // Check whether the last dr_validate_graph call passed. + const validateCalls = turn.toolCalls.filter((c) => c.name === "dr_validate_graph"); + const last = validateCalls[validateCalls.length - 1]; + const validationPassed = last + ? (() => { + try { + const parsed = JSON.parse(last.resultText) as { ok?: boolean; data?: { valid?: boolean } }; + return Boolean(parsed.ok && parsed.data?.valid); + } catch { + return false; + } + })() + : false; + return { summary: turn.text, toolCallCount: turn.toolCalls.length, validationPassed }; +} diff --git a/server/src/cli/agents/scoping.ts b/server/src/cli/agents/scoping.ts new file mode 100644 index 0000000..4f8434e --- /dev/null +++ b/server/src/cli/agents/scoping.ts @@ -0,0 +1,58 @@ +import OpenAI from "openai"; +import { LLMConfig } from "../../llm/client.js"; +import { runAgentTurn } from "../../llm/agent.js"; + +const SYSTEM = `You are the scoping phase of an idea-to-MVP planning pipeline. + +Your one job: turn a project description into a sharp MVP scope, written into the project's state. + +You have access to MCP tools. Use them. Specifically: +1. Call \`dr_status\` first to learn the project's title, description, effort_level, and current scope (which may be partially populated already). +2. Read any PRD context the user supplies in the initial message. +3. Synthesize four lists: + - **in_scope**: 3-5 must-ship capabilities. Concrete, not aspirational. + - **success_criteria**: 2-4 measurable signals the MVP worked. + - **out_of_scope**: 2-5 deliberately deferred capabilities. Be explicit about what you're NOT building. + - **nice_to_have**: 0-3 optional items that may slip in if scope allows. +4. Call \`dr_update_scope\` once with all four lists. +5. If the project's effort_level is 'mvp' or 'full', also instantiate the \`scope-statement\` seed DR: + - Call \`dr_seed_load\` with seed_name='scope-statement'. + - Choose a position based on the project's nature: 'Lean MVP', 'Walking-skeleton MVP', or 'Polished MVP'. + - Call \`dr_update_decision\` to set \`selected_position\` and \`argument\` (one paragraph: why this shape fits this project). + - Do NOT accept it yet — leave status 'proposed'. The orchestrator handles acceptance after review. + +Once you've made every tool call, return a brief plain-text summary: +- The chosen scope as four bullet lists. +- For mvp/full presets: the scoping DR id and the selected position. + +Be decisive. Don't hedge. The orchestrator will surface your output to the human for sign-off; revisions happen there, not here.`; + +export interface ScopingResult { + summary: string; + toolCallCount: number; +} + +export async function runScopingAgent( + client: OpenAI, + config: LLMConfig, + cwd: string, + prdContext: string | null, + verbose: boolean +): Promise<ScopingResult> { + const userMessage = prdContext + ? `Please scope this project. The project state already has a title and description; use dr_status to read them. Additional PRD context:\n\n${prdContext}` + : "Please scope this project. Read the project's current state with dr_status and produce the four-list scope."; + + const turn = await runAgentTurn( + { + client, + config, + system: SYSTEM, + toolContext: { cwd }, + verbose, + maxIterations: 16, + }, + userMessage + ); + return { summary: turn.text, toolCallCount: turn.toolCalls.length }; +} diff --git a/server/src/cli/agents/skeptic.ts b/server/src/cli/agents/skeptic.ts new file mode 100644 index 0000000..e98daf7 --- /dev/null +++ b/server/src/cli/agents/skeptic.ts @@ -0,0 +1,103 @@ +import OpenAI from "openai"; +import { LLMConfig } from "../../llm/client.js"; +import { runAgentTurn } from "../../llm/agent.js"; + +const LENSES = ["operational", "strategic", "security", "cost", "user-impact"] as const; +export type Lens = (typeof LENSES)[number]; + +export const ALL_LENSES = LENSES; + +function systemFor(lens: Lens): string { + const lensGuidance: Record<Lens, string> = { + operational: + "Can the team actually maintain this? What's the on-call cost? What breaks at 3am? Who owns each operational concern?", + strategic: + "Does this advance the business goal? Is it differentiated? Is the timing right? What's the opportunity cost?", + security: + "What's the attack surface? What data is exposed? What new compliance hooks? What's the worst-case breach impact?", + cost: + "Total cost of ownership over 12 months. Hidden costs (people, time, licenses). Migration costs if we're wrong.", + "user-impact": + "How does this feel to the user? Does it create friction? Could it break trust? Is the upgrade/migration painful?", + }; + + return `You are dr-skeptic — an antagonistic reviewer applying the ${lens} lens. + +${lensGuidance[lens]} + +Your job: stress-test the decision. Find what's wrong before it's locked in. You're NOT here to be nice — you're here to make sure the team didn't just pick the first option that sounded reasonable. + +Workflow: +1. Call \`dr_get_decision\` with the decision id you're given. +2. Examine: title, issue, assumptions, constraints, positions, selected_position, argument, implications. +3. Stress-test the argument through the ${lens} lens: + - What assumptions are unstated? + - What positions were dismissed without serious consideration? + - What edge cases would break this choice? + - What's the cost of being wrong, and how easily is the decision reversible? +4. Call \`dr_review_decision\` with: + - \`reviewer: 'dr-skeptic'\` + - \`lens: '${lens}'\` + - \`verdict: 'pass' | 'block'\` + - \`score: 1-5\` (1=blocking concerns, 5=enthusiastic) + - \`concerns: [...]\` (crisp one-line statements — concrete, actionable, not vague) + +Pass only if you genuinely tried to break the decision and failed. If \`argument\` is empty or weak, score it low and demand more. + +After the tool call, return one or two sentences summarizing your verdict.`; +} + +export interface SkepticReview { + lens: Lens; + verdict: "pass" | "block"; + score: number; + concerns: string[]; + summary: string; +} + +export async function runSkepticAgent( + client: OpenAI, + config: LLMConfig, + cwd: string, + decisionId: string, + lens: Lens, + verbose: boolean +): Promise<SkepticReview> { + const turn = await runAgentTurn( + { + client, + config, + system: systemFor(lens), + toolContext: { cwd }, + verbose, + maxIterations: 8, + toolFilter: { + include: ["dr_get_decision", "dr_review_decision", "dr_list_decisions"], + }, + }, + `Review decision \`${decisionId}\` through the ${lens} lens. Record your verdict via dr_review_decision.` + ); + + const reviewCall = turn.toolCalls.find((c) => c.name === "dr_review_decision"); + if (!reviewCall) { + return { + lens, + verdict: "block", + score: 1, + concerns: ["Skeptic agent did not call dr_review_decision — review missing."], + summary: turn.text || "Skeptic produced no output.", + }; + } + const args = reviewCall.args as { + verdict?: "pass" | "block"; + score?: number; + concerns?: string[]; + }; + return { + lens, + verdict: args.verdict ?? "block", + score: args.score ?? 0, + concerns: args.concerns ?? [], + summary: turn.text, + }; +} diff --git a/server/src/cli/checkpoints.ts b/server/src/cli/checkpoints.ts new file mode 100644 index 0000000..35f2e6c --- /dev/null +++ b/server/src/cli/checkpoints.ts @@ -0,0 +1,82 @@ +import { createInterface } from "node:readline/promises"; + +const GREEN = "\x1b[32m"; +const YELLOW = "\x1b[33m"; +const RED = "\x1b[31m"; +const BLUE = "\x1b[34m"; +const DIM = "\x1b[2m"; +const BOLD = "\x1b[1m"; +const RESET = "\x1b[0m"; + +export interface CheckpointOptions { + /** Skip interactive prompt and auto-confirm (for --yes / fully autonomous mode). */ + autoYes: boolean; +} + +export async function confirm( + prompt: string, + options: CheckpointOptions, + defaultYes = true +): Promise<boolean> { + if (options.autoYes) { + process.stderr.write(`${BLUE}>${RESET} ${prompt} ${DIM}[auto-yes]${RESET}\n`); + return true; + } + const rl = createInterface({ input: process.stdin, output: process.stderr }); + try { + const hint = defaultYes ? "[Y/n]" : "[y/N]"; + const answer = (await rl.question(`${BLUE}>${RESET} ${prompt} ${hint} `)) + .trim() + .toLowerCase(); + if (answer === "") return defaultYes; + return answer === "y" || answer === "yes"; + } finally { + rl.close(); + } +} + +export async function ask( + prompt: string, + options: CheckpointOptions, + fallback = "" +): Promise<string> { + if (options.autoYes) { + process.stderr.write(`${BLUE}>${RESET} ${prompt} ${DIM}[auto: '${fallback}']${RESET}\n`); + return fallback; + } + const rl = createInterface({ input: process.stdin, output: process.stderr }); + try { + const answer = await rl.question(`${BLUE}>${RESET} ${prompt} `); + return answer.trim() || fallback; + } finally { + rl.close(); + } +} + +export function header(text: string): void { + process.stderr.write(`\n${BOLD}${BLUE}━━━ ${text} ━━━${RESET}\n`); +} + +export function info(text: string): void { + process.stderr.write(`${DIM}${text}${RESET}\n`); +} + +export function success(text: string): void { + process.stderr.write(`${GREEN}✓${RESET} ${text}\n`); +} + +export function warn(text: string): void { + process.stderr.write(`${YELLOW}!${RESET} ${text}\n`); +} + +export function error(text: string): void { + process.stderr.write(`${RED}✗${RESET} ${text}\n`); +} + +export function bullet(text: string): void { + process.stderr.write(` ${DIM}•${RESET} ${text}\n`); +} + +export function divider(): void { + process.stderr.write(`${DIM}${"─".repeat(60)}${RESET}\n`); +} diff --git a/server/src/cli/index.ts b/server/src/cli/index.ts new file mode 100644 index 0000000..b3c1d90 --- /dev/null +++ b/server/src/cli/index.ts @@ -0,0 +1,232 @@ +import { resolve } from "node:path"; +import { makeClient, resolveConfig } from "../llm/client.js"; +import { registerAllTools } from "../tools/index.js"; +import { runPipeline } from "./orchestrator.js"; +import { readPRD, PRDDigest } from "./prd.js"; +import { error, header, info } from "./checkpoints.js"; + +interface ParsedArgs { + idea?: string; + title?: string; + description?: string; + prdPath?: string; + cwd: string; + effortLevel: "poc" | "mvp" | "full"; + model?: string; + apiKey?: string; + baseURL?: string; + resume: boolean; + autoYes: boolean; + verbose: boolean; + help: boolean; + version: boolean; +} + +const VERSION = "0.1.0"; + +const HELP = `decision-record — idea-to-MVP planning CLI + +Usage: + decision-record [options] Start a new project (interactive) + decision-record --idea "..." Start with a free-form idea + decision-record --prd <file> Start from a PRD markdown file + decision-record --resume Resume the project in --cwd (or process.cwd()) + +Options: + --idea TEXT Free-form one-line idea (will derive title + description). + --title TEXT Explicit project title. + --description TEXT Explicit project description. + --prd PATH Read a Markdown PRD as scope context. Combinable with --idea. + --cwd PATH Target project directory (default: cwd). State lands under .dr/ and dr/. + --effort poc|mvp|full Gate strictness preset (default: mvp). + --model NAME LLM model name (default: $OPENAI_MODEL or gpt-4o). + --api-key KEY OpenAI-compat API key (default: $OPENAI_API_KEY). + --base-url URL OpenAI-compat base URL (default: $OPENAI_BASE_URL or api.openai.com). + --resume Skip intake; pick up the existing project in --cwd. + --yes, -y Bypass interactive checkpoints (fully autonomous). + --verbose, -v Stream agent reasoning and tool calls to stderr. + --help, -h Show this help. + --version Print version. + +Environment: + OPENAI_API_KEY Required unless --api-key is passed. + OPENAI_BASE_URL Optional. Set for OpenRouter, vLLM, Ollama, LiteLLM, etc. + OPENAI_MODEL Optional. Default model name. + LINEAR_API_KEY Optional. Enables Linear handoff target. + LINEAR_TEAM_ID Optional. Pre-fills the Linear team ID prompt. + +Examples: + decision-record --idea "a CLI for QuickBooks CSV → ledger normalization" --effort poc + decision-record --prd ./docs/idea.md --effort mvp --yes + decision-record --cwd ./my-project --resume +`; + +function parseArgs(argv: string[]): ParsedArgs { + const out: ParsedArgs = { + cwd: process.cwd(), + effortLevel: "mvp", + resume: false, + autoYes: false, + verbose: false, + help: false, + version: false, + }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + const next = () => { + const v = argv[++i]; + if (v === undefined) throw new Error(`Missing value for ${a}`); + return v; + }; + switch (a) { + case "--idea": + out.idea = next(); + break; + case "--title": + out.title = next(); + break; + case "--description": + out.description = next(); + break; + case "--prd": + out.prdPath = next(); + break; + case "--cwd": + out.cwd = resolve(next()); + break; + case "--effort": { + const v = next(); + if (v !== "poc" && v !== "mvp" && v !== "full") { + throw new Error(`--effort must be poc | mvp | full (got ${v})`); + } + out.effortLevel = v; + break; + } + case "--model": + out.model = next(); + break; + case "--api-key": + out.apiKey = next(); + break; + case "--base-url": + out.baseURL = next(); + break; + case "--resume": + out.resume = true; + break; + case "--yes": + case "-y": + out.autoYes = true; + break; + case "--verbose": + case "-v": + out.verbose = true; + break; + case "--help": + case "-h": + out.help = true; + break; + case "--version": + out.version = true; + break; + default: + // First positional is treated as --idea when --idea isn't set. + if (a && !a.startsWith("--") && !out.idea && !out.title) { + out.idea = a; + } else if (a) { + throw new Error(`Unknown argument: ${a}`); + } + } + } + return out; +} + +async function main(): Promise<number> { + let args: ParsedArgs; + try { + args = parseArgs(process.argv.slice(2)); + } catch (err) { + error(err instanceof Error ? err.message : String(err)); + process.stderr.write(HELP); + return 2; + } + if (args.help) { + process.stdout.write(HELP); + return 0; + } + if (args.version) { + process.stdout.write(`decision-record ${VERSION}\n`); + return 0; + } + + registerAllTools(); + + let prd: PRDDigest | null = null; + if (args.prdPath) { + try { + prd = await readPRD(args.prdPath); + info(`Loaded PRD: ${args.prdPath} (${prd.raw.length} chars).`); + } catch (err) { + error(`Could not read PRD at ${args.prdPath}: ${err instanceof Error ? err.message : String(err)}`); + return 1; + } + } + + let title = args.title; + let description = args.description; + if (!args.resume) { + if (!title && prd?.title_hint) title = prd.title_hint; + if (!title && args.idea) { + title = args.idea.length > 80 ? args.idea.slice(0, 77) + "…" : args.idea; + } + if (!description) { + if (args.idea) description = args.idea; + else if (prd?.description_hint) description = prd.description_hint; + } + } + + let config; + let client; + try { + config = resolveConfig({ + ...(args.model !== undefined && { model: args.model }), + ...(args.apiKey !== undefined && { apiKey: args.apiKey }), + ...(args.baseURL !== undefined && { baseURL: args.baseURL }), + }); + client = makeClient(config); + } catch (err) { + error(err instanceof Error ? err.message : String(err)); + return 2; + } + + header(`decision-record v${VERSION}`); + info(`Target: ${args.cwd}`); + info(`Model: ${config.model}${config.baseURL ? ` @ ${config.baseURL}` : ""}`); + if (args.autoYes) info("Mode: autonomous (--yes; checkpoints bypassed)"); + + const outcome = await runPipeline( + { + cwd: args.cwd, + client, + config, + autoYes: args.autoYes, + verbose: args.verbose, + }, + { + ...(title !== undefined && { title }), + ...(description !== undefined && { description }), + effortLevel: args.effortLevel, + prd, + resume: args.resume, + } + ); + + return outcome.exitCode; +} + +main() + .then((code) => process.exit(code)) + .catch((err) => { + error(err instanceof Error ? err.message : String(err)); + process.exit(1); + }); diff --git a/server/src/cli/orchestrator.ts b/server/src/cli/orchestrator.ts new file mode 100644 index 0000000..4828a0f --- /dev/null +++ b/server/src/cli/orchestrator.ts @@ -0,0 +1,415 @@ +import OpenAI from "openai"; +import { LLMConfig } from "../llm/client.js"; +import { executeAgentTool } from "../llm/tools.js"; +import { + CheckpointOptions, + ask, + bullet, + confirm, + divider, + error, + header, + info, + success, + warn, +} from "./checkpoints.js"; +import { PRDDigest } from "./prd.js"; +import { runScopingAgent } from "./agents/scoping.js"; +import { runDecidingAgent } from "./agents/deciding.js"; +import { ALL_LENSES, runSkepticAgent } from "./agents/skeptic.js"; +import { runDecomposerAgent } from "./agents/decomposer.js"; + +export interface OrchestratorOptions extends CheckpointOptions { + cwd: string; + client: OpenAI; + config: LLMConfig; + verbose: boolean; +} + +export interface RunOutcome { + exitCode: number; + finalPhase: string; +} + +export async function runPipeline( + opts: OrchestratorOptions, + ctx: { + title?: string; + description?: string; + effortLevel?: "poc" | "mvp" | "full"; + prd?: PRDDigest | null; + resume: boolean; + } +): Promise<RunOutcome> { + // 1. Resume check + const status = await callTool(opts.cwd, "dr_status", {}); + const hasProject = status.ok; + + if (hasProject) { + if (!ctx.resume) { + warn( + `A project is already initialized in ${opts.cwd}. Treating this as a resume.` + ); + } else { + info(`Resuming existing project in ${opts.cwd}.`); + } + } else { + if (ctx.resume) { + error(`No project found in ${opts.cwd}. Nothing to resume.`); + return { exitCode: 2, finalPhase: "(none)" }; + } + if (!ctx.title) { + error("Title is required to start a new project (pass --title or --idea)."); + return { exitCode: 2, finalPhase: "(none)" }; + } + header("Phase: Intake"); + const initRes = await callTool(opts.cwd, "dr_init", { + title: ctx.title, + description: ctx.description ?? "", + effort_level: ctx.effortLevel ?? "mvp", + }); + if (!initRes.ok) { + error(`dr_init failed: ${(initRes.errors ?? []).join("; ")}`); + return { exitCode: 1, finalPhase: "intake" }; + } + const initData = initRes.data as { project: { id: string; effort_level: string } }; + success(`Initialized '${initData.project.id}' at effort_level=${initData.project.effort_level}`); + } + + // 2. Walk forward through phases. + while (true) { + const cur = await getStatus(opts.cwd); + const phase = cur.state.phase as string; + const nextPhase = cur.state.next_phase as string | null; + if (!nextPhase || phase === "handed-off") { + success(`Pipeline complete. Final phase: ${phase}`); + return { exitCode: 0, finalPhase: phase }; + } + + info(`Current phase: ${phase} → next: ${nextPhase}`); + let workResult: { exitCode: number } | null = null; + switch (phase) { + case "intake": + workResult = await advanceIntake(opts, cur, nextPhase as string); + break; + case "scoping": + workResult = await advanceScoping(opts, ctx.prd ?? null); + break; + case "deciding": + workResult = await advanceDeciding(opts); + break; + case "decomposing": + workResult = await advanceDecomposing(opts); + break; + case "handing-off": + workResult = await advanceHandoff(opts); + break; + default: + error(`Unknown phase '${phase}'`); + return { exitCode: 1, finalPhase: phase }; + } + if (workResult && workResult.exitCode !== 0) { + return { exitCode: workResult.exitCode, finalPhase: phase }; + } + } +} + +async function advanceIntake( + opts: OrchestratorOptions, + status: StatusData, + nextPhase: string +): Promise<{ exitCode: number }> { + const needsHumanSignoff = status.effective_gate_config.require_human_signoff_phases.includes( + nextPhase + ); + return advancePhase(opts, "intake → scoping", needsHumanSignoff); +} + +async function advanceScoping( + opts: OrchestratorOptions, + prd: PRDDigest | null +): Promise<{ exitCode: number }> { + header("Phase: Scoping"); + info("Running scoping agent…"); + const prdContext = prd + ? `PRD (excerpt):\n${prd.raw.slice(0, 4000)}${prd.raw.length > 4000 ? "\n…[truncated]" : ""}` + : null; + const result = await runScopingAgent(opts.client, opts.config, opts.cwd, prdContext, opts.verbose); + success(`Scoping agent finished (${result.toolCallCount} tool calls).`); + divider(); + process.stderr.write(result.summary + "\n"); + divider(); + + const project = (await callTool(opts.cwd, "dr_status", {})).data as StatusData; + const failures = realGateFailures(project); + if (failures.length > 0) { + warn("Scoping gate is not yet passable. The agent's output was:"); + for (const r of failures) bullet(r); + return { exitCode: 1 }; + } + return advancePhase(opts, "scoping → deciding", needsHumanSignoffFor(project, "deciding")); +} + +async function advanceDeciding(opts: OrchestratorOptions): Promise<{ exitCode: number }> { + header("Phase: Deciding"); + info("Running deciding agent (proposing decisions)…"); + const result = await runDecidingAgent(opts.client, opts.config, opts.cwd, opts.verbose); + success(`Deciding agent finished (${result.toolCallCount} tool calls).`); + divider(); + process.stderr.write(result.summary + "\n"); + divider(); + + // Lens-rotating review for every proposed decision. + const proposed = await listDecisions(opts.cwd, "proposed"); + if (proposed.length === 0) { + warn("No decisions in 'proposed' state to review."); + } else { + header(`Antagonistic review: ${proposed.length} decisions × ${ALL_LENSES.length} lenses`); + for (const d of proposed) { + info(`Reviewing ${d.id} — ${d.title}`); + const lensVerdicts: { lens: string; verdict: string; score: number }[] = []; + let anyBlock = false; + for (const lens of ALL_LENSES) { + const review = await runSkepticAgent( + opts.client, + opts.config, + opts.cwd, + d.id, + lens, + opts.verbose + ); + lensVerdicts.push({ lens, verdict: review.verdict, score: review.score }); + if (review.verdict === "block") { + anyBlock = true; + warn(` ${lens}: BLOCK (${review.score}/5) — ${review.concerns.join("; ")}`); + } else { + info(` ${lens}: pass (${review.score}/5)`); + } + } + if (anyBlock) { + warn(`${d.id} has blocking concerns. Will not auto-accept.`); + const decision = await ask( + `Override and accept ${d.id} anyway? (type 'accept' to override, anything else to reject)`, + opts, + "reject" + ); + if (decision === "accept") { + await callTool(opts.cwd, "dr_accept_decision", { + id: d.id, + sign_off_by: "human", + sign_off_actor: "cli-user", + sign_off_notes: "Accepted with blocking review concerns overridden.", + }); + success(`Accepted ${d.id} with human override.`); + } else { + await callTool(opts.cwd, "dr_reject_decision", { + id: d.id, + reason: "Skeptic review blocked; not overridden.", + sign_off_by: "human", + sign_off_actor: "cli-user", + }); + warn(`Rejected ${d.id}.`); + } + } else { + const accept = await callTool(opts.cwd, "dr_accept_decision", { + id: d.id, + sign_off_by: "human", + sign_off_actor: "cli-user", + sign_off_notes: `All ${ALL_LENSES.length} lens reviews passed.`, + }); + if (accept.ok) { + success(`Accepted ${d.id}.`); + } else { + warn(`Could not accept ${d.id}: ${(accept.errors ?? []).join("; ")}`); + } + } + } + } + + const status = await getStatus(opts.cwd); + const failures = realGateFailures(status); + if (failures.length > 0) { + warn("Deciding gate still failing:"); + for (const r of failures) bullet(r); + return { exitCode: 1 }; + } + return advancePhase(opts, "deciding → decomposing", needsHumanSignoffFor(status, "decomposing")); +} + +async function advanceDecomposing(opts: OrchestratorOptions): Promise<{ exitCode: number }> { + header("Phase: Decomposing"); + info("Running decomposer agent (building task graph)…"); + const result = await runDecomposerAgent(opts.client, opts.config, opts.cwd, opts.verbose); + if (result.validationPassed) { + success(`Decomposer finished (${result.toolCallCount} tool calls). Graph validates.`); + } else { + warn(`Decomposer finished (${result.toolCallCount} tool calls) but graph did not validate.`); + } + divider(); + process.stderr.write(result.summary + "\n"); + divider(); + + const status = await getStatus(opts.cwd); + const failures = realGateFailures(status); + if (failures.length > 0) { + warn("Decomposing gate still failing:"); + for (const r of failures) bullet(r); + return { exitCode: 1 }; + } + return advancePhase(opts, "decomposing → handing-off", needsHumanSignoffFor(status, "handing-off")); +} + +async function advanceHandoff(opts: OrchestratorOptions): Promise<{ exitCode: number }> { + header("Phase: Handoff"); + info("Rendering Markdown + HTML artifacts…"); + const renderRes = await callTool(opts.cwd, "dr_render", {}); + if (!renderRes.ok) { + error(`Render failed: ${(renderRes.errors ?? []).join("; ")}`); + return { exitCode: 1 }; + } + success("Artifacts rendered."); + + const linearAvailable = Boolean(process.env.LINEAR_API_KEY); + let target: "linear" | "filesystem" = "filesystem"; + if (linearAvailable) { + const wantsLinear = await confirm( + "LINEAR_API_KEY detected. Push the plan to Linear?", + opts, + true + ); + target = wantsLinear ? "linear" : "filesystem"; + } + + if (target === "linear") { + const teamId = await ask( + "Linear team ID:", + opts, + process.env.LINEAR_TEAM_ID ?? "" + ); + if (!teamId) { + error("Linear team ID is required."); + return { exitCode: 2 }; + } + info("Running dry-run preview…"); + const dry = await callTool(opts.cwd, "dr_export_linear", { + team_id: teamId, + dry_run: true, + }); + if (!dry.ok) { + error(`Linear dry-run failed: ${(dry.errors ?? []).join("; ")}`); + return { exitCode: 1 }; + } + const totals = (dry.data as { totals: { issues: number; decisions: number; tasks: number } }).totals; + info(`Dry-run plan: ${totals.issues} issues (${totals.decisions} decisions + ${totals.tasks} tasks)`); + const proceed = await confirm("Push to Linear now?", opts, true); + if (!proceed) { + warn("Linear push cancelled. Project remains in 'handing-off'."); + return { exitCode: 0 }; + } + const push = await callTool(opts.cwd, "dr_export_linear", { + team_id: teamId, + dry_run: false, + sign_off_by: "human", + sign_off_actor: "cli-user", + }); + if (!push.ok) { + error(`Linear export failed: ${(push.errors ?? []).join("; ")}`); + return { exitCode: 1 }; + } + const data = push.data as { linear_project: { url?: string }; issues_created: number }; + success(`Pushed ${data.issues_created} issues to Linear.`); + if (data.linear_project.url) info(`Project URL: ${data.linear_project.url}`); + } else { + const proceed = await confirm("Finalize plan to filesystem?", opts, true); + if (!proceed) { + warn("Filesystem export cancelled. Project remains in 'handing-off'."); + return { exitCode: 0 }; + } + const fs = await callTool(opts.cwd, "dr_export_filesystem", { + sign_off_by: "human", + sign_off_actor: "cli-user", + }); + if (!fs.ok) { + error(`Filesystem export failed: ${(fs.errors ?? []).join("; ")}`); + return { exitCode: 1 }; + } + success("Plan finalized to filesystem."); + } + // Re-render so artifacts reflect the final 'handed-off' state. + await callTool(opts.cwd, "dr_render", {}); + return { exitCode: 0 }; +} + +async function advancePhase( + opts: OrchestratorOptions, + label: string, + needsHumanSignoff: boolean +): Promise<{ exitCode: number }> { + if (needsHumanSignoff) { + divider(); + info(`Next transition (${label}) requires human sign-off.`); + const proceed = await confirm("Advance?", opts, true); + if (!proceed) { + warn(`Halting before ${label}. Re-run to resume.`); + return { exitCode: 0 }; + } + } + const args = needsHumanSignoff + ? { sign_off_by: "human", sign_off_actor: "cli-user" } + : {}; + const adv = await callTool(opts.cwd, "dr_advance", args); + if (!adv.ok) { + error(`dr_advance failed for ${label}:`); + for (const r of adv.errors ?? []) bullet(r); + return { exitCode: 1 }; + } + success(`Advanced: ${label}`); + return { exitCode: 0 }; +} + +function needsHumanSignoffFor(status: StatusData, nextPhase: string): boolean { + return status.effective_gate_config.require_human_signoff_phases.includes(nextPhase); +} + +/** + * Return gate-failure reasons excluding sign-off failures. + * The orchestrator handles sign-off itself in advancePhase, so a sign-off-only + * "failure" from dr_status (which is called without sign-off context) is not + * a real blocker. + */ +function realGateFailures(status: StatusData): string[] { + return status.gate_to_next.reasons.filter((r) => !r.startsWith("Sign-off gate:")); +} + +interface StatusData { + state: { phase: string; next_phase: string | null }; + gate_to_next: { pass: boolean; reasons: string[]; next_phase: string | null }; + effective_gate_config: { + require_human_signoff_phases: string[]; + [k: string]: unknown; + }; + counts: { decisions: number; tasks: number }; +} + +async function getStatus(cwd: string): Promise<StatusData> { + const res = await callTool(cwd, "dr_status", {}); + if (!res.ok) throw new Error(`dr_status failed: ${(res.errors ?? []).join("; ")}`); + return res.data as StatusData; +} + +async function listDecisions( + cwd: string, + status: "rfc" | "proposed" | "accepted" | "rejected" | "deprecated" | "superseded" +): Promise<{ id: string; title: string }[]> { + const res = await callTool(cwd, "dr_list_decisions", { status: [status] }); + if (!res.ok) return []; + return ((res.data as { decisions?: { id: string; title: string }[] }).decisions) ?? []; +} + +async function callTool( + cwd: string, + name: string, + args: Record<string, unknown> +): Promise<{ ok: boolean; data?: unknown; errors?: string[] }> { + return executeAgentTool(name, { ...args, cwd }, { cwd }); +} diff --git a/server/src/cli/prd.ts b/server/src/cli/prd.ts new file mode 100644 index 0000000..930f54c --- /dev/null +++ b/server/src/cli/prd.ts @@ -0,0 +1,36 @@ +import { readFile } from "node:fs/promises"; + +export interface PRDDigest { + /** Raw PRD content. */ + raw: string; + /** First H1 if present — used as a title hint. */ + title_hint?: string; + /** First paragraph after title — used as a description hint. */ + description_hint?: string; +} + +export async function readPRD(path: string): Promise<PRDDigest> { + const raw = await readFile(path, "utf8"); + return digest(raw); +} + +export function digest(raw: string): PRDDigest { + const lines = raw.split("\n"); + let title_hint: string | undefined; + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.startsWith("# ") && !trimmed.startsWith("##")) { + title_hint = trimmed.replace(/^#+\s*/, "").trim(); + break; + } + } + // Take first non-heading, non-empty paragraph as description hint + let description_hint: string | undefined; + const blocks = raw.split(/\n\s*\n/).map((b) => b.trim()).filter((b) => b.length > 0); + for (const block of blocks) { + if (block.startsWith("#")) continue; + description_hint = block.length > 800 ? block.slice(0, 800) + "…" : block; + break; + } + return { raw, ...(title_hint && { title_hint }), ...(description_hint && { description_hint }) }; +} diff --git a/server/src/llm/agent.ts b/server/src/llm/agent.ts new file mode 100644 index 0000000..931ccbc --- /dev/null +++ b/server/src/llm/agent.ts @@ -0,0 +1,161 @@ +import OpenAI from "openai"; +import { LLMConfig } from "./client.js"; +import { + executeAgentTool, + listOpenAITools, + ToolFilter, + ToolInvocationContext, +} from "./tools.js"; +import { log } from "../log.js"; + +export interface AgentOptions { + client: OpenAI; + config: LLMConfig; + system: string; + toolFilter?: ToolFilter; + toolContext: ToolInvocationContext; + /** Max tool-use iterations before giving up. */ + maxIterations?: number; + /** Stream agent reasoning to stderr. */ + verbose?: boolean; +} + +export interface AgentTurn { + /** Final assistant text after the loop ends. */ + text: string; + /** Tool calls executed during the loop. */ + toolCalls: { name: string; args: Record<string, unknown>; resultText: string }[]; + /** Reason the loop terminated. */ + stopReason: "end_turn" | "max_iterations" | "refusal" | "length"; + /** Total iterations consumed. */ + iterations: number; + /** Approximate token usage (sum across all turns). */ + usage: { prompt: number; completion: number }; +} + +/** Run a single agent turn — initial user message plus full tool-using loop until the model has nothing more to do. */ +export async function runAgentTurn( + options: AgentOptions, + userMessage: string +): Promise<AgentTurn> { + const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [ + { role: "system", content: options.system }, + { role: "user", content: userMessage }, + ]; + return runAgentLoop(options, messages); +} + +/** Continue an agent conversation with a new user message. Messages are mutated in place. */ +export async function continueAgentConversation( + options: AgentOptions, + messages: OpenAI.Chat.ChatCompletionMessageParam[], + userMessage: string +): Promise<AgentTurn> { + messages.push({ role: "user", content: userMessage }); + return runAgentLoop(options, messages); +} + +async function runAgentLoop( + options: AgentOptions, + messages: OpenAI.Chat.ChatCompletionMessageParam[] +): Promise<AgentTurn> { + const tools = listOpenAITools(options.toolFilter); + const maxIter = options.maxIterations ?? 32; + const toolCalls: AgentTurn["toolCalls"] = []; + const usage = { prompt: 0, completion: 0 }; + + for (let i = 0; i < maxIter; i++) { + const completion = await options.client.chat.completions.create({ + model: options.config.model, + messages, + tools: tools.length > 0 ? tools : undefined, + max_tokens: options.config.maxTokens, + temperature: options.config.temperature, + }); + if (completion.usage) { + usage.prompt += completion.usage.prompt_tokens; + usage.completion += completion.usage.completion_tokens; + } + const choice = completion.choices[0]; + if (!choice) { + throw new Error("LLM returned no choices"); + } + const msg = choice.message; + messages.push(msg as OpenAI.Chat.ChatCompletionMessageParam); + + if (options.verbose && msg.content) { + process.stderr.write(`\n[agent] ${msg.content}\n`); + } + + if (choice.finish_reason === "length") { + return { + text: msg.content ?? "", + toolCalls, + stopReason: "length", + iterations: i + 1, + usage, + }; + } + if (choice.finish_reason === "content_filter") { + return { + text: msg.content ?? "[content filtered]", + toolCalls, + stopReason: "refusal", + iterations: i + 1, + usage, + }; + } + const calls = msg.tool_calls ?? []; + if (calls.length === 0) { + return { + text: msg.content ?? "", + toolCalls, + stopReason: "end_turn", + iterations: i + 1, + usage, + }; + } + + for (const call of calls) { + if (call.type !== "function") continue; + const name = call.function.name; + const argsStr = call.function.arguments; + if (options.verbose) { + process.stderr.write(`[agent→${name}] ${argsStr}\n`); + } + const result = await executeAgentTool(name, argsStr, options.toolContext); + const resultText = JSON.stringify(result, null, 2); + toolCalls.push({ + name, + args: safeJson(argsStr), + resultText, + }); + messages.push({ + role: "tool", + tool_call_id: call.id, + content: resultText, + }); + if (options.verbose) { + const head = resultText.length > 300 ? resultText.slice(0, 300) + "…" : resultText; + process.stderr.write(`[tool→${name}] ${head}\n`); + } + } + } + + log.warn(`Agent loop hit max_iterations=${maxIter} without ending`); + return { + text: "[agent stopped: max iterations reached]", + toolCalls, + stopReason: "max_iterations", + iterations: maxIter, + usage, + }; +} + +function safeJson(s: string): Record<string, unknown> { + try { + return JSON.parse(s); + } catch { + return { _raw: s }; + } +} diff --git a/server/src/llm/client.ts b/server/src/llm/client.ts new file mode 100644 index 0000000..ac71d27 --- /dev/null +++ b/server/src/llm/client.ts @@ -0,0 +1,34 @@ +import OpenAI from "openai"; + +export interface LLMConfig { + apiKey?: string; + baseURL?: string; + model: string; + maxTokens?: number; + temperature?: number; +} + +export function resolveConfig(overrides: Partial<LLMConfig> = {}): LLMConfig { + const apiKey = overrides.apiKey ?? process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error( + "OPENAI_API_KEY is required (or pass --api-key). Set OPENAI_BASE_URL for non-default endpoints (Ollama, vLLM, OpenRouter, LiteLLM, etc.)." + ); + } + const baseURL = overrides.baseURL ?? process.env.OPENAI_BASE_URL; + const model = overrides.model ?? process.env.OPENAI_MODEL ?? "gpt-4o"; + return { + apiKey, + baseURL, + model, + maxTokens: overrides.maxTokens, + temperature: overrides.temperature, + }; +} + +export function makeClient(config: LLMConfig): OpenAI { + return new OpenAI({ + apiKey: config.apiKey, + baseURL: config.baseURL, + }); +} diff --git a/server/src/llm/tools.ts b/server/src/llm/tools.ts new file mode 100644 index 0000000..431bfee --- /dev/null +++ b/server/src/llm/tools.ts @@ -0,0 +1,94 @@ +import { getTool, listTools } from "../tools/registry.js"; +import { zodToJsonSchema } from "../jsonSchema.js"; +import { z } from "zod"; +import OpenAI from "openai"; + +export interface ToolFilter { + /** If set, only tools whose name is in this list are exposed. */ + include?: string[]; + /** If set, tools whose name is in this list are hidden. */ + exclude?: string[]; +} + +export function listOpenAITools(filter: ToolFilter = {}): OpenAI.Chat.ChatCompletionTool[] { + return listTools() + .filter((t) => (filter.include ? filter.include.includes(t.name) : true)) + .filter((t) => (filter.exclude ? !filter.exclude.includes(t.name) : true)) + .map((t) => ({ + type: "function", + function: { + name: t.name, + description: t.description, + parameters: zodToJsonSchema(t.inputSchema) as Record<string, unknown>, + }, + })); +} + +export interface ToolInvocationContext { + /** Target project cwd. Injected into every tool call that accepts `cwd`. */ + cwd: string; +} + +export interface ToolCallResult { + ok: boolean; + data?: unknown; + errors?: string[]; + warnings?: string[]; +} + +/** + * Execute a tool by name with the agent's chosen input. Injects `cwd` from the + * orchestrator's context if the tool accepts it and the agent didn't supply one. + * Validation errors are returned as ok:false so the agent can recover. + */ +export async function executeAgentTool( + name: string, + rawArgs: string | Record<string, unknown>, + ctx: ToolInvocationContext +): Promise<ToolCallResult> { + const tool = getTool(name); + if (!tool) { + return { ok: false, errors: [`Unknown tool: ${name}`] }; + } + let args: Record<string, unknown>; + try { + args = typeof rawArgs === "string" ? JSON.parse(rawArgs) : rawArgs; + } catch (err) { + return { + ok: false, + errors: [ + `Failed to parse tool arguments as JSON: ${err instanceof Error ? err.message : String(err)}`, + ], + }; + } + + // Inject cwd automatically when the tool has a `cwd` field in its schema + // and the agent didn't pass one. + if (toolAcceptsCwd(tool.inputSchema) && !("cwd" in args)) { + args.cwd = ctx.cwd; + } + + try { + const validated = tool.inputSchema.parse(args); + const result = await tool.handler(validated); + return result as ToolCallResult; + } catch (err) { + if (err instanceof z.ZodError) { + return { + ok: false, + errors: err.errors.map((e) => `${e.path.join(".") || "(root)"}: ${e.message}`), + }; + } + return { + ok: false, + errors: [err instanceof Error ? err.message : String(err)], + }; + } +} + +function toolAcceptsCwd(schema: z.ZodTypeAny): boolean { + const def = (schema as unknown as { _def: { typeName: string; shape?: () => Record<string, unknown> } })._def; + if (def.typeName !== "ZodObject") return false; + const obj = schema as z.ZodObject<z.ZodRawShape>; + return "cwd" in obj.shape; +} diff --git a/server/src/schemas/index.ts b/server/src/schemas/index.ts index 9e0acb5..9fc5fdb 100644 --- a/server/src/schemas/index.ts +++ b/server/src/schemas/index.ts @@ -254,7 +254,7 @@ export const GateFailureSchema = z.object({ export type GateFailure = z.infer<typeof GateFailureSchema>; export const PipelineStateSchema = z.object({ - schema_version: z.string(), + schema_version: z.string().regex(/^[0-9]+\.[0-9]+\.[0-9]+$/, "must be semver"), project_id: SlugSchema, phase: PhaseSchema, effective_gate_config: EffectiveGateConfigSchema, diff --git a/server/tests/flow-poc-pipeline.test.ts b/server/tests/flow-poc-pipeline.test.ts new file mode 100644 index 0000000..5203f4a --- /dev/null +++ b/server/tests/flow-poc-pipeline.test.ts @@ -0,0 +1,406 @@ +import { describe, it, before, after } from "node:test"; +import assert from "node:assert/strict"; +import { existsSync, readFileSync, readdirSync } from "node:fs"; +import { join } from "node:path"; +import { makeTmpProject } from "./helpers/tmp-project.js"; +import { makeMockOpenAI, ScriptedResponse } from "./helpers/mock-openai.js"; +import { registerAllTools } from "../src/tools/index.js"; +import { runPipeline } from "../src/cli/orchestrator.js"; + +/** + * End-to-end pipeline test using a scripted mock LLM. + * + * This test drives the full intake → scoping → deciding → decomposing → handoff + * flow without any real API calls. The mock LLM is told exactly what tool calls to + * make at each phase, and we assert the artifacts on disk match expectations. + */ +describe("Flow: POC happy path (mock LLM)", () => { + let toolsRegistered = false; + + before(() => { + if (!toolsRegistered) { + registerAllTools(); + toolsRegistered = true; + } + }); + + it("runs intake → scoping → deciding → decomposing → handoff (filesystem)", async () => { + const project = makeTmpProject("dr-flow-poc-"); + try { + const script: ScriptedResponse[] = [ + // ── Scoping agent ────────────────────────────────────────────── + // Turn 1: read status + { toolCalls: [{ name: "dr_status", args: {} }] }, + // Turn 2: set scope + { + toolCalls: [ + { + name: "dr_update_scope", + args: { + in_scope: ["thing A", "thing B"], + success_criteria: ["it works", "it ships"], + out_of_scope: ["far-future feature"], + nice_to_have: [], + }, + }, + ], + }, + // Turn 3: final summary + { text: "Scope set. in_scope: A, B. success: it works, it ships." }, + + // ── Deciding agent ───────────────────────────────────────────── + // Turn 1: read status + { toolCalls: [{ name: "dr_status", args: {} }] }, + // Turn 2: search seeds + { toolCalls: [{ name: "dr_seed_search", args: { query: "language" } }] }, + // Turn 3: load seed + { + toolCalls: [{ name: "dr_seed_load", args: { seed_name: "language-choice" } }], + }, + // Turn 4: pick a position + argument + { + toolCalls: [ + { + name: "dr_update_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + selected_position: "TypeScript", + argument: "Team has deep TS expertise and the project is web-facing.", + }, + }, + ], + }, + // Turn 5: final summary + { text: "Decided: 0001-* → TypeScript." }, + + // ── Skeptic (5 lenses × 1 decision = 5 invocations × 2 turns each) ── + // Each skeptic invocation: 1 review tool call + 1 summary + // operational + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + reviewer: "dr-skeptic", + lens: "operational", + verdict: "pass", + score: 4, + concerns: [], + }, + }, + ], + }, + { text: "Operational review: pass (4/5)." }, + // strategic + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + reviewer: "dr-skeptic", + lens: "strategic", + verdict: "pass", + score: 4, + concerns: [], + }, + }, + ], + }, + { text: "Strategic review: pass." }, + // security + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + reviewer: "dr-skeptic", + lens: "security", + verdict: "pass", + score: 5, + concerns: [], + }, + }, + ], + }, + { text: "Security review: pass." }, + // cost + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + reviewer: "dr-skeptic", + lens: "cost", + verdict: "pass", + score: 4, + concerns: [], + }, + }, + ], + }, + { text: "Cost review: pass." }, + // user-impact + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-choose-the-primary-implementation-language", + reviewer: "dr-skeptic", + lens: "user-impact", + verdict: "pass", + score: 5, + concerns: [], + }, + }, + ], + }, + { text: "User-impact review: pass." }, + + // ── Decomposer agent ─────────────────────────────────────────── + { toolCalls: [{ name: "dr_status", args: {} }] }, + { toolCalls: [{ name: "dr_list_decisions", args: { status: ["accepted"] } }] }, + { + toolCalls: [ + { + name: "dr_propose_task", + args: { + title: "Bootstrap repository", + description: "Init repo, install deps, scaffold config.", + acceptance_criteria: ["repo initialized", "tsconfig in place"], + estimate: { unit: "hours", value: 2, confidence: "high" }, + decision_refs: ["0001-choose-the-primary-implementation-language"], + priority: "p0", + }, + }, + ], + }, + { + toolCalls: [ + { + name: "dr_propose_task", + args: { + title: "Implement core feature", + description: "Build the main thing.", + acceptance_criteria: ["feature works", "tests pass"], + estimate: { unit: "hours", value: 6, confidence: "med" }, + depends_on: ["T0001-bootstrap-repository"], + decision_refs: ["0001-choose-the-primary-implementation-language"], + priority: "p0", + }, + }, + ], + }, + { + toolCalls: [ + { + name: "dr_propose_task", + args: { + title: "Ship and document", + description: "Build artifact and write README.", + acceptance_criteria: ["binary built", "README complete"], + estimate: { unit: "hours", value: 2, confidence: "high" }, + depends_on: ["T0002-implement-core-feature"], + decision_refs: ["0001-choose-the-primary-implementation-language"], + priority: "p1", + }, + }, + ], + }, + { toolCalls: [{ name: "dr_validate_graph", args: {} }] }, + { text: "3 tasks: bootstrap → implement → ship. Graph validates." }, + ]; + + const client = makeMockOpenAI(script); + + const outcome = await runPipeline( + { + cwd: project.cwd, + client, + config: { apiKey: "mock", model: "mock" }, + autoYes: true, + verbose: false, + }, + { + title: "Flow POC Test", + description: "A test project for the flow harness.", + effortLevel: "poc", + prd: null, + resume: false, + } + ); + + assert.equal(outcome.exitCode, 0, "pipeline should exit cleanly"); + assert.equal(outcome.finalPhase, "handed-off", "should reach handed-off"); + + // Artifacts on disk + assert.ok(project.exists("dr/project.json"), "project.json exists"); + assert.ok(project.exists(".dr/state.json"), "state.json exists"); + assert.ok(project.exists("dr/index.html"), "index.html rendered"); + + const projectJson = project.readJson<{ + status: string; + handoff?: { target: string }; + scope?: { in_scope: string[] }; + }>("dr/project.json"); + assert.equal(projectJson.status, "handed-off"); + assert.equal(projectJson.handoff?.target, "filesystem"); + assert.deepEqual(projectJson.scope?.in_scope, ["thing A", "thing B"]); + + const decisions = project.list("dr/decisions").filter((f) => f.endsWith(".json")); + assert.equal(decisions.length, 1, "exactly one decision"); + const decision = project.readJson<{ status: string; review: unknown[] }>( + join("dr/decisions", decisions[0]!) + ); + assert.equal(decision.status, "accepted"); + assert.equal(decision.review.length, 5, "5 lens reviews recorded"); + + const tasks = project.list("dr/tasks").filter((f) => f.endsWith(".json")); + assert.equal(tasks.length, 3, "three tasks"); + + // Event log — verify all major lifecycle events were captured. + // Note: this test uses a seed-loaded decision, which emits 'seed_loaded' + // instead of 'decision_proposed'. + const events = project.events(); + const kinds = new Set(events.map((e) => e.kind as string)); + assert.ok(kinds.has("project_initialized"), "project_initialized event"); + assert.ok(kinds.has("scope_updated"), "scope_updated event"); + assert.ok(kinds.has("seed_loaded"), "seed_loaded event (seed-instantiated DR)"); + assert.ok(kinds.has("decision_reviewed"), "decision_reviewed event"); + assert.ok(kinds.has("decision_accepted"), "decision_accepted event"); + assert.ok(kinds.has("task_proposed"), "task_proposed event"); + assert.ok(kinds.has("export_completed"), "export_completed event"); + assert.ok(kinds.has("phase_advanced"), "phase_advanced event"); + + // Index HTML sanity + const html = readFileSync(join(project.cwd, "dr/index.html"), "utf8"); + assert.ok(html.includes("Flow POC Test")); + assert.ok(html.includes("handed-off")); + } finally { + project.dispose(); + } + }); + + it("rejects a decision when skeptic blocks and no override given", async () => { + const project = makeTmpProject("dr-flow-block-"); + try { + // Pre-initialize via direct tool calls so we land mid-pipeline quickly. + const { executeAgentTool } = await import("../src/llm/tools.js"); + await executeAgentTool( + "dr_init", + { title: "Block Test", description: "test", effort_level: "poc" }, + { cwd: project.cwd } + ); + await executeAgentTool("dr_advance", {}, { cwd: project.cwd }); + await executeAgentTool( + "dr_update_scope", + { in_scope: ["x"], success_criteria: ["y"] }, + { cwd: project.cwd } + ); + await executeAgentTool("dr_advance", {}, { cwd: project.cwd }); + + const script: ScriptedResponse[] = [ + // Deciding agent + { toolCalls: [{ name: "dr_status", args: {} }] }, + { + toolCalls: [ + { + name: "dr_propose_decision", + args: { + title: "Pick a thing", + issue: "We need to pick a thing.", + positions: [{ title: "A" }, { title: "B" }], + }, + }, + ], + }, + { + toolCalls: [ + { + name: "dr_update_decision", + args: { id: "0001-pick-a-thing", selected_position: "A", argument: "Because A." }, + }, + ], + }, + { text: "Decided A." }, + + // 5 skeptic reviews — first one blocks + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-pick-a-thing", + reviewer: "dr-skeptic", + lens: "operational", + verdict: "block", + score: 2, + concerns: ["this would burn the team out"], + }, + }, + ], + }, + { text: "Operational: block." }, + // Subsequent lenses still run + ...Array.from({ length: 4 }, () => [ + { + toolCalls: [ + { + name: "dr_review_decision", + args: { + id: "0001-pick-a-thing", + reviewer: "dr-skeptic", + lens: "strategic", + verdict: "pass", + score: 3, + concerns: [], + }, + }, + ], + }, + { text: "pass." }, + ]).flat(), + // After rejection, the orchestrator advances to decomposing (poc min_decisions=0). + // Script the decomposer to do nothing — gate fails on min_tasks, pipeline returns 1. + { toolCalls: [{ name: "dr_status", args: {} }] }, + { toolCalls: [{ name: "dr_list_decisions", args: { status: ["accepted"] } }] }, + { text: "No accepted decisions; producing no tasks." }, + ]; + + const client = makeMockOpenAI(script); + + // autoYes: true means the override prompt receives "" (fallback "reject"), + // so the orchestrator will reject the blocked decision. + const outcome = await runPipeline( + { + cwd: project.cwd, + client, + config: { apiKey: "mock", model: "mock" }, + autoYes: true, + verbose: false, + }, + { resume: true, prd: null } + ); + + // Decision was rejected — gate fails (no accepted decisions for poc preset, but min_decisions=0) + // Actually for poc preset, min_decisions=0, so the gate might pass. Either way, the + // decision should be in 'rejected' state. + const { executeAgentTool: tool2 } = await import("../src/llm/tools.js"); + const listRes = await tool2( + "dr_list_decisions", + { status: ["rejected"] }, + { cwd: project.cwd } + ); + const rejected = (listRes.data as { decisions: { id: string }[] }).decisions; + assert.equal(rejected.length, 1, "the blocked decision should be rejected"); + assert.equal(rejected[0]?.id, "0001-pick-a-thing"); + assert.ok([0, 1].includes(outcome.exitCode), "pipeline should exit cleanly or stall"); + } finally { + project.dispose(); + } + }); +}); diff --git a/server/tests/helpers/index.ts b/server/tests/helpers/index.ts new file mode 100644 index 0000000..18a8ead --- /dev/null +++ b/server/tests/helpers/index.ts @@ -0,0 +1,2 @@ +export { McpClient, withMcp, type ToolResponse, type McpClientOptions } from "./mcp-client.js"; +export { makeTmpProject, withTmpProject, type TmpProject } from "./tmp-project.js"; diff --git a/server/tests/helpers/mcp-client.ts b/server/tests/helpers/mcp-client.ts new file mode 100644 index 0000000..020d30e --- /dev/null +++ b/server/tests/helpers/mcp-client.ts @@ -0,0 +1,194 @@ +import { spawn, ChildProcessWithoutNullStreams } from "node:child_process"; +import { resolve } from "node:path"; + +interface PendingCall { + resolve: (value: ToolResponse) => void; + reject: (error: Error) => void; + timeout: NodeJS.Timeout; +} + +export interface ToolResponse<T = unknown> { + ok: boolean; + data?: T; + errors?: string[]; + warnings?: string[]; +} + +export interface McpClientOptions { + /** Absolute path to the built server entrypoint. Defaults to ../../dist/index.js relative to this file. */ + serverPath?: string; + /** Per-call timeout in ms. Defaults to 8000. */ + timeoutMs?: number; + /** Forward server stderr to parent (debugging). Defaults to false. */ + verboseStderr?: boolean; + /** Environment for the spawned server. Merged with process.env. */ + env?: Record<string, string>; +} + +const DEFAULT_SERVER_PATH = resolve( + new URL(".", import.meta.url).pathname, + "..", + "..", + "..", + "dist", + "index.js" +); + +export class McpClient { + private proc: ChildProcessWithoutNullStreams; + private nextId = 1; + private pending = new Map<number, PendingCall>(); + private buf = ""; + private readonly timeoutMs: number; + private closed = false; + + constructor(opts: McpClientOptions = {}) { + this.timeoutMs = opts.timeoutMs ?? 8000; + const serverPath = opts.serverPath ?? DEFAULT_SERVER_PATH; + this.proc = spawn("node", [serverPath], { + stdio: ["pipe", "pipe", "pipe"], + env: { ...process.env, ...(opts.env ?? {}) }, + }); + this.proc.stdout.on("data", (d) => this.onStdout(d.toString())); + this.proc.stderr.on("data", (d) => { + if (opts.verboseStderr) process.stderr.write(d); + }); + this.proc.on("exit", () => { + this.closed = true; + for (const [, p] of this.pending) { + clearTimeout(p.timeout); + p.reject(new Error("MCP server exited before responding")); + } + this.pending.clear(); + }); + } + + private onStdout(chunk: string): void { + this.buf += chunk; + let idx: number; + while ((idx = this.buf.indexOf("\n")) >= 0) { + const line = this.buf.slice(0, idx).trim(); + this.buf = this.buf.slice(idx + 1); + if (!line) continue; + let msg: { id?: number; result?: { content?: { text: string }[]; isError?: boolean }; error?: { message: string } }; + try { + msg = JSON.parse(line); + } catch { + continue; + } + if (typeof msg.id !== "number") continue; + const pending = this.pending.get(msg.id); + if (!pending) continue; + this.pending.delete(msg.id); + clearTimeout(pending.timeout); + if (msg.error) { + pending.reject(new Error(`JSON-RPC error: ${msg.error.message}`)); + continue; + } + const text = msg.result?.content?.[0]?.text; + if (text === undefined) { + pending.reject(new Error("Tool response had no content text")); + continue; + } + try { + pending.resolve(JSON.parse(text) as ToolResponse); + } catch { + pending.resolve({ ok: false, errors: ["non-JSON response"], data: text } as ToolResponse); + } + } + } + + private send(method: string, params: Record<string, unknown>): number { + if (this.closed) throw new Error("MCP client is closed"); + const id = this.nextId++; + this.proc.stdin.write( + JSON.stringify({ jsonrpc: "2.0", id, method, params }) + "\n" + ); + return id; + } + + async initialize(): Promise<void> { + return new Promise((resolveFn, rejectFn) => { + const id = this.send("initialize", { + protocolVersion: "2024-11-05", + capabilities: {}, + clientInfo: { name: "dr-test-harness", version: "0" }, + }); + const timeout = setTimeout(() => { + this.pending.delete(id); + rejectFn(new Error("initialize timed out")); + }, this.timeoutMs); + this.pending.set(id, { + resolve: () => resolveFn(), + reject: rejectFn, + timeout, + }); + }); + } + + async call<T = unknown>( + tool: string, + args: Record<string, unknown> = {} + ): Promise<ToolResponse<T>> { + return new Promise<ToolResponse<T>>((resolveFn, rejectFn) => { + const id = this.send("tools/call", { name: tool, arguments: args }); + const timeout = setTimeout(() => { + this.pending.delete(id); + rejectFn(new Error(`tool '${tool}' timed out after ${this.timeoutMs}ms`)); + }, this.timeoutMs); + this.pending.set(id, { + resolve: (v) => resolveFn(v as ToolResponse<T>), + reject: rejectFn, + timeout, + }); + }); + } + + /** Same as call(), but throws when ok=false (test ergonomics). */ + async callOk<T = unknown>( + tool: string, + args: Record<string, unknown> = {} + ): Promise<T> { + const res = await this.call<T>(tool, args); + if (!res.ok) { + throw new Error( + `Expected ok call for ${tool}, got errors: ${(res.errors ?? []).join("; ")}` + ); + } + return res.data as T; + } + + /** Same as call(), but throws when ok=true (used to assert gate failures). */ + async callFail( + tool: string, + args: Record<string, unknown> = {} + ): Promise<string[]> { + const res = await this.call(tool, args); + if (res.ok) { + throw new Error( + `Expected ${tool} to fail, but it succeeded with: ${JSON.stringify(res.data).slice(0, 200)}` + ); + } + return res.errors ?? []; + } + + async close(): Promise<void> { + if (this.closed) return; + this.closed = true; + this.proc.kill("SIGTERM"); + await new Promise<void>((r) => this.proc.on("exit", () => r())); + } +} + +export async function withMcp<T>( + fn: (mcp: McpClient) => Promise<T>, + opts?: McpClientOptions +): Promise<T> { + const mcp = new McpClient(opts); + try { + await mcp.initialize(); + return await fn(mcp); + } finally { + await mcp.close(); + } +} diff --git a/server/tests/helpers/mock-openai.ts b/server/tests/helpers/mock-openai.ts new file mode 100644 index 0000000..0d2ce6a --- /dev/null +++ b/server/tests/helpers/mock-openai.ts @@ -0,0 +1,82 @@ +import OpenAI from "openai"; + +/** + * Scripted response — a single completion the mock will return. + * If `toolCalls` is non-empty, the model is asking for those tools to be executed. + * If `text` is non-empty AND no toolCalls, this terminates the agent loop. + */ +export interface ScriptedResponse { + text?: string; + toolCalls?: { name: string; args: Record<string, unknown> }[]; +} + +/** + * Build a mock OpenAI client that pops scripted responses off a queue. + * Each call to chat.completions.create consumes one entry. + */ +export function makeMockOpenAI(script: ScriptedResponse[]): OpenAI { + let i = 0; + const queue = [...script]; + let nextId = 1; + + const create = async (params: OpenAI.Chat.ChatCompletionCreateParams) => { + const entry = queue[i++]; + if (!entry) { + const lastUser = [...params.messages] + .reverse() + .find((m) => m.role === "user" || m.role === "tool"); + const lastUserSummary = lastUser + ? `last ${lastUser.role}: ${ + typeof lastUser.content === "string" + ? lastUser.content.slice(0, 120) + : "[structured content]" + }` + : "no user/tool messages found"; + throw new Error( + `Mock OpenAI exhausted after ${i - 1} calls (${queue.length} scripted). ${lastUserSummary}` + ); + } + if (process.env.DR_MOCK_DEBUG) { + process.stderr.write(`[mock #${i}] ${JSON.stringify(entry).slice(0, 200)}\n`); + } + const toolCalls = (entry.toolCalls ?? []).map((c) => ({ + id: `call_${nextId++}`, + type: "function" as const, + function: { name: c.name, arguments: JSON.stringify(c.args) }, + })); + const message: OpenAI.Chat.ChatCompletionMessage = { + role: "assistant", + content: entry.text ?? null, + refusal: null, + ...(toolCalls.length > 0 && { tool_calls: toolCalls }), + }; + return { + id: `cmpl_mock_${i}`, + object: "chat.completion", + created: Date.now(), + model: "mock", + choices: [ + { + index: 0, + message, + finish_reason: toolCalls.length > 0 ? "tool_calls" : "stop", + logprobs: null, + }, + ], + usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 }, + } as unknown as OpenAI.Chat.ChatCompletion; + }; + + // Build a minimal object that quacks like OpenAI for our agent loop. + const mock = { + chat: { + completions: { create }, + }, + } as unknown as OpenAI; + return mock; +} + +export function remainingMockCalls(client: OpenAI, expectedTotal: number): number { + // For tests that want to assert the script was fully consumed. + return expectedTotal; +} diff --git a/server/tests/helpers/tmp-project.ts b/server/tests/helpers/tmp-project.ts new file mode 100644 index 0000000..a44e0f3 --- /dev/null +++ b/server/tests/helpers/tmp-project.ts @@ -0,0 +1,44 @@ +import { mkdtempSync, rmSync, existsSync, readFileSync, readdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +export interface TmpProject { + cwd: string; + dispose: () => void; + exists: (relative: string) => boolean; + read: (relative: string) => string; + readJson: <T = unknown>(relative: string) => T; + list: (relative: string) => string[]; + events: () => Array<Record<string, unknown>>; +} + +export function makeTmpProject(prefix = "dr-test-"): TmpProject { + const cwd = mkdtempSync(join(tmpdir(), prefix)); + return { + cwd, + dispose: () => rmSync(cwd, { recursive: true, force: true }), + exists: (relative) => existsSync(join(cwd, relative)), + read: (relative) => readFileSync(join(cwd, relative), "utf8"), + readJson: (relative) => JSON.parse(readFileSync(join(cwd, relative), "utf8")), + list: (relative) => readdirSync(join(cwd, relative)), + events: () => { + if (!existsSync(join(cwd, ".dr/events.jsonl"))) return []; + return readFileSync(join(cwd, ".dr/events.jsonl"), "utf8") + .split("\n") + .filter((l) => l.trim().length > 0) + .map((l) => JSON.parse(l)); + }, + }; +} + +export async function withTmpProject<T>( + fn: (project: TmpProject) => Promise<T>, + prefix?: string +): Promise<T> { + const project = makeTmpProject(prefix); + try { + return await fn(project); + } finally { + project.dispose(); + } +} diff --git a/server/tests/unit-gate.test.ts b/server/tests/unit-gate.test.ts new file mode 100644 index 0000000..2d10f0a --- /dev/null +++ b/server/tests/unit-gate.test.ts @@ -0,0 +1,438 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import { presetFor, resolveEffectiveGateConfig } from "../src/gate.js"; +import { evaluateAdvance, nextPhaseOf } from "../src/gateEval.js"; +import { + Decision, + PipelineState, + Project, + SCHEMA_VERSION, + Task, +} from "../src/schemas/index.js"; + +const NOW = "2026-05-17T00:00:00.000Z"; + +function makeProject(overrides: Partial<Project> = {}): Project { + return { + id: "test-project", + title: "Test Project", + description: "An idea worth shipping.", + created_at: NOW, + updated_at: NOW, + effort_level: "poc", + status: "intake", + sign_offs: [], + gate_config: { preset: "poc" }, + tags: [], + ...overrides, + }; +} + +function makeState(overrides: Partial<PipelineState> = {}): PipelineState { + return { + schema_version: SCHEMA_VERSION, + project_id: "test-project", + phase: "intake", + effective_gate_config: presetFor("poc"), + next_decision_seq: 1, + next_task_seq: 1, + pending_questions: [], + gate_failures: [], + ...overrides, + }; +} + +function makeDecision(overrides: Partial<Decision> = {}): Decision { + return { + id: "0001-test", + number: 1, + slug: "test", + title: "Test decision", + status: "accepted", + template_variant: "canonical", + created_at: NOW, + updated_at: NOW, + assumptions: [], + constraints: [], + positions: [{ title: "A", pros: [], cons: [], links: [] }], + opinions: [], + selected_position: "A", + argument: "Because A.", + implications: [], + depends_on: [], + related_decisions: [], + related_artifacts: [], + review: [], + tags: [], + ...overrides, + }; +} + +function makeTask(overrides: Partial<Task> = {}): Task { + return { + id: "T0001-test", + number: 1, + slug: "test", + title: "Test task", + status: "ready", + estimate: { unit: "hours", value: 2 }, + acceptance_criteria: ["criteria 1"], + depends_on: [], + decision_refs: [], + priority: "p2", + labels: [], + created_at: NOW, + updated_at: NOW, + ...overrides, + }; +} + +describe("gate / preset resolution", () => { + it("returns the preset baseline when no overrides", () => { + const cfg = resolveEffectiveGateConfig({ preset: "mvp" }); + assert.equal(cfg.min_decisions, 3); + assert.equal(cfg.min_tasks, 8); + assert.equal(cfg.max_task_estimate_hours, 8); + assert.equal(cfg.review_required_per_decision, false); + assert.deepEqual(cfg.review_required_phases, ["scoping", "decomposing"]); + }); + + it("applies overrides per-knob without affecting other preset values", () => { + const cfg = resolveEffectiveGateConfig({ + preset: "mvp", + overrides: { min_tasks: 5, review_required_per_decision: true }, + }); + assert.equal(cfg.min_tasks, 5); + assert.equal(cfg.review_required_per_decision, true); + assert.equal(cfg.min_decisions, 3, "min_decisions still preset default"); + assert.equal(cfg.max_task_estimate_hours, 8, "max_task_estimate_hours still preset default"); + }); + + it("preset 'poc' is loosest, 'full' is strictest", () => { + const poc = presetFor("poc"); + const mvp = presetFor("mvp"); + const full = presetFor("full"); + assert.ok(poc.min_tasks <= mvp.min_tasks); + assert.ok(mvp.min_tasks <= full.min_tasks); + assert.ok(poc.min_decisions <= mvp.min_decisions); + assert.ok(mvp.min_decisions <= full.min_decisions); + assert.ok(poc.max_task_estimate_hours >= mvp.max_task_estimate_hours); + assert.ok(mvp.max_task_estimate_hours >= full.max_task_estimate_hours); + }); +}); + +describe("nextPhaseOf", () => { + it("walks the linear pipeline", () => { + assert.equal(nextPhaseOf("intake"), "scoping"); + assert.equal(nextPhaseOf("scoping"), "deciding"); + assert.equal(nextPhaseOf("deciding"), "decomposing"); + assert.equal(nextPhaseOf("decomposing"), "handing-off"); + assert.equal(nextPhaseOf("handing-off"), "handed-off"); + assert.equal(nextPhaseOf("handed-off"), null); + }); +}); + +describe("evaluateAdvance: intake → scoping", () => { + it("passes with title + description", () => { + const project = makeProject(); + const state = makeState({ phase: "intake" }); + const result = evaluateAdvance(project, state, [], [], null); + assert.equal(result.pass, true); + assert.equal(result.next_phase, "scoping"); + }); + + it("blocks when description empty", () => { + const project = makeProject({ description: "" }); + const state = makeState({ phase: "intake" }); + const result = evaluateAdvance(project, state, [], [], null); + assert.equal(result.pass, false); + assert.ok( + result.reasons.some((r) => r.includes("description")), + `expected description-blocked reason; got: ${result.reasons.join(" | ")}` + ); + }); +}); + +describe("evaluateAdvance: scoping → deciding", () => { + it("passes with non-empty in_scope and success_criteria (poc)", () => { + const project = makeProject({ + status: "scoping", + scope: { + in_scope: ["thing 1"], + success_criteria: ["measurable 1"], + out_of_scope: [], + nice_to_have: [], + }, + }); + const state = makeState({ phase: "scoping" }); + const result = evaluateAdvance(project, state, [], [], null); + assert.equal(result.pass, true); + }); + + it("blocks when in_scope is empty", () => { + const project = makeProject({ + status: "scoping", + scope: { + in_scope: [], + success_criteria: ["x"], + out_of_scope: [], + nice_to_have: [], + }, + }); + const state = makeState({ phase: "scoping" }); + const result = evaluateAdvance(project, state, [], [], null); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("in_scope"))); + }); + + it("blocks when success_criteria is empty", () => { + const project = makeProject({ + status: "scoping", + scope: { + in_scope: ["x"], + success_criteria: [], + out_of_scope: [], + nice_to_have: [], + }, + }); + const state = makeState({ phase: "scoping" }); + const result = evaluateAdvance(project, state, [], [], null); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("success_criteria"))); + }); + + it("under mvp preset, requires a scoping DR with passing review", () => { + const project = makeProject({ + effort_level: "mvp", + status: "scoping", + scope: { + in_scope: ["x"], + success_criteria: ["y"], + out_of_scope: [], + nice_to_have: [], + }, + gate_config: { preset: "mvp" }, + }); + const state = makeState({ + phase: "scoping", + effective_gate_config: presetFor("mvp"), + }); + const noScopingDr = evaluateAdvance( + project, + state, + [], + [], + { by: "human" } + ); + assert.equal(noScopingDr.pass, false); + assert.ok(noScopingDr.reasons.some((r) => r.includes("scoping decision"))); + + const unreviewedScopingDr = makeDecision({ + id: "0001-scope", + slug: "scope", + template_variant: "scoping", + status: "proposed", + review: [], + }); + const stillBlocked = evaluateAdvance( + project, + state, + [unreviewedScopingDr], + [], + { by: "human" } + ); + assert.equal(stillBlocked.pass, false); + assert.ok(stillBlocked.reasons.some((r) => r.includes("no passing review"))); + + const reviewedScopingDr = makeDecision({ + id: "0001-scope", + slug: "scope", + template_variant: "scoping", + status: "proposed", + review: [ + { + reviewer: "dr-skeptic", + lens: "operational", + verdict: "pass", + score: 4, + concerns: [], + at: NOW, + }, + ], + }); + const passes = evaluateAdvance( + project, + state, + [reviewedScopingDr], + [], + { by: "human" } + ); + assert.equal(passes.pass, true, `expected pass, got: ${passes.reasons.join("; ")}`); + }); +}); + +describe("evaluateAdvance: deciding → decomposing", () => { + it("blocks when fewer decisions than min_decisions", () => { + const project = makeProject({ status: "deciding", effort_level: "mvp" }); + const state = makeState({ phase: "deciding", effective_gate_config: presetFor("mvp") }); + const result = evaluateAdvance(project, state, [makeDecision()], [], { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("decisions"))); + }); + + it("blocks when any decision is still 'proposed'", () => { + const project = makeProject({ status: "deciding" }); + const state = makeState({ phase: "deciding" }); + const ds = [ + makeDecision({ id: "0001-a", slug: "a" }), + makeDecision({ id: "0002-b", slug: "b", status: "proposed", selected_position: undefined, argument: undefined }), + ]; + const result = evaluateAdvance(project, state, ds, [], { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("not 'accepted'"))); + }); + + it("passes when all decisions accepted and deps resolved (poc)", () => { + const project = makeProject({ status: "deciding" }); + const state = makeState({ phase: "deciding" }); + const ds = [makeDecision()]; + const result = evaluateAdvance(project, state, ds, [], { by: "human" }); + assert.equal(result.pass, true, `expected pass, got: ${result.reasons.join("; ")}`); + }); + + it("blocks when decision dependencies are missing", () => { + const project = makeProject({ status: "deciding" }); + const state = makeState({ phase: "deciding" }); + const ds = [ + makeDecision({ id: "0001-a", slug: "a", depends_on: ["0999-missing"] }), + ]; + const result = evaluateAdvance(project, state, ds, [], { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("missing dependencies"))); + }); + + it("under full preset, requires every accepted decision to have a passing review", () => { + const project = makeProject({ + status: "deciding", + effort_level: "full", + gate_config: { preset: "full" }, + }); + const state = makeState({ + phase: "deciding", + effective_gate_config: presetFor("full"), + }); + // 6 accepted decisions; min_decisions = 6 for full + const ds = Array.from({ length: 6 }, (_, i) => + makeDecision({ + id: `${String(i + 1).padStart(4, "0")}-d${i}`, + slug: `d${i}`, + number: i + 1, + }) + ); + const noReview = evaluateAdvance(project, state, ds, [], { by: "human" }); + assert.equal(noReview.pass, false); + assert.ok( + noReview.reasons.some((r) => r.includes("lack a passing review")), + `expected per-decision-review blocker; got: ${noReview.reasons.join(" | ")}` + ); + }); +}); + +describe("evaluateAdvance: decomposing → handing-off", () => { + it("passes with deps satisfied and estimates in budget", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1, decision_refs: [] }), + makeTask({ id: "T0002-b", slug: "b", number: 2, depends_on: ["T0001-a"] }), + makeTask({ id: "T0003-c", slug: "c", number: 3, depends_on: ["T0002-b"] }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, true, `expected pass, got: ${result.reasons.join("; ")}`); + }); + + it("blocks on cycles", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1, depends_on: ["T0003-c"] }), + makeTask({ id: "T0002-b", slug: "b", number: 2, depends_on: ["T0001-a"] }), + makeTask({ id: "T0003-c", slug: "c", number: 3, depends_on: ["T0002-b"] }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("cycles"))); + }); + + it("blocks on orphan dependencies", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1, depends_on: ["T0999-missing"] }), + makeTask({ id: "T0002-b", slug: "b", number: 2 }), + makeTask({ id: "T0003-c", slug: "c", number: 3 }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("missing dependencies"))); + }); + + it("blocks when task estimate exceeds max", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1, estimate: { unit: "hours", value: 100 } }), + makeTask({ id: "T0002-b", slug: "b", number: 2 }), + makeTask({ id: "T0003-c", slug: "c", number: 3 }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("estimate"))); + }); + + it("blocks when task has no estimate", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1 }), + makeTask({ id: "T0002-b", slug: "b", number: 2 }), + makeTask({ id: "T0003-c", slug: "c", number: 3, estimate: undefined }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("missing or oversized"))); + }); + + it("blocks when task references a missing decision", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1, decision_refs: ["0999-missing"] }), + makeTask({ id: "T0002-b", slug: "b", number: 2 }), + makeTask({ id: "T0003-c", slug: "c", number: 3 }), + ]; + const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(result.pass, false); + assert.ok(result.reasons.some((r) => r.includes("missing decisions"))); + }); +}); + +describe("evaluateAdvance: sign-off requirement", () => { + it("requires human sign-off for handing-off under poc preset", () => { + const project = makeProject({ status: "decomposing" }); + const state = makeState({ phase: "decomposing" }); + const tasks = [ + makeTask({ id: "T0001-a", slug: "a", number: 1 }), + makeTask({ id: "T0002-b", slug: "b", number: 2 }), + makeTask({ id: "T0003-c", slug: "c", number: 3 }), + ]; + const agentOnly = evaluateAdvance(project, state, [makeDecision()], tasks, { + by: "agent", + }); + assert.equal(agentOnly.pass, false); + assert.ok(agentOnly.reasons.some((r) => r.includes("human sign-off"))); + + const human = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" }); + assert.equal(human.pass, true, `expected pass, got: ${human.reasons.join("; ")}`); + }); +}); diff --git a/server/tests/unit-schemas.test.ts b/server/tests/unit-schemas.test.ts new file mode 100644 index 0000000..3ab2764 --- /dev/null +++ b/server/tests/unit-schemas.test.ts @@ -0,0 +1,273 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import { + DecisionIdSchema, + DecisionSchema, + EventSchema, + GateConfigSchema, + PipelineStateSchema, + ProjectSchema, + SCHEMA_VERSION, + SlugSchema, + TaskIdSchema, + TaskSchema, +} from "../src/schemas/index.js"; + +const NOW = "2026-05-17T00:00:00.000Z"; + +describe("SlugSchema", () => { + it("accepts well-formed kebab-case", () => { + assert.doesNotThrow(() => SlugSchema.parse("project-name")); + assert.doesNotThrow(() => SlugSchema.parse("a1")); + assert.doesNotThrow(() => SlugSchema.parse("multi-word-thing")); + }); + + it("rejects upper-case, underscores, leading/trailing dashes", () => { + assert.throws(() => SlugSchema.parse("Project")); + assert.throws(() => SlugSchema.parse("snake_case")); + assert.throws(() => SlugSchema.parse("-leading")); + assert.throws(() => SlugSchema.parse("trailing-")); + assert.throws(() => SlugSchema.parse("")); + }); +}); + +describe("DecisionIdSchema", () => { + it("requires 0000-slug shape", () => { + assert.doesNotThrow(() => DecisionIdSchema.parse("0001-language-choice")); + assert.doesNotThrow(() => DecisionIdSchema.parse("9999-ab")); + }); + + it("rejects malformed prefixes", () => { + assert.throws(() => DecisionIdSchema.parse("1-foo")); + assert.throws(() => DecisionIdSchema.parse("0001")); + assert.throws(() => DecisionIdSchema.parse("T0001-foo")); + assert.throws(() => DecisionIdSchema.parse("0001-")); + }); +}); + +describe("TaskIdSchema", () => { + it("requires T0000-slug shape", () => { + assert.doesNotThrow(() => TaskIdSchema.parse("T0001-bootstrap")); + }); + + it("rejects decision-style IDs", () => { + assert.throws(() => TaskIdSchema.parse("0001-foo")); + assert.throws(() => TaskIdSchema.parse("t0001-foo")); + }); +}); + +describe("GateConfigSchema", () => { + it("accepts preset-only", () => { + assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "poc" })); + assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "mvp" })); + assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "full" })); + }); + + it("accepts preset + overrides", () => { + const parsed = GateConfigSchema.parse({ + preset: "mvp", + overrides: { min_tasks: 5, review_required_per_decision: true }, + }); + assert.equal(parsed.overrides?.min_tasks, 5); + assert.equal(parsed.overrides?.review_required_per_decision, true); + }); + + it("rejects unknown preset values", () => { + assert.throws(() => GateConfigSchema.parse({ preset: "rapid" })); + }); +}); + +describe("ProjectSchema", () => { + const validProject = { + id: "demo", + title: "Demo", + description: "", + created_at: NOW, + updated_at: NOW, + effort_level: "poc" as const, + status: "intake" as const, + sign_offs: [], + gate_config: { preset: "poc" as const }, + tags: [], + }; + + it("round-trips a minimal project", () => { + const parsed = ProjectSchema.parse(validProject); + assert.equal(parsed.id, "demo"); + assert.equal(parsed.status, "intake"); + }); + + it("rejects unknown status values", () => { + assert.throws(() => ProjectSchema.parse({ ...validProject, status: "launching" })); + }); + + it("rejects bogus id slugs", () => { + assert.throws(() => ProjectSchema.parse({ ...validProject, id: "Invalid_Id" })); + }); + + it("rejects invalid effort_level", () => { + assert.throws(() => ProjectSchema.parse({ ...validProject, effort_level: "rapid" })); + }); +}); + +describe("DecisionSchema", () => { + const validDecision = { + id: "0001-xx", + number: 1, + slug: "xx", + title: "X", + status: "proposed" as const, + template_variant: "canonical" as const, + created_at: NOW, + updated_at: NOW, + }; + + it("accepts minimal valid decision", () => { + const parsed = DecisionSchema.parse(validDecision); + assert.equal(parsed.id, "0001-xx"); + assert.deepEqual(parsed.positions, []); + assert.deepEqual(parsed.review, []); + }); + + it("rejects mismatched id format", () => { + assert.throws(() => DecisionSchema.parse({ ...validDecision, id: "T0001-xx" })); + }); + + it("rejects invalid template_variant", () => { + assert.throws(() => + DecisionSchema.parse({ ...validDecision, template_variant: "novel" }) + ); + }); + + it("parses full structure with positions, review, sign_off", () => { + const full = { + ...validDecision, + status: "accepted" as const, + positions: [{ title: "A", pros: ["fast"], cons: [], links: [] }], + selected_position: "A", + argument: "speed matters", + implications: ["follow-up"], + review: [ + { + reviewer: "dr-skeptic", + lens: "operational" as const, + verdict: "pass" as const, + score: 5, + concerns: [], + at: NOW, + }, + ], + sign_off: { by: "human" as const, at: NOW }, + }; + const parsed = DecisionSchema.parse(full); + assert.equal(parsed.selected_position, "A"); + assert.equal(parsed.review[0]?.verdict, "pass"); + assert.equal(parsed.sign_off?.by, "human"); + }); +}); + +describe("TaskSchema", () => { + const validTask = { + id: "T0001-xx", + number: 1, + slug: "xx", + title: "X task", + status: "open" as const, + acceptance_criteria: [], + depends_on: [], + decision_refs: [], + priority: "p2" as const, + labels: [], + created_at: NOW, + updated_at: NOW, + }; + + it("round-trips a minimal task", () => { + const parsed = TaskSchema.parse(validTask); + assert.equal(parsed.status, "open"); + assert.equal(parsed.priority, "p2"); + }); + + it("accepts estimate with confidence", () => { + const parsed = TaskSchema.parse({ + ...validTask, + estimate: { unit: "hours", value: 4, confidence: "med" }, + }); + assert.equal(parsed.estimate?.confidence, "med"); + }); + + it("rejects negative estimate", () => { + assert.throws(() => + TaskSchema.parse({ + ...validTask, + estimate: { unit: "hours", value: -1 }, + }) + ); + }); + + it("rejects unknown priority", () => { + assert.throws(() => TaskSchema.parse({ ...validTask, priority: "p4" })); + }); +}); + +describe("PipelineStateSchema", () => { + const validState = { + schema_version: SCHEMA_VERSION, + project_id: "demo", + phase: "intake" as const, + effective_gate_config: { + decisions_required_status: "accepted" as const, + review_required_phases: [], + review_required_per_decision: false, + max_task_estimate_hours: 16, + require_human_signoff_phases: ["handing-off"], + min_decisions: 0, + min_tasks: 3, + }, + next_decision_seq: 1, + next_task_seq: 1, + pending_questions: [], + gate_failures: [], + }; + + it("round-trips and defaults", () => { + const parsed = PipelineStateSchema.parse(validState); + assert.equal(parsed.phase, "intake"); + assert.equal(parsed.next_decision_seq, 1); + }); + + it("rejects non-semver schema_version", () => { + assert.throws(() => + PipelineStateSchema.parse({ ...validState, schema_version: "0.1" }) + ); + }); +}); + +describe("EventSchema", () => { + it("accepts a minimal event", () => { + const parsed = EventSchema.parse({ + at: NOW, + actor: "agent", + kind: "project_initialized", + }); + assert.equal(parsed.kind, "project_initialized"); + }); + + it("accepts a payload of arbitrary shape", () => { + const parsed = EventSchema.parse({ + at: NOW, + actor: "human", + kind: "decision_accepted", + entity_kind: "decision", + entity_id: "0001-x", + payload: { reason: "fine", nested: { key: "value" } }, + }); + assert.equal(parsed.payload?.["reason"], "fine"); + }); + + it("rejects unknown event kinds", () => { + assert.throws(() => + EventSchema.parse({ at: NOW, actor: "agent", kind: "totally_made_up" }) + ); + }); +}); diff --git a/server/tsup.config.ts b/server/tsup.config.ts index b32b759..ce9b473 100644 --- a/server/tsup.config.ts +++ b/server/tsup.config.ts @@ -1,7 +1,7 @@ import { defineConfig } from "tsup"; export default defineConfig({ - entry: ["src/index.ts"], + entry: ["src/index.ts", "src/cli.ts"], format: ["esm"], target: "node20", clean: true,