From 253e43e9a5d52ef9eeaa5270b5abc90029fab890 Mon Sep 17 00:00:00 2001 From: Michael Sitarzewski Date: Wed, 18 Feb 2026 00:17:02 -0600 Subject: [PATCH] =?UTF-8?q?Epistemic=20confidence=20Phase=20A=20=E2=80=94?= =?UTF-8?q?=20honest=20confidence=20scoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Confidence now reflects inherent uncertainty of the question domain, not just challenge quality. Rigor (renamed from old confidence) measures challenge genuineness [0.5–1.0]; confidence = min(domain_cap, rigor) where domain caps are factual=0.95, technical=0.90, creative=0.85, judgment=0.80, strategic=0.70. Adds calibration module (ECE metric), duh calibration CLI, GET /api/calibration endpoint, and calibration dashboard in web UI. Full-stack propagation of rigor field across ORM, handlers, CLI, API, WebSocket, MCP, and frontend (47 source files + 5 memory-bank files). 1586 Python + 126 Vitest = 1712 tests passing. Co-Authored-By: Claude Opus 4.6 --- memory-bank/activeContext.md | 148 ++--- memory-bank/decisions.md | 32 +- memory-bank/progress.md | 21 +- memory-bank/tasks/2026-02/README.md | 37 ++ memory-bank/toc.md | 4 +- src/duh/api/routes/ask.py | 15 +- src/duh/api/routes/crud.py | 68 ++ src/duh/api/routes/threads.py | 3 + src/duh/api/routes/ws.py | 5 +- src/duh/calibration.py | 150 +++++ src/duh/cli/app.py | 608 ++++++++++++++---- src/duh/cli/display.py | 18 +- src/duh/consensus/handlers.py | 40 +- src/duh/consensus/machine.py | 4 + src/duh/consensus/scheduler.py | 14 +- src/duh/consensus/synthesis.py | 7 + src/duh/consensus/voting.py | 2 + src/duh/mcp/server.py | 7 +- src/duh/memory/context.py | 2 +- src/duh/memory/migrations.py | 34 + src/duh/memory/models.py | 1 + src/duh/memory/repository.py | 2 + tests/integration/test_consensus_loop.py | 4 +- tests/sycophancy/test_confidence_impact.py | 28 +- tests/sycophancy/test_known_flaws.py | 4 +- tests/unit/test_api_crud.py | 91 +++ tests/unit/test_api_ws.py | 5 +- tests/unit/test_calibration.py | 162 +++++ tests/unit/test_cli.py | 10 +- tests/unit/test_cli_batch.py | 16 +- tests/unit/test_cli_calibration.py | 154 +++++ tests/unit/test_cli_decompose.py | 2 +- tests/unit/test_cli_display.py | 19 +- tests/unit/test_cli_tools.py | 4 +- tests/unit/test_cli_voting.py | 4 +- tests/unit/test_commit_handler.py | 34 +- tests/unit/test_confidence_scoring.py | 163 +++++ tests/unit/test_context_builder.py | 4 +- tests/unit/test_mcp_server.py | 4 +- tests/unit/test_scheduler.py | 3 +- web/src/App.tsx | 2 + .../__tests__/consensus-components.test.tsx | 7 + web/src/__tests__/stores.test.ts | 83 +++ web/src/api/client.ts | 16 + web/src/api/types.ts | 27 + .../calibration/CalibrationDashboard.tsx | 243 +++++++ web/src/components/calibration/index.ts | 1 + .../components/consensus/ConfidenceMeter.tsx | 8 +- .../consensus/ConsensusComplete.tsx | 13 +- .../components/consensus/ConsensusPanel.tsx | 4 +- .../decision-space/DecisionCloud.tsx | 1 + web/src/components/layout/Sidebar.tsx | 1 + web/src/components/shared/ExportMenu.tsx | 6 +- web/src/components/threads/ThreadDetail.tsx | 14 +- web/src/components/threads/TurnCard.tsx | 5 +- web/src/pages/CalibrationPage.tsx | 12 + web/src/pages/index.ts | 1 + web/src/stores/calibration.ts | 57 ++ web/src/stores/consensus.ts | 8 + web/src/stores/index.ts | 1 + web/tsconfig.tsbuildinfo | 2 +- 61 files changed, 2118 insertions(+), 327 deletions(-) create mode 100644 src/duh/calibration.py create mode 100644 src/duh/memory/migrations.py create mode 100644 tests/unit/test_calibration.py create mode 100644 tests/unit/test_cli_calibration.py create mode 100644 tests/unit/test_confidence_scoring.py create mode 100644 web/src/components/calibration/CalibrationDashboard.tsx create mode 100644 web/src/components/calibration/index.ts create mode 100644 web/src/pages/CalibrationPage.tsx create mode 100644 web/src/stores/calibration.ts diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md index 8d150a2..040cda1 100644 --- a/memory-bank/activeContext.md +++ b/memory-bank/activeContext.md @@ -1,104 +1,68 @@ # Active Context -**Last Updated**: 2026-02-17 -**Current Phase**: v0.5 + Export Feature -**Next Action**: Merge v0.5.0 to main. Export to Markdown & PDF feature implemented. - -## Next Task: Model Selection Controls + Provider Updates - -### Context -Users can't control which models participate in consensus. `select_proposer()` picks highest `output_cost_per_mtok`, `select_challengers()` picks next-costliest. Problems: no user control (`ConsensusConfig.panel` exists but unused), Google catalog outdated, Perplexity should be challengers-only (search-grounded), Anthropic missing `claude-sonnet-4-6`. - -### Changes (6 steps) - -1. **Update provider model catalogs** - - `src/duh/providers/google.py:34-67` — Gemini 3 GA + early-access models (web search for latest) - - `src/duh/providers/anthropic.py:36-61` — Add `claude-sonnet-4-6` - - `src/duh/providers/perplexity.py:35-60` — Verify current model IDs/pricing - -2. **Add `proposer_eligible` flag to ModelInfo** - - `src/duh/providers/base.py:28-45` — Add `proposer_eligible: bool = True` - - `src/duh/providers/perplexity.py` — Set `proposer_eligible=False` (challengers only, user decision) - -3. **Wire `ConsensusConfig.panel` + update selection functions** - - `src/duh/consensus/handlers.py:185-202` (`select_proposer`) — Accept optional `panel`, filter to `proposer_eligible=True` - - `src/duh/consensus/handlers.py:322-356` (`select_challengers`) — Accept optional `panel` - - `src/duh/cli/app.py:236-246`, `src/duh/api/routes/ws.py:108,128`, `src/duh/api/routes/ask.py` — Pass panel - -4. **Add CLI flags**: `--proposer MODEL_REF`, `--challengers MODEL_REF,MODEL_REF`, `--panel MODEL_REF,...` - - `src/duh/cli/app.py` (ask command) - -5. **Add to REST API**: Optional `panel`, `proposer`, `challengers` fields in ask request body - - `src/duh/api/routes/ask.py` - -6. **Tests**: Update `test_propose_handler.py`, `test_challenge_handler.py` for panel filtering + proposer_eligible. Test CLI flags. Fix any tests with hardcoded model catalogs. - -7. **Documentation + CLI help** - - `docs/cli/ask.md` — Document `--proposer`, `--challengers`, `--panel` flags - - `docs/api-reference.md` — Document panel/proposer/challengers in `/api/ask` - - `docs/concepts/providers-and-models.md` — Update model lists, model selection explanation - - `docs/getting-started/configuration.md` — Document `[consensus] panel` config - - `docs/reference/config-reference.md` — Add panel, proposer_strategy fields - - `src/duh/cli/app.py` — Update Click help strings for new flags - - `docs/index.md` — Update feature list if needed - -### Current model cost ranking (for reference) -| Model | output_cost | Provider | -|-------|------------|----------| -| Opus 4.6 | $25.00 | anthropic | -| Sonar Pro | $15.00 | perplexity | -| Sonnet 4.5 | $15.00 | anthropic | -| GPT-5.2 | $14.00 | openai | -| Gemini 3 Pro | $12.00 | google | -| Gemini 2.5 Pro | $10.00 | google | -| Mistral Medium | $8.10 | mistral | -| o3 | $8.00 | openai | -| Sonar Deep Research | $8.00 | perplexity | -| Mistral Large | $6.00 | mistral | -| Haiku 4.5 | $5.00 | anthropic | +**Last Updated**: 2026-02-18 +**Current Phase**: Epistemic Confidence (Phase A) — on branch `epistemic-confidence-phase-a` +**Next Action**: Commit, push, create PR to merge to main. + +## What Just Shipped: Epistemic Confidence Phase A + +### Core Change +Confidence scoring is now **epistemic** — it reflects inherent uncertainty of the question domain, not just challenge quality. + +**Before**: `confidence = _compute_confidence(challenges)` — measured rigor only (0.5–1.0 based on sycophancy ratio). +**After**: Two separate scores: +- **Rigor** (renamed from old confidence) — how genuine the challenges were (0.5–1.0) +- **Confidence** — `min(domain_cap(intent), rigor)` — rigor clamped by question type ceiling + +### Domain Caps +| Intent | Cap | Rationale | +|--------|-----|-----------| +| factual | 0.95 | Verifiable answers, near-certain | +| technical | 0.90 | Strong consensus possible | +| creative | 0.85 | Subjective, multiple valid answers | +| judgment | 0.80 | Requires weighing trade-offs | +| strategic | 0.70 | Inherent future uncertainty | +| unknown/None | 0.85 | Default conservative cap | + +### Files Changed (47 files, +997, -230) +**New files:** +- `src/duh/calibration.py` — ECE (Expected Calibration Error) computation +- `src/duh/memory/migrations.py` — SQLite schema migration (adds rigor column) +- `tests/unit/test_calibration.py` — 15 calibration tests +- `tests/unit/test_confidence_scoring.py` — 20 epistemic confidence tests +- `tests/unit/test_cli_calibration.py` — 4 CLI calibration tests +- `web/src/components/calibration/CalibrationDashboard.tsx` — Calibration viz +- `web/src/pages/CalibrationPage.tsx` — Calibration page +- `web/src/stores/calibration.ts` — Calibration Zustand store + +**Modified across full stack:** +- `consensus/handlers.py` — Renamed `_compute_confidence` → `_compute_rigor`, added `_domain_cap()`, `DOMAIN_CAPS`, epistemic formula +- `consensus/machine.py` — Added `rigor` to ConsensusContext, RoundResult +- `consensus/scheduler.py` — Propagates rigor through subtask results +- `consensus/synthesis.py` — Averages rigor across subtask results +- `consensus/voting.py` — Added rigor to VoteResult, VotingAggregation +- `memory/models.py` — Added `rigor` column to Decision ORM +- `memory/repository.py` — Accepts `rigor` param in `save_decision()` +- `memory/context.py` — Shows rigor in context builder output +- `cli/app.py` — All output paths show rigor; new `duh calibration` command; PDF export enhanced +- `cli/display.py` — `show_commit()` and `show_final_decision()` show rigor +- `api/routes/crud.py` — `GET /api/calibration` endpoint; rigor in decision space +- `api/routes/ask.py`, `ws.py`, `threads.py` — Propagate rigor +- `mcp/server.py` — Propagates rigor +- Frontend: ConfidenceMeter, ConsensusComplete, ConsensusPanel, ThreadDetail, TurnCard, ExportMenu, Sidebar, DecisionCloud, stores updated --- ## Current State -- **v0.5 + Export feature on branch `v0.5.0`.** All v0.5 tasks done + export feature added. -- **6 providers shipping**: Anthropic (3 models), OpenAI (3 models), Google (4 models), Mistral (4 models), Perplexity (3 models) — 17 total. -- **1539 Python unit/load tests + 122 Vitest tests** (1661 total), ruff clean. -- **~60 Python source files + 67 frontend source files** (~127 total). -- REST API, WebSocket streaming, MCP server, Python client library, web UI all built. -- Multi-user auth (JWT + RBAC), PostgreSQL support, Prometheus metrics, backup/restore, Playwright E2E. -- CLI commands: `duh ask`, `duh recall`, `duh threads`, `duh show`, `duh models`, `duh cost`, `duh serve`, `duh mcp`, `duh batch`, `duh export`, `duh feedback`, `duh backup`, `duh restore`, `duh user-create`, `duh user-list`. -- Export: `duh export --format pdf/markdown --content full/decision --no-dissent -o file` -- Docs: production-deployment.md, monitoring.md, authentication.md added. -- MkDocs docs site: https://msitarzewski.github.io/duh/ -- GitHub repo: https://github.com/msitarzewski/duh +- **Branch `epistemic-confidence-phase-a`** — all changes uncommitted, ready to commit. +- **1586 Python tests + 126 Vitest tests** (1712 total), ruff clean, mypy strict clean. +- **~62 Python source files + 70 frontend source files** (~132 total). +- All previous features intact (v0.1–v0.5 + export). -## v0.5 Delivered - -**Theme**: Production hardening, multi-user, enterprise readiness. -**18 tasks across 7 phases** — all complete. - -### What Shipped -- User accounts + JWT auth + RBAC (admin/contributor/viewer) — `api/auth.py`, `api/rbac.py`, `models.py:User` -- PostgreSQL support (asyncpg) with connection pooling (`pool_pre_ping`, compound indexes) -- Perplexity provider adapter (6th provider, search-grounded) — `providers/perplexity.py` -- Prometheus metrics (`/api/metrics`) + extended health checks (`/api/health/detailed`) -- Backup/restore CLI (`duh backup`, `duh restore`) with SQLite copy + JSON export/import -- Playwright E2E browser tests (`web/e2e/`) -- Per-user + per-provider rate limiting (middleware keys by user_id > api_key > IP) -- Production deployment documentation (3 new guides) -- 26 multi-user integration tests + 12 load tests (latency, concurrency, rate limiting) -- Alembic migration `005_v05_users.py` (users table, user_id FKs on threads/decisions/api_keys) +## Next Task: Model Selection Controls + Provider Updates -### New Source Files (v0.5) -- `src/duh/api/auth.py` — JWT authentication endpoints -- `src/duh/api/rbac.py` — Role-based access control -- `src/duh/api/metrics.py` — Prometheus metrics endpoint -- `src/duh/api/health.py` — Extended health checks -- `src/duh/memory/backup.py` — Backup/restore utilities -- `src/duh/providers/perplexity.py` — Perplexity provider adapter -- `alembic/versions/005_v05_users.py` — User migration -- `docs/guides/production-deployment.md`, `authentication.md`, `monitoring.md` +Deferred from before Phase A. See `progress.md` for details. ## Open Questions (Still Unresolved) diff --git a/memory-bank/decisions.md b/memory-bank/decisions.md index 9d49d9d..b2eaef2 100644 --- a/memory-bank/decisions.md +++ b/memory-bank/decisions.md @@ -1,6 +1,6 @@ # Architectural Decisions -**Last Updated**: 2026-02-17 +**Last Updated**: 2026-02-18 --- @@ -324,3 +324,33 @@ - Remove `create_all` entirely — breaks in-memory test fixtures that don't run alembic **Consequences**: Tests continue to work (in-memory SQLite still uses `create_all`). Production databases must run `alembic upgrade head` after code updates. This was already the expected workflow but is now enforced. **References**: `src/duh/cli/app.py:101-104` + +--- + +## 2026-02-18: Epistemic Confidence — Separate Rigor from Confidence + +**Status**: Approved +**Context**: The original `_compute_confidence()` in `handlers.py` measured challenge quality (ratio of genuine vs sycophantic challenges), producing a score in [0.5, 1.0]. This was misleading: a factual question ("What is the capital of France?") and a strategic question ("Will AI replace software engineers by 2035?") could both score 1.0 confidence if all challenges were genuine. But inherently uncertain questions should never report near-certain confidence. +**Decision**: Split into two metrics: +- **Rigor** (renamed from old confidence): measures challenge quality, [0.5, 1.0] +- **Confidence** (epistemic): `min(domain_cap(intent), rigor)` — rigor clamped by a per-domain ceiling based on question intent (factual=0.95, technical=0.90, creative=0.85, judgment=0.80, strategic=0.70, default=0.85). +**Alternatives**: +- Single blended score (simpler, but hides the two distinct signals) +- User-configurable caps (more flexible, but adds UX complexity without clear benefit) +- LLM-estimated confidence (model judges own uncertainty — unreliable, circular) +**Consequences**: Confidence scores are now more honest. Strategic questions max out at 70% even with perfect rigor. Rigor is preserved as a separate signal for calibration analysis. Requires `rigor` column added to Decision model. Full-stack change: ORM, handlers, CLI, API, WebSocket, MCP, frontend all updated. +**References**: `src/duh/consensus/handlers.py:641-670`, `src/duh/calibration.py` + +--- + +## 2026-02-18: Lightweight SQLite Migrations (Not Alembic) + +**Status**: Approved +**Context**: Adding the `rigor` column to the `decisions` table requires a migration for existing file-based SQLite databases. Alembic handles PostgreSQL migrations, but for SQLite (the default local dev DB), running `alembic upgrade head` is a friction point for casual users. +**Decision**: Created `src/duh/memory/migrations.py` with `ensure_schema()` that runs on startup for file-based SQLite only. Uses `PRAGMA table_info()` to detect missing columns and `ALTER TABLE` to add them. In-memory SQLite uses `create_all` (unchanged). PostgreSQL uses Alembic (unchanged). +**Alternatives**: +- Alembic-only (requires users to run migration command) +- create_all for all databases (can't alter existing tables) +- Manual migration instructions in docs (user friction) +**Consequences**: File-based SQLite databases auto-migrate on startup. Zero friction for local users. PostgreSQL still requires `alembic upgrade head`. Lightweight and self-contained. +**References**: `src/duh/memory/migrations.py`, `src/duh/cli/app.py:107-110` diff --git a/memory-bank/progress.md b/memory-bank/progress.md index 7aa3754..5d6b98a 100644 --- a/memory-bank/progress.md +++ b/memory-bank/progress.md @@ -1,10 +1,26 @@ # Progress -**Last Updated**: 2026-02-17 +**Last Updated**: 2026-02-18 --- -## Current State: v0.5 COMPLETE — Production Hardening & Multi-User +## Current State: Epistemic Confidence Phase A COMPLETE + +### Epistemic Confidence Phase A + +- **Renamed `_compute_confidence` → `_compute_rigor`** — old "confidence" measured challenge quality, now called "rigor" +- **Added `rigor` field** to Decision ORM model, ConsensusContext, RoundResult, SubtaskResult, VoteResult, VotingAggregation, SynthesisResult +- **Domain caps** — confidence capped by question intent: factual (0.95), technical (0.90), creative (0.85), judgment (0.80), strategic (0.70), default (0.85) +- **Epistemic formula**: `confidence = min(domain_cap(intent), rigor)` — rigor clamped by domain ceiling +- **Calibration module** — `src/duh/calibration.py` computes ECE (Expected Calibration Error) from decisions with outcomes +- **`duh calibration` CLI command** — shows calibration analysis with bucket breakdown +- **`GET /api/calibration` endpoint** — serves calibration data with category/date filters +- **Calibration frontend** — CalibrationDashboard, CalibrationPage, calibration Zustand store +- **SQLite migration** — `src/duh/memory/migrations.py` adds rigor column on startup for file-based SQLite +- **Full-stack propagation** — rigor shown in CLI, API, WebSocket, MCP, frontend across all views +- **Enhanced PDF export** — research-paper quality: header/footer, TOC, provider callouts, confidence meter, Unicode TTF +- 1586 Python tests + 126 Vitest tests (1712 total), ruff clean, mypy strict clean +- New files: calibration.py, migrations.py, test_calibration.py, test_confidence_scoring.py, test_cli_calibration.py, CalibrationDashboard.tsx, CalibrationPage.tsx, calibration.ts ### v0.5 Additions @@ -151,3 +167,4 @@ Phase 0 benchmark framework — fully functional, pilot-tested on 5 questions. | 2026-02-17 | v0.5 T14-T18 (Phase 7: Ship) — multi-user integration tests, load tests, docs, migration finalized, version bump | Done | | 2026-02-17 | v0.5.0 — "It Scales" | **Complete** | | 2026-02-17 | Export to Markdown & PDF (CLI + API + Web UI) | Done | +| 2026-02-18 | Epistemic Confidence Phase A (rigor + domain caps + calibration) | Done | diff --git a/memory-bank/tasks/2026-02/README.md b/memory-bank/tasks/2026-02/README.md index 151d440..9850c2a 100644 --- a/memory-bank/tasks/2026-02/README.md +++ b/memory-bank/tasks/2026-02/README.md @@ -462,3 +462,40 @@ - Manual override classes: `.theme-dark` / `.theme-light` on any ancestor element - Light mode code block overrides in `animations.css` - Variables: backgrounds (5), text (3), primary accent, semantic colors (3), borders (3), glass (2), layout (3), typography (1) + +--- + +## Epistemic Confidence Phase A — "Honest Confidence" + +### 2026-02-18: Epistemic Confidence Scoring +- Renamed `_compute_confidence()` → `_compute_rigor()` — old metric now properly named +- Added `DOMAIN_CAPS` dict and `_domain_cap(intent)` lookup +- New formula: `confidence = min(domain_cap(intent), rigor)` +- Domain caps: factual (0.95), technical (0.90), creative (0.85), judgment (0.80), strategic (0.70), default (0.85) +- `handle_commit()` now always attempts taxonomy classification to get intent for capping +- Files: `src/duh/consensus/handlers.py` + +### 2026-02-18: Rigor Field Propagation (Full Stack) +- Added `rigor: float` to Decision ORM, ConsensusContext, RoundResult, SubtaskResult, VoteResult, VotingAggregation, SynthesisResult +- Updated save_decision(), scheduler, synthesis, voting to propagate rigor +- Updated all CLI outputs (ask, recall, show, export JSON/markdown/PDF) +- Updated API responses (crud, ask, ws, threads) and MCP server +- Updated display (show_commit, show_final_decision) +- Updated context builder to show rigor alongside confidence +- Frontend: ConfidenceMeter, ConsensusComplete, ConsensusPanel, ThreadDetail, TurnCard, ExportMenu, DecisionCloud, stores +- Files: 47 files changed, +997 insertions, -230 deletions + +### 2026-02-18: SQLite Schema Migration +- Created `src/duh/memory/migrations.py` — `ensure_schema()` adds rigor column on startup +- Runs for file-based SQLite only (PRAGMA table_info check → ALTER TABLE) +- In-memory SQLite: create_all handles it. PostgreSQL: Alembic handles it. +- Wired into `_create_db()` in `cli/app.py` + +### 2026-02-18: Calibration Module + CLI + API + Frontend +- Created `src/duh/calibration.py` — `compute_calibration()` buckets decisions by confidence, computes ECE +- `CalibrationBucket` and `CalibrationResult` dataclasses +- `duh calibration [--category CAT]` CLI command +- `GET /api/calibration` endpoint with category/since/until filters +- Frontend: CalibrationDashboard (metric cards + bar chart + bucket table), CalibrationPage, calibration Zustand store +- Tests: 15 calibration tests, 20 confidence scoring tests, 4 CLI calibration tests +- **Total: 1586 Python + 126 Vitest = 1712 tests** diff --git a/memory-bank/toc.md b/memory-bank/toc.md index 7a9f592..cc4a0b7 100644 --- a/memory-bank/toc.md +++ b/memory-bank/toc.md @@ -3,8 +3,8 @@ ## Core Files - [projectbrief.md](./projectbrief.md) — Vision, tenets, architecture, build sequence - [techContext.md](./techContext.md) — Tech stack decisions with rationale (Python, Docker, SQLAlchemy, frontend, tools, etc.) -- [decisions.md](./decisions.md) — Architectural decisions with context, alternatives, and consequences (18 ADRs) -- [activeContext.md](./activeContext.md) — Current state, v0.5 complete, ready to merge to main +- [decisions.md](./decisions.md) — Architectural decisions with context, alternatives, and consequences (20 ADRs) +- [activeContext.md](./activeContext.md) — Current state, epistemic confidence Phase A complete - [progress.md](./progress.md) — Milestone tracking, what's built, what's next - [competitive-landscape.md](./competitive-landscape.md) — Research on existing tools, frameworks, and academic work - [quick-start.md](./quick-start.md) — Session entry point, v0.5 complete, key file references diff --git a/src/duh/api/routes/ask.py b/src/duh/api/routes/ask.py index 0be2ba6..3d926bf 100644 --- a/src/duh/api/routes/ask.py +++ b/src/duh/api/routes/ask.py @@ -29,6 +29,7 @@ class AskRequest(BaseModel): class AskResponse(BaseModel): decision: str confidence: float + rigor: float = 0.0 dissent: str | None = None cost: float thread_id: str | None = None @@ -82,7 +83,7 @@ async def _handle_consensus( # type: ignore[no-untyped-def] """Run the consensus protocol.""" from duh.cli.app import _run_consensus - decision, confidence, dissent, cost = await _run_consensus( + decision, confidence, rigor, dissent, cost = await _run_consensus( body.question, config, pm, @@ -95,7 +96,7 @@ async def _handle_consensus( # type: ignore[no-untyped-def] if db_factory is not None: try: thread_id = await _persist_result( - db_factory, body.question, decision, confidence, dissent + db_factory, body.question, decision, confidence, dissent, rigor=rigor ) except Exception: logger.exception("Failed to persist consensus thread") @@ -103,6 +104,7 @@ async def _handle_consensus( # type: ignore[no-untyped-def] return AskResponse( decision=decision, confidence=confidence, + rigor=rigor, dissent=dissent, cost=cost, thread_id=thread_id, @@ -118,6 +120,7 @@ async def _handle_voting(body: AskRequest, config, pm) -> AskResponse: # type: return AskResponse( decision=result.decision, confidence=result.confidence, + rigor=result.rigor, cost=pm.total_cost, protocol_used="voting", ) @@ -150,12 +153,13 @@ async def _handle_decompose(body: AskRequest, config, pm) -> AskResponse: # typ if len(subtask_specs) == 1: from duh.cli.app import _run_consensus - decision, confidence, dissent, cost = await _run_consensus( + decision, confidence, rigor, dissent, cost = await _run_consensus( body.question, config, pm ) return AskResponse( decision=decision, confidence=confidence, + rigor=rigor, dissent=dissent, cost=cost, protocol_used="decompose", @@ -168,6 +172,7 @@ async def _handle_decompose(body: AskRequest, config, pm) -> AskResponse: # typ return AskResponse( decision=synthesis_result.content, confidence=synthesis_result.confidence, + rigor=synthesis_result.rigor, cost=pm.total_cost, protocol_used="decompose", ) @@ -179,6 +184,8 @@ async def _persist_result( decision: str, confidence: float, dissent: str | None, + *, + rigor: float = 0.0, ) -> str: """Persist a consensus result to the database. @@ -192,7 +199,7 @@ async def _persist_result( thread.status = "complete" turn = await repo.create_turn(thread.id, 1, "COMMIT") await repo.save_decision( - turn.id, thread.id, decision, confidence, dissent=dissent + turn.id, thread.id, decision, confidence, rigor=rigor, dissent=dissent ) await session.commit() return str(thread.id) diff --git a/src/duh/api/routes/crud.py b/src/duh/api/routes/crud.py index 8bb8c14..f2254a0 100644 --- a/src/duh/api/routes/crud.py +++ b/src/duh/api/routes/crud.py @@ -196,6 +196,72 @@ async def cost(request: Request) -> CostResponse: ) +# -- GET /api/calibration --------------------------------------------------- + + +class CalibrationBucketResponse(BaseModel): + range_lo: float + range_hi: float + count: int + with_outcomes: int + success: int + failure: int + partial: int + accuracy: float + mean_confidence: float + + +class CalibrationResponse(BaseModel): + buckets: list[CalibrationBucketResponse] + total_decisions: int + total_with_outcomes: int + overall_accuracy: float + ece: float + + +@router.get("/calibration", response_model=CalibrationResponse) +async def calibration( + request: Request, + category: str | None = None, + since: str | None = None, + until: str | None = None, +) -> CalibrationResponse: + """Confidence calibration analysis.""" + from duh.calibration import compute_calibration + from duh.memory.repository import MemoryRepository + + db_factory = request.app.state.db_factory + async with db_factory() as session: + repo = MemoryRepository(session) + decisions = await repo.get_all_decisions_for_space( + category=category, + since=since, + until=until, + ) + + result = compute_calibration(decisions) + return CalibrationResponse( + buckets=[ + CalibrationBucketResponse( + range_lo=b.range_lo, + range_hi=b.range_hi, + count=b.count, + with_outcomes=b.with_outcomes, + success=b.success, + failure=b.failure, + partial=b.partial, + accuracy=b.accuracy, + mean_confidence=b.mean_confidence, + ) + for b in result.buckets + ], + total_decisions=result.total_decisions, + total_with_outcomes=result.total_with_outcomes, + overall_accuracy=result.overall_accuracy, + ece=result.ece, + ) + + # -- GET /api/decisions/space ----------------------------------------------- @@ -204,6 +270,7 @@ class SpaceDecisionResponse(BaseModel): thread_id: str question: str confidence: float + rigor: float = 0.0 intent: str | None = None category: str | None = None genus: str | None = None @@ -270,6 +337,7 @@ async def decision_space( thread_id=d.thread_id, question=question, confidence=d.confidence, + rigor=d.rigor, intent=d.intent, category=d.category, genus=d.genus, diff --git a/src/duh/api/routes/threads.py b/src/duh/api/routes/threads.py index 9b4f0a9..7222b4d 100644 --- a/src/duh/api/routes/threads.py +++ b/src/duh/api/routes/threads.py @@ -23,6 +23,7 @@ class ContributionResponse(BaseModel): class DecisionResponse(BaseModel): content: str confidence: float + rigor: float = 0.0 dissent: str | None = None @@ -125,6 +126,7 @@ async def get_thread(thread_id: str, request: Request) -> ThreadDetailResponse: dec = DecisionResponse( content=turn.decision.content, confidence=turn.decision.confidence, + rigor=turn.decision.rigor, dissent=turn.decision.dissent, ) turns.append( @@ -179,6 +181,7 @@ async def get_shared_thread(share_token: str, request: Request) -> ThreadDetailR dec = DecisionResponse( content=turn.decision.content, confidence=turn.decision.confidence, + rigor=turn.decision.rigor, dissent=turn.decision.dissent, ) turns.append( diff --git a/src/duh/api/routes/ws.py b/src/duh/api/routes/ws.py index e9d1bda..36ff8ff 100644 --- a/src/duh/api/routes/ws.py +++ b/src/duh/api/routes/ws.py @@ -186,11 +186,12 @@ async def _stream_consensus( # COMMIT sm.transition(ConsensusState.COMMIT) - await handle_commit(ctx) + await handle_commit(ctx, pm) await ws.send_json( { "type": "commit", "confidence": ctx.confidence, + "rigor": ctx.rigor, "dissent": ctx.dissent, "round": ctx.current_round, } @@ -217,6 +218,7 @@ async def _stream_consensus( "type": "complete", "decision": ctx.decision or "", "confidence": ctx.confidence, + "rigor": ctx.rigor, "dissent": ctx.dissent, "cost": pm.total_cost, "thread_id": thread_id, @@ -258,6 +260,7 @@ async def _persist_consensus( thread.id, rr.decision, rr.confidence, + rigor=rr.rigor, dissent=rr.dissent, ) diff --git a/src/duh/calibration.py b/src/duh/calibration.py new file mode 100644 index 0000000..542d6da --- /dev/null +++ b/src/duh/calibration.py @@ -0,0 +1,150 @@ +"""Confidence calibration analysis. + +Computes calibration metrics for decisions with tracked outcomes. +Buckets decisions by confidence range and compares predicted +confidence against actual accuracy (ECE metric). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Sequence + + from duh.memory.models import Decision + + +@dataclass(frozen=True) +class CalibrationBucket: + """One confidence range bucket with accuracy stats.""" + + range_lo: float + range_hi: float + count: int + with_outcomes: int + success: int + failure: int + partial: int + accuracy: float + mean_confidence: float + + +@dataclass(frozen=True) +class CalibrationResult: + """Full calibration analysis result.""" + + buckets: list[CalibrationBucket] = field(default_factory=list) + total_decisions: int = 0 + total_with_outcomes: int = 0 + overall_accuracy: float = 0.0 + ece: float = 0.0 + + +def compute_calibration( + decisions: Sequence[Decision], + *, + n_buckets: int = 10, +) -> CalibrationResult: + """Compute calibration metrics from decisions with outcomes. + + Buckets decisions by confidence into ``n_buckets`` equal-width bins. + For each bucket, accuracy = (success + 0.5 * partial) / with_outcomes. + ECE is the weighted average of |accuracy - mean_confidence| across + non-empty buckets. + + Args: + decisions: Sequence of Decision model instances (with .outcome loaded). + n_buckets: Number of equal-width confidence bins (default 10). + + Returns: + CalibrationResult with per-bucket stats and overall ECE. + """ + if n_buckets < 1: + n_buckets = 1 + + # Initialize per-bucket accumulators + bucket_counts = [0] * n_buckets + bucket_with_outcomes = [0] * n_buckets + bucket_success = [0] * n_buckets + bucket_failure = [0] * n_buckets + bucket_partial = [0] * n_buckets + bucket_conf_sum = [0.0] * n_buckets + + total = len(decisions) + + for d in decisions: + # Determine bucket index from confidence + idx = int(d.confidence * n_buckets) + if idx >= n_buckets: + idx = n_buckets - 1 + if idx < 0: + idx = 0 + + bucket_counts[idx] += 1 + bucket_conf_sum[idx] += d.confidence + + if d.outcome is not None: + result = d.outcome.result + bucket_with_outcomes[idx] += 1 + if result == "success": + bucket_success[idx] += 1 + elif result == "failure": + bucket_failure[idx] += 1 + elif result == "partial": + bucket_partial[idx] += 1 + + # Build bucket objects + width = 1.0 / n_buckets + buckets: list[CalibrationBucket] = [] + total_with_outcomes = 0 + total_accuracy_sum = 0.0 + ece_sum = 0.0 + ece_weight_sum = 0 + + for i in range(n_buckets): + lo = round(i * width, 10) + hi = round((i + 1) * width, 10) + count = bucket_counts[i] + with_out = bucket_with_outcomes[i] + s = bucket_success[i] + f = bucket_failure[i] + p = bucket_partial[i] + + mean_conf = bucket_conf_sum[i] / count if count > 0 else (lo + hi) / 2 + accuracy = (s + 0.5 * p) / with_out if with_out > 0 else 0.0 + + buckets.append( + CalibrationBucket( + range_lo=lo, + range_hi=hi, + count=count, + with_outcomes=with_out, + success=s, + failure=f, + partial=p, + accuracy=accuracy, + mean_confidence=mean_conf, + ) + ) + + total_with_outcomes += with_out + total_accuracy_sum += s + 0.5 * p + + if with_out > 0: + ece_sum += with_out * abs(accuracy - mean_conf) + ece_weight_sum += with_out + + overall_accuracy = ( + total_accuracy_sum / total_with_outcomes if total_with_outcomes > 0 else 0.0 + ) + ece = ece_sum / ece_weight_sum if ece_weight_sum > 0 else 0.0 + + return CalibrationResult( + buckets=buckets, + total_decisions=total, + total_with_outcomes=total_with_outcomes, + overall_accuracy=overall_accuracy, + ece=ece, + ) diff --git a/src/duh/cli/app.py b/src/duh/cli/app.py index 3a85774..56ddc91 100644 --- a/src/duh/cli/app.py +++ b/src/duh/cli/app.py @@ -10,6 +10,7 @@ import json as json_mod import sys import time +from datetime import UTC from pathlib import Path from typing import TYPE_CHECKING @@ -104,6 +105,10 @@ def _enable_fks(dbapi_conn, connection_record): # type: ignore[no-untyped-def] if is_memory: async with engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) + elif url.startswith("sqlite"): + from duh.memory.migrations import ensure_schema + + await ensure_schema(engine) factory = async_sessionmaker(engine, expire_on_commit=False) return factory, engine @@ -204,10 +209,10 @@ async def _run_consensus( panel: list[str] | None = None, proposer_override: str | None = None, challengers_override: list[str] | None = None, -) -> tuple[str, float, str | None, float]: +) -> tuple[str, float, float, str | None, float]: """Run the full consensus loop. - Returns (decision, confidence, dissent, total_cost). + Returns (decision, confidence, rigor, dissent, total_cost). """ from duh.consensus.convergence import check_convergence from duh.consensus.handlers import ( @@ -275,9 +280,9 @@ async def _run_consensus( # COMMIT sm.transition(ConsensusState.COMMIT) - await handle_commit(ctx) + await handle_commit(ctx, pm) if display: - display.show_commit(ctx.confidence, ctx.dissent) + display.show_commit(ctx.confidence, ctx.rigor, ctx.dissent) display.round_footer( ctx.current_round, config.general.max_rounds, @@ -303,6 +308,7 @@ async def _run_consensus( return ( ctx.decision or "", ctx.confidence, + ctx.rigor, ctx.dissent, pm.total_cost, ) @@ -442,12 +448,12 @@ def ask( _error(str(e)) return # unreachable - decision, confidence, dissent, cost = result + decision, confidence, rigor, dissent, cost = result from duh.cli.display import ConsensusDisplay display = ConsensusDisplay() - display.show_final_decision(decision, confidence, cost, dissent) + display.show_final_decision(decision, confidence, rigor, cost, dissent) async def _ask_async( @@ -457,7 +463,7 @@ async def _ask_async( panel: list[str] | None = None, proposer_override: str | None = None, challengers_override: list[str] | None = None, -) -> tuple[str, float, str | None, float]: +) -> tuple[str, float, float, str | None, float]: """Async implementation for the ask command.""" from duh.cli.display import ConsensusDisplay @@ -528,6 +534,7 @@ async def _ask_voting_async( thread.id, result.decision, result.confidence, + rigor=result.rigor, ) await session.commit() await engine.dispose() @@ -563,10 +570,10 @@ async def _ask_auto_async( display = ConsensusDisplay() display.start() - decision, confidence, dissent, cost = await _run_consensus( + decision, confidence, rigor, dissent, cost = await _run_consensus( question, config, pm, display=display ) - display.show_final_decision(decision, confidence, cost, dissent) + display.show_final_decision(decision, confidence, rigor, cost, dissent) async def _ask_decompose_async( @@ -639,8 +646,8 @@ async def _ask_decompose_async( # Single-subtask optimization: skip synthesis if len(subtask_specs) == 1: result = await _run_consensus(question, config, pm, display=display) - decision, confidence, dissent, cost = result - display.show_final_decision(decision, confidence, cost, dissent) + decision, confidence, rigor, dissent, cost = result + display.show_final_decision(decision, confidence, rigor, cost, dissent) await engine.dispose() return @@ -660,6 +667,7 @@ async def _ask_decompose_async( display.show_final_decision( synthesis_result.content, synthesis_result.confidence, + synthesis_result.rigor, pm.total_cost, None, ) @@ -711,7 +719,9 @@ async def _recall_async(config: DuhConfig, query: str, limit: int) -> None: latest = thread.decisions[-1] snippet = latest.content[:120].replace("\n", " ") click.echo(f" Decision: {snippet}...") - click.echo(f" Confidence: {latest.confidence:.0%}") + click.echo( + f" Confidence: {latest.confidence:.0%} Rigor: {latest.rigor:.0%}" + ) click.echo() @@ -830,7 +840,10 @@ async def _show_async(config: DuhConfig, thread_id: str) -> None: click.echo(f" {contrib.content}") click.echo() if turn.decision: - click.echo(f" Decision (confidence {turn.decision.confidence:.0%}):") + click.echo( + f" Decision (confidence {turn.decision.confidence:.0%}," + f" rigor {turn.decision.rigor:.0%}):" + ) click.echo(f" {turn.decision.content}") if turn.decision.dissent: click.echo(f" Dissent: {turn.decision.dissent}") @@ -1086,6 +1099,7 @@ def _format_thread_json( decision_data = { "content": turn.decision.content, "confidence": turn.decision.confidence, + "rigor": turn.decision.rigor, "dissent": turn.decision.dissent, } @@ -1153,7 +1167,8 @@ def _format_thread_markdown( lines.append(final_decision.content) lines.append("") conf_pct = f"{final_decision.confidence:.0%}" - lines.append(f"Confidence: {conf_pct}") + rigor_pct = f"{final_decision.rigor:.0%}" + lines.append(f"Confidence: {conf_pct} Rigor: {rigor_pct}") lines.append("") if include_dissent and final_decision.dissent: @@ -1220,19 +1235,30 @@ def _format_thread_pdf( content: str = "full", include_dissent: bool = True, ) -> bytes: - """Format a thread as PDF for export. + """Format a thread as a research-paper quality PDF. - Args: - content: "full" for complete report, "decision" for decision only. - include_dissent: Whether to include the dissent section. + Features: repeating header/footer, TOC with bookmarks, provider-colored + callout boxes, confidence meter, and full Unicode via TTF fonts (with + graceful fallback to core Helvetica). """ import html as html_mod import re + from datetime import datetime from fpdf import FPDF # type: ignore[import-untyped] total_cost = sum(c.cost_usd for turn in thread.turns for c in turn.contributions) + total_input = sum( + c.input_tokens for turn in thread.turns for c in turn.contributions + ) + total_output = sum( + c.output_tokens for turn in thread.turns for c in turn.contributions + ) created = thread.created_at.strftime("%Y-%m-%d") + exported = datetime.now(tz=UTC).strftime("%Y-%m-%d") + model_refs = sorted( + {c.model_ref for turn in thread.turns for c in turn.contributions} + ) final_decision = None for turn in reversed(thread.turns): @@ -1240,23 +1266,93 @@ def _format_thread_pdf( final_decision = turn.decision break - def _pdf_safe(text: str) -> str: - """Replace Unicode chars unsupported by core PDF fonts.""" - for char, repl in ( - ("\u2014", "--"), - ("\u2013", "-"), - ("\u2018", "'"), - ("\u2019", "'"), - ("\u201c", '"'), - ("\u201d", '"'), - ("\u2026", "..."), - ("\u2022", "*"), - ("\u00a0", " "), - ("\u2192", "->"), - ("\u2190", "<-"), - ): - text = text.replace(char, repl) - return text.encode("latin-1", errors="replace").decode("latin-1") + # ── Provider color map ────────────────────────────────────── + provider_colors: dict[str, tuple[int, int, int]] = { + "anthropic": (204, 107, 43), + "openai": (16, 163, 127), + "google": (66, 133, 244), + "mistral": (131, 56, 236), + "perplexity": (0, 160, 160), + } + default_color = (120, 120, 120) + + def _provider_color(model_ref: str) -> tuple[int, int, int]: + provider = model_ref.split(":")[0].lower() if ":" in model_ref else "" + return provider_colors.get(provider, default_color) + + # ── PDF subclass with header/footer ───────────────────────── + + class ConsensusReport(FPDF): # type: ignore[misc] + """FPDF subclass with repeating header and footer.""" + + def __init__(self) -> None: + super().__init__() + self._use_ttf = False + self._font_family = "Helvetica" + self._mono_family = "Courier" + + def _setup_fonts(self) -> None: + """Try to load a TTF font for Unicode support.""" + import os + + search_paths = [ + "/System/Library/Fonts/Helvetica.ttc", + "/System/Library/Fonts/HelveticaNeue.ttc", + "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", + "/usr/share/fonts/TTF/DejaVuSans.ttf", + ] + for path in search_paths: + if os.path.isfile(path): + try: + self.add_font("DuhSans", "", path) + self.add_font("DuhSans", "B", path) + self.add_font("DuhSans", "I", path) + self._use_ttf = True + self._font_family = "DuhSans" + break + except Exception: + continue + + def header(self) -> None: + self.set_font(self._font_family, "", 8) + self.set_text_color(160, 160, 160) + self.cell(0, 5, "duh consensus report", align="L") + self.cell(0, 5, exported, align="R", new_x="LMARGIN", new_y="NEXT") + self.set_draw_color(220, 220, 220) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(4) + + def footer(self) -> None: + self.set_y(-15) + self.set_font(self._font_family, "", 8) + self.set_text_color(160, 160, 160) + self.set_draw_color(220, 220, 220) + self.line(10, self.get_y(), 200, self.get_y()) + self.ln(2) + self.cell(0, 5, f"Page {self.page_no()}/{{nb}}", align="C") + self.cell(0, 5, f"duh v{__version__}", align="R") + + def _safe(self, text: str) -> str: + """Make text safe for the current font encoding.""" + if self._use_ttf: + return text + for char, repl in ( + ("\u2014", "--"), + ("\u2013", "-"), + ("\u2018", "'"), + ("\u2019", "'"), + ("\u201c", '"'), + ("\u201d", '"'), + ("\u2026", "..."), + ("\u2022", "*"), + ("\u00a0", " "), + ("\u2192", "->"), + ("\u2190", "<-"), + ): + text = text.replace(char, repl) + return text.encode("latin-1", errors="replace").decode("latin-1") + + # ── Markdown rendering helpers ────────────────────────────── def _inline_fmt(text: str) -> str: """Convert inline markdown (bold, italic, code) to HTML.""" @@ -1265,7 +1361,8 @@ def _inline_fmt(text: str) -> str: for part in parts: if part.startswith("`") and part.endswith("`"): result.append( - f"{html_mod.escape(part[1:-1])}" + f"" + f"{html_mod.escape(part[1:-1])}" ) else: escaped = html_mod.escape(part) @@ -1309,7 +1406,6 @@ def _md_to_html(md: str) -> str: in_list = False continue - # Headers -> bold paragraph m = re.match(r"^#{1,6}\s+(.+)$", stripped) if m: if in_list: @@ -1318,7 +1414,6 @@ def _md_to_html(md: str) -> str: parts.append(f"

{_inline_fmt(m.group(1))}

") continue - # Unordered list m = re.match(r"^[-*]\s+(.+)$", stripped) if m: if not in_list or list_tag != "ul": @@ -1330,7 +1425,6 @@ def _md_to_html(md: str) -> str: parts.append(f"
  • {_inline_fmt(m.group(1))}
  • ") continue - # Ordered list m = re.match(r"^\d+[.)]\s+(.+)$", stripped) if m: if not in_list or list_tag != "ol": @@ -1342,7 +1436,6 @@ def _md_to_html(md: str) -> str: parts.append(f"
  • {_inline_fmt(m.group(1))}
  • ") continue - # Regular text if in_list: parts.append(f"") in_list = False @@ -1357,122 +1450,277 @@ def _md_to_html(md: str) -> str: def _write_md(md_text: str) -> None: """Render markdown content as formatted PDF.""" - pdf.write_html(_pdf_safe(_md_to_html(md_text))) + pdf.write_html(pdf._safe(_md_to_html(md_text))) + + # ── Callout box helper ────────────────────────────────────── + + def _draw_accent_bar( + start_y: float, end_y: float, color: tuple[int, int, int] + ) -> None: + """Draw a thick colored accent bar on the left margin.""" + saved_draw = (pdf.draw_color.r, pdf.draw_color.g, pdf.draw_color.b) + saved_width = pdf.line_width + pdf.set_draw_color(*color) + pdf.set_line_width(2.5) + x = pdf.l_margin - 1 + # Clamp to page content area + top = max(start_y, pdf.t_margin) + bot = min(end_y, pdf.h - pdf.b_margin) + if bot > top: + pdf.line(x, top, x, bot) + pdf.set_draw_color(*saved_draw) + pdf.set_line_width(saved_width) + + def _callout_box( + model_ref: str, + role: str, + body: str, + *, + accent: tuple[int, int, int] | None = None, + ) -> None: + """Draw a colored callout box with provider accent line.""" + color = accent or _provider_color(model_ref) + start_y = pdf.get_y() + + # Indent content to leave room for accent bar + saved_margin = pdf.l_margin + pdf.set_left_margin(saved_margin + 6) + pdf.set_x(pdf.l_margin) + + # Header: model + role + pdf.set_font(pdf._font_family, "B", 9) + pdf.set_text_color(*color) + pdf.cell(0, 5, pdf._safe(f"{model_ref} | {role.upper()}")) + pdf.ln(5) + + # Body + pdf.set_text_color(40, 40, 40) + pdf.set_font(pdf._font_family, "", 10) + _write_md(body) + pdf.ln(2) + + end_y = pdf.get_y() + + # Draw accent bar on left edge (doesn't overlap text) + _draw_accent_bar(start_y, end_y, color) + + # Restore margin + pdf.set_left_margin(saved_margin) + pdf.ln(4) - pdf = FPDF() + # ── Build the PDF ─────────────────────────────────────────── + + pdf = ConsensusReport() + pdf._setup_fonts() + pdf.alias_nb_pages() + pdf.set_auto_page_break(auto=True, margin=20) + pdf.set_text_color(40, 40, 40) + + # -- Title page / header area -- pdf.add_page() - pdf.set_auto_page_break(auto=True, margin=15) - font = "Helvetica" - # Title - pdf.set_font(font, "B", 16) - pdf.multi_cell(0, 10, _pdf_safe(f"Consensus: {thread.question}")) - pdf.ln(5) + pdf.set_font(pdf._font_family, "B", 20) + pdf.multi_cell(0, 10, pdf._safe(thread.question)) + pdf.ln(3) + + # Metadata line + pdf.set_font(pdf._font_family, "", 9) + pdf.set_text_color(130, 130, 130) + meta_parts = [ + f"Thread {thread.id[:8]}", + f"Created {created}", + f"{len(model_refs)} model{'s' if len(model_refs) != 1 else ''}", + ] + if total_cost > 0: + meta_parts.append(f"Cost ${total_cost:.4f}") + pdf.cell(0, 5, pdf._safe(" | ".join(meta_parts))) + pdf.ln(6) + + # Horizontal rule + pdf.set_draw_color(200, 200, 200) + pdf.set_line_width(0.5) + pdf.line(10, pdf.get_y(), 200, pdf.get_y()) + pdf.ln(6) + pdf.set_text_color(40, 40, 40) + pdf.set_line_width(0.2) + + # -- TOC placeholder -- + if content == "full": + pdf.insert_toc_placeholder( + render_toc, + pages=1, + ) - # Decision + # -- Decision section -- if final_decision: - pdf.set_font(font, "B", 13) + pdf.start_section("Decision") + pdf.set_font(pdf._font_family, "B", 15) pdf.cell(0, 8, "Decision") - pdf.ln() - pdf.ln(2) - pdf.set_font(font, "", 11) - _write_md(final_decision.content) - pdf.ln(3) - conf_pct = f"{final_decision.confidence:.0%}" - pdf.set_font(font, "I", 10) - pdf.cell(0, 6, f"Confidence: {conf_pct}") pdf.ln(8) + decision_start_y = pdf.get_y() + + # Indent for accent bar + pdf.set_left_margin(16) + pdf.set_x(16) + + # Decision content + pdf.set_font(pdf._font_family, "", 11) + pdf.set_text_color(40, 40, 40) + _write_md(final_decision.content) + pdf.ln(4) + + # Confidence meter + conf_pct = final_decision.confidence + pdf.set_font(pdf._font_family, "B", 10) + pdf.cell(30, 6, pdf._safe(f"Confidence: {conf_pct:.0%}")) + bar_x = pdf.get_x() + 2 + bar_y = pdf.get_y() + 1 + bar_w = 60 + bar_h = 4 + pdf.set_fill_color(230, 230, 230) + pdf.rect(bar_x, bar_y, bar_w, bar_h, style="F") + g = int(100 + 155 * conf_pct) + pdf.set_fill_color(40, min(g, 200), 80) + pdf.rect(bar_x, bar_y, bar_w * conf_pct, bar_h, style="F") + pdf.ln(10) + + # Rigor meter + rigor_pct = final_decision.rigor + pdf.set_font(pdf._font_family, "B", 10) + pdf.cell(30, 6, pdf._safe(f"Rigor: {rigor_pct:.0%}")) + bar_x = pdf.get_x() + 2 + bar_y = pdf.get_y() + 1 + pdf.set_fill_color(230, 230, 230) + pdf.rect(bar_x, bar_y, bar_w, bar_h, style="F") + g = int(100 + 155 * rigor_pct) + pdf.set_fill_color(40, min(g, 200), 80) + pdf.rect(bar_x, bar_y, bar_w * rigor_pct, bar_h, style="F") + pdf.ln(10) + + # Draw green accent bar + _draw_accent_bar(decision_start_y, pdf.get_y(), (40, 160, 80)) + pdf.set_left_margin(10) + + # Dissent if include_dissent and final_decision.dissent: - pdf.set_font(font, "B", 13) + pdf.start_section("Dissent", level=1) + pdf.set_font(pdf._font_family, "B", 13) + pdf.set_text_color(40, 40, 40) pdf.cell(0, 8, "Dissent") - pdf.ln() - pdf.ln(2) - pdf.set_font(font, "", 11) + pdf.ln(6) + + dissent_start_y = pdf.get_y() + pdf.set_left_margin(16) + pdf.set_x(16) + + pdf.set_font(pdf._font_family, "I", 10) + pdf.set_text_color(100, 100, 100) _write_md(final_decision.dissent) - pdf.ln(5) + pdf.ln(4) + + # Amber accent bar + _draw_accent_bar(dissent_start_y, pdf.get_y(), (200, 140, 80)) + pdf.set_left_margin(10) + pdf.set_text_color(40, 40, 40) + # -- Consensus process -- if content == "full": - # Separator - pdf.set_draw_color(180, 180, 180) + pdf.set_draw_color(200, 200, 200) pdf.line(10, pdf.get_y(), 200, pdf.get_y()) - pdf.ln(5) + pdf.ln(6) - pdf.set_font(font, "B", 13) + pdf.start_section("Consensus Process") + pdf.set_font(pdf._font_family, "B", 15) pdf.cell(0, 8, "Consensus Process") - pdf.ln() - pdf.ln(3) + pdf.ln(8) for turn in thread.turns: - pdf.set_font(font, "B", 12) - pdf.cell(0, 7, f"Round {turn.round_number}") - pdf.ln() - pdf.ln(2) + section_title = f"Round {turn.round_number}" + pdf.start_section(section_title, level=1) + pdf.set_font(pdf._font_family, "B", 13) + pdf.set_text_color(80, 80, 80) + pdf.cell(0, 7, section_title) + pdf.ln(7) + pdf.set_text_color(40, 40, 40) + + for c in turn.contributions: + _callout_box(c.model_ref, c.role, c.content) + + # Votes + if votes: + pdf.start_section("Votes", level=1) + pdf.set_font(pdf._font_family, "B", 13) + pdf.cell(0, 8, "Votes") + pdf.ln(6) - proposers = [c for c in turn.contributions if c.role == "proposer"] - challengers = [c for c in turn.contributions if c.role == "challenger"] - revisers = [c for c in turn.contributions if c.role == "reviser"] - others = [ - c - for c in turn.contributions - if c.role not in ("proposer", "challenger", "reviser") - ] + for v in votes: + color = _provider_color(v.model_ref) + pdf.set_font(pdf._font_family, "B", 10) + pdf.set_text_color(*color) + pdf.cell(55, 6, pdf._safe(v.model_ref)) + pdf.set_font(pdf._font_family, "", 10) + pdf.set_text_color(60, 60, 60) + pdf.cell(0, 6, pdf._safe(v.content)) + pdf.ln(6) + + pdf.ln(4) + pdf.set_text_color(40, 40, 40) + + # -- Appendix: metadata footer ─────────────────────────────── + pdf.ln(4) + pdf.set_draw_color(200, 200, 200) + pdf.line(10, pdf.get_y(), 200, pdf.get_y()) + pdf.ln(4) + + pdf.set_font(pdf._font_family, "", 8) + pdf.set_text_color(140, 140, 140) + footer_parts = [ + f"Cost: ${total_cost:.4f}", + f"Tokens: {total_input:,} in / {total_output:,} out", + f"Models: {', '.join(model_refs)}", + ] + pdf.cell(0, 4, pdf._safe(" | ".join(footer_parts))) + pdf.set_text_color(40, 40, 40) - for p in proposers: - pdf.set_font(font, "B", 11) - pdf.cell(0, 6, f"Proposal ({p.model_ref})") - pdf.ln() - pdf.set_font(font, "", 10) - _write_md(p.content) - pdf.ln(3) + return bytes(pdf.output()) - if challengers: - pdf.set_font(font, "B", 11) - pdf.cell(0, 6, "Challenges") - pdf.ln() - for ch in challengers: - pdf.set_font(font, "B", 10) - pdf.cell(0, 5, f"{ch.model_ref}:") - pdf.ln() - pdf.set_font(font, "", 10) - _write_md(ch.content) - pdf.ln(2) - for r in revisers: - pdf.set_font(font, "B", 11) - pdf.cell(0, 6, f"Revision ({r.model_ref})") - pdf.ln() - pdf.set_font(font, "", 10) - _write_md(r.content) - pdf.ln(3) +def render_toc(pdf: object, outline: list[object]) -> None: + """Render a table of contents page for the PDF. - for o in others: - role_label = o.role.capitalize() - pdf.set_font(font, "B", 11) - pdf.cell(0, 6, f"{role_label} ({o.model_ref})") - pdf.ln() - pdf.set_font(font, "", 10) - _write_md(o.content) - pdf.ln(3) + Called by fpdf2's ``insert_toc_placeholder`` mechanism. + """ + from fpdf import FPDF - if votes: - pdf.set_font(font, "B", 11) - pdf.cell(0, 6, "Votes") - pdf.ln() - for v in votes: - pdf.set_font(font, "", 10) - pdf.cell(0, 5, _pdf_safe(f"{v.model_ref}: {v.content}")) - pdf.ln() - pdf.ln(3) + assert isinstance(pdf, FPDF) + font = getattr(pdf, "_font_family", "Helvetica") + pdf.set_font(font, "B", 15) + pdf.set_text_color(40, 40, 40) + pdf.cell(0, 10, "Table of Contents") + pdf.ln(10) - # Footer - pdf.set_draw_color(180, 180, 180) - pdf.line(10, pdf.get_y(), 200, pdf.get_y()) - pdf.ln(3) - pdf.set_font(font, "I", 9) - pdf.cell(0, 5, f"duh v{__version__} | {created} | Cost: ${total_cost:.4f}") + for entry in outline: + level = getattr(entry, "level", 0) + name = getattr(entry, "name", "") + page_number = getattr(entry, "page_number", 0) + link = getattr(entry, "link", None) - return bytes(pdf.output()) + indent = 4 * level + pdf.set_x(pdf.l_margin + indent) + + if level == 0: + pdf.set_font(font, "B", 11) + else: + pdf.set_font(font, "", 10) + + pdf.set_text_color(60, 60, 60) + w = pdf.w - pdf.l_margin - pdf.r_margin - indent - 15 + # Use safe method if available + safe = getattr(pdf, "_safe", lambda t: t) + pdf.cell(w, 6, safe(name), link=link) + pdf.cell(15, 6, str(page_number), align="R") + pdf.ln(6) # ── models ─────────────────────────────────────────────────────── @@ -1581,6 +1829,99 @@ async def _cost_async(config: DuhConfig) -> None: click.echo(f" {model_ref}: ${model_cost:.4f} ({call_count} calls)") +# ── calibration ────────────────────────────────────────────────── + + +@cli.command() +@click.option("--category", default=None, help="Filter by decision category.") +@click.option("--since", default=None, help="Only decisions after this date (ISO).") +@click.option("--until", default=None, help="Only decisions before this date (ISO).") +@click.pass_context +def calibration( + ctx: click.Context, + category: str | None, + since: str | None, + until: str | None, +) -> None: + """Show confidence calibration analysis. + + Compares predicted confidence against actual outcomes to + measure how well-calibrated the consensus engine is. + """ + config = _load_config(ctx.obj["config_path"]) + try: + asyncio.run(_calibration_async(config, category, since, until)) + except DuhError as e: + _error(str(e)) + + +async def _calibration_async( + config: DuhConfig, + category: str | None, + since: str | None, + until: str | None, +) -> None: + """Async implementation for the calibration command.""" + from duh.calibration import compute_calibration + from duh.memory.repository import MemoryRepository + + factory, engine = await _create_db(config) + async with factory() as session: + repo = MemoryRepository(session) + decisions = await repo.get_all_decisions_for_space( + category=category, + since=since, + until=until, + ) + + await engine.dispose() + + result = compute_calibration(decisions) + + if result.total_decisions == 0: + click.echo("No decisions found.") + return + + click.echo(f"Total decisions: {result.total_decisions}") + click.echo(f"With outcomes: {result.total_with_outcomes}") + click.echo(f"Overall accuracy: {result.overall_accuracy:.1%}") + click.echo(f"ECE: {result.ece:.4f}") + + if result.ece < 0.05: + rating = "excellent" + elif result.ece < 0.10: + rating = "good" + elif result.ece < 0.20: + rating = "fair" + else: + rating = "poor" + click.echo(f"Calibration: {rating}") + + if result.total_with_outcomes > 0: + click.echo() + click.echo( + f"{'Range':<12} {'Count':>6} {'Outcomes':>9} " + f"{'Accuracy':>9} {'Conf':>6} {'Gap':>6}" + ) + for b in result.buckets: + if b.count == 0: + continue + lo_pct = f"{b.range_lo:.0%}" + hi_pct = f"{b.range_hi:.0%}" + label = f"{lo_pct}-{hi_pct}" + acc_str = f"{b.accuracy:.1%}" if b.with_outcomes > 0 else "-" + conf_str = f"{b.mean_confidence:.1%}" + gap_str = ( + f"{abs(b.accuracy - b.mean_confidence):.1%}" + if b.with_outcomes > 0 + else "-" + ) + click.echo( + f"{label:<12} {b.count:>6} {b.with_outcomes:>9} " + f"{acc_str:>9} {conf_str:>6} {gap_str:>6}" + ) + + # ── backup ─────────────────────────────────────────────────────── @@ -1916,8 +2257,9 @@ async def _batch_async( vr = await run_voting(question, pm, aggregation=aggregation) decision = vr.decision or "" confidence = vr.confidence + rigor = vr.rigor else: - decision, confidence, _dissent, _cost = await _run_consensus( + decision, confidence, rigor, _dissent, _cost = await _run_consensus( question, config, pm ) @@ -1929,13 +2271,14 @@ async def _batch_async( "question": question, "decision": decision, "confidence": confidence, + "rigor": rigor, "cost": round(q_cost, 4), } ) if output_fmt == "text": click.echo(f"Decision: {decision[:200]}") - click.echo(f"Confidence: {confidence:.0%}") + click.echo(f"Confidence: {confidence:.0%} Rigor: {rigor:.0%}") click.echo(f"Cost: ${q_cost:.4f}") except Exception as e: @@ -1946,6 +2289,7 @@ async def _batch_async( "question": question, "error": str(e), "confidence": 0.0, + "rigor": 0.0, "cost": round(q_cost, 4), } ) diff --git a/src/duh/cli/display.py b/src/duh/cli/display.py index 27ec841..ac34bba 100644 --- a/src/duh/cli/display.py +++ b/src/duh/cli/display.py @@ -154,10 +154,10 @@ def show_revise(self, model_ref: str, content: str) -> None: ) ) - def show_commit(self, confidence: float, dissent: str | None) -> None: + def show_commit(self, confidence: float, rigor: float, dissent: str | None) -> None: """Display commit result line.""" check = "[bold green]\\u2713[/bold green]" - line = f"{check} COMMIT Confidence: {confidence:.0%}" + line = f"{check} COMMIT Confidence: {confidence:.0%} Rigor: {rigor:.0%}" if dissent is None: line += " (no dissent)" self._console.print(line) @@ -265,7 +265,8 @@ def show_subtask_progress(self, subtask_result: SubtaskResult) -> None: check = "[bold green]\\u2713[/bold green]" self._console.print( f"{check} [{subtask_result.label}] " - f"Confidence: {subtask_result.confidence:.0%}" + f"Confidence: {subtask_result.confidence:.0%} " + f"Rigor: {subtask_result.rigor:.0%}" ) self._console.print( Panel( @@ -288,7 +289,10 @@ def show_synthesis(self, synthesis_result: SynthesisResult) -> None: border_style="bright_white", ) ) - self._console.print(f"Aggregate confidence: {synthesis_result.confidence:.0%}") + self._console.print( + f"Aggregate confidence: {synthesis_result.confidence:.0%}" + f" | Rigor: {synthesis_result.rigor:.0%}" + ) # ── Voting ───────────────────────────────────────────────── @@ -325,6 +329,7 @@ def show_voting_result(self, result: VotingAggregation, cost: float) -> None: self._console.print( f"Strategy: {result.strategy} | " f"Confidence: {result.confidence:.0%} | " + f"Rigor: {result.rigor:.0%} | " f"Votes: {len(result.votes)} | " f"Cost: ${cost:.4f}" ) @@ -358,6 +363,7 @@ def show_final_decision( self, decision: str, confidence: float, + rigor: float, cost: float, dissent: str | None, ) -> None: @@ -371,7 +377,9 @@ def show_final_decision( border_style="bright_white", ) ) - self._console.print(f"Confidence: {confidence:.0%} | Cost: ${cost:.4f}") + self._console.print( + f"Confidence: {confidence:.0%} | Rigor: {rigor:.0%} | Cost: ${cost:.4f}" + ) if dissent: self._console.print() diff --git a/src/duh/consensus/handlers.py b/src/duh/consensus/handlers.py index 4f2ed8b..6dde013 100644 --- a/src/duh/consensus/handlers.py +++ b/src/duh/consensus/handlers.py @@ -625,10 +625,10 @@ async def handle_revise( # ── COMMIT helpers + handler ───────────────────────────────── -def _compute_confidence(challenges: list[ChallengeResult]) -> float: - """Compute confidence score from challenge quality. +def _compute_rigor(challenges: list[ChallengeResult]) -> float: + """Compute rigor score from challenge quality. - Genuine (non-sycophantic) challenges improve confidence because + Genuine (non-sycophantic) challenges improve rigor because they indicate the revision was rigorously tested. Returns a float in [0.5, 1.0]: @@ -641,6 +641,26 @@ def _compute_confidence(challenges: list[ChallengeResult]) -> float: return 0.5 + (genuine / len(challenges)) * 0.5 +# Domain caps for epistemic confidence scoring. +# Caps confidence based on question intent to reflect inherent +# uncertainty of different question types. +DOMAIN_CAPS: dict[str, float] = { + "factual": 0.95, + "technical": 0.90, + "creative": 0.85, + "judgment": 0.80, + "strategic": 0.70, +} +_DEFAULT_DOMAIN_CAP = 0.85 + + +def _domain_cap(intent: str | None) -> float: + """Return the confidence ceiling for a given question intent.""" + if intent is None: + return _DEFAULT_DOMAIN_CAP + return DOMAIN_CAPS.get(intent, _DEFAULT_DOMAIN_CAP) + + def _extract_dissent(challenges: list[ChallengeResult]) -> str | None: """Extract dissent from non-sycophantic challenges. @@ -693,14 +713,22 @@ async def handle_commit( raise ConsensusError(msg) ctx.decision = ctx.revision - ctx.confidence = _compute_confidence(ctx.challenges) + ctx.rigor = _compute_rigor(ctx.challenges) ctx.dissent = _extract_dissent(ctx.challenges) - # Optional taxonomy classification - if classify and provider_manager is not None: + # Taxonomy classification (always attempt when provider available) + intent: str | None = None + if provider_manager is not None: taxonomy = await _classify_decision(ctx, provider_manager) if taxonomy: ctx.taxonomy = taxonomy + intent = taxonomy.get("intent") or None + elif classify: + # Legacy path: explicit classify without provider is a no-op + pass + + # Epistemic confidence = rigor clamped by domain ceiling + ctx.confidence = min(_domain_cap(intent), ctx.rigor) async def _classify_decision( diff --git a/src/duh/consensus/machine.py b/src/duh/consensus/machine.py index 264d18e..5ed42fd 100644 --- a/src/duh/consensus/machine.py +++ b/src/duh/consensus/machine.py @@ -54,6 +54,7 @@ class RoundResult: revision: str decision: str confidence: float + rigor: float = 0.0 dissent: str | None = None @@ -90,6 +91,7 @@ class ConsensusContext: revision_model: str | None = None decision: str | None = None confidence: float = 0.0 + rigor: float = 0.0 dissent: str | None = None converged: bool = False @@ -117,6 +119,7 @@ def _clear_round_data(self) -> None: self.revision_model = None self.decision = None self.confidence = 0.0 + self.rigor = 0.0 self.dissent = None self.converged = False @@ -131,6 +134,7 @@ def _archive_round(self) -> None: revision=self.revision or "", decision=self.decision or "", confidence=self.confidence, + rigor=self.rigor, dissent=self.dissent, ) ) diff --git a/src/duh/consensus/scheduler.py b/src/duh/consensus/scheduler.py index 3e79dbf..e9820c2 100644 --- a/src/duh/consensus/scheduler.py +++ b/src/duh/consensus/scheduler.py @@ -43,6 +43,7 @@ class SubtaskResult: label: str decision: str confidence: float + rigor: float = 0.0 cost: float = 0.0 @@ -52,7 +53,7 @@ async def _run_mini_consensus( *, max_rounds: int = 1, display: ConsensusDisplay | None = None, -) -> tuple[str, float]: +) -> tuple[str, float, float]: """Run a simplified single-round consensus for one subtask. Executes PROPOSE -> CHALLENGE -> REVISE -> COMMIT with the @@ -65,7 +66,7 @@ async def _run_mini_consensus( display: Optional display for real-time progress output. Returns: - (decision, confidence) tuple. + (decision, confidence, rigor) tuple. Raises: ConsensusError: If any handler phase fails. @@ -110,11 +111,11 @@ async def _run_mini_consensus( # COMMIT sm.transition(ConsensusState.COMMIT) - await handle_commit(ctx) + await handle_commit(ctx, provider_manager) if display: - display.show_commit(ctx.confidence, ctx.dissent) + display.show_commit(ctx.confidence, ctx.rigor, ctx.dissent) - return ctx.decision or "", ctx.confidence + return ctx.decision or "", ctx.confidence, ctx.rigor async def _execute_subtask( @@ -155,7 +156,7 @@ async def _execute_subtask( augmented_question += f"\n\nContext from prior subtasks:\n{dep_text}" cost_before = provider_manager.total_cost - decision, confidence = await _run_mini_consensus( + decision, confidence, rigor = await _run_mini_consensus( augmented_question, provider_manager, display=display ) subtask_cost = provider_manager.total_cost - cost_before @@ -164,6 +165,7 @@ async def _execute_subtask( label=subtask.label, decision=decision, confidence=confidence, + rigor=rigor, cost=subtask_cost, ) diff --git a/src/duh/consensus/synthesis.py b/src/duh/consensus/synthesis.py index 2f3ee25..9f042ea 100644 --- a/src/duh/consensus/synthesis.py +++ b/src/duh/consensus/synthesis.py @@ -24,6 +24,7 @@ class SynthesisResult: content: str confidence: float strategy: str + rigor: float = 0.0 def _build_merge_prompt( @@ -187,9 +188,15 @@ async def synthesize( # Aggregate confidence: weighted average of subtask confidences total_conf = sum(r.confidence for r in subtask_results) avg_confidence = total_conf / len(subtask_results) if subtask_results else 0.0 + avg_rigor = ( + sum(r.rigor for r in subtask_results) / len(subtask_results) + if subtask_results + else 0.0 + ) return SynthesisResult( content=response.content, confidence=avg_confidence, strategy=strategy, + rigor=avg_rigor, ) diff --git a/src/duh/consensus/voting.py b/src/duh/consensus/voting.py index e1df5e2..f9c0143 100644 --- a/src/duh/consensus/voting.py +++ b/src/duh/consensus/voting.py @@ -30,6 +30,7 @@ class VoteResult: model_ref: str content: str confidence: float = 0.0 + rigor: float = 0.5 @dataclass(frozen=True, slots=True) @@ -40,6 +41,7 @@ class VotingAggregation: decision: str strategy: str confidence: float + rigor: float = 0.5 # ── Internal helpers ───────────────────────────────────────────── diff --git a/src/duh/mcp/server.py b/src/duh/mcp/server.py index d61c7fe..f91663d 100644 --- a/src/duh/mcp/server.py +++ b/src/duh/mcp/server.py @@ -127,6 +127,7 @@ async def _handle_ask(args: dict) -> list[TextContent]: # type: ignore[type-arg { "decision": result.decision, "confidence": result.confidence, + "rigor": result.rigor, "votes": len(result.votes), "cost": pm.total_cost, } @@ -134,7 +135,9 @@ async def _handle_ask(args: dict) -> list[TextContent]: # type: ignore[type-arg ) ] else: - decision, confidence, dissent, cost = await _run_consensus(question, config, pm) + decision, confidence, rigor, dissent, cost = await _run_consensus( + question, config, pm + ) return [ TextContent( type="text", @@ -142,6 +145,7 @@ async def _handle_ask(args: dict) -> list[TextContent]: # type: ignore[type-arg { "decision": decision, "confidence": confidence, + "rigor": rigor, "dissent": dissent, "cost": cost, } @@ -179,6 +183,7 @@ async def _handle_recall(args: dict) -> list[TextContent]: # type: ignore[type- latest = thread.decisions[-1] entry["decision"] = latest.content[:200] entry["confidence"] = latest.confidence + entry["rigor"] = latest.rigor results.append(entry) await engine.dispose() diff --git a/src/duh/memory/context.py b/src/duh/memory/context.py index c679312..9c9ac66 100644 --- a/src/duh/memory/context.py +++ b/src/duh/memory/context.py @@ -76,7 +76,7 @@ def build_context( if decisions and remaining > 0: decision_parts: list[str] = [] for d in decisions: - part = f"- [{d.confidence:.0%} confidence] {d.content}" + part = f"- [{d.confidence:.0%} confidence, {d.rigor:.0%} rigor] {d.content}" if d.dissent: part += f"\n Dissent: {d.dissent}" outcome = outcome_map.get(d.id) diff --git a/src/duh/memory/migrations.py b/src/duh/memory/migrations.py new file mode 100644 index 0000000..1e7c159 --- /dev/null +++ b/src/duh/memory/migrations.py @@ -0,0 +1,34 @@ +"""Lightweight schema migrations for SQLite. + +Runs on startup for file-based SQLite databases to add new columns +that were added after the initial schema. In-memory SQLite uses +``create_all`` which handles new columns automatically. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from sqlalchemy.ext.asyncio import AsyncEngine + +logger = logging.getLogger(__name__) + + +async def ensure_schema(engine: AsyncEngine) -> None: + """Apply pending schema migrations. + + Currently handles: + - Adding ``rigor`` column to ``decisions`` table (Phase A). + """ + async with engine.begin() as conn: + # Check if rigor column exists + rows = await conn.exec_driver_sql("PRAGMA table_info(decisions)") + columns = {row[1] for row in rows} + + if "rigor" not in columns: + logger.info("Adding 'rigor' column to decisions table") + await conn.exec_driver_sql( + "ALTER TABLE decisions ADD COLUMN rigor FLOAT DEFAULT 0.0" + ) diff --git a/src/duh/memory/models.py b/src/duh/memory/models.py index 4ef6d7f..ff5de20 100644 --- a/src/duh/memory/models.py +++ b/src/duh/memory/models.py @@ -189,6 +189,7 @@ class Decision(Base): ) content: Mapped[str] = mapped_column(Text) confidence: Mapped[float] = mapped_column(Float, default=0.0) + rigor: Mapped[float] = mapped_column(Float, default=0.0) dissent: Mapped[str | None] = mapped_column(Text, nullable=True, default=None) intent: Mapped[str | None] = mapped_column(String(50), nullable=True, default=None) category: Mapped[str | None] = mapped_column( diff --git a/src/duh/memory/repository.py b/src/duh/memory/repository.py index 1f9ba31..d62911c 100644 --- a/src/duh/memory/repository.py +++ b/src/duh/memory/repository.py @@ -149,6 +149,7 @@ async def save_decision( content: str, confidence: float, *, + rigor: float = 0.0, dissent: str | None = None, intent: str | None = None, category: str | None = None, @@ -160,6 +161,7 @@ async def save_decision( thread_id=thread_id, content=content, confidence=confidence, + rigor=rigor, dissent=dissent, intent=intent, category=category, diff --git a/tests/integration/test_consensus_loop.py b/tests/integration/test_consensus_loop.py index 88f2af6..3420ab6 100644 --- a/tests/integration/test_consensus_loop.py +++ b/tests/integration/test_consensus_loop.py @@ -90,6 +90,7 @@ async def test_full_loop_to_complete(self, mock_provider: MockProvider) -> None: assert ctx.revision is not None assert ctx.decision is not None assert ctx.confidence > 0 + assert ctx.rigor > 0 assert ctx.current_round == 1 # Transition to COMPLETE (max_rounds=1) @@ -367,8 +368,9 @@ async def test_sycophantic_challenges_lower_confidence(self) -> None: await _run_single_round(ctx, sm, pm) - # Both challengers are sycophantic → confidence should be 0.5 + # All sycophantic → rigor=0.5, confidence=min(0.85,0.5) assert ctx.confidence < 1.0 + assert ctx.rigor < 1.0 # Dissent should be None (all sycophantic) assert ctx.dissent is None diff --git a/tests/sycophancy/test_confidence_impact.py b/tests/sycophancy/test_confidence_impact.py index 92281f0..c3cd701 100644 --- a/tests/sycophancy/test_confidence_impact.py +++ b/tests/sycophancy/test_confidence_impact.py @@ -1,13 +1,13 @@ -"""Tests for sycophancy impact on confidence scoring and dissent. +"""Tests for sycophancy impact on rigor scoring and dissent. Verifies the mathematical relationship between sycophantic challenge -counts and resulting confidence, and that dissent extraction correctly +counts and resulting rigor, and that dissent extraction correctly filters out sycophantic responses. """ from __future__ import annotations -from duh.consensus.handlers import _compute_confidence, _extract_dissent +from duh.consensus.handlers import _compute_rigor, _extract_dissent from duh.consensus.machine import ChallengeResult # ── Helpers ────────────────────────────────────────────────────── @@ -30,44 +30,44 @@ def _sycophantic( # ── Confidence computation ─────────────────────────────────────── -class TestConfidenceComputation: +class TestRigorComputation: def test_all_genuine_two_challengers(self) -> None: """2/2 genuine → 0.5 + (2/2)*0.5 = 1.0.""" challenges = [_genuine("a"), _genuine("b")] - assert _compute_confidence(challenges) == 1.0 + assert _compute_rigor(challenges) == 1.0 def test_all_sycophantic_two_challengers(self) -> None: """0/2 genuine → 0.5 + (0/2)*0.5 = 0.5.""" challenges = [_sycophantic("a"), _sycophantic("b")] - assert _compute_confidence(challenges) == 0.5 + assert _compute_rigor(challenges) == 0.5 def test_one_genuine_one_sycophantic(self) -> None: """1/2 genuine → 0.5 + (1/2)*0.5 = 0.75.""" challenges = [_genuine("a"), _sycophantic("b")] - assert _compute_confidence(challenges) == 0.75 + assert _compute_rigor(challenges) == 0.75 def test_empty_challenges(self) -> None: """No challenges → 0.5 (untested).""" - assert _compute_confidence([]) == 0.5 + assert _compute_rigor([]) == 0.5 def test_single_genuine(self) -> None: """1/1 genuine → 1.0.""" - assert _compute_confidence([_genuine()]) == 1.0 + assert _compute_rigor([_genuine()]) == 1.0 def test_single_sycophantic(self) -> None: """0/1 genuine → 0.5.""" - assert _compute_confidence([_sycophantic()]) == 0.5 + assert _compute_rigor([_sycophantic()]) == 0.5 def test_three_challengers_two_genuine(self) -> None: """2/3 genuine → 0.5 + (2/3)*0.5 ≈ 0.833.""" challenges = [_genuine("a"), _genuine("b"), _sycophantic("c")] - result = _compute_confidence(challenges) + result = _compute_rigor(challenges) assert abs(result - (0.5 + (2 / 3) * 0.5)) < 1e-10 def test_three_challengers_one_genuine(self) -> None: """1/3 genuine → 0.5 + (1/3)*0.5 ≈ 0.667.""" challenges = [_genuine("a"), _sycophantic("b"), _sycophantic("c")] - result = _compute_confidence(challenges) + result = _compute_rigor(challenges) assert abs(result - (0.5 + (1 / 3) * 0.5)) < 1e-10 def test_confidence_always_between_half_and_one(self) -> None: @@ -79,7 +79,7 @@ def test_confidence_always_between_half_and_one(self) -> None: challenges = [_genuine(f"g{i}") for i in range(n_genuine)] + [ _sycophantic(f"s{i}") for i in range(n_syc) ] - conf = _compute_confidence(challenges) + conf = _compute_rigor(challenges) assert 0.5 <= conf <= 1.0, f"{n_genuine}g/{n_syc}s → {conf}" def test_confidence_monotonic_with_genuine_ratio(self) -> None: @@ -91,7 +91,7 @@ def test_confidence_monotonic_with_genuine_ratio(self) -> None: challenges = [_genuine(f"g{i}") for i in range(n_genuine)] + [ _sycophantic(f"s{i}") for i in range(n_syc) ] - conf = _compute_confidence(challenges) + conf = _compute_rigor(challenges) assert conf >= prev, f"Not monotonic at {n_genuine}/{total}" prev = conf diff --git a/tests/sycophancy/test_known_flaws.py b/tests/sycophancy/test_known_flaws.py index 5e84746..fb3d6c2 100644 --- a/tests/sycophancy/test_known_flaws.py +++ b/tests/sycophancy/test_known_flaws.py @@ -84,7 +84,9 @@ async def test_full_loop_genuine_high_confidence( sm.transition(ConsensusState.COMMIT) await handle_commit(ctx) - assert ctx.confidence == 1.0 + # rigor=1.0 (all genuine), but no pm → default cap 0.85 + assert ctx.rigor == 1.0 + assert ctx.confidence == 0.85 async def test_genuine_challenges_produce_dissent( self, known_flaw_genuine_provider: MockProvider diff --git a/tests/unit/test_api_crud.py b/tests/unit/test_api_crud.py index b111870..14f41bd 100644 --- a/tests/unit/test_api_crud.py +++ b/tests/unit/test_api_crud.py @@ -393,3 +393,94 @@ async def test_by_model_ordering(self) -> None: assert len(data["by_model"]) == 2 assert data["by_model"][0]["model_ref"] == "expensive:model" assert data["by_model"][1]["model_ref"] == "cheap:model" + + +# -- Helpers for calibration ----------------------------------------------- + + +async def _seed_decision_with_outcome( + app: FastAPI, + confidence: float, + outcome_result: str | None = None, + *, + category: str | None = None, +) -> tuple[str, str]: + """Seed a thread with decision and optional outcome. + + Returns (thread_id, decision_id). + """ + async with app.state.db_factory() as session: + repo = MemoryRepository(session) + thread = await repo.create_thread("Calibration question") + turn = await repo.create_turn(thread.id, 1, "COMMIT") + decision = await repo.save_decision( + turn.id, thread.id, "Decision content", confidence, category=category + ) + if outcome_result is not None: + await repo.save_outcome(decision.id, thread.id, outcome_result) + await session.commit() + return thread.id, decision.id + + +# -- TestCalibration ----------------------------------------------------------- + + +class TestCalibration: + async def test_empty_returns_zeros(self) -> None: + app = await _make_app() + client = TestClient(app, raise_server_exceptions=False) + resp = client.get("/api/calibration") + assert resp.status_code == 200 + data = resp.json() + assert data["total_decisions"] == 0 + assert data["total_with_outcomes"] == 0 + assert data["overall_accuracy"] == 0.0 + assert data["ece"] == 0.0 + assert len(data["buckets"]) == 10 + + async def test_with_outcomes(self) -> None: + app = await _make_app() + await _seed_decision_with_outcome(app, 0.9, "success") + await _seed_decision_with_outcome(app, 0.9, "success") + await _seed_decision_with_outcome(app, 0.3, "failure") + + client = TestClient(app, raise_server_exceptions=False) + resp = client.get("/api/calibration") + assert resp.status_code == 200 + data = resp.json() + assert data["total_decisions"] == 3 + assert data["total_with_outcomes"] == 3 + assert data["overall_accuracy"] > 0.0 + assert data["ece"] >= 0.0 + + async def test_category_filter(self) -> None: + app = await _make_app() + await _seed_decision_with_outcome(app, 0.8, "success", category="tech") + await _seed_decision_with_outcome(app, 0.5, "failure", category="other") + + client = TestClient(app, raise_server_exceptions=False) + resp = client.get("/api/calibration", params={"category": "tech"}) + assert resp.status_code == 200 + data = resp.json() + assert data["total_decisions"] == 1 + assert data["total_with_outcomes"] == 1 + + async def test_bucket_structure(self) -> None: + app = await _make_app() + await _seed_decision_with_outcome(app, 0.5, "success") + + client = TestClient(app, raise_server_exceptions=False) + resp = client.get("/api/calibration") + assert resp.status_code == 200 + data = resp.json() + buckets = data["buckets"] + assert len(buckets) == 10 + + # Check bucket 5 (0.5-0.6) has the decision + b5 = buckets[5] + assert b5["count"] == 1 + assert b5["with_outcomes"] == 1 + assert b5["success"] == 1 + assert b5["accuracy"] == 1.0 + assert b5["range_lo"] == 0.5 + assert b5["range_hi"] == 0.6 diff --git a/tests/unit/test_api_ws.py b/tests/unit/test_api_ws.py index 0c330d8..212f0da 100644 --- a/tests/unit/test_api_ws.py +++ b/tests/unit/test_api_ws.py @@ -97,8 +97,9 @@ async def mock_revise(ctx, pm, **kwargs): ctx.revision_model = ctx.proposal_model return _make_response(revision) - async def mock_commit(ctx, **kwargs): + async def mock_commit(ctx, *args, **kwargs): ctx.decision = ctx.revision + ctx.rigor = 1.0 ctx.confidence = confidence ctx.dissent = dissent @@ -240,6 +241,7 @@ def test_complete_event_has_decision_confidence_cost(self): complete = next(e for e in events if e["type"] == "complete") assert complete["decision"] == "Final answer" assert complete["confidence"] == 0.85 + assert complete["rigor"] == 1.0 assert complete["dissent"] == "Minor dissent" assert "cost" in complete @@ -256,6 +258,7 @@ def test_commit_event_has_confidence_and_dissent(self): commit = next(e for e in events if e["type"] == "commit") assert commit["confidence"] == 0.9 + assert commit["rigor"] == 1.0 assert commit["dissent"] == "Some dissent" assert "round" in commit diff --git a/tests/unit/test_calibration.py b/tests/unit/test_calibration.py new file mode 100644 index 0000000..1375e09 --- /dev/null +++ b/tests/unit/test_calibration.py @@ -0,0 +1,162 @@ +"""Tests for duh.calibration — confidence calibration analysis.""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from duh.calibration import compute_calibration + + +def _decision(confidence: float, outcome: str | None = None) -> SimpleNamespace: + """Create a fake Decision with optional Outcome for testing.""" + out = None + if outcome is not None: + out = SimpleNamespace(result=outcome) + return SimpleNamespace(confidence=confidence, outcome=out) + + +class TestComputeCalibration: + def test_empty_input(self) -> None: + result = compute_calibration([]) + assert result.total_decisions == 0 + assert result.total_with_outcomes == 0 + assert result.overall_accuracy == 0.0 + assert result.ece == 0.0 + assert len(result.buckets) == 10 + + def test_no_outcomes(self) -> None: + decisions = [_decision(0.5), _decision(0.8), _decision(0.3)] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 3 + assert result.total_with_outcomes == 0 + assert result.overall_accuracy == 0.0 + assert result.ece == 0.0 + + def test_single_success(self) -> None: + decisions = [_decision(0.9, "success")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 1 + assert result.total_with_outcomes == 1 + assert result.overall_accuracy == 1.0 + # Bucket 9 (0.9-1.0): accuracy=1.0, mean_conf=0.9, |1.0-0.9|=0.1 + assert result.ece == pytest.approx(0.1) + + def test_single_failure(self) -> None: + decisions = [_decision(0.7, "failure")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 1 + assert result.total_with_outcomes == 1 + assert result.overall_accuracy == 0.0 + # Bucket 7 (0.7-0.8): accuracy=0.0, mean_conf=0.7, |0.0-0.7|=0.7 + assert result.ece == pytest.approx(0.7) + + def test_partial_counts_as_half(self) -> None: + decisions = [_decision(0.5, "partial")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_with_outcomes == 1 + assert result.overall_accuracy == 0.5 + # Bucket 5 (0.5-0.6): accuracy=0.5, mean_conf=0.5, |0.5-0.5|=0.0 + assert result.ece == pytest.approx(0.0) + + def test_perfect_calibration(self) -> None: + """When accuracy matches confidence, ECE should be near 0.""" + # Put 10 decisions at confidence 0.85: + # 8 or 9 successes needed for accuracy ~0.85 + # With 10 decisions: 8 success + 1 partial + 1 failure + # accuracy = (8 + 0.5) / 10 = 0.85 matches mean_conf=0.85 + decisions = ( + [_decision(0.85, "success")] * 8 + + [_decision(0.85, "partial")] + + [_decision(0.85, "failure")] + ) + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 10 + assert result.total_with_outcomes == 10 + assert result.overall_accuracy == pytest.approx(0.85) + assert result.ece == pytest.approx(0.0) + + def test_overconfident(self) -> None: + """High confidence but all failures = high ECE.""" + decisions = [_decision(0.95, "failure")] * 10 + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.overall_accuracy == 0.0 + assert result.ece > 0.8 # ~0.95 + + def test_multiple_buckets(self) -> None: + decisions = [ + _decision(0.15, "success"), + _decision(0.15, "failure"), + _decision(0.85, "success"), + _decision(0.85, "success"), + ] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 4 + assert result.total_with_outcomes == 4 + + # Bucket 1 (0.1-0.2): 2 with_outcomes, 1 success, accuracy=0.5 + bucket1 = result.buckets[1] + assert bucket1.count == 2 + assert bucket1.with_outcomes == 2 + assert bucket1.success == 1 + assert bucket1.accuracy == pytest.approx(0.5) + + # Bucket 8 (0.8-0.9): 2 with_outcomes, 2 success, accuracy=1.0 + bucket8 = result.buckets[8] + assert bucket8.count == 2 + assert bucket8.with_outcomes == 2 + assert bucket8.success == 2 + assert bucket8.accuracy == pytest.approx(1.0) + + def test_custom_n_buckets(self) -> None: + decisions = [_decision(0.5, "success")] + result = compute_calibration(decisions, n_buckets=5) # type: ignore[arg-type] + assert len(result.buckets) == 5 + # confidence 0.5 -> bucket index 2 (0.4-0.6) + assert result.buckets[2].count == 1 + assert result.buckets[2].with_outcomes == 1 + + def test_boundary_zero(self) -> None: + """Confidence 0.0 goes into the first bucket.""" + decisions = [_decision(0.0, "failure")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.buckets[0].count == 1 + assert result.buckets[0].with_outcomes == 1 + assert result.buckets[0].accuracy == 0.0 + + def test_boundary_one(self) -> None: + """Confidence 1.0 goes into the last bucket.""" + decisions = [_decision(1.0, "success")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.buckets[9].count == 1 + assert result.buckets[9].with_outcomes == 1 + assert result.buckets[9].accuracy == 1.0 + + def test_boundary_exact_tenth(self) -> None: + """Confidence exactly 0.1 goes into bucket 1 (0.1-0.2).""" + decisions = [_decision(0.1, "success")] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.buckets[1].count == 1 + + def test_overall_accuracy(self) -> None: + decisions = [ + _decision(0.5, "success"), + _decision(0.5, "failure"), + _decision(0.5, "partial"), + _decision(0.5, "success"), + ] + # accuracy = (2 + 0.5) / 4 = 0.625 + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.overall_accuracy == pytest.approx(0.625) + + def test_mixed_with_and_without_outcomes(self) -> None: + decisions = [ + _decision(0.5, "success"), + _decision(0.5), # no outcome + _decision(0.5, "failure"), + ] + result = compute_calibration(decisions) # type: ignore[arg-type] + assert result.total_decisions == 3 + assert result.total_with_outcomes == 2 + assert result.overall_accuracy == pytest.approx(0.5) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 172d73f..b6bef3b 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -72,6 +72,7 @@ def test_displays_decision( mock_run.return_value = ( "Use SQLite for v0.1.", 1.0, + 1.0, None, 0.0042, ) @@ -97,6 +98,7 @@ def test_displays_dissent( mock_run.return_value = ( "Use SQLite.", 0.75, + 1.0, "[model-a]: PostgreSQL would be better for scale.", 0.01, ) @@ -119,7 +121,7 @@ def test_no_dissent_when_none( from duh.config.schema import DuhConfig mock_config.return_value = DuhConfig() - mock_run.return_value = ("Answer.", 1.0, None, 0.0) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0) result = runner.invoke(cli, ["ask", "Question?"]) @@ -138,7 +140,7 @@ def test_rounds_option( config = DuhConfig() mock_config.return_value = config - mock_run.return_value = ("Answer.", 1.0, None, 0.0) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0) result = runner.invoke(cli, ["ask", "--rounds", "5", "Question?"]) @@ -432,7 +434,7 @@ async def _seed() -> str: assert "Use PostgreSQL" in result.output assert "[CHALLENGER] mock:challenger-1" in result.output assert "SQLite is simpler" in result.output - assert "Decision (confidence 85%)" in result.output + assert "Decision (confidence 85%, rigor 0%)" in result.output assert "Use SQLite for v0.1." in result.output assert "Dissent: PostgreSQL for future scale." in result.output asyncio.run(engine.dispose()) @@ -639,7 +641,7 @@ def test_ask_full_loop(self, runner: CliRunner) -> None: async def fake_ask( question: str, cfg: Any, **kwargs: Any - ) -> tuple[str, float, str | None, float]: + ) -> tuple[str, float, float, str | None, float]: pm = ProviderManager() await pm.register(provider) from duh.cli.app import _run_consensus diff --git a/tests/unit/test_cli_batch.py b/tests/unit/test_cli_batch.py index 0f35a94..b9c2956 100644 --- a/tests/unit/test_cli_batch.py +++ b/tests/unit/test_cli_batch.py @@ -452,10 +452,10 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, str | None, float]: + ) -> tuple[str, float, float, str | None, float]: nonlocal consensus_called consensus_called = True - return ("Use SQLite.", 0.85, None, 0.01) + return ("Use SQLite.", 0.85, 1.0, None, 0.01) with ( patch("duh.cli.app.load_config", return_value=config), @@ -546,8 +546,8 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, str | None, float]: - return ("Answer.", 0.9, None, 0.01) + ) -> tuple[str, float, float, str | None, float]: + return ("Answer.", 0.9, 1.0, None, 0.01) with ( patch("duh.cli.app.load_config", return_value=config), @@ -601,12 +601,12 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, str | None, float]: + ) -> tuple[str, float, float, str | None, float]: nonlocal call_count call_count += 1 if question == "Q2": raise RuntimeError("Provider timeout") - return ("Answer.", 0.9, None, 0.01) + return ("Answer.", 0.9, 1.0, None, 0.01) with ( patch("duh.cli.app.load_config", return_value=config), @@ -650,10 +650,10 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, str | None, float]: + ) -> tuple[str, float, float, str | None, float]: if question == "Q2": raise RuntimeError("Model unavailable") - return ("Answer.", 0.9, None, 0.01) + return ("Answer.", 0.9, 1.0, None, 0.01) with ( patch("duh.cli.app.load_config", return_value=config), diff --git a/tests/unit/test_cli_calibration.py b/tests/unit/test_cli_calibration.py new file mode 100644 index 0000000..e2618de --- /dev/null +++ b/tests/unit/test_cli_calibration.py @@ -0,0 +1,154 @@ +"""Tests for the duh calibration CLI command.""" + +from __future__ import annotations + +import asyncio +from typing import Any +from unittest.mock import AsyncMock, patch + +import pytest +from click.testing import CliRunner + +from duh.cli.app import cli + + +@pytest.fixture +def runner() -> CliRunner: + return CliRunner() + + +# ── DB helpers (same pattern as test_cli_export.py) ────────────────── + + +def _make_db() -> tuple[Any, Any]: + """Create an in-memory SQLite engine + sessionmaker synchronously.""" + from sqlalchemy import event + from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + from sqlalchemy.pool import StaticPool + + engine = create_async_engine( + "sqlite+aiosqlite://", + poolclass=StaticPool, + connect_args={"check_same_thread": False}, + ) + + @event.listens_for(engine.sync_engine, "connect") + def _enable_fks(dbapi_conn, connection_record): # type: ignore[no-untyped-def] + cursor = dbapi_conn.cursor() + cursor.execute("PRAGMA foreign_keys=ON") + cursor.close() + + asyncio.run(_init_tables(engine)) + factory = async_sessionmaker(engine, expire_on_commit=False) + return factory, engine + + +async def _init_tables(engine: Any) -> None: + from duh.memory.models import Base + + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + +def _mem_config() -> Any: + from duh.config.schema import DuhConfig + + return DuhConfig( + database={"url": "sqlite+aiosqlite://"}, # type: ignore[arg-type] + ) + + +async def _seed_decision_with_outcome( + factory: Any, + confidence: float, + outcome_result: str | None = None, +) -> str: + """Seed a thread + turn + decision, optionally with an outcome.""" + from duh.memory.repository import MemoryRepository + + async with factory() as session: + repo = MemoryRepository(session) + thread = await repo.create_thread("Test question") + turn = await repo.create_turn(thread.id, 1, "COMMIT") + decision = await repo.save_decision( + turn.id, thread.id, "Some decision", confidence + ) + if outcome_result is not None: + await repo.save_outcome(decision.id, thread.id, outcome_result) + await session.commit() + return thread.id + + +# ── Tests ──────────────────────────────────────────────────────── + + +class TestCalibrationCLI: + def test_no_decisions(self, runner: CliRunner) -> None: + factory, engine = _make_db() + config = _mem_config() + + with ( + patch("duh.cli.app._load_config", return_value=config), + patch("duh.cli.app._create_db", new_callable=AsyncMock) as mock_db, + ): + mock_db.return_value = (factory, engine) + result = runner.invoke(cli, ["calibration"]) + + assert result.exit_code == 0 + assert "No decisions found" in result.output + + def test_with_outcomes(self, runner: CliRunner) -> None: + factory, engine = _make_db() + config = _mem_config() + + # Seed some decisions with outcomes + asyncio.run(_seed_decision_with_outcome(factory, 0.9, "success")) + asyncio.run(_seed_decision_with_outcome(factory, 0.9, "success")) + asyncio.run(_seed_decision_with_outcome(factory, 0.5, "failure")) + + with ( + patch("duh.cli.app._load_config", return_value=config), + patch("duh.cli.app._create_db", new_callable=AsyncMock) as mock_db, + ): + mock_db.return_value = (factory, engine) + result = runner.invoke(cli, ["calibration"]) + + assert result.exit_code == 0 + assert "Total decisions: 3" in result.output + assert "With outcomes: 3" in result.output + assert "ECE:" in result.output + assert "Calibration:" in result.output + + def test_without_outcomes(self, runner: CliRunner) -> None: + factory, engine = _make_db() + config = _mem_config() + + # Seed decisions without outcomes + asyncio.run(_seed_decision_with_outcome(factory, 0.8)) + asyncio.run(_seed_decision_with_outcome(factory, 0.6)) + + with ( + patch("duh.cli.app._load_config", return_value=config), + patch("duh.cli.app._create_db", new_callable=AsyncMock) as mock_db, + ): + mock_db.return_value = (factory, engine) + result = runner.invoke(cli, ["calibration"]) + + assert result.exit_code == 0 + assert "Total decisions: 2" in result.output + assert "With outcomes: 0" in result.output + assert "Overall accuracy: 0.0%" in result.output + + def test_category_filter(self, runner: CliRunner) -> None: + factory, engine = _make_db() + config = _mem_config() + + with ( + patch("duh.cli.app._load_config", return_value=config), + patch("duh.cli.app._create_db", new_callable=AsyncMock) as mock_db, + ): + mock_db.return_value = (factory, engine) + result = runner.invoke(cli, ["calibration", "--category", "tech"]) + + assert result.exit_code == 0 + assert "No decisions found" in result.output diff --git a/tests/unit/test_cli_decompose.py b/tests/unit/test_cli_decompose.py index 52b0bfa..6bb3764 100644 --- a/tests/unit/test_cli_decompose.py +++ b/tests/unit/test_cli_decompose.py @@ -225,7 +225,7 @@ def test_complete_decompose_flow(self) -> None: strategy="merge", ) display.show_synthesis(synthesis) - display.show_final_decision("Use SQLite for v0.1.", 0.85, 0.042, None) + display.show_final_decision("Use SQLite for v0.1.", 0.85, 1.0, 0.042, None) out = _output(buf) assert "DECOMPOSE" in out diff --git a/tests/unit/test_cli_display.py b/tests/unit/test_cli_display.py index 21e97db..5c72456 100644 --- a/tests/unit/test_cli_display.py +++ b/tests/unit/test_cli_display.py @@ -230,20 +230,20 @@ def test_truncates_long_content(self) -> None: class TestShowCommit: def test_shows_confidence(self) -> None: display, buf = _make_display() - display.show_commit(0.85, "Some dissent here.") + display.show_commit(0.85, 1.0, "Some dissent here.") out = _output(buf) assert "COMMIT" in out assert "85%" in out def test_shows_no_dissent_marker(self) -> None: display, buf = _make_display() - display.show_commit(1.0, None) + display.show_commit(1.0, 1.0, None) out = _output(buf) assert "no dissent" in out def test_confidence_formatting(self) -> None: display, buf = _make_display() - display.show_commit(0.5, "dissent text") + display.show_commit(0.5, 1.0, "dissent text") out = _output(buf) assert "50%" in out @@ -291,14 +291,14 @@ def test_footer_different_values(self) -> None: class TestShowFinalDecision: def test_shows_decision_text(self) -> None: display, buf = _make_display() - display.show_final_decision("Use SQLite for v0.1.", 0.85, 0.0042, None) + display.show_final_decision("Use SQLite for v0.1.", 0.85, 1.0, 0.0042, None) out = _output(buf) assert "Use SQLite for v0.1." in out assert "Decision" in out def test_shows_confidence_and_cost(self) -> None: display, buf = _make_display() - display.show_final_decision("Answer.", 1.0, 0.0042, None) + display.show_final_decision("Answer.", 1.0, 1.0, 0.0042, None) out = _output(buf) assert "Confidence: 100%" in out assert "Cost: $0.0042" in out @@ -308,6 +308,7 @@ def test_shows_dissent_when_present(self) -> None: display.show_final_decision( "Answer.", 0.75, + 1.0, 0.01, "[model-a]: PostgreSQL would be better for scale.", ) @@ -317,14 +318,14 @@ def test_shows_dissent_when_present(self) -> None: def test_no_dissent_panel_when_none(self) -> None: display, buf = _make_display() - display.show_final_decision("Answer.", 1.0, 0.0, None) + display.show_final_decision("Answer.", 1.0, 1.0, 0.0, None) out = _output(buf) assert "Dissent" not in out def test_decision_not_truncated(self) -> None: display, buf = _make_display() long_decision = "x" * 1000 - display.show_final_decision(long_decision, 0.9, 0.05, None) + display.show_final_decision(long_decision, 0.9, 1.0, 0.05, None) out = _output(buf) # Final decision should NOT be truncated assert "..." not in out @@ -356,10 +357,10 @@ def test_complete_round_display(self) -> None: ] ) display.show_revise("mock:model-a", "Revised with challenges.") - display.show_commit(0.75, "Some dissent.") + display.show_commit(0.75, 1.0, "Some dissent.") display.round_footer(1, 2, 3, 0.05) display.show_final_decision( - "Final consensus answer.", 0.75, 0.05, "Some dissent." + "Final consensus answer.", 0.75, 1.0, 0.05, "Some dissent." ) out = _output(buf) diff --git a/tests/unit/test_cli_tools.py b/tests/unit/test_cli_tools.py index 1973dca..3e24d94 100644 --- a/tests/unit/test_cli_tools.py +++ b/tests/unit/test_cli_tools.py @@ -242,7 +242,7 @@ def test_tools_enabled_passes_registry( config = DuhConfig(tools=ToolsConfig(enabled=True)) mock_config.return_value = config mock_providers.return_value.list_all_models.return_value = ["model1"] - mock_consensus.return_value = ("Answer", 0.9, None, 0.01) + mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01) runner.invoke(cli, ["ask", "test question"]) @@ -263,7 +263,7 @@ def test_tools_disabled_passes_none( config = DuhConfig(tools=ToolsConfig(enabled=False)) mock_config.return_value = config mock_providers.return_value.list_all_models.return_value = ["model1"] - mock_consensus.return_value = ("Answer", 0.9, None, 0.01) + mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01) runner.invoke(cli, ["ask", "test question"]) diff --git a/tests/unit/test_cli_voting.py b/tests/unit/test_cli_voting.py index 02b031d..a57b303 100644 --- a/tests/unit/test_cli_voting.py +++ b/tests/unit/test_cli_voting.py @@ -147,7 +147,7 @@ def test_default_protocol_is_consensus( from duh.config.schema import DuhConfig mock_config.return_value = DuhConfig() - mock_run.return_value = ("Answer.", 1.0, None, 0.0) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0) result = runner.invoke(cli, ["ask", "Question?"]) assert result.exit_code == 0 @@ -293,7 +293,7 @@ async def _seed() -> str: assert "Use Django" in result.output assert "mock:model-b" in result.output assert "Use FastAPI" in result.output - assert "Decision (confidence 80%)" in result.output + assert "Decision (confidence 80%, rigor 0%)" in result.output asyncio.run(engine.dispose()) def test_show_without_votes(self, runner: CliRunner) -> None: diff --git a/tests/unit/test_commit_handler.py b/tests/unit/test_commit_handler.py index df56f1d..9351288 100644 --- a/tests/unit/test_commit_handler.py +++ b/tests/unit/test_commit_handler.py @@ -1,4 +1,4 @@ -"""Tests for the COMMIT handler: confidence, dissent, context.""" +"""Tests for the COMMIT handler: rigor, confidence, dissent, context.""" from __future__ import annotations @@ -7,7 +7,7 @@ import pytest from duh.consensus.handlers import ( - _compute_confidence, + _compute_rigor, _extract_dissent, handle_commit, ) @@ -61,30 +61,30 @@ def _commit_ctx(**kwargs: object) -> ConsensusContext: # ── Confidence computation ─────────────────────────────────────── -class TestComputeConfidence: +class TestComputeRigor: def test_all_genuine(self) -> None: challenges = [ ChallengeResult("m1", "real issue"), ChallengeResult("m2", "another issue"), ] - assert _compute_confidence(challenges) == 1.0 + assert _compute_rigor(challenges) == 1.0 def test_all_sycophantic(self) -> None: challenges = [ ChallengeResult("m1", "great answer", sycophantic=True), ChallengeResult("m2", "looks good", sycophantic=True), ] - assert _compute_confidence(challenges) == 0.5 + assert _compute_rigor(challenges) == 0.5 def test_mixed(self) -> None: challenges = [ ChallengeResult("m1", "real issue"), ChallengeResult("m2", "great answer", sycophantic=True), ] - assert _compute_confidence(challenges) == 0.75 + assert _compute_rigor(challenges) == 0.75 def test_empty(self) -> None: - assert _compute_confidence([]) == 0.5 + assert _compute_rigor([]) == 0.5 # ── Dissent extraction ─────────────────────────────────────────── @@ -147,12 +147,20 @@ async def test_decision_equals_revision(self) -> None: assert ctx.decision == revision - async def test_confidence_computed(self) -> None: + async def test_rigor_computed(self) -> None: ctx = _commit_ctx() # Default challenges are all genuine await handle_commit(ctx) - assert ctx.confidence == 1.0 + assert ctx.rigor == 1.0 + + async def test_confidence_capped_by_domain(self) -> None: + ctx = _commit_ctx() + # All genuine → rigor=1.0, but no pm → no classification → cap=0.85 + await handle_commit(ctx) + + assert ctx.rigor == 1.0 + assert ctx.confidence == 0.85 # min(0.85, 1.0) async def test_confidence_with_sycophantic(self) -> None: ctx = _commit_ctx() @@ -162,7 +170,8 @@ async def test_confidence_with_sycophantic(self) -> None: ] await handle_commit(ctx) - assert ctx.confidence == 0.75 + assert ctx.rigor == 0.75 + assert ctx.confidence == 0.75 # min(0.85, 0.75) = rigor is lower async def test_dissent_preserved(self) -> None: ctx = _commit_ctx() @@ -235,7 +244,8 @@ async def test_full_commit_flow(self) -> None: await handle_commit(ctx) assert ctx.decision == "Use SQLite instead" - assert ctx.confidence == 1.0 + assert ctx.rigor == 1.0 + assert ctx.confidence == 0.85 # no pm → default cap assert ctx.dissent is not None assert "Too complex" in ctx.dissent @@ -295,6 +305,7 @@ async def test_decision_db_round_trip(self, db_session: AsyncSession) -> None: thread_id=thread.id, content=ctx.decision or "", confidence=ctx.confidence, + rigor=ctx.rigor, dissent=ctx.dissent, ) await db_session.commit() @@ -305,6 +316,7 @@ async def test_decision_db_round_trip(self, db_session: AsyncSession) -> None: loaded = decisions[0] assert loaded.content == ctx.decision assert loaded.confidence == ctx.confidence + assert loaded.rigor == ctx.rigor assert loaded.dissent == ctx.dissent assert loaded.turn_id == turn.id assert loaded.thread_id == thread.id diff --git a/tests/unit/test_confidence_scoring.py b/tests/unit/test_confidence_scoring.py new file mode 100644 index 0000000..675e602 --- /dev/null +++ b/tests/unit/test_confidence_scoring.py @@ -0,0 +1,163 @@ +"""Tests for the epistemic confidence scoring system. + +Tests the renamed _compute_rigor(), new _domain_cap(), and the +combined confidence = min(domain_cap, rigor) formula. +""" + +from __future__ import annotations + +import pytest + +from duh.consensus.handlers import ( + DOMAIN_CAPS, + _compute_rigor, + _domain_cap, +) +from duh.consensus.machine import ChallengeResult + +# ── Rigor computation (renamed from _compute_confidence) ───── + + +class TestComputeRigor: + def test_all_genuine(self) -> None: + challenges = [ + ChallengeResult("m1", "real issue"), + ChallengeResult("m2", "another issue"), + ] + assert _compute_rigor(challenges) == 1.0 + + def test_all_sycophantic(self) -> None: + challenges = [ + ChallengeResult("m1", "great", sycophantic=True), + ChallengeResult("m2", "good", sycophantic=True), + ] + assert _compute_rigor(challenges) == 0.5 + + def test_mixed(self) -> None: + challenges = [ + ChallengeResult("m1", "real issue"), + ChallengeResult("m2", "great", sycophantic=True), + ] + assert _compute_rigor(challenges) == 0.75 + + def test_empty(self) -> None: + assert _compute_rigor([]) == 0.5 + + def test_range_always_half_to_one(self) -> None: + for n_genuine in range(5): + for n_syc in range(5): + if n_genuine + n_syc == 0: + continue + challenges = [ + ChallengeResult(f"g{i}", "issue") for i in range(n_genuine) + ] + [ + ChallengeResult(f"s{i}", "good", sycophantic=True) + for i in range(n_syc) + ] + rigor = _compute_rigor(challenges) + assert 0.5 <= rigor <= 1.0, f"{n_genuine}g/{n_syc}s -> {rigor}" + + +# ── Domain cap lookup ──────────────────────────────────────── + + +class TestDomainCap: + def test_factual(self) -> None: + assert _domain_cap("factual") == 0.95 + + def test_technical(self) -> None: + assert _domain_cap("technical") == 0.90 + + def test_creative(self) -> None: + assert _domain_cap("creative") == 0.85 + + def test_judgment(self) -> None: + assert _domain_cap("judgment") == 0.80 + + def test_strategic(self) -> None: + assert _domain_cap("strategic") == 0.70 + + def test_unknown_intent(self) -> None: + assert _domain_cap("nonexistent") == 0.85 + + def test_none_intent(self) -> None: + assert _domain_cap(None) == 0.85 + + def test_all_caps_below_one(self) -> None: + for intent, cap in DOMAIN_CAPS.items(): + assert cap < 1.0, f"{intent} cap {cap} >= 1.0" + + def test_all_caps_above_zero(self) -> None: + for intent, cap in DOMAIN_CAPS.items(): + assert cap > 0.0, f"{intent} cap {cap} <= 0.0" + + +# ── Combined epistemic confidence ──────────────────────────── + + +class TestEpistemicConfidence: + """Test the formula: confidence = min(domain_cap, rigor).""" + + def test_factual_all_genuine(self) -> None: + """Capital of France: rigor=1.0, cap=0.95 -> confidence=0.95.""" + rigor = _compute_rigor( + [ + ChallengeResult("m1", "real issue"), + ChallengeResult("m2", "another issue"), + ] + ) + assert rigor == 1.0 + cap = _domain_cap("factual") + confidence = min(cap, rigor) + assert confidence == 0.95 + + def test_strategic_all_genuine(self) -> None: + """Will X happen by 2035: rigor=1.0, cap=0.70 -> confidence=0.70.""" + rigor = _compute_rigor( + [ + ChallengeResult("m1", "real issue"), + ChallengeResult("m2", "another issue"), + ] + ) + assert rigor == 1.0 + cap = _domain_cap("strategic") + confidence = min(cap, rigor) + assert confidence == 0.70 + + def test_rigor_below_cap(self) -> None: + """When rigor < cap, confidence = rigor.""" + rigor = _compute_rigor( + [ + ChallengeResult("m1", "issue"), + ChallengeResult("m2", "great", sycophantic=True), + ] + ) + assert rigor == 0.75 + cap = _domain_cap("factual") + confidence = min(cap, rigor) + assert confidence == 0.75 # rigor is the binding constraint + + def test_unknown_domain_capped(self) -> None: + """Unknown intent uses default cap of 0.85.""" + rigor = 1.0 + cap = _domain_cap(None) + assert min(cap, rigor) == 0.85 + + @pytest.mark.parametrize( + ("intent", "expected_cap"), + [ + ("factual", 0.95), + ("technical", 0.90), + ("creative", 0.85), + ("judgment", 0.80), + ("strategic", 0.70), + (None, 0.85), + ], + ) + def test_max_confidence_per_intent( + self, intent: str | None, expected_cap: float + ) -> None: + """With perfect rigor, confidence equals the domain cap.""" + rigor = 1.0 # all genuine challenges + confidence = min(_domain_cap(intent), rigor) + assert confidence == expected_cap diff --git a/tests/unit/test_context_builder.py b/tests/unit/test_context_builder.py index 507021b..44adae7 100644 --- a/tests/unit/test_context_builder.py +++ b/tests/unit/test_context_builder.py @@ -30,12 +30,14 @@ def __init__( self, content: str, confidence: float = 1.0, + rigor: float = 0.0, dissent: str | None = None, ) -> None: _FakeDecision._counter += 1 self.id = f"fake-decision-{_FakeDecision._counter}" self.content = content self.confidence = confidence + self.rigor = rigor self.dissent = dissent @@ -93,7 +95,7 @@ def test_decision_confidence_formatted(self) -> None: _FakeDecision("Use Redis", confidence=0.75), ] result = build_context(None, decisions) # type: ignore[arg-type] - assert "75% confidence" in result + assert "75% confidence, 0% rigor" in result def test_decisions_with_dissent(self) -> None: decisions = [ diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py index 4d28aa2..01dd139 100644 --- a/tests/unit/test_mcp_server.py +++ b/tests/unit/test_mcp_server.py @@ -177,7 +177,7 @@ async def test_consensus_protocol(self) -> None: patch( "duh.cli.app._run_consensus", new_callable=AsyncMock, - return_value=("Use SQLite.", 0.9, "minor dissent", 0.05), + return_value=("Use SQLite.", 0.9, 1.0, "minor dissent", 0.05), ), ): result = await _handle_ask({"question": "What DB?", "rounds": 2}) @@ -185,6 +185,7 @@ async def test_consensus_protocol(self) -> None: data = json.loads(result[0].text) assert data["decision"] == "Use SQLite." assert data["confidence"] == 0.9 + assert data["rigor"] == 1.0 assert data["dissent"] == "minor dissent" assert data["cost"] == 0.05 @@ -205,6 +206,7 @@ class FakeAggregation: decision: str strategy: str confidence: float + rigor: float = 0.5 fake_result = FakeAggregation( votes=(FakeVote("m1", "Use X", 0.9),), diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 8394dd1..5596c66 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -75,12 +75,13 @@ async def test_returns_decision_and_confidence(self) -> None: pm = ProviderManager() await pm.register(provider) - decision, confidence = await _run_mini_consensus( + decision, confidence, rigor = await _run_mini_consensus( "What database should I use?", pm ) assert isinstance(decision, str) assert len(decision) > 0 assert 0.0 <= confidence <= 1.0 + assert 0.5 <= rigor <= 1.0 async def test_runs_all_four_phases(self) -> None: from duh.providers.manager import ProviderManager diff --git a/web/src/App.tsx b/web/src/App.tsx index eec04fd..b7e9acb 100644 --- a/web/src/App.tsx +++ b/web/src/App.tsx @@ -6,6 +6,7 @@ import { ThreadsPage, ThreadDetailPage, DecisionSpacePage, + CalibrationPage, PreferencesPage, SharePage, } from '@/pages' @@ -21,6 +22,7 @@ export function App() { } /> } /> } /> + } /> } /> diff --git a/web/src/__tests__/consensus-components.test.tsx b/web/src/__tests__/consensus-components.test.tsx index 5262284..5109027 100644 --- a/web/src/__tests__/consensus-components.test.tsx +++ b/web/src/__tests__/consensus-components.test.tsx @@ -232,6 +232,7 @@ describe('generateExportMarkdown', () => { reviser: 'anthropic:claude-3', revision: 'Use SQLite for v0.1.', confidence: 0.85, + rigor: 0.78, dissent: 'PostgreSQL for scale.', }, ] @@ -241,6 +242,7 @@ describe('generateExportMarkdown', () => { 'Best database?', 'Use SQLite.', 0.85, + 0.78, 'PostgreSQL for scale.', 0.003, rounds, @@ -252,6 +254,7 @@ describe('generateExportMarkdown', () => { expect(md).toContain('## Decision') expect(md).toContain('Use SQLite.') expect(md).toContain('Confidence: 85%') + expect(md).toContain('Rigor: 78%') expect(md).toContain('## Dissent') expect(md).toContain('PostgreSQL for scale.') expect(md).not.toContain('## Consensus Process') @@ -263,6 +266,7 @@ describe('generateExportMarkdown', () => { 'Best database?', 'Use SQLite.', 0.85, + 0.78, 'PostgreSQL for scale.', 0.003, rounds, @@ -285,6 +289,7 @@ describe('generateExportMarkdown', () => { 'Best database?', 'Use SQLite.', 0.85, + 0.78, 'PostgreSQL for scale.', 0.003, rounds, @@ -302,6 +307,7 @@ describe('generateExportMarkdown', () => { 'Question', 'Answer', 0.9, + 0.82, null, 0.0512, [], @@ -317,6 +323,7 @@ describe('generateExportMarkdown', () => { null, 'Answer', 0.9, + 0.82, null, null, [], diff --git a/web/src/__tests__/stores.test.ts b/web/src/__tests__/stores.test.ts index e13124d..ead0eb0 100644 --- a/web/src/__tests__/stores.test.ts +++ b/web/src/__tests__/stores.test.ts @@ -19,6 +19,7 @@ vi.mock('@/api/client', () => ({ recall: vi.fn(), feedback: vi.fn(), decisionSpace: vi.fn(), + calibration: vi.fn(), }, })) @@ -26,6 +27,7 @@ import { useConsensusStore } from '@/stores/consensus' import { useThreadsStore } from '@/stores/threads' import { usePreferencesStore } from '@/stores/preferences' import { useDecisionSpaceStore } from '@/stores/decision-space' +import { useCalibrationStore } from '@/stores/calibration' import { api } from '@/api/client' const mockedApi = vi.mocked(api) @@ -46,6 +48,7 @@ describe('useConsensusStore', () => { expect(state.rounds).toEqual([]) expect(state.decision).toBeNull() expect(state.confidence).toBeNull() + expect(state.rigor).toBeNull() expect(state.dissent).toBeNull() expect(state.cost).toBeNull() }) @@ -317,6 +320,7 @@ describe('useDecisionSpaceStore', () => { thread_id: 't1', question: 'Q1', confidence: 0.85, + rigor: 0.72, intent: null, category: 'tech', genus: null, @@ -390,3 +394,82 @@ describe('useDecisionSpaceStore', () => { expect(useDecisionSpaceStore.getState().timelineSpeed).toBe(4) }) }) + +// ── Calibration Store ──────────────────────────────────── + +describe('useCalibrationStore', () => { + beforeEach(() => { + vi.clearAllMocks() + useCalibrationStore.setState({ + buckets: [], + totalDecisions: 0, + totalWithOutcomes: 0, + overallAccuracy: 0, + ece: 0, + loading: false, + error: null, + category: null, + }) + }) + + it('has correct initial state', () => { + const state = useCalibrationStore.getState() + expect(state.buckets).toEqual([]) + expect(state.totalDecisions).toBe(0) + expect(state.totalWithOutcomes).toBe(0) + expect(state.overallAccuracy).toBe(0) + expect(state.ece).toBe(0) + expect(state.loading).toBe(false) + expect(state.error).toBeNull() + expect(state.category).toBeNull() + }) + + it('fetchCalibration populates data', async () => { + const mockData = { + buckets: [ + { + range_lo: 0.0, + range_hi: 0.1, + count: 0, + with_outcomes: 0, + success: 0, + failure: 0, + partial: 0, + accuracy: 0, + mean_confidence: 0.05, + }, + ], + total_decisions: 5, + total_with_outcomes: 3, + overall_accuracy: 0.75, + ece: 0.08, + } + mockedApi.calibration.mockResolvedValue(mockData) + + await useCalibrationStore.getState().fetchCalibration() + const state = useCalibrationStore.getState() + expect(state.totalDecisions).toBe(5) + expect(state.totalWithOutcomes).toBe(3) + expect(state.overallAccuracy).toBe(0.75) + expect(state.ece).toBe(0.08) + expect(state.buckets).toEqual(mockData.buckets) + expect(state.loading).toBe(false) + }) + + it('fetchCalibration handles errors', async () => { + mockedApi.calibration.mockRejectedValue(new Error('Server error')) + + await useCalibrationStore.getState().fetchCalibration() + const state = useCalibrationStore.getState() + expect(state.error).toBe('Server error') + expect(state.loading).toBe(false) + }) + + it('setCategory updates category filter', () => { + useCalibrationStore.getState().setCategory('tech') + expect(useCalibrationStore.getState().category).toBe('tech') + + useCalibrationStore.getState().setCategory(null) + expect(useCalibrationStore.getState().category).toBeNull() + }) +}) diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 094398a..9ded7df 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -1,6 +1,7 @@ import type { AskRequest, AskResponse, + CalibrationResponse, CostResponse, DecisionSpaceResponse, FeedbackRequest, @@ -110,6 +111,21 @@ export const api = { return request(`/decisions/space${suffix}`) }, + calibration(params?: { + category?: string + since?: string + until?: string + }): Promise { + const qs = new URLSearchParams() + if (params) { + for (const [k, v] of Object.entries(params)) { + if (v != null) qs.set(k, String(v)) + } + } + const suffix = qs.toString() ? `?${qs}` : '' + return request(`/calibration${suffix}`) + }, + getShare(shareToken: string): Promise { return request(`/share/${encodeURIComponent(shareToken)}`) }, diff --git a/web/src/api/types.ts b/web/src/api/types.ts index 0fcb67d..26ed5d6 100644 --- a/web/src/api/types.ts +++ b/web/src/api/types.ts @@ -19,6 +19,7 @@ export interface FeedbackRequest { export interface AskResponse { decision: string confidence: number + rigor: number dissent: string | null cost: number thread_id: string | null @@ -49,6 +50,7 @@ export interface Contribution { export interface Decision { content: string confidence: number + rigor: number dissent: string | null } @@ -130,6 +132,7 @@ export interface SpaceDecision { thread_id: string question: string confidence: number + rigor: number intent: string | null category: string | null genus: string | null @@ -148,6 +151,28 @@ export interface DecisionSpaceResponse { total: number } +// ── Calibration types ──────────────────────────────────── + +export interface CalibrationBucket { + range_lo: number + range_hi: number + count: number + with_outcomes: number + success: number + failure: number + partial: number + accuracy: number + mean_confidence: number +} + +export interface CalibrationResponse { + buckets: CalibrationBucket[] + total_decisions: number + total_with_outcomes: number + overall_accuracy: number + ece: number +} + // ── WebSocket event types ───────────────────────────────── export type WSEventType = @@ -183,6 +208,7 @@ export interface WSChallenge { export interface WSCommit { type: 'commit' confidence: number + rigor: number dissent: string | null round: number } @@ -191,6 +217,7 @@ export interface WSComplete { type: 'complete' decision: string confidence: number + rigor: number dissent: string | null cost: number thread_id: string | null diff --git a/web/src/components/calibration/CalibrationDashboard.tsx b/web/src/components/calibration/CalibrationDashboard.tsx new file mode 100644 index 0000000..502a25e --- /dev/null +++ b/web/src/components/calibration/CalibrationDashboard.tsx @@ -0,0 +1,243 @@ +import { useEffect } from 'react' +import { useCalibrationStore } from '@/stores' + +function eceRating(ece: number): { label: string; color: string } { + if (ece < 0.05) return { label: 'Excellent', color: 'var(--color-success, #22c55e)' } + if (ece < 0.1) return { label: 'Good', color: 'var(--color-primary)' } + if (ece < 0.2) return { label: 'Fair', color: 'var(--color-warning, #eab308)' } + return { label: 'Poor', color: 'var(--color-error, #ef4444)' } +} + +export function CalibrationDashboard() { + const { + buckets, + totalDecisions, + totalWithOutcomes, + overallAccuracy, + ece, + loading, + error, + fetchCalibration, + } = useCalibrationStore() + + useEffect(() => { + fetchCalibration() + }, [fetchCalibration]) + + if (loading) { + return ( +
    + Loading calibration data... +
    + ) + } + + if (error) { + return ( +
    + Error: {error} +
    + ) + } + + const rating = eceRating(ece) + + return ( +
    +

    + Confidence Calibration +

    +

    + Are confidence scores accurate? Compare predicted confidence against actual + outcomes. +

    + + {/* Metric cards */} +
    + + + 0 ? `${(overallAccuracy * 100).toFixed(1)}%` : '-'} + /> + 0 ? ece.toFixed(4) : '-'} + sublabel={totalWithOutcomes > 0 ? rating.label : undefined} + sublabelColor={totalWithOutcomes > 0 ? rating.color : undefined} + /> +
    + + {totalWithOutcomes === 0 && ( +
    + No outcomes recorded yet. Use{' '} + + duh feedback + {' '} + to record outcomes for your decisions. +
    + )} + + {totalWithOutcomes > 0 && ( + <> + {/* Calibration chart */} +
    +

    + Calibration Chart +

    +
    + {buckets.map((b) => { + const lo = Math.round(b.range_lo * 100) + const hi = Math.round(b.range_hi * 100) + const label = `${lo}-${hi}%` + const accWidth = + b.with_outcomes > 0 ? (b.accuracy * 100).toFixed(0) : '0' + const confWidth = (b.mean_confidence * 100).toFixed(0) + return ( +
    + + {label} + +
    + {/* Accuracy bar */} + {b.with_outcomes > 0 && ( +
    + )} + {/* Perfect calibration line */} +
    +
    + + {b.with_outcomes > 0 + ? `${(b.accuracy * 100).toFixed(0)}% (n=${b.with_outcomes})` + : '-'} + +
    + ) + })} +
    +
    + + + Actual accuracy + + + + Expected (mean confidence) + +
    +
    + + {/* Bucket table */} +
    +

    + Bucket Details +

    + + + + + + + + + + + + + + + + {buckets + .filter((b) => b.count > 0) + .map((b) => { + const lo = Math.round(b.range_lo * 100) + const hi = Math.round(b.range_hi * 100) + const gap = + b.with_outcomes > 0 + ? Math.abs(b.accuracy - b.mean_confidence) + : null + + return ( + + + + + + + + + + + + ) + })} + +
    RangeCountOutcomesSuccessPartialFailureAccuracyConfidenceGap
    {lo}-{hi}%{b.count}{b.with_outcomes}{b.success}{b.partial}{b.failure} + {b.with_outcomes > 0 + ? `${(b.accuracy * 100).toFixed(1)}%` + : '-'} + + {(b.mean_confidence * 100).toFixed(1)}% + + {gap != null ? `${(gap * 100).toFixed(1)}%` : '-'} +
    +
    + + )} +
    + ) +} + +function MetricCard({ + label, + value, + sublabel, + sublabelColor, +}: { + label: string + value: string + sublabel?: string + sublabelColor?: string +}) { + return ( +
    +
    + {label} +
    +
    + {value} +
    + {sublabel && ( +
    + {sublabel} +
    + )} +
    + ) +} diff --git a/web/src/components/calibration/index.ts b/web/src/components/calibration/index.ts new file mode 100644 index 0000000..2a81cc6 --- /dev/null +++ b/web/src/components/calibration/index.ts @@ -0,0 +1 @@ +export { CalibrationDashboard } from './CalibrationDashboard' diff --git a/web/src/components/consensus/ConfidenceMeter.tsx b/web/src/components/consensus/ConfidenceMeter.tsx index 865bdde..4971ca6 100644 --- a/web/src/components/consensus/ConfidenceMeter.tsx +++ b/web/src/components/consensus/ConfidenceMeter.tsx @@ -3,6 +3,7 @@ import { useEffect, useState } from 'react' interface ConfidenceMeterProps { value: number size?: number + label?: string } function getColor(value: number): string { @@ -12,7 +13,7 @@ function getColor(value: number): string { return 'var(--color-green)' } -export function ConfidenceMeter({ value, size = 64 }: ConfidenceMeterProps) { +export function ConfidenceMeter({ value, size = 64, label }: ConfidenceMeterProps) { const radius = (size - 8) / 2 const circumference = 2 * Math.PI * radius const targetOffset = circumference * (1 - value) @@ -54,6 +55,11 @@ export function ConfidenceMeter({ value, size = 64 }: ConfidenceMeterProps) { {(value * 100).toFixed(0)}% + {label && ( + + {label} + + )}
    ) } diff --git a/web/src/components/consensus/ConsensusComplete.tsx b/web/src/components/consensus/ConsensusComplete.tsx index b841c1c..88942b0 100644 --- a/web/src/components/consensus/ConsensusComplete.tsx +++ b/web/src/components/consensus/ConsensusComplete.tsx @@ -9,6 +9,7 @@ import type { RoundData } from '@/stores/consensus' interface ConsensusCompleteProps { decision: string confidence: number + rigor: number dissent: string | null cost: number | null } @@ -17,6 +18,7 @@ export function generateExportMarkdown( question: string | null, decision: string, confidence: number, + rigor: number, dissent: string | null, cost: number | null, rounds: RoundData[], @@ -29,7 +31,7 @@ export function generateExportMarkdown( lines.push('## Decision') lines.push(decision) lines.push('') - lines.push(`Confidence: ${Math.round(confidence * 100)}%`) + lines.push(`Confidence: ${Math.round(confidence * 100)}% Rigor: ${Math.round(rigor * 100)}%`) lines.push('') if (includeDissent && dissent) { @@ -86,7 +88,7 @@ function downloadFile(content: string | Blob, filename: string, mimeType: string URL.revokeObjectURL(url) } -export function ConsensusComplete({ decision, confidence, dissent, cost }: ConsensusCompleteProps) { +export function ConsensusComplete({ decision, confidence, rigor, dissent, cost }: ConsensusCompleteProps) { const [copied, setCopied] = useState(false) const [exportOpen, setExportOpen] = useState(false) const { question, rounds, threadId } = useConsensusStore() @@ -98,7 +100,7 @@ export function ConsensusComplete({ decision, confidence, dissent, cost }: Conse } const handleExportMarkdown = (content: 'full' | 'decision') => { - const md = generateExportMarkdown(question, decision, confidence, dissent, cost, rounds, content, true) + const md = generateExportMarkdown(question, decision, confidence, rigor, dissent, cost, rounds, content, true) downloadFile(md, `consensus-${content}.md`, 'text/markdown') setExportOpen(false) } @@ -121,7 +123,10 @@ export function ConsensusComplete({ decision, confidence, dissent, cost }: Conse CONSENSUS REACHED
    - +
    + + +
    {decision} diff --git a/web/src/components/consensus/ConsensusPanel.tsx b/web/src/components/consensus/ConsensusPanel.tsx index 5fb4386..2cc3e0a 100644 --- a/web/src/components/consensus/ConsensusPanel.tsx +++ b/web/src/components/consensus/ConsensusPanel.tsx @@ -8,7 +8,7 @@ import { CostTicker } from './CostTicker' export function ConsensusPanel() { const { status, error, currentPhase, currentRound, rounds, - decision, confidence, dissent, cost, + decision, confidence, rigor, dissent, cost, startConsensus, reset, } = useConsensusStore() @@ -73,6 +73,7 @@ export function ConsensusPanel() { {round.confidence !== null && (
    Confidence: {(round.confidence * 100).toFixed(0)}% + {round.rigor !== null && Rigor: {(round.rigor * 100).toFixed(0)}%} {round.dissent && Dissent noted}
    )} @@ -85,6 +86,7 @@ export function ConsensusPanel() { diff --git a/web/src/components/decision-space/DecisionCloud.tsx b/web/src/components/decision-space/DecisionCloud.tsx index 4208dde..7fdd137 100644 --- a/web/src/components/decision-space/DecisionCloud.tsx +++ b/web/src/components/decision-space/DecisionCloud.tsx @@ -158,6 +158,7 @@ export function DecisionCloud({ decisions, categories, genera, timelinePosition

    {hoveredDecision.question}

    {(hoveredDecision.confidence * 100).toFixed(0)}% + rigor {(hoveredDecision.rigor * 100).toFixed(0)}% {hoveredDecision.category && ( {hoveredDecision.category} )} diff --git a/web/src/components/layout/Sidebar.tsx b/web/src/components/layout/Sidebar.tsx index 45cfbba..143f751 100644 --- a/web/src/components/layout/Sidebar.tsx +++ b/web/src/components/layout/Sidebar.tsx @@ -4,6 +4,7 @@ const navItems = [ { path: '/', label: 'Consensus', icon: '\u2B21' }, { path: '/threads', label: 'Threads', icon: '\u2261' }, { path: '/space', label: 'Decision Space', icon: '\u25CE' }, + { path: '/calibration', label: 'Calibration', icon: '\u25C9' }, { path: '/preferences', label: 'Preferences', icon: '\u2699' }, ] diff --git a/web/src/components/shared/ExportMenu.tsx b/web/src/components/shared/ExportMenu.tsx index 4e900c2..b63469b 100644 --- a/web/src/components/shared/ExportMenu.tsx +++ b/web/src/components/shared/ExportMenu.tsx @@ -14,7 +14,7 @@ function generateMarkdown(thread: ThreadDetail, content: ContentMode, includeDis lines.push('') // Find last decision - let finalDecision: { content: string; confidence: number; dissent: string | null } | null = null + let finalDecision: { content: string; confidence: number; rigor: number; dissent: string | null } | null = null for (let i = thread.turns.length - 1; i >= 0; i--) { if (thread.turns[i]?.decision) { finalDecision = thread.turns[i]!.decision @@ -32,7 +32,7 @@ function generateMarkdown(thread: ThreadDetail, content: ContentMode, includeDis lines.push('## Decision') lines.push(finalDecision.content) lines.push('') - lines.push(`Confidence: ${Math.round(finalDecision.confidence * 100)}%`) + lines.push(`Confidence: ${Math.round(finalDecision.confidence * 100)}% Rigor: ${Math.round(finalDecision.rigor * 100)}%`) lines.push('') if (includeDissent && finalDecision.dissent) { @@ -148,7 +148,7 @@ export function ExportMenu({ thread }: ExportMenuProps) { {downloading ? 'Exporting...' : 'Export'} {open && ( -
    +
    )}
    - +
    + + +
    )} diff --git a/web/src/pages/CalibrationPage.tsx b/web/src/pages/CalibrationPage.tsx new file mode 100644 index 0000000..a169c05 --- /dev/null +++ b/web/src/pages/CalibrationPage.tsx @@ -0,0 +1,12 @@ +import { CalibrationDashboard } from '@/components/calibration' +import { PageTransition } from '@/components/shared' + +export function CalibrationPage() { + return ( + +
    + +
    +
    + ) +} diff --git a/web/src/pages/index.ts b/web/src/pages/index.ts index 353f229..9683931 100644 --- a/web/src/pages/index.ts +++ b/web/src/pages/index.ts @@ -4,3 +4,4 @@ export { ThreadDetailPage } from './ThreadDetailPage' export { DecisionSpacePage } from './DecisionSpacePage' export { PreferencesPage } from './PreferencesPage' export { SharePage } from './SharePage' +export { CalibrationPage } from './CalibrationPage' diff --git a/web/src/stores/calibration.ts b/web/src/stores/calibration.ts new file mode 100644 index 0000000..efce8f1 --- /dev/null +++ b/web/src/stores/calibration.ts @@ -0,0 +1,57 @@ +import { create } from 'zustand' +import { api } from '@/api/client' +import type { CalibrationBucket } from '@/api/types' + +interface CalibrationState { + buckets: CalibrationBucket[] + totalDecisions: number + totalWithOutcomes: number + overallAccuracy: number + ece: number + loading: boolean + error: string | null + + // Filters + category: string | null + + // Actions + fetchCalibration: () => Promise + setCategory: (category: string | null) => void +} + +export const useCalibrationStore = create((set, get) => ({ + buckets: [], + totalDecisions: 0, + totalWithOutcomes: 0, + overallAccuracy: 0, + ece: 0, + loading: false, + error: null, + + category: null, + + fetchCalibration: async () => { + set({ loading: true, error: null }) + try { + const { category } = get() + const params: { category?: string } = {} + if (category) params.category = category + + const data = await api.calibration(params) + set({ + buckets: data.buckets, + totalDecisions: data.total_decisions, + totalWithOutcomes: data.total_with_outcomes, + overallAccuracy: data.overall_accuracy, + ece: data.ece, + loading: false, + }) + } catch (e) { + set({ error: (e as Error).message, loading: false }) + } + }, + + setCategory: (category) => { + set({ category }) + }, +})) diff --git a/web/src/stores/consensus.ts b/web/src/stores/consensus.ts index cfc7b0e..3c2607a 100644 --- a/web/src/stores/consensus.ts +++ b/web/src/stores/consensus.ts @@ -23,6 +23,7 @@ export interface RoundData { reviser: string | null revision: string | null confidence: number | null + rigor: number | null dissent: string | null } @@ -42,6 +43,7 @@ interface ConsensusState { question: string | null decision: string | null confidence: number | null + rigor: number | null dissent: string | null cost: number | null threadId: string | null @@ -64,6 +66,7 @@ function createEmptyRound(round: number): RoundData { reviser: null, revision: null, confidence: null, + rigor: null, dissent: null, } } @@ -77,6 +80,7 @@ export const useConsensusStore = create((set, get) => ({ question: null, decision: null, confidence: null, + rigor: null, dissent: null, cost: null, threadId: null, @@ -91,6 +95,7 @@ export const useConsensusStore = create((set, get) => ({ question, decision: null, confidence: null, + rigor: null, dissent: null, cost: null, threadId: null, @@ -129,6 +134,7 @@ export const useConsensusStore = create((set, get) => ({ question: null, decision: null, confidence: null, + rigor: null, dissent: null, cost: null, threadId: null, @@ -216,6 +222,7 @@ function handleEvent( currentPhase: 'COMMIT' as ConsensusPhase, rounds: updateRound(state.rounds, idx, { confidence: event.confidence, + rigor: event.rigor, dissent: event.dissent, }), }) @@ -227,6 +234,7 @@ function handleEvent( status: 'complete', decision: event.decision, confidence: event.confidence, + rigor: event.rigor, dissent: event.dissent, cost: event.cost, threadId: event.thread_id ?? null, diff --git a/web/src/stores/index.ts b/web/src/stores/index.ts index cae5eed..f5babdb 100644 --- a/web/src/stores/index.ts +++ b/web/src/stores/index.ts @@ -4,3 +4,4 @@ export { useThreadsStore } from './threads' export { useDecisionSpaceStore } from './decision-space' export type { SpaceFilters } from './decision-space' export { usePreferencesStore } from './preferences' +export { useCalibrationStore } from './calibration' diff --git a/web/tsconfig.tsbuildinfo b/web/tsconfig.tsbuildinfo index cf66e67..4db173d 100644 --- a/web/tsconfig.tsbuildinfo +++ b/web/tsconfig.tsbuildinfo @@ -1 +1 @@ -{"root":["./src/app.tsx","./src/main.tsx","./src/test-setup.ts","./src/three-types.d.ts","./src/api/client.ts","./src/api/index.ts","./src/api/types.ts","./src/api/websocket.ts","./src/components/consensus/confidencemeter.tsx","./src/components/consensus/consensuscomplete.tsx","./src/components/consensus/consensuspanel.tsx","./src/components/consensus/costticker.tsx","./src/components/consensus/dissentbanner.tsx","./src/components/consensus/modelbadge.tsx","./src/components/consensus/phasecard.tsx","./src/components/consensus/questioninput.tsx","./src/components/consensus/streamingtext.tsx","./src/components/consensus/index.ts","./src/components/decision-space/decisioncloud.tsx","./src/components/decision-space/decisionspace.tsx","./src/components/decision-space/filterpanel.tsx","./src/components/decision-space/gridfloor.tsx","./src/components/decision-space/scatterfallback.tsx","./src/components/decision-space/scene3d.tsx","./src/components/decision-space/timelineslider.tsx","./src/components/decision-space/index.ts","./src/components/layout/shell.tsx","./src/components/layout/sidebar.tsx","./src/components/layout/topbar.tsx","./src/components/layout/index.ts","./src/components/preferences/preferencespanel.tsx","./src/components/preferences/index.ts","./src/components/shared/badge.tsx","./src/components/shared/errorboundary.tsx","./src/components/shared/exportmenu.tsx","./src/components/shared/glasspanel.tsx","./src/components/shared/glowbutton.tsx","./src/components/shared/gridoverlay.tsx","./src/components/shared/markdown.tsx","./src/components/shared/pagetransition.tsx","./src/components/shared/particlefield.tsx","./src/components/shared/skeleton.tsx","./src/components/shared/index.ts","./src/components/threads/threadbrowser.tsx","./src/components/threads/threadcard.tsx","./src/components/threads/threaddetail.tsx","./src/components/threads/threadfilters.tsx","./src/components/threads/threadsearch.tsx","./src/components/threads/turncard.tsx","./src/components/threads/index.ts","./src/hooks/index.ts","./src/hooks/usemediaquery.ts","./src/pages/consensuspage.tsx","./src/pages/decisionspacepage.tsx","./src/pages/preferencespage.tsx","./src/pages/sharepage.tsx","./src/pages/threaddetailpage.tsx","./src/pages/threadspage.tsx","./src/pages/index.ts","./src/stores/consensus.ts","./src/stores/decision-space.ts","./src/stores/index.ts","./src/stores/preferences.ts","./src/stores/threads.ts","./src/utils/colors.ts","./src/utils/index.ts"],"version":"5.9.3"} \ No newline at end of file +{"root":["./src/app.tsx","./src/main.tsx","./src/test-setup.ts","./src/three-types.d.ts","./src/api/client.ts","./src/api/index.ts","./src/api/types.ts","./src/api/websocket.ts","./src/components/calibration/calibrationdashboard.tsx","./src/components/calibration/index.ts","./src/components/consensus/confidencemeter.tsx","./src/components/consensus/consensuscomplete.tsx","./src/components/consensus/consensuspanel.tsx","./src/components/consensus/costticker.tsx","./src/components/consensus/dissentbanner.tsx","./src/components/consensus/modelbadge.tsx","./src/components/consensus/phasecard.tsx","./src/components/consensus/questioninput.tsx","./src/components/consensus/streamingtext.tsx","./src/components/consensus/index.ts","./src/components/decision-space/decisioncloud.tsx","./src/components/decision-space/decisionspace.tsx","./src/components/decision-space/filterpanel.tsx","./src/components/decision-space/gridfloor.tsx","./src/components/decision-space/scatterfallback.tsx","./src/components/decision-space/scene3d.tsx","./src/components/decision-space/timelineslider.tsx","./src/components/decision-space/index.ts","./src/components/layout/shell.tsx","./src/components/layout/sidebar.tsx","./src/components/layout/topbar.tsx","./src/components/layout/index.ts","./src/components/preferences/preferencespanel.tsx","./src/components/preferences/index.ts","./src/components/shared/badge.tsx","./src/components/shared/errorboundary.tsx","./src/components/shared/exportmenu.tsx","./src/components/shared/glasspanel.tsx","./src/components/shared/glowbutton.tsx","./src/components/shared/gridoverlay.tsx","./src/components/shared/markdown.tsx","./src/components/shared/pagetransition.tsx","./src/components/shared/particlefield.tsx","./src/components/shared/skeleton.tsx","./src/components/shared/index.ts","./src/components/threads/threadbrowser.tsx","./src/components/threads/threadcard.tsx","./src/components/threads/threaddetail.tsx","./src/components/threads/threadfilters.tsx","./src/components/threads/threadsearch.tsx","./src/components/threads/turncard.tsx","./src/components/threads/index.ts","./src/hooks/index.ts","./src/hooks/usemediaquery.ts","./src/pages/calibrationpage.tsx","./src/pages/consensuspage.tsx","./src/pages/decisionspacepage.tsx","./src/pages/preferencespage.tsx","./src/pages/sharepage.tsx","./src/pages/threaddetailpage.tsx","./src/pages/threadspage.tsx","./src/pages/index.ts","./src/stores/calibration.ts","./src/stores/consensus.ts","./src/stores/decision-space.ts","./src/stores/index.ts","./src/stores/preferences.ts","./src/stores/threads.ts","./src/utils/colors.ts","./src/utils/index.ts"],"version":"5.9.3"} \ No newline at end of file