From 433b44e02b9ea9037304853b503bca1e18032801 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 14:30:32 -0700 Subject: [PATCH 1/6] feat: ship default knowledge store + side-effect-verified eval harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The template now ships a working memory loop and end-to-end eval suite on day one so forks have a green baseline before they touch a single line of code. Closes #154. What lands: - knowledge/store.py — sqlite + FTS5 (LIKE fallback). One ``chunks`` table backs operator notes (memory_ingest), daily-log entries (daily_log), and conversation findings extracted by MemoryMiddleware (domain='finding'). Path resolves env > config > default with an automatic ~/.protoagent/ fallback when /sandbox isn't writable. - tools/lg_tools.py — five new memory tools (memory_ingest, memory_recall, memory_list, memory_stats, daily_log) bound to the store via a closure factory so tests get a fresh store per run. ``echo`` removed; ``get_all_tools(knowledge_store)`` actually uses its parameter now. - server.py — _build_knowledge_store() constructs the store and threads it through both initial init and the drawer reload path. Defaults flipped: knowledge_middleware + memory_middleware now ON by default (config/langgraph-config.yaml + graph/config.py). - evals/ — A2A client + runner + verify helpers + 15 starter cases (tasks.json) covering agent card discovery, bearer auth gating, abstention, every shipped tool, KB recall, a chained two-tool case, and KnowledgeMiddleware injection. Side-effect-verified: audit log + reply text + KB chunks all checked independently so hallucinated tool results get caught. - docs/guides/evals.md — full how-to. README/TEMPLATE/configuration/ starter-tools/first-agent updated to reflect the new defaults and the additional five memory tools. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 4 +- TEMPLATE.md | 21 +- config/langgraph-config.yaml | 20 +- docs/guides/customize-and-deploy.md | 2 +- docs/guides/evals.md | 151 +++++++++ docs/guides/fork-the-template.md | 2 +- docs/guides/index.md | 1 + docs/guides/subagents.md | 15 +- docs/reference/configuration.md | 25 +- docs/reference/starter-tools.md | 71 ++++- docs/tutorials/first-agent.md | 2 +- docs/tutorials/first-tool.md | 3 +- evals/README.md | 100 ++++++ evals/__init__.py | 0 evals/client.py | 262 ++++++++++++++++ evals/results/.gitkeep | 0 evals/runner.py | 307 +++++++++++++++++++ evals/tasks.json | 186 ++++++++++++ evals/verify.py | 176 +++++++++++ graph/config.py | 16 +- graph/subagents/config.py | 6 +- knowledge/__init__.py | 12 + knowledge/store.py | 456 ++++++++++++++++++++++++++++ server.py | 37 ++- tests/test_config_io.py | 8 +- tests/test_skill_curator.py | 2 +- tests/test_skill_emission.py | 20 +- tests/test_starter_tools.py | 10 - tools/lg_tools.py | 150 +++++++-- 29 files changed, 1976 insertions(+), 89 deletions(-) create mode 100644 docs/guides/evals.md create mode 100644 evals/README.md create mode 100644 evals/__init__.py create mode 100644 evals/client.py create mode 100644 evals/results/.gitkeep create mode 100644 evals/runner.py create mode 100644 evals/tasks.json create mode 100644 evals/verify.py create mode 100644 knowledge/__init__.py create mode 100644 knowledge/store.py diff --git a/README.md b/README.md index 1a1b34c..ef54b84 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,9 @@ rename / release-pipeline wiring. | Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 | | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork | | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships | -| Starter tools | `tools/lg_tools.py` | Free, keyless tools so a fresh fork can demo real behaviour: `echo`, `current_time`, `calculator` (safe AST eval), `web_search` (DuckDuckGo), `fetch_url` | +| Starter tools | `tools/lg_tools.py` | Keyless general tools (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) plus memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled store | +| Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` | +| Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) | | Tracing | `tracing.py` | Langfuse trace_session with distributed `a2a.trace` propagation and the OTel cross-context-detach filter | | Observability | `metrics.py`, `audit.py` | Prometheus metrics with per-agent prefix, JSONL audit log with trace IDs | | Output protocol | `graph/output_format.py` | `` / `` parsing so the model can think without it leaking to users | diff --git a/TEMPLATE.md b/TEMPLATE.md index 31b1eb2..08ff5b3 100644 --- a/TEMPLATE.md +++ b/TEMPLATE.md @@ -72,10 +72,10 @@ handler's output extraction depends on it. ## 4. Add your real tools `tools/lg_tools.py` ships with a small keyless starter set so a -fresh clone can demonstrate a real research loop: `echo`, -`current_time`, `calculator` (safe AST eval — no `eval()`), -`web_search` (DuckDuckGo via `ddgs`), and `fetch_url`. Keep the -ones you want, drop the rest, and add your own: +fresh clone can demonstrate a real research loop: `current_time`, +`calculator` (safe AST eval — no `eval()`), `web_search` (DuckDuckGo +via `ddgs`), and `fetch_url`. Keep the ones you want, drop the rest, +and add your own: ```python from langchain_core.tools import tool @@ -167,6 +167,19 @@ your fork. A useful pattern: - Extend `tests/test_a2a_integration.py` with assertions for your declared skills + extensions on the agent card +For end-to-end behaviour testing — "when the operator asks X, does +the right tool actually fire and the right row land in the KB?" — +the template ships an eval harness under `evals/`: + +```bash +python -m evals.runner # against a running agent +python -m evals.runner --category tool +``` + +See [Eval your fork](./docs/guides/evals.md) for what each case +asserts, how the three assertion channels work, and how to add +cases for your fork's new tools. + ## 9a. Understand the skill loop protoAgent's skill loop lets your agent learn from experience automatically. diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml index c3f53e8..05bada2 100644 --- a/config/langgraph-config.yaml +++ b/config/langgraph-config.yaml @@ -22,14 +22,24 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 middleware: - # The knowledge middleware requires a knowledge store. Leave false - # until you add one. Memory persistence is enabled by default and - # writes session summaries to /sandbox/memory/ without a store. - knowledge: false + # All three middlewares default ON. The knowledge middleware needs a + # store; the template constructs one automatically (see + # ``server.py::_build_knowledge_store``). Set ``knowledge: false`` if + # your fork is purely stateless. + knowledge: true audit: true memory: true diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md index 17c24d9..81fdeec 100644 --- a/docs/guides/customize-and-deploy.md +++ b/docs/guides/customize-and-deploy.md @@ -66,7 +66,7 @@ Replace with the skills your agent actually advertises over A2A. The `name` and ## 5. (Optional) Add domain tools -`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. ## 6. (Optional) Configure subagents diff --git a/docs/guides/evals.md b/docs/guides/evals.md new file mode 100644 index 0000000..38e1e33 --- /dev/null +++ b/docs/guides/evals.md @@ -0,0 +1,151 @@ +# Eval your fork + +The template ships an eval harness under `evals/` so a fresh fork has +a working test suite for its tools, memory, and A2A protocol surface +on day one. Cases assert across three independent channels — audit +log, reply text, and knowledge-store side effects — so a model that +hallucinates a tool result still gets caught. + +## When to read this + +- You forked the template and want a baseline pass-rate before you + ship. +- You added a new tool and want to lock in its intent — "when the + operator says X, fire tool Y". +- You changed a prompt or model and want to measure regression. + +## Run the suite + +```bash +# Agent running at $EVAL_BASE_URL (default http://localhost:7870) +# with the relevant auth env (A2A_AUTH_TOKEN and/or _API_KEY). + +python -m evals.runner +python -m evals.runner --category tool +python -m evals.runner --tasks current_time_intent,daily_log_intent +``` + +Reports land in `evals/results/run-.json`. The CLI prints a +pass/fail board; the JSON report carries reply previews and timing +for post-hoc inspection. + +## The three assertion channels + +``` +prompt → A2A → audit log (1) tools fired with expected outcome + → reply text (2) substrings present in reply + → KB chunks table (3) side effects landed correctly +``` + +A case passes only when every configured assertion holds. Most cases +should opt in to channels 1 and 3 — text patterns alone are brittle +to model paraphrasing and miss hallucinated tool results entirely. + +### Why side-effect verification beats text-only + +A model can produce "Logged: ..." in its reply without actually +calling `daily_log`. Substring matching passes, the DB stays empty, +and the bug ships. Reading `audit.jsonl` and the `chunks` table +afterward catches it. + +## The shape of a case + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-XYZ", + "domain": "context" + }, + "setup": [{"kb_ingest": {"content": "...", "domain": "...", "heading": "..."}}], + "teardown": [{"kb_delete_by_content": {"contains": "..."}}] +} +``` + +Three case `kind`s ship: + +- `agent_card` — fetch `/.well-known/agent-card.json` and assert on + the card's name, skill count, and declared extensions. +- `auth_check` — send a request with a deliberately bad bearer and + assert the server returns the expected status (401 by default). +- `ask` — the main shape. Sends `prompt`, then asserts on tool firing, + reply patterns, and KB state. + +## Prompt rule + +**The tool name never appears in the prompt.** Every prompt must be +plausibly typed by a real user. "Use `daily_log` to record..." tests +instruction-following, not tool selection. If the agent needs to +infer the tool from intent, that *is* the test. + +## Setup and teardown — start clean every time + +Each `ask` case can pre-seed state via `setup` blocks (BFCL's +`initial_config` pattern: direct DB writes the model never sees) and +clean up after itself with `teardown`. The fixture is invisible to +the agent — it discovers the seeded state via tools, exactly as a +real user would. + +`teardown` runs even when assertions fail, so case order doesn't +matter and a noisy failure doesn't poison the next run. + +Supported setup/teardown step kinds (extend `evals/verify.py` to add +more): + +| Step kind | Args | What it does | +|---|---|---| +| `kb_ingest` | `content`, `domain`, `heading?` | Insert a chunk | +| `kb_delete_by_content` | `contains` | Delete chunks where content LIKE `%contains%` | +| `kb_delete_by_heading` | `domain`, `heading` | Delete chunks matching (domain, heading) | + +## What forks should test by default + +The starter `tasks.json` covers: + +- Agent card discovery (name, skill count, `cost-v1` extension) +- Bearer auth gating +- Each shipped tool fires from a plausible operator prompt +- Memory ingest → recall round-trip +- KB-driven middleware injection (no tool call needed) +- A chained two-tool case (`daily_log` then `memory_recall`) + +When you add a tool, add at least one case for it. When you add a +skill to the agent card, extend the `card_discovery` case to assert +the new skill is advertised. + +## Running in CI + +The runner exits non-zero when any case fails, so it drops in cleanly: + +```yaml +- name: Boot agent + run: docker compose up -d agent + +- name: Wait for /health + run: ./scripts/wait-for-it.sh http://localhost:7870/.well-known/agent-card.json + +- name: Run evals + run: python -m evals.runner + env: + EVAL_BASE_URL: http://localhost:7870 + A2A_AUTH_TOKEN: ${{ secrets.AGENT_BEARER }} +``` + +For non-deterministic categories (any `tool` or `chained` case), aim +for an N-of-M majority threshold rather than 100% — the reference +implementation runs 3 attempts and gates at 2 passes for those +categories. Deterministic ones (`a2a-protocol`, `subsystem` with +seeded state) gate at 100%. + +## References + +- [`evals/README.md`](https://github.com/protoLabsAI/protoAgent/blob/main/evals/README.md) — quick reference for case authors +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) — user simulator + milestones / minefields diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md index eacbc92..d5472e4 100644 --- a/docs/guides/fork-the-template.md +++ b/docs/guides/fork-the-template.md @@ -43,7 +43,7 @@ Keep the `` / `` protocol block in `prompts.py` — the A2A ## 4. Replace the starter tools -`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. See the [starter tools reference](/reference/starter-tools) for the shapes of the shipped ones. diff --git a/docs/guides/index.md b/docs/guides/index.md index 3e49012..65dc41e 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -9,4 +9,5 @@ Task-oriented procedures. Assumes you already have a running agent (see [Tutoria | [Add a custom skill](/guides/add-a-skill) | Your agent does new things and callers need to dispatch to them | | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` | | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production | +| [Eval your fork](/guides/evals) | You want a baseline pass-rate for the tools / memory / A2A surface in your fork | | [Deploy via GHCR](/guides/deploy) | You're ready to ship and want auto-deploy wired up | diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md index f903995..9fc1a9a 100644 --- a/docs/guides/subagents.md +++ b/docs/guides/subagents.md @@ -86,7 +86,16 @@ for name in ("worker", "researcher"): # ← add new names subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 researcher: enabled: true @@ -117,8 +126,8 @@ If your agent is simple enough that subagents are pure overhead, flip `include_s ```python _graph = create_agent_graph( _graph_config, - knowledge_store=None, - include_subagents=False, # ← skip the task() tool and subagent machinery + knowledge_store=knowledge_store, # keep the bundled store wired up + include_subagents=False, # ← skip the task() tool and subagent machinery ) ``` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 3463dba..bd3f5be 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -17,13 +17,22 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 middleware: - knowledge: false + knowledge: true audit: true - memory: false + memory: true knowledge: db_path: /sandbox/knowledge/agent.db @@ -59,9 +68,9 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag | Key | Default | What | |---|---|---| -| `knowledge` | `false` | Inject retrieved knowledge into state before LLM calls. Requires a knowledge store — leave off until you add one. | +| `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. | | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. | -| `memory` | `false` | Memory middleware (experimental). Requires a knowledge store. | +| `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. | ## `knowledge` @@ -69,8 +78,8 @@ Only read when `middleware.knowledge` is `true`. | Key | Default | What | |---|---|---| -| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. | -| `embed_model` | `nomic-embed-text` | Embedding model. | +| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. Falls back to `~/.protoagent/knowledge/agent.db` automatically when the configured path isn't writable (e.g. running locally without `/sandbox`). Override at runtime with `KNOWLEDGE_DB_PATH`. | +| `embed_model` | `nomic-embed-text` | Reserved for forks that bolt embeddings on top of the FTS5 baseline. The bundled store ignores it. | | `top_k` | `5` | Results per query fed into state. | -The template does not ship a knowledge store — the config keys are kept so a fork can flip the switch without rewiring every call site. +The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 isn't available). One `chunks` table; the `domain` column distinguishes operator-set notes (`memory_ingest`), daily-log entries (`daily_log`), and conversation findings extracted by `MemoryMiddleware` (`domain='finding'`). diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index e47f25e..c0918b5 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -1,15 +1,11 @@ # Starter tools -Five free, keyless tools ship in `tools/lg_tools.py`. They exist so a fresh template clone can demonstrate real behaviour immediately. Keep them, drop them, or swap them — `get_all_tools()` is the registry. +Nine tools ship in `tools/lg_tools.py`: -## `echo` +- Four keyless general-purpose tools — `current_time`, `calculator`, `web_search`, `fetch_url` — that work without any state. +- Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)). -```python -@tool -async def echo(message: str) -> str -``` - -Returns `"echo: "`. The template-only sanity tool. Safe to delete once your real tools are wired. +`get_all_tools(knowledge_store)` is the registry. When `knowledge_store` is `None` (the store is disabled in config) the memory tools are omitted automatically. ## `current_time` @@ -106,6 +102,62 @@ Example Domain This domain is for use in documentation examples... ``` +## `memory_ingest` + +```python +@tool +async def memory_ingest(content: str, domain: str = "general", heading: str | None = None) -> str +``` + +Stores a chunk in the bundled `KnowledgeStore`. Use for things the operator wants you to remember across sessions — preferences, environment facts, decisions worth recalling later. + +`domain` is a logical bucket (`"preferences"`, `"context"`, `"general"`, …). `heading` is an optional short label that doubles as a stable de-dupe key. + +Returns `"Stored chunk 17 in 'preferences'."` on success, an error string when the store is unavailable. + +## `memory_recall` + +```python +@tool +async def memory_recall(query: str, k: int = 5) -> str +``` + +Top-k keyword search over the store via FTS5 (LIKE fallback). Returns one match per line: + +``` +[preferences] coffee: Operator's preferred coffee is a Gibraltar with oat milk. +[context] lab: Primary lab is Snickerdoodle in Spokane. +``` + +Returns `"No matches."` when nothing scores above the keyword threshold. + +## `memory_list` + +```python +@tool +async def memory_list(domain: str | None = None, limit: int = 10) -> str +``` + +Most-recent-first listing of stored chunks. Filter by domain when given. Useful for "what did I log today?" style queries. + +## `memory_stats` + +```python +@tool +async def memory_stats() -> str +``` + +Per-domain chunk counts plus a total. Useful for sanity-checking that ingest landed. + +## `daily_log` + +```python +@tool +async def daily_log(content: str) -> str +``` + +Convenience wrapper around `memory_ingest` that writes to `domain='daily-log'` with today's UTC date as the heading. Same-day entries cluster under the same heading for `memory_list(domain='daily-log')`. + ## Adding your own Follow the same pattern: @@ -132,7 +184,7 @@ Then append it to the list in `get_all_tools()`: ```python def get_all_tools(knowledge_store=None): - return [echo, current_time, calculator, web_search, fetch_url, my_tool] + return [current_time, calculator, web_search, fetch_url, my_tool] ``` See [Write your first tool](/tutorials/first-tool) for the full walkthrough. @@ -141,3 +193,4 @@ See [Write your first tool](/tutorials/first-tool) for the full walkthrough. - [Configure subagents](/guides/subagents) — tools are allowlisted per subagent - [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url` +- [Eval your fork](/guides/evals) — the eval harness exercises every tool listed here end-to-end diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md index 6082f66..ce12744 100644 --- a/docs/tutorials/first-agent.md +++ b/docs/tutorials/first-agent.md @@ -40,7 +40,7 @@ Walk through the four steps: 1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one. 2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent. -3. **Tools & middleware.** All five starter tools (`echo`, `current_time`, `calculator`, `web_search`, `fetch_url`) are enabled by default. Leave **Audit** and **Memory** middleware on. Leave **Knowledge** off — that needs an index the template doesn't ship with. +3. **Tools & middleware.** All nine starter tools (`current_time`, `calculator`, `web_search`, `fetch_url`, plus the memory tools `memory_ingest` / `memory_recall` / `memory_list` / `memory_stats` / `daily_log`) are enabled by default. Leave **Audit**, **Memory**, and **Knowledge** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` (falls back to `~/.protoagent/knowledge/agent.db` outside Docker). 4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`. Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices. diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 9f10251..87502d0 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -37,7 +37,6 @@ Then register it in `get_all_tools()` at the bottom of the same file: ```python def get_all_tools(knowledge_store=None): return [ - echo, current_time, calculator, web_search, @@ -53,7 +52,7 @@ If you want the worker subagent to be able to call `git_sha`, add it to the allo ```python WORKER_CONFIG = SubagentConfig( # ... - tools=["echo", "current_time", "calculator", "web_search", "fetch_url", "git_sha"], + tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"], # ... ) ``` diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..c8aff77 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,100 @@ +# Evals + +Side-effect-verified eval harness. Each case sends a prompt over A2A +to a running agent and asserts on three independent channels: + +1. **Audit log** — every expected tool name fires with the expected + outcome (`AuditMiddleware` writes JSONL to `/sandbox/audit/audit.jsonl`). +2. **Reply text** — case-insensitive substring patterns appear in the + model's final reply. +3. **Knowledge store side effects** — the right rows actually land in + the `chunks` table after a memory-writing turn. + +A case passes only when every configured assertion holds. + +## Quickstart + +```bash +# Agent must be running at $EVAL_BASE_URL (default http://localhost:7870). +# Auth: set $A2A_AUTH_TOKEN if bearer is configured, $_API_KEY +# (or $EVAL_API_KEY) if X-API-Key auth is configured. Both are sent +# when both env vars exist. + +python -m evals.runner # all cases +python -m evals.runner --category tool # one category +python -m evals.runner --tasks current_time,daily_log +python -m evals.runner --base-url http://host:7870 +``` + +Reports land in `evals/results/run-.json` per run. + +## Categories + +| Category | What it covers | +|---|---| +| `a2a-protocol` | Agent card discovery, auth gating | +| `simple` | Direct LLM answers, no tool use | +| `abstention` | Don't reach for a tool when training data is enough | +| `tool` | Single-tool invocations across the starter set | +| `chained` | Multi-step reasoning that calls 2+ tools | +| `subsystem` | KnowledgeMiddleware retrieval, hot-memory injection | + +## File layout + +``` +evals/ + client.py A2A client (message/send + poll, message/stream, agent card, cancel) + runner.py CLI runner — print board, write JSON report + verify.py Audit-log + KB side-effect assertions, setup/teardown + tasks.json Cases — 15 covering the starter tools end-to-end + results/ Per-run reports +``` + +## Adding a case + +Append to `tasks.json`: + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Human-readable description", + "prompt": "What you ask the agent (in real-user voice — never name the tool)", + "expected_tools": ["tool_name"], + "expected_patterns": ["substring-that-must-appear"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-A1B2", + "domain": "context" + }, + "setup": [ + {"kb_ingest": {"content": "...", "domain": "context", "heading": "..."}} + ], + "teardown": [ + {"kb_delete_by_content": {"contains": "EVAL-MARK-A1B2"}} + ] +} +``` + +Use **unique markers** (`EVAL-MARK-XYZ`, `eval-chain-flag-q9`) in +prompts whenever you need a verifier to disambiguate from real +operator data. + +## Why side-effect verification + +When the model hallucinates a tool result (e.g. "Logged: ..." without +actually calling `daily_log`), text-only checks pass while the DB +stays empty. The audit-log + KB queries here catch it. + +## Prompt rule + +Every prompt must be plausibly typed by a real user. **The tool name +never appears.** If the agent has to infer the tool from intent, that +*is* the test — leaking the tool name into the prompt is testing +instruction-following, not tool selection. + +## References + +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/client.py b/evals/client.py new file mode 100644 index 0000000..51eadfe --- /dev/null +++ b/evals/client.py @@ -0,0 +1,262 @@ +"""A2A client for the eval runner. + +Drives the running agent over the same JSON-RPC + SSE surface that +real A2A callers use: + +- ``agent_card()`` — GET ``/.well-known/agent-card.json`` +- ``ask()`` — ``message/send`` + ``tasks/get`` poll +- ``stream()`` — ``message/stream`` SSE +- ``cancel()`` — ``tasks/cancel`` + +Returns structured ``TaskResult`` objects the runner asserts against. + +Auth picks up both surfaces the template exposes (see ``server.py``): + +- ``Authorization: Bearer `` — wizard-set / ``A2A_AUTH_TOKEN`` env +- ``X-API-Key: `` — legacy, ``_API_KEY`` env + +Both headers are sent when the corresponding env var is set; the +running agent enforces whichever it is configured for. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import time +import uuid +from dataclasses import dataclass, field +from typing import Any + +import httpx + + +@dataclass +class TaskResult: + task_id: str + state: str # completed / failed / canceled / timeout + text: str = "" # extracted user-facing reply + artifacts: list[dict] = field(default_factory=list) + usage: dict = field(default_factory=dict) + duration_ms: int = 0 + error: str | None = None + + +def _resolve_auth_env() -> tuple[str, str]: + """Return (bearer_token, api_key) from env. + + Bearer comes from ``A2A_AUTH_TOKEN`` (the env name the A2A handler + reads at boot). The API key is named after the agent — + ``_API_KEY`` — so a fork named ``quinn`` reads + ``QUINN_API_KEY``. ``EVAL_API_KEY`` is honored as an explicit + override so CI doesn't have to know the agent's slug. + """ + bearer = os.environ.get("A2A_AUTH_TOKEN", "") + + api_key = os.environ.get("EVAL_API_KEY", "") + if not api_key: + agent = os.environ.get("AGENT_NAME", "protoagent").upper() + api_key = os.environ.get(f"{agent}_API_KEY", "") + return bearer, api_key + + +class AgentClient: + """Thin A2A client tied to one agent instance.""" + + def __init__( + self, + base_url: str | None = None, + bearer: str | None = None, + api_key: str | None = None, + ): + self.base_url = ( + base_url + or os.environ.get("EVAL_BASE_URL") + or os.environ.get("AGENT_BASE_URL") + or "http://localhost:7870" + ).rstrip("/") + + env_bearer, env_api_key = _resolve_auth_env() + token = bearer if bearer is not None else env_bearer + x_api = api_key if api_key is not None else env_api_key + self.headers = {"Content-Type": "application/json"} + if token: + self.headers["Authorization"] = f"Bearer {token}" + if x_api: + self.headers["X-API-Key"] = x_api + + # ── Agent card ────────────────────────────────────────────────────────── + + async def agent_card(self) -> dict: + """Fetch the agent card. + + The template serves both ``/.well-known/agent-card.json`` (modern) + and ``/.well-known/agent.json`` (legacy). We try the modern path + first; fall back to the legacy path so this works against forks + that disabled one or the other. + """ + async with httpx.AsyncClient(timeout=10) as client: + for path in ("/.well-known/agent-card.json", "/.well-known/agent.json"): + r = await client.get(f"{self.base_url}{path}") + if r.status_code == 200: + return r.json() + r.raise_for_status() # surface the last error + return {} + + # ── message/send + poll ───────────────────────────────────────────────── + + async def ask(self, prompt: str, *, timeout_s: int = 90) -> TaskResult: + """Send + poll until terminal. Returns TaskResult with extracted text.""" + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + start = time.time() + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(f"{self.base_url}/a2a", headers=self.headers, json=payload) + r.raise_for_status() + resp = r.json() + if "error" in resp: + return TaskResult(task_id="", state="failed", error=str(resp["error"])) + task_id = resp.get("result", {}).get("id", "") + + deadline = start + timeout_s + while time.time() < deadline: + await asyncio.sleep(1.5) + poll = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "p", + "method": "tasks/get", + "params": {"id": task_id}, + }, + ) + poll.raise_for_status() + res = poll.json().get("result", {}) + state = (res.get("status") or {}).get("state", "") + if state in ("completed", "failed", "canceled"): + text, usage = _extract(res) + return TaskResult( + task_id=task_id, + state=state, + text=text, + artifacts=res.get("artifacts", []), + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + return TaskResult( + task_id=task_id, state="timeout", + duration_ms=int((time.time() - start) * 1000), + ) + + # ── message/stream (SSE) ──────────────────────────────────────────────── + + async def stream(self, prompt: str, *, timeout_s: int = 90) -> tuple[list[dict], TaskResult | None]: + """Stream a turn over SSE. Returns (event_log, final TaskResult). + + Each event is a dict shaped ``{kind, result}``. Use this to assert + on the streaming protocol itself (status-update sequence, final + flag, artifact chunks). Most cases should use ``ask()`` instead. + """ + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/stream", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + events: list[dict] = [] + final: TaskResult | None = None + start = time.time() + async with httpx.AsyncClient(timeout=timeout_s) as client: + async with client.stream( + "POST", f"{self.base_url}/a2a", headers=self.headers, json=payload + ) as r: + if r.status_code >= 400: + body = await r.aread() + return events, TaskResult( + task_id="", state="failed", + error=f"HTTP {r.status_code}: {body.decode()[:300]}", + ) + async for line in r.aiter_lines(): + if not line or line.startswith(":"): + continue + if line.startswith("data:"): + raw = line[5:].strip() + if not raw: + continue + try: + data = json.loads(raw) + except json.JSONDecodeError: + events.append({"kind": "raw", "raw": raw}) + continue + result = (data.get("result") or {}) + kind = result.get("kind", "?") + events.append({"kind": kind, "result": result}) + if kind in ("status-update", "task") and result.get("final"): + text, usage = _extract(result) + final = TaskResult( + task_id=result.get("taskId") or result.get("id", ""), + state=(result.get("status") or {}).get("state", "unknown"), + text=text, + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + break + return events, final + + # ── tasks/cancel ──────────────────────────────────────────────────────── + + async def cancel(self, task_id: str) -> dict: + async with httpx.AsyncClient(timeout=10) as client: + r = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "c", + "method": "tasks/cancel", + "params": {"id": task_id}, + }, + ) + return r.json() + + +def _extract(result: dict) -> tuple[str, dict]: + """Pull text + cost data out of an A2A result envelope.""" + text_parts: list[str] = [] + usage: dict = {} + artifacts = result.get("artifacts") or [] + for art in artifacts: + for p in art.get("parts", []): + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + elif p.get("kind") == "data" and isinstance(p.get("data"), dict): + if "usage" in p["data"]: + usage = dict(p["data"]["usage"]) + if "durationMs" in p["data"]: + usage["durationMs"] = p["data"]["durationMs"] + status = result.get("status") or {} + msg = status.get("message") or {} + for p in msg.get("parts") or []: + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + return "\n".join(text_parts).strip(), usage diff --git a/evals/results/.gitkeep b/evals/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/runner.py b/evals/runner.py new file mode 100644 index 0000000..ad5154b --- /dev/null +++ b/evals/runner.py @@ -0,0 +1,307 @@ +"""Eval runner — executes ``tasks.json``, prints a pass/fail board, +writes a JSON report to ``evals/results/run-.json``. + +Usage: + +.. code:: bash + + # agent must be running at $EVAL_BASE_URL (default http://localhost:7870) + # auth: $A2A_AUTH_TOKEN and/or $_API_KEY (or $EVAL_API_KEY) + + python -m evals.runner # all cases + python -m evals.runner --category tool # one category + python -m evals.runner --tasks current_time,daily_log + python -m evals.runner --base-url http://host:7870 + +Cases are described in ``tasks.json``. Each case picks one of three +``kind`` runners: + +- ``agent_card`` — fetch ``/.well-known/agent-card.json`` and assert + on the returned card shape. +- ``auth_check`` — send a request with a known-bad bearer token and + assert the expected HTTP status. +- ``ask`` — send a prompt over A2A, optionally pre-seed the KB, then + assert against three independent channels: audit-log tool firing, + reply-text patterns, and KB side effects. + +A case passes only when all assertions hold. The ``detail`` column in +the pass/fail board names the missing assertion when one fails. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +# Allow ``python -m evals.runner`` and ``python evals/runner.py``. +_PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(_PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(_PROJECT_ROOT)) + +from evals.client import AgentClient, TaskResult +from evals import verify + + +@dataclass +class CaseResult: + id: str + category: str + name: str + passed: bool + detail: str + duration_ms: int = 0 + tokens: int = 0 + raw: dict = field(default_factory=dict) + + +# ── case runners ──────────────────────────────────────────────────────────── + + +async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult: + expect = case.get("expect", {}) + try: + card = await client.agent_card() + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"fetch failed: {e}") + + problems: list[str] = [] + if "name" in expect and card.get("name") != expect["name"]: + problems.append(f"name={card.get('name')!r} expected {expect['name']!r}") + if "skills_min" in expect: + skills = card.get("skills") or [] + if len(skills) < expect["skills_min"]: + problems.append(f"only {len(skills)} skills, expected >= {expect['skills_min']}") + if "extensions_contain" in expect: + ext_uris = [ + e.get("uri", "") + for e in (card.get("capabilities") or {}).get("extensions") or [] + ] + for needle in expect["extensions_contain"]: + if not any(needle in u for u in ext_uris): + problems.append(f"missing extension matching {needle!r}; saw {ext_uris}") + if problems: + return CaseResult(case["id"], case["category"], case["name"], False, "; ".join(problems)) + return CaseResult(case["id"], case["category"], case["name"], True, "card OK") + + +async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: + """Verify the A2A endpoint rejects a bad bearer with the expected status.""" + import httpx + + expected_status = case.get("expect", {}).get("status", 401) + bad = case.get("bad_token", "") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {bad}", + # No X-API-Key — testing bearer alone. + } + payload = { + "jsonrpc": "2.0", + "id": "auth-check", + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": "ping"}], + "messageId": "auth-check", + } + }, + } + try: + async with httpx.AsyncClient(timeout=10) as c: + r = await c.post(f"{client.base_url}/a2a", headers=headers, json=payload) + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"request failed: {e}") + if r.status_code != expected_status: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"got {r.status_code}, expected {expected_status}", + ) + return CaseResult( + case["id"], case["category"], case["name"], True, f"status={r.status_code}", + ) + + +async def _run_ask(client: AgentClient, case: dict) -> CaseResult: + # Pre-seed state via direct DB writes (model never sees this). + if "setup" in case: + err = verify.apply_setup(case["setup"]) + if err: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"setup failed: {err}", + ) + + since = verify.audit_now() + result: TaskResult = await client.ask( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + + if result.state != "completed": + if "teardown" in case: + verify.apply_teardown(case["teardown"]) + return CaseResult( + case["id"], case["category"], case["name"], False, + f"task state={result.state}; error={result.error or '(none)'}", + duration_ms=result.duration_ms, + raw={"text": result.text[:200]}, + ) + + problems: list[str] = [] + + # Tool firing assertions. + expected_tools = case.get("expected_tools") or [] + if expected_tools: + await asyncio.sleep(0.3) # let the audit log catch up + entries = verify.audit_entries_since(since) + require_success = case.get("tool_outcome", "success") == "success" + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if not passed: + problems.append(detail) + + # Text pattern assertions (case-insensitive substrings). + text_lower = result.text.lower() + for pattern in case.get("expected_patterns") or []: + if pattern.lower() not in text_lower: + problems.append(f"missing pattern {pattern!r}") + + # KB side-effect assertions. + vk = case.get("verify_kb") or {} + if "find_chunk_containing" in vk: + chunk = verify.find_chunk_containing( + vk["find_chunk_containing"], domain=vk.get("domain"), + ) + if not chunk: + problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") + + if "teardown" in case: + verify.apply_teardown(case["teardown"]) + + detail = ( + "; ".join(problems) if problems + else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" + ) + return CaseResult( + case["id"], case["category"], case["name"], + passed=not problems, + detail=detail, + duration_ms=result.duration_ms, + tokens=result.usage.get("total_tokens", 0) or 0, + raw={"reply": result.text[:300]}, + ) + + +# ── dispatch ──────────────────────────────────────────────────────────────── + + +_RUNNERS = { + "agent_card": _run_agent_card, + "auth_check": _run_auth_check, + "ask": _run_ask, +} + + +async def run_one(client: AgentClient, case: dict) -> CaseResult: + runner = _RUNNERS.get(case.get("kind", "ask")) + if runner is None: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"unknown kind: {case.get('kind')}", + ) + try: + return await runner(client, case) + except Exception as e: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"exception: {e!r}", + ) + + +# ── main ──────────────────────────────────────────────────────────────────── + + +def _print_board(results: list[CaseResult]) -> None: + width_id = max(len(r.id) for r in results) + width_cat = max(len(r.category) for r in results) + print() + print(f"{'ID'.ljust(width_id)} {'CAT'.ljust(width_cat)} RESULT TIME TOKENS DETAIL") + print("-" * 90) + pass_count = 0 + for r in results: + mark = "PASS" if r.passed else "FAIL" + if r.passed: + pass_count += 1 + time_s = f"{r.duration_ms}ms".rjust(6) + tokens = str(r.tokens).rjust(6) if r.tokens else " - " + print( + f"{r.id.ljust(width_id)} {r.category.ljust(width_cat)} " + f"{mark} {time_s} {tokens} {r.detail[:80]}" + ) + print("-" * 90) + print(f"\n{pass_count}/{len(results)} passed") + + +def _save_report(results: list[CaseResult], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "ts": datetime.now(timezone.utc).isoformat(), + "total": len(results), + "passed": sum(1 for r in results if r.passed), + "results": [asdict(r) for r in results], + } + path.write_text(json.dumps(payload, indent=2)) + print(f"\nReport: {path}") + + +async def main(): + p = argparse.ArgumentParser() + p.add_argument("--base-url", default=None) + p.add_argument("--tasks", default=None, help="comma-separated case IDs") + p.add_argument("--category", default=None) + p.add_argument("--out", default=None) + args = p.parse_args() + + tasks_path = Path(__file__).parent / "tasks.json" + cases = json.loads(tasks_path.read_text()) + + if args.tasks: + wanted = set(args.tasks.split(",")) + cases = [c for c in cases if c["id"] in wanted] + if args.category: + cases = [c for c in cases if c.get("category") == args.category] + + if not cases: + print("no cases match filters", file=sys.stderr) + return 2 + + client = AgentClient(base_url=args.base_url) + + print(f"Running {len(cases)} case(s) against {client.base_url}") + results: list[CaseResult] = [] + for case in cases: + sys.stdout.write(f" {case['id']}... ") + sys.stdout.flush() + result = await run_one(client, case) + sys.stdout.write(f"{'PASS' if result.passed else 'FAIL'} {result.detail[:60]}\n") + results.append(result) + + _print_board(results) + + out_path = Path(args.out) if args.out else ( + Path(__file__).parent / "results" / f"run-{int(time.time())}.json" + ) + _save_report(results, out_path) + + return 0 if all(r.passed for r in results) else 1 + + +if __name__ == "__main__": + raise SystemExit(asyncio.run(main())) diff --git a/evals/tasks.json b/evals/tasks.json new file mode 100644 index 0000000..14cdd16 --- /dev/null +++ b/evals/tasks.json @@ -0,0 +1,186 @@ +[ + { + "id": "card_discovery", + "category": "a2a-protocol", + "kind": "agent_card", + "name": "Agent card discovery", + "expect": { + "skills_min": 1, + "extensions_contain": ["cost-v1"] + } + }, + { + "id": "auth_negative", + "category": "a2a-protocol", + "kind": "auth_check", + "name": "Reject bad bearer when bearer auth is configured", + "bad_token": "definitely-not-the-real-token", + "expect": {"status": 401} + }, + + { + "id": "abstain_no_tool", + "category": "abstention", + "kind": "ask", + "name": "Don't reach for a tool when training data is fine", + "prompt": "What's the capital of France? One word.", + "expected_tools": [], + "expected_patterns": ["paris"] + }, + { + "id": "greeting", + "category": "simple", + "kind": "ask", + "name": "Direct greeting, no tool", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [] + }, + + { + "id": "current_time_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about live time → current_time", + "prompt": "What time is it in UTC right now?", + "expected_tools": ["current_time"], + "expected_patterns": ["UTC"] + }, + { + "id": "calculator_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"] + }, + { + "id": "web_search_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about recent news → web_search", + "prompt": "Anything notable in the news about Anthropic this week?", + "expected_tools": ["web_search"], + "expected_patterns": [] + }, + { + "id": "fetch_url_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a URL's content → fetch_url", + "prompt": "What's on https://example.com? Just the page title is fine.", + "expected_tools": ["fetch_url"], + "expected_patterns": ["example"] + }, + + { + "id": "memory_ingest_intent", + "category": "tool", + "kind": "ask", + "name": "Stores a stable preference → memory_ingest writes a chunk", + "prompt": "Remember that I prefer protoLabs Studio standups at 9am Eastern.", + "expected_tools": ["memory_ingest"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "9am" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "9am"}} + ] + }, + { + "id": "daily_log_intent", + "category": "tool", + "kind": "ask", + "name": "Asks to log an event → daily_log writes today's chunk", + "prompt": "Log this for today: my standup just ended, team is unblocked on the auth migration.", + "expected_tools": ["daily_log"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "auth migration", + "domain": "daily-log" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "auth migration"}} + ] + }, + { + "id": "memory_recall_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a stored fact → recall surfaces it", + "setup": [ + {"kb_ingest": { + "content": "Operator's primary lab is Snickerdoodle, located in Spokane.", + "domain": "context", + "heading": "lab" + }} + ], + "prompt": "Where's my primary lab and what's it called?", + "expected_tools": ["memory_recall"], + "expected_patterns": ["snickerdoodle", "spokane"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "context", "heading": "lab"}} + ] + }, + { + "id": "memory_list_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for recent log entries → memory_list", + "setup": [ + {"kb_ingest": {"content": "called the dentist", "domain": "daily-log", "heading": "today"}}, + {"kb_ingest": {"content": "merged the auth PR", "domain": "daily-log", "heading": "today"}} + ], + "prompt": "What did I do today? Summarize from the log.", + "expected_tools": ["memory_list"], + "expected_patterns": ["dentist"], + "teardown": [ + {"kb_delete_by_content": {"contains": "called the dentist"}}, + {"kb_delete_by_content": {"contains": "merged the auth PR"}} + ] + }, + { + "id": "memory_stats_intent", + "category": "tool", + "kind": "ask", + "name": "Asks how much is in memory → memory_stats", + "prompt": "How much have I got stored across each memory domain?", + "expected_tools": ["memory_stats"], + "expected_patterns": [] + }, + + { + "id": "log_then_recall_chain", + "category": "chained", + "kind": "ask", + "name": "Log an event, then recall it later in the same turn", + "prompt": "Log this for today: 'eval-chain-flag-q9: chained log+recall test'. After logging, search memory for that flag and quote it back.", + "expected_tools": ["daily_log", "memory_recall"], + "expected_patterns": ["eval-chain-flag-q9"], + "teardown": [ + {"kb_delete_by_content": {"contains": "eval-chain-flag-q9"}} + ] + }, + + { + "id": "knowledge_middleware_recall", + "category": "subsystem", + "kind": "ask", + "name": "KnowledgeMiddleware surfaces a stored fact without an explicit search", + "setup": [ + {"kb_ingest": { + "content": "Operator's preferred coffee is a Gibraltar with oat milk from Atticus.", + "domain": "preferences", + "heading": "coffee" + }} + ], + "prompt": "What's my usual coffee order?", + "expected_tools": [], + "expected_patterns": ["gibraltar", "oat"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "preferences", "heading": "coffee"}} + ] + } +] diff --git a/evals/verify.py b/evals/verify.py new file mode 100644 index 0000000..5b1f8cc --- /dev/null +++ b/evals/verify.py @@ -0,0 +1,176 @@ +"""Side-effect verifiers for eval cases. + +Two channels: + +- **Audit log** — JSONL written by ``AuditMiddleware`` at + ``/sandbox/audit/audit.jsonl`` (override with ``AUDIT_PATH`` env). + ``audit_entries_since`` returns entries newer than a marker, and + ``assert_tools_fired`` confirms a tool name appears with the + expected outcome. +- **Knowledge store** — sqlite DB at ``KNOWLEDGE_DB_PATH`` (or the + template default). ``find_chunk_containing`` confirms a memory + write actually landed; ``setup_chunk`` / ``teardown`` mutate the + store directly so cases start from a known state. + +The store is opened read/write so setup steps can pre-seed (BFCL's +``initial_config`` pattern). The model never sees these direct writes +— it discovers them via ``memory_recall`` / ``memory_list`` tools as +real users would. +""" + +from __future__ import annotations + +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +# ── path resolution ───────────────────────────────────────────────────────── + + +def _audit_path() -> Path: + """Audit JSONL location. Falls back to the template's docker default.""" + raw = os.environ.get("AUDIT_PATH") or "/sandbox/audit/audit.jsonl" + p = Path(raw).expanduser() + if p.is_file(): + return p + # Local-dev fallback: same shape, but under the home dir. + fallback = Path.home() / ".protoagent" / "audit" / "audit.jsonl" + return fallback + + +def _kb_store(): + """Construct a ``KnowledgeStore`` against the configured path. + + Imported lazily so ``evals/verify.py`` can be loaded in a context + where ``knowledge/`` isn't on sys.path yet (the runner adjusts + sys.path before calling in). + """ + from knowledge import KnowledgeStore + return KnowledgeStore() # honors KNOWLEDGE_DB_PATH env + + +# ── audit log ─────────────────────────────────────────────────────────────── + + +def audit_now() -> str: + """ISO-8601 marker suitable as a 'since' input to ``audit_entries_since``.""" + return datetime.now(timezone.utc).isoformat() + + +def audit_entries_since(ts_iso: str) -> list[dict]: + """Return audit-log entries with ``ts`` strictly greater than ``ts_iso``.""" + p = _audit_path() + if not p.is_file(): + return [] + out: list[dict] = [] + with p.open() as fh: + for line in fh: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if entry.get("ts", "") > ts_iso: + out.append(entry) + return out + + +def assert_tools_fired( + audit_entries: list[dict], + expected: list[str], + *, + require_success: bool = True, +) -> tuple[bool, str]: + """Confirm each expected tool name appears in audit entries. + + Order doesn't matter — a tool that fires twice still satisfies one + expected entry, and extra entries (subset matching, BFCL-style) are + allowed. + + ``require_success=True`` (default) only counts ``success=True`` + entries — use this for happy-path cases. Pass ``require_success=False`` + when the case represents an error path that the agent should still + *attempt* (e.g. fetching a private URL the agent has no creds for). + """ + fired: dict[str, dict[str, int]] = {} + for e in audit_entries: + bucket = fired.setdefault(e.get("tool", "?"), {"ok": 0, "err": 0}) + bucket["ok" if e.get("success") else "err"] += 1 + + missing: list[str] = [] + for t in expected: + if t not in fired: + missing.append(t) + continue + if require_success and fired[t]["ok"] == 0: + missing.append(f"{t} (only errors)") + + if missing: + return False, f"missing tools: {missing}; saw: {dict(fired)}" + return True, f"saw: {dict(fired)}" + + +# ── knowledge store ───────────────────────────────────────────────────────── + + +def find_chunk_containing(text: str, *, domain: str | None = None) -> dict | None: + store = _kb_store() + chunk = store.find_chunk_containing(text, domain=domain) + return chunk.as_dict() if chunk else None + + +def chunks_in_domain(domain: str, *, limit: int = 50) -> list[dict]: + store = _kb_store() + return [c.as_dict() for c in store.list_chunks(domain=domain, limit=limit)] + + +# ── setup / teardown helpers ───────────────────────────────────────────────── + + +def apply_setup(steps: list[dict]) -> str | None: + """Apply a list of setup steps. Each step is a dict with one key. + + Supported step kinds: + + - ``kb_ingest``: ``{content, domain, heading?}`` + + Returns ``None`` on success, an error string on first failure. + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + if kind == "kb_ingest": + if store.add_chunk( + args["content"], + domain=args.get("domain", "general"), + heading=args.get("heading"), + ) is None: + return f"kb_ingest failed for {args!r}" + else: + return f"unknown setup step: {kind}" + return None + + +def apply_teardown(steps: list[dict]) -> None: + """Best-effort teardown. Never raises so a setup failure or assertion + failure doesn't poison subsequent cases. + + Supported step kinds: + + - ``kb_delete_by_content``: ``{contains}`` + - ``kb_delete_by_heading``: ``{domain, heading}`` + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + try: + if kind == "kb_delete_by_content": + store.delete_by_content(args["contains"]) + elif kind == "kb_delete_by_heading": + store.delete_by_heading(args["domain"], args["heading"]) + except Exception as exc: # pragma: no cover + log.debug("[verify] teardown step %s failed: %s", kind, exc) diff --git a/graph/config.py b/graph/config.py index 00ae1a8..a3df02b 100644 --- a/graph/config.py +++ b/graph/config.py @@ -37,16 +37,24 @@ class LangGraphConfig: # Subagents — template ships with one example (see graph/subagents/config.py). # Add fields here as you add entries to SUBAGENT_REGISTRY. worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, )) # Middleware toggles - knowledge_middleware: bool = False # template ships no knowledge store + knowledge_middleware: bool = True audit_middleware: bool = True - memory_middleware: bool = False + memory_middleware: bool = True - # Knowledge store (opt-in — leave disabled until the fork ships one) + # Knowledge store — sqlite + FTS5, see ``knowledge/store.py``. + # The default path lives under ``/sandbox/`` to play well with the + # bundled Docker volume; the store falls back to + # ``~/.protoagent/knowledge/agent.db`` automatically when /sandbox + # is read-only or absent (e.g. local ``python server.py``). knowledge_db_path: str = "/sandbox/knowledge/agent.db" embed_model: str = "qwen3-embedding" knowledge_top_k: int = 5 diff --git a/graph/subagents/config.py b/graph/subagents/config.py index 554a321..560edc7 100644 --- a/graph/subagents/config.py +++ b/graph/subagents/config.py @@ -63,7 +63,11 @@ class SubagentConfig: Replace this prompt with domain-specific guidance once your agent has real specialized roles.""", - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, ) diff --git a/knowledge/__init__.py b/knowledge/__init__.py new file mode 100644 index 0000000..1de93de --- /dev/null +++ b/knowledge/__init__.py @@ -0,0 +1,12 @@ +"""Knowledge store — sqlite-backed chunk storage for memory tools and middleware. + +The template ships this enabled by default so a fresh fork has a working +memory loop on day one (memory_ingest, memory_recall, daily_log) and the +eval harness can assert side effects against real DB state. + +See ``knowledge.store.KnowledgeStore`` for the public API. +""" + +from knowledge.store import KnowledgeStore, Chunk + +__all__ = ["KnowledgeStore", "Chunk"] diff --git a/knowledge/store.py b/knowledge/store.py new file mode 100644 index 0000000..a6aee7a --- /dev/null +++ b/knowledge/store.py @@ -0,0 +1,456 @@ +"""KnowledgeStore — sqlite-backed chunk storage with FTS5 search. + +The template's default knowledge surface. One ``chunks`` table holds +every piece of stored content (operator notes via ``memory_ingest``, +daily-log entries, conversation findings extracted by +``MemoryMiddleware``); the ``domain`` column distinguishes them. + +Search uses sqlite FTS5 when available (true on virtually all modern +sqlite builds). When FTS5 is missing — sandboxed sqlite, custom builds +— the store transparently falls back to ``LIKE`` keyword matching so +the API contract still holds. + +The store is path-aware and degradation-aware: + +- Honors ``KNOWLEDGE_DB_PATH`` env var → constructor argument → + config default ``/sandbox/knowledge/agent.db``. +- If the configured path is unwritable (running locally outside the + container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db`` + so a fresh ``python server.py`` works without sudo. +- All write operations swallow ``sqlite3.OperationalError`` and log; + the store never crashes the agent loop on a corrupt or read-only DB. + +Forks that want embeddings on top of FTS5 can subclass and override +``search()`` — the middleware reads through that one method. +""" + +from __future__ import annotations + +import logging +import os +import re +import sqlite3 +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +DEFAULT_DB_PATH = "/sandbox/knowledge/agent.db" + + +@dataclass +class Chunk: + """One row from the chunks table — what callers see.""" + id: int + content: str + domain: str + heading: str | None + source: str | None + source_type: str | None + finding_type: str | None + created_at: str + updated_at: str + + def as_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "content": self.content, + "domain": self.domain, + "heading": self.heading, + "source": self.source, + "source_type": self.source_type, + "finding_type": self.finding_type, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + + +def _resolve_path(db_path: str | Path | None) -> Path: + """Pick a writable DB path. Env > arg > default; fall back to ~/.protoagent.""" + raw = os.environ.get("KNOWLEDGE_DB_PATH") or db_path or DEFAULT_DB_PATH + p = Path(str(raw)).expanduser() + try: + p.parent.mkdir(parents=True, exist_ok=True) + # Probe writability + probe = p.parent / ".write-probe" + probe.touch() + probe.unlink() + return p + except OSError: + fallback = Path.home() / ".protoagent" / "knowledge" / "agent.db" + fallback.parent.mkdir(parents=True, exist_ok=True) + log.info( + "[knowledge] %s not writable; using %s instead", + p, fallback, + ) + return fallback + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _has_fts5(db: sqlite3.Connection) -> bool: + try: + db.execute( + "CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_probe USING fts5(x)" + ) + db.execute("DROP TABLE _fts5_probe") + return True + except sqlite3.OperationalError: + return False + + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + content TEXT NOT NULL, + domain TEXT NOT NULL DEFAULT 'general', + heading TEXT, + source TEXT, + source_type TEXT, + finding_type TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_chunks_domain ON chunks(domain); +CREATE INDEX IF NOT EXISTS idx_chunks_created_at ON chunks(created_at); +""" + +_FTS_SCHEMA = """ +CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5( + content, heading, content='chunks', content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; +""" + + +class KnowledgeStore: + """Default knowledge store. Sqlite + FTS5 (with LIKE fallback). + + Forks usually don't subclass this — extend ``add_chunk`` / + ``search`` directly when you need new fields, or wrap it with + your own embedding layer. + """ + + def __init__(self, db_path: str | Path | None = None): + self.path = _resolve_path(db_path) + self._fts_available: bool | None = None + self._init_db() + + # ── connection / schema ───────────────────────────────────────────────── + + def _connect(self) -> sqlite3.Connection: + db = sqlite3.connect(str(self.path)) + db.row_factory = sqlite3.Row + db.execute("PRAGMA journal_mode=WAL") + return db + + def _init_db(self) -> None: + try: + db = self._connect() + db.executescript(_SCHEMA) + self._fts_available = _has_fts5(db) + if self._fts_available: + db.executescript(_FTS_SCHEMA) + else: + log.info( + "[knowledge] FTS5 unavailable — search will use LIKE fallback" + ) + db.commit() + db.close() + except sqlite3.OperationalError as exc: + log.error("[knowledge] schema init failed at %s: %s", self.path, exc) + + # Convenience for middleware that wants the raw connection. Kept + # private so the public API stays small. + def _get_db(self) -> sqlite3.Connection | None: + try: + return self._connect() + except sqlite3.OperationalError as exc: + log.error("[knowledge] connect failed: %s", exc) + return None + + # ── writes ────────────────────────────────────────────────────────────── + + def add_chunk( + self, + content: str, + domain: str = "general", + heading: str | None = None, + *, + source: str | None = None, + source_type: str | None = None, + finding_type: str | None = None, + ) -> int | None: + """Insert a chunk. Returns the new row id, or None on failure.""" + if not content or not content.strip(): + return None + db = self._get_db() + if db is None: + return None + try: + now = _now_iso() + cur = db.execute( + "INSERT INTO chunks " + "(content, domain, heading, source, source_type, finding_type, " + "created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + (content, domain, heading, source, source_type, finding_type, now, now), + ) + db.commit() + return int(cur.lastrowid) + except sqlite3.OperationalError as exc: + log.error("[knowledge] add_chunk failed: %s", exc) + return None + finally: + db.close() + + def add_finding( + self, + content: str, + source: str = "conversation", + source_type: str = "chat", + finding_type: str = "insight", + ) -> int | None: + """Compatibility shim for ``MemoryMiddleware.after_agent``. + + Stored under ``domain='finding'`` so memory_list / memory_recall + can surface them alongside operator-set chunks. + """ + return self.add_chunk( + content, + domain="finding", + source=source, + source_type=source_type, + finding_type=finding_type, + ) + + # ── reads ─────────────────────────────────────────────────────────────── + + def search( + self, + query: str, + k: int = 5, + *, + domain: str | None = None, + ) -> list[dict[str, Any]]: + """Top-k chunks matching ``query``. Shape matches what the + ``KnowledgeMiddleware`` consumes: each result has ``table``, + ``preview``, plus the underlying chunk fields. + + Uses FTS5 when available, else a tokenized LIKE fallback. Returns + an empty list on no matches or DB failure (never raises). + """ + if not query or not query.strip(): + return [] + db = self._get_db() + if db is None: + return [] + try: + rows = self._search_fts(db, query, k, domain) if self._fts_available \ + else self._search_like(db, query, k, domain) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] search failed: %s", exc) + rows = [] + finally: + db.close() + + results: list[dict[str, Any]] = [] + for r in rows: + preview = (r["heading"] + ": " if r["heading"] else "") + r["content"] + results.append({ + "table": "chunks", + "preview": preview[:240], + **dict(r), + }) + return results + + def _search_fts( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + # Sanitize to FTS5-safe tokens; OR them so a multi-word query + # matches any of the keywords (closer to LIKE behaviour). + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + match = " OR ".join(tokens) + if domain: + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? AND c.domain = ? " + "ORDER BY rank LIMIT ?", + (match, domain, k), + ).fetchall() + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? " + "ORDER BY rank LIMIT ?", + (match, k), + ).fetchall() + + def _search_like( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + # Score = number of tokens matched (rough recall-style ranking). + like_clauses = " + ".join( + "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END" + for _ in tokens + ) + params: list[Any] = [] + for t in tokens: + needle = f"%{t}%" + params.extend([needle, needle]) + sql = ( + f"SELECT *, ({like_clauses}) AS score FROM chunks " + "WHERE score > 0" + ) + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY score DESC, id DESC LIMIT ?" + params.append(k) + return db.execute(sql, params).fetchall() + + def list_chunks( + self, + domain: str | None = None, + limit: int = 50, + ) -> list[Chunk]: + """Most-recent-first chunk listing. Used by ``memory_list``.""" + db = self._get_db() + if db is None: + return [] + try: + if domain: + rows = db.execute( + "SELECT * FROM chunks WHERE domain = ? ORDER BY id DESC LIMIT ?", + (domain, limit), + ).fetchall() + else: + rows = db.execute( + "SELECT * FROM chunks ORDER BY id DESC LIMIT ?", + (limit,), + ).fetchall() + except sqlite3.OperationalError as exc: + log.warning("[knowledge] list_chunks failed: %s", exc) + rows = [] + finally: + db.close() + return [Chunk(**dict(r)) for r in rows] + + def stats(self) -> dict[str, int]: + """Return per-domain chunk counts plus a ``total`` key.""" + db = self._get_db() + if db is None: + return {"total": 0} + try: + rows = db.execute( + "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC" + ).fetchall() + total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] + except sqlite3.OperationalError as exc: + log.warning("[knowledge] stats failed: %s", exc) + return {"total": 0} + finally: + db.close() + out = {r["domain"]: r["n"] for r in rows} + out["total"] = int(total) + return out + + # ── verification helpers (used by evals/verify.py) ────────────────────── + + def find_chunk_containing( + self, + text: str, + domain: str | None = None, + ) -> Chunk | None: + """Return the most-recent chunk whose content or heading contains ``text``. + + Used by the eval runner to assert side-effect outcomes after a + memory-writing turn. + """ + db = self._get_db() + if db is None: + return None + try: + sql = ( + "SELECT * FROM chunks " + "WHERE (content LIKE ? OR heading LIKE ?)" + ) + params: list[Any] = [f"%{text}%", f"%{text}%"] + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY id DESC LIMIT 1" + row = db.execute(sql, params).fetchone() + except sqlite3.OperationalError as exc: + log.warning("[knowledge] find_chunk_containing failed: %s", exc) + row = None + finally: + db.close() + return Chunk(**dict(row)) if row else None + + def delete_by_content(self, contains: str) -> int: + """Delete chunks whose content matches ``%contains%``. Returns count.""" + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",)) + db.commit() + return int(cur.rowcount) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] delete_by_content failed: %s", exc) + return 0 + finally: + db.close() + + def delete_by_heading(self, domain: str, heading: str) -> int: + """Delete chunks matching (domain, heading). Returns count.""" + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute( + "DELETE FROM chunks WHERE domain = ? AND heading = ?", + (domain, heading), + ) + db.commit() + return int(cur.rowcount) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] delete_by_heading failed: %s", exc) + return 0 + finally: + db.close() diff --git a/server.py b/server.py index 75f9692..f1ecd46 100644 --- a/server.py +++ b/server.py @@ -90,8 +90,38 @@ def _init_langgraph_agent(): from graph.agent import create_agent_graph - _graph = create_agent_graph(_graph_config) - log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name) + # Construct the default KnowledgeStore so memory tools (memory_ingest, + # memory_recall, daily_log) and KnowledgeMiddleware have something to + # bind to. Forks that don't want a store can set + # ``middleware.knowledge: false`` and remove the memory tools from + # the worker subagent — the store is still cheap to construct. + knowledge_store = _build_knowledge_store(_graph_config) + + _graph = create_agent_graph(_graph_config, knowledge_store=knowledge_store) + log.info( + "LangGraph agent initialized (model: %s, knowledge_db: %s)", + _graph_config.model_name, + getattr(knowledge_store, "path", "(disabled)"), + ) + + +def _build_knowledge_store(config): + """Return a ``KnowledgeStore`` bound to the configured DB path. + + Best-effort: any sqlite-level failure is logged and the store + falls back to ``~/.protoagent/knowledge/agent.db`` automatically + (see ``knowledge.store._resolve_path``). Returns ``None`` only when + knowledge is disabled in config — kept as a separate code path so + forks can audit when the agent is running KB-less. + """ + if not getattr(config, "knowledge_middleware", True): + return None + try: + from knowledge import KnowledgeStore + return KnowledgeStore(db_path=config.knowledge_db_path) + except Exception as exc: + log.warning("[server] knowledge store init failed: %s; running KB-less", exc) + return None def _reload_langgraph_agent() -> tuple[bool, str]: @@ -130,7 +160,8 @@ def _reload_langgraph_agent() -> tuple[bool, str]: # metrics / card / auth all de-sync from what's actually running. if is_setup_complete(): try: - new_graph = create_agent_graph(new_config) + new_store = _build_knowledge_store(new_config) + new_graph = create_agent_graph(new_config, knowledge_store=new_store) except Exception as e: log.exception("[reload] graph rebuild failed") return False, f"graph rebuild failed: {e}" diff --git a/tests/test_config_io.py b/tests/test_config_io.py index 25a7472..39fc017 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -105,11 +105,11 @@ def test_apply_updates_nested_worker(tmp_path: Path) -> None: config_io.apply_updates_to_yaml( doc, - {"subagents": {"worker": {"enabled": True, "tools": ["echo", "calculator"]}}}, + {"subagents": {"worker": {"enabled": True, "tools": ["current_time", "calculator"]}}}, ) assert doc["subagents"]["worker"]["enabled"] is True - assert list(doc["subagents"]["worker"]["tools"]) == ["echo", "calculator"] + assert list(doc["subagents"]["worker"]["tools"]) == ["current_time", "calculator"] # ── config_to_dict ─────────────────────────────────────────────────────────── @@ -325,9 +325,9 @@ def test_list_available_tools_returns_starter_set(): # Lock in the template's starter set — forks replace these but # the drawer's CheckboxGroup populates from this call, so the # contract is "return tool names in a stable list". - assert "echo" in names - assert "calculator" in names assert "current_time" in names + assert "calculator" in names + assert "fetch_url" in names assert all(isinstance(n, str) for n in names) diff --git a/tests/test_skill_curator.py b/tests/test_skill_curator.py index cb7bf43..3d8e211 100644 --- a/tests/test_skill_curator.py +++ b/tests/test_skill_curator.py @@ -52,7 +52,7 @@ def _make_skill( "name": name, "description": description, "prompt_template": f"Run the {name} workflow.", - "tools_used": ["echo"], + "tools_used": ["current_time"], "confidence": confidence, "created_at": _utc_iso(days_ago), } diff --git a/tests/test_skill_emission.py b/tests/test_skill_emission.py index 34b8f1c..6d9555b 100644 --- a/tests/test_skill_emission.py +++ b/tests/test_skill_emission.py @@ -88,14 +88,14 @@ def test_skill_datapart_serialization() -> None: name="dp-test", description="DataPart test", prompt_template="prompt", - tools_used=["echo"], + tools_used=["current_time"], source_session_id="s1", ) part = artifact.to_datapart() assert part["kind"] == "data" assert part["metadata"]["mimeType"] == SKILL_V1_MIME assert part["data"]["name"] == "dp-test" - assert part["data"]["tools_used"] == ["echo"] + assert part["data"]["tools_used"] == ["current_time"] # created_at must be present and parseable datetime.fromisoformat(part["data"]["created_at"]) @@ -125,7 +125,7 @@ def test_skill_artifact_validation_tools_not_list() -> None: with pytest.raises(TypeError, match="tools_used"): SkillV1Artifact( name="x", description="d", prompt_template="p", - tools_used="echo", # type: ignore[arg-type] + tools_used="current_time", # type: ignore[arg-type] ) @@ -250,7 +250,7 @@ def _run_emit_logic( def test_skill_emitted_when_emit_skill_true() -> None: """Skill artifact is emitted when emit_skill=True and subagent succeeds.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -264,7 +264,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: assert len(skills) == 1 skill = skills[0] assert skill.name == "my-task" - assert skill.tools_used == ["echo"] + assert skill.tools_used == ["current_time"] assert skill.prompt_template == "do the thing" assert "Captured workflow" in skill.description @@ -272,7 +272,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: def test_no_emission_on_opt_out() -> None: """No skill artifact is emitted when emit_skill=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -307,7 +307,7 @@ def test_no_emission_on_failure() -> None: def test_no_emission_when_config_disallows() -> None: """No skill artifact is emitted when allow_skill_emission=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -323,8 +323,8 @@ def test_no_emission_when_config_disallows() -> None: def test_tool_tracking_metadata_captured() -> None: """tools_used in the artifact lists all tools invoked, deduplicated.""" msgs = [ - _make_ai_message_with_tool_calls(["echo", "calculator"]), - _make_ai_message_with_tool_calls(["echo"]), # duplicate — should appear once + _make_ai_message_with_tool_calls(["current_time", "calculator"]), + _make_ai_message_with_tool_calls(["current_time"]), # duplicate — should appear once _make_ai_message_with_content("result"), ] _run_emit_logic( @@ -336,7 +336,7 @@ def test_tool_tracking_metadata_captured() -> None: ) skills = get_pending_skills() assert len(skills) == 1 - assert skills[0].tools_used.count("echo") == 1 + assert skills[0].tools_used.count("current_time") == 1 assert "calculator" in skills[0].tools_used diff --git a/tests/test_starter_tools.py b/tests/test_starter_tools.py index fe4495b..f469365 100644 --- a/tests/test_starter_tools.py +++ b/tests/test_starter_tools.py @@ -114,13 +114,3 @@ async def test_fetch_url_rejects_non_http_scheme(): ): result = await fetch_url.ainvoke({"url": bad}) assert result.startswith("Error:"), f"accepted unsafe url: {bad!r}" - - -# ── echo — sanity ──────────────────────────────────────────────────────────── - - -@pytest.mark.asyncio -async def test_echo_sanity(): - from tools.lg_tools import echo - result = await echo.ainvoke({"message": "hello"}) - assert result == "echo: hello" diff --git a/tools/lg_tools.py b/tools/lg_tools.py index d8ce0f5..b59ccba 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -7,12 +7,20 @@ The template ships with a small starter set of free, keyless tools so a fresh clone can demonstrate real agent behaviour out of the box: -- ``echo`` — sanity check - ``current_time`` — wall-clock time in any IANA timezone - ``calculator`` — safe numeric expression evaluation - ``web_search`` — DuckDuckGo text search (via ``ddgs``, no API key) - ``fetch_url`` — fetch a URL and return cleaned text +Plus memory tools that bind to a ``KnowledgeStore`` (constructed in +``server.py`` and threaded through ``get_all_tools(knowledge_store)``): + +- ``memory_ingest`` — store a fact / preference / note +- ``memory_recall`` — search the store for relevant chunks +- ``memory_list`` — list recent chunks (optionally per domain) +- ``memory_stats`` — per-domain counts +- ``daily_log`` — convenience: write a daily-log chunk + Replace or extend this file with your agent's real tools and update ``get_all_tools()`` to return the full list. @@ -39,20 +47,6 @@ from langchain_core.tools import tool -# ── echo ───────────────────────────────────────────────────────────────────── - - -@tool -async def echo(message: str) -> str: - """Echo the input back with a prefix. Template-only sanity tool. - - Useful to verify the tool loop is wired end-to-end before real - tools are in place. Safe to delete once your fork has its own - tools. - """ - return f"echo: {message}" - - # ── current_time ───────────────────────────────────────────────────────────── @@ -273,16 +267,130 @@ def _extract_text_from_html(content: bytes) -> str: return "\n".join(lines) +# ── memory tools ───────────────────────────────────────────────────────────── +# +# Each memory tool is built by a factory that closes over the +# ``KnowledgeStore`` instance. Doing it this way (rather than module- +# level globals) keeps tests isolated — they pass a temp store and get +# a fresh tool list bound to it. Production constructs one store in +# ``server.py`` and reuses the bound tools for the lifetime of the +# process. + + +def _build_memory_tools(knowledge_store): + """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" + from datetime import datetime, timezone + + @tool + async def memory_ingest( + content: str, + domain: str = "general", + heading: str | None = None, + ) -> str: + """Store a fact, preference, or note in long-term memory. + + Use this for things the operator wants you to remember across + sessions — preferences ("I take my coffee black"), facts about + the operator's environment, decisions worth recalling later. + + Args: + content: The text to remember. Be specific and self-contained; + the chunk is retrieved by keyword search. + domain: Logical bucket — ``"preferences"``, ``"context"``, + ``"general"``. Defaults to ``"general"``. + heading: Optional short label (e.g. ``"coffee"``) used as a + stable de-dupe key by the eval suite and curator. + + Returns ``"Stored chunk N in 'domain'."`` on success. + """ + chunk_id = knowledge_store.add_chunk(content, domain=domain, heading=heading) + if chunk_id is None: + return "Error: failed to store chunk (knowledge store unavailable)." + return f"Stored chunk {chunk_id} in {domain!r}." + + @tool + async def memory_recall(query: str, k: int = 5) -> str: + """Search long-term memory for chunks relevant to ``query``. + + Returns the top-k matches, one per line. Pull this when the + operator asks something where stored context is more reliable + than the model's own training data ("what's my coffee order?", + "remind me what we decided about the auth migration"). + + Returns ``"No matches."`` when the store is empty or nothing + scores above the keyword threshold. + """ + results = knowledge_store.search(query, k=k) + if not results: + return "No matches." + lines = [] + for r in results: + lines.append(f"[{r.get('domain', '?')}] {r['preview']}") + return "\n".join(lines) + + @tool + async def memory_list(domain: str | None = None, limit: int = 10) -> str: + """List the most recent chunks. Filter by domain when given. + + Useful when the operator asks for recent activity ("what did I + log today?") or wants to inspect what the agent has stored. + """ + chunks = knowledge_store.list_chunks(domain=domain, limit=limit) + if not chunks: + return f"No chunks in {domain or 'any domain'}." + lines = [] + for c in chunks: + head = f"[{c.domain}]" + if c.heading: + head += f" {c.heading}:" + preview = (c.content or "")[:200] + lines.append(f"{c.created_at} {head} {preview}") + return "\n".join(lines) + + @tool + async def memory_stats() -> str: + """Return chunk counts per domain. Useful for sanity checks.""" + s = knowledge_store.stats() + if s.get("total", 0) == 0: + return "Knowledge store is empty." + lines = [f"Total: {s['total']}"] + for k, v in s.items(): + if k == "total": + continue + lines.append(f" {k}: {v}") + return "\n".join(lines) + + @tool + async def daily_log(content: str) -> str: + """Append a daily-log entry for today. + + Stored under ``domain='daily-log'`` with today's UTC date as + the heading, so the same day's entries cluster together for + ``memory_list(domain='daily-log')`` queries. + """ + today = datetime.now(timezone.utc).date().isoformat() + chunk_id = knowledge_store.add_chunk( + content, domain="daily-log", heading=today, + ) + if chunk_id is None: + return "Error: failed to write daily log entry." + return f"Logged ({today}): {content[:120]}" + + return [memory_ingest, memory_recall, memory_list, memory_stats, daily_log] + + # ── registry ───────────────────────────────────────────────────────────────── def get_all_tools(knowledge_store=None): """Return every LangChain tool the lead agent + subagents can use. - ``knowledge_store`` is threaded through for agents that ship a - knowledge / memory subsystem (see ``graph/middleware/knowledge.py`` - for the hook-in pattern). The template doesn't ship a store — the - parameter is kept so adding one later doesn't require touching - every call site. + When ``knowledge_store`` is provided, the memory tools are bound + to it and included. Forks that disable the store can pass + ``knowledge_store=None`` and the lead agent runs with the four + keyless tools only. """ - return [echo, current_time, calculator, web_search, fetch_url] + tools = [current_time, calculator, web_search, fetch_url] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + return tools From 4d0d4288e483fa464def76382f8bb4b469c97bc0 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 14:46:58 -0700 Subject: [PATCH 2/6] fix(review): address PR #155 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - evals/runner.py: teardown now runs in a finally block so seeded KB rows get cleaned up even when the verifier or client.ask() raises. expected_tools=[] now means "assert no tools fired" (was conflated with "no key" via the `or []` short-circuit, making the abstention case a no-op). - evals/runner.py + tasks.json: added a `stream` runner kind so AgentClient.stream() is reachable from tasks.json — new streaming_status_updates case asserts the SSE event sequence. - knowledge/store.py: PRAGMA journal_mode=WAL is now best-effort (read-only DBs no longer break _connect). FTS5 rebuild after schema install so an existing chunks table populated before FTS was added gets indexed. find_chunk_containing/delete_by_content reject empty/whitespace-only inputs to prevent LIKE '%%' wildcards from matching every row. Hardening: - tools/lg_tools.py: clamp memory_recall(k) to [1, 20] and memory_list(limit) to [1, 200] so the agent can't request arbitrarily large slices of the KB. Doc cleanup: - docs/guides/subagents.md: LangGraphConfig snippet had a stale "echo" reference; replaced with the new memory-tool list. - docs/tutorials/first-tool.md: WORKER_CONFIG example now appends git_sha alongside the bundled defaults instead of replacing them and dropping the memory tools. - docs/reference/starter-tools.md: "adding your own" snippet now preserves the conditional _build_memory_tools(knowledge_store) extension. - tests/test_config_io.py: starter-tool contract assertion now also covers web_search. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/guides/subagents.md | 6 +- docs/reference/starter-tools.md | 7 +- docs/tutorials/first-tool.md | 9 +- evals/runner.py | 155 +++++++++++++++++++++----------- evals/tasks.json | 10 +++ knowledge/store.py | 33 ++++++- tests/test_config_io.py | 1 + tools/lg_tools.py | 10 ++- 8 files changed, 167 insertions(+), 64 deletions(-) diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md index 9fc1a9a..031cff6 100644 --- a/docs/guides/subagents.md +++ b/docs/guides/subagents.md @@ -56,7 +56,11 @@ The template's `LangGraphConfig` (in `graph/config.py`) has a `worker` field. Ad class LangGraphConfig: # ... existing fields ... worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, )) researcher: SubagentDef = field(default_factory=lambda: SubagentDef( diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index c0918b5..60d74d4 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -180,11 +180,14 @@ async def my_tool(required_arg: str, optional_arg: int = 5) -> str: return f"Success: {result}" ``` -Then append it to the list in `get_all_tools()`: +Then append it to the keyless tool list in `get_all_tools()` — keep the conditional `_build_memory_tools(knowledge_store)` extension below it so the bundled memory tools still ship when a store is configured: ```python def get_all_tools(knowledge_store=None): - return [current_time, calculator, web_search, fetch_url, my_tool] + tools = [current_time, calculator, web_search, fetch_url, my_tool] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + return tools ``` See [Write your first tool](/tutorials/first-tool) for the full walkthrough. diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 87502d0..056a8e4 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -47,12 +47,17 @@ def get_all_tools(knowledge_store=None): ## 2. Allow the subagent to use it (optional) -If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`: +If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`. Append rather than replace — dropping the bundled defaults removes the worker's memory tools: ```python WORKER_CONFIG = SubagentConfig( # ... - tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + "git_sha", # ← new + ], # ... ) ``` diff --git a/evals/runner.py b/evals/runner.py index ad5154b..522f830 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -129,7 +129,26 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: async def _run_ask(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/send`` + poll. Teardown always runs.""" + return await _run_prompt_case(client, case, streaming=False) + + +async def _run_stream(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/stream`` + SSE. Same assertion shape as ``ask``, + plus an optional ``expected_event_kinds`` list that asserts the SSE + stream surfaced the named event kinds (``status-update``, ``task``, + etc.) at least once.""" + return await _run_prompt_case(client, case, streaming=True) + + +async def _run_prompt_case( + client: AgentClient, + case: dict, + *, + streaming: bool, +) -> CaseResult: # Pre-seed state via direct DB writes (model never sees this). + setup_applied = False if "setup" in case: err = verify.apply_setup(case["setup"]) if err: @@ -137,66 +156,93 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult: case["id"], case["category"], case["name"], False, f"setup failed: {err}", ) + setup_applied = True - since = verify.audit_now() - result: TaskResult = await client.ask( - case["prompt"], timeout_s=case.get("timeout_s", 90), - ) + events: list[dict] = [] + result: TaskResult | None = None - if result.state != "completed": - if "teardown" in case: - verify.apply_teardown(case["teardown"]) - return CaseResult( - case["id"], case["category"], case["name"], False, - f"task state={result.state}; error={result.error or '(none)'}", - duration_ms=result.duration_ms, - raw={"text": result.text[:200]}, - ) + try: + since = verify.audit_now() - problems: list[str] = [] + if streaming: + events, result = await client.stream( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + else: + result = await client.ask( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) - # Tool firing assertions. - expected_tools = case.get("expected_tools") or [] - if expected_tools: - await asyncio.sleep(0.3) # let the audit log catch up - entries = verify.audit_entries_since(since) - require_success = case.get("tool_outcome", "success") == "success" - passed, detail = verify.assert_tools_fired( - entries, expected_tools, require_success=require_success, + if result is None or result.state != "completed": + state = result.state if result else "no-final-event" + error = (result.error if result else None) or "(none)" + duration = result.duration_ms if result else 0 + text_preview = (result.text if result else "")[:200] + return CaseResult( + case["id"], case["category"], case["name"], False, + f"task state={state}; error={error}", + duration_ms=duration, + raw={"text": text_preview}, + ) + + problems: list[str] = [] + + # Tool firing assertions. ``expected_tools is not None`` so an + # explicit empty list asserts that *no* tools fired (abstention + # cases). Missing key skips the audit check entirely. + expected_tools = case.get("expected_tools") + if expected_tools is not None: + await asyncio.sleep(0.3) # let the audit log catch up + entries = verify.audit_entries_since(since) + require_success = case.get("tool_outcome", "success") == "success" + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if not passed: + problems.append(detail) + + # Text pattern assertions (case-insensitive substrings). + text_lower = result.text.lower() + for pattern in case.get("expected_patterns") or []: + if pattern.lower() not in text_lower: + problems.append(f"missing pattern {pattern!r}") + + # KB side-effect assertions. + vk = case.get("verify_kb") or {} + if "find_chunk_containing" in vk: + chunk = verify.find_chunk_containing( + vk["find_chunk_containing"], domain=vk.get("domain"), + ) + if not chunk: + problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") + + # Streaming-only: assert the SSE event sequence surfaced the + # expected kinds at least once. + if streaming: + seen_kinds = {e.get("kind") for e in events} + for kind in case.get("expected_event_kinds") or []: + if kind not in seen_kinds: + problems.append(f"missing SSE event kind {kind!r}; saw {sorted(seen_kinds)}") + + detail = ( + "; ".join(problems) if problems + else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" ) - if not passed: - problems.append(detail) - - # Text pattern assertions (case-insensitive substrings). - text_lower = result.text.lower() - for pattern in case.get("expected_patterns") or []: - if pattern.lower() not in text_lower: - problems.append(f"missing pattern {pattern!r}") - - # KB side-effect assertions. - vk = case.get("verify_kb") or {} - if "find_chunk_containing" in vk: - chunk = verify.find_chunk_containing( - vk["find_chunk_containing"], domain=vk.get("domain"), + return CaseResult( + case["id"], case["category"], case["name"], + passed=not problems, + detail=detail, + duration_ms=result.duration_ms, + tokens=result.usage.get("total_tokens", 0) or 0, + raw={"reply": result.text[:300]}, ) - if not chunk: - problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") - - if "teardown" in case: - verify.apply_teardown(case["teardown"]) - - detail = ( - "; ".join(problems) if problems - else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" - ) - return CaseResult( - case["id"], case["category"], case["name"], - passed=not problems, - detail=detail, - duration_ms=result.duration_ms, - tokens=result.usage.get("total_tokens", 0) or 0, - raw={"reply": result.text[:300]}, - ) + finally: + # Teardown unconditionally — even when the task crashed or + # an assertion raised — so seeded KB rows never leak into the + # next case. + if setup_applied or "teardown" in case: + if "teardown" in case: + verify.apply_teardown(case["teardown"]) # ── dispatch ──────────────────────────────────────────────────────────────── @@ -206,6 +252,7 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult: "agent_card": _run_agent_card, "auth_check": _run_auth_check, "ask": _run_ask, + "stream": _run_stream, } diff --git a/evals/tasks.json b/evals/tasks.json index 14cdd16..d4b5389 100644 --- a/evals/tasks.json +++ b/evals/tasks.json @@ -17,6 +17,16 @@ "bad_token": "definitely-not-the-real-token", "expect": {"status": 401} }, + { + "id": "streaming_status_updates", + "category": "a2a-protocol", + "kind": "stream", + "name": "message/stream surfaces status-update events ending in final=true", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [], + "expected_event_kinds": ["status-update"] + }, { "id": "abstain_no_tool", diff --git a/knowledge/store.py b/knowledge/store.py index a6aee7a..72d9353 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -162,7 +162,14 @@ def __init__(self, db_path: str | Path | None = None): def _connect(self) -> sqlite3.Connection: db = sqlite3.connect(str(self.path)) db.row_factory = sqlite3.Row - db.execute("PRAGMA journal_mode=WAL") + # WAL is best-effort — read-only sqlite files (e.g. immutable + # mounts) reject the PRAGMA. The connection stays usable for + # reads; only writes will fail later, and those go through + # the per-method OperationalError guards. + try: + db.execute("PRAGMA journal_mode=WAL") + except sqlite3.OperationalError as exc: + log.debug("[knowledge] PRAGMA journal_mode=WAL skipped: %s", exc) return db def _init_db(self) -> None: @@ -172,6 +179,16 @@ def _init_db(self) -> None: self._fts_available = _has_fts5(db) if self._fts_available: db.executescript(_FTS_SCHEMA) + # Re-index any pre-existing rows. The CREATE TRIGGER + # statements only fire on subsequent inserts, so a DB + # populated before FTS was added would have an empty + # virtual table without this rebuild. + try: + db.execute( + "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')" + ) + except sqlite3.OperationalError as exc: + log.debug("[knowledge] FTS rebuild skipped: %s", exc) else: log.info( "[knowledge] FTS5 unavailable — search will use LIKE fallback" @@ -399,8 +416,12 @@ def find_chunk_containing( """Return the most-recent chunk whose content or heading contains ``text``. Used by the eval runner to assert side-effect outcomes after a - memory-writing turn. + memory-writing turn. Empty / whitespace-only ``text`` returns + ``None`` rather than building a ``LIKE '%%'`` predicate that + would match every row. """ + if not text or not text.strip(): + return None db = self._get_db() if db is None: return None @@ -423,7 +444,13 @@ def find_chunk_containing( return Chunk(**dict(row)) if row else None def delete_by_content(self, contains: str) -> int: - """Delete chunks whose content matches ``%contains%``. Returns count.""" + """Delete chunks whose content matches ``%contains%``. Returns count. + + Empty / whitespace-only ``contains`` is a no-op — the alternative + is ``DELETE WHERE content LIKE '%%'`` which wipes every row. + """ + if not contains or not contains.strip(): + return 0 db = self._get_db() if db is None: return 0 diff --git a/tests/test_config_io.py b/tests/test_config_io.py index 39fc017..caf0bb2 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -327,6 +327,7 @@ def test_list_available_tools_returns_starter_set(): # contract is "return tool names in a stable list". assert "current_time" in names assert "calculator" in names + assert "web_search" in names assert "fetch_url" in names assert all(isinstance(n, str) for n in names) diff --git a/tools/lg_tools.py b/tools/lg_tools.py index b59ccba..fd2af1a 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -277,6 +277,10 @@ def _extract_text_from_html(content: bytes) -> str: # process. +_MEMORY_RECALL_MAX_K = 20 +_MEMORY_LIST_MAX_LIMIT = 200 + + def _build_memory_tools(knowledge_store): """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" from datetime import datetime, timezone @@ -320,7 +324,8 @@ async def memory_recall(query: str, k: int = 5) -> str: Returns ``"No matches."`` when the store is empty or nothing scores above the keyword threshold. """ - results = knowledge_store.search(query, k=k) + clamped_k = max(1, min(int(k), _MEMORY_RECALL_MAX_K)) + results = knowledge_store.search(query, k=clamped_k) if not results: return "No matches." lines = [] @@ -335,7 +340,8 @@ async def memory_list(domain: str | None = None, limit: int = 10) -> str: Useful when the operator asks for recent activity ("what did I log today?") or wants to inspect what the agent has stored. """ - chunks = knowledge_store.list_chunks(domain=domain, limit=limit) + clamped_limit = max(1, min(int(limit), _MEMORY_LIST_MAX_LIMIT)) + chunks = knowledge_store.list_chunks(domain=domain, limit=clamped_limit) if not chunks: return f"No chunks in {domain or 'any domain'}." lines = [] From cab3bd8d0f57d8df3ad98c25345cc36df2a11e68 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:02:53 +0000 Subject: [PATCH 3/6] fix(review-2): address round-2 PR #155 CodeRabbit feedback - evals/runner.py: collapse redundant nested teardown guard into a single `if "teardown" in case:` (SIM102); remove now-unused `setup_applied` flag - knowledge/store.py: use `datetime.UTC` alias (Python 3.11+, UP017) - tools/lg_tools.py: add `-> list` return annotation to `_build_memory_tools` (ANN202); replace explicit loop with list comprehension in `memory_recall` (PERF401) Co-Authored-By: Claude Opus 4.7 (1M context) https://claude.ai/code/session_01148o8ppbuQwuZBsVGTQWwQ --- evals/runner.py | 7 ++----- knowledge/store.py | 4 ++-- tools/lg_tools.py | 6 ++---- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index 522f830..d767f90 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -148,7 +148,6 @@ async def _run_prompt_case( streaming: bool, ) -> CaseResult: # Pre-seed state via direct DB writes (model never sees this). - setup_applied = False if "setup" in case: err = verify.apply_setup(case["setup"]) if err: @@ -156,7 +155,6 @@ async def _run_prompt_case( case["id"], case["category"], case["name"], False, f"setup failed: {err}", ) - setup_applied = True events: list[dict] = [] result: TaskResult | None = None @@ -240,9 +238,8 @@ async def _run_prompt_case( # Teardown unconditionally — even when the task crashed or # an assertion raised — so seeded KB rows never leak into the # next case. - if setup_applied or "teardown" in case: - if "teardown" in case: - verify.apply_teardown(case["teardown"]) + if "teardown" in case: + verify.apply_teardown(case["teardown"]) # ── dispatch ──────────────────────────────────────────────────────────────── diff --git a/knowledge/store.py b/knowledge/store.py index 72d9353..62c5f54 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -31,7 +31,7 @@ import re import sqlite3 from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -89,7 +89,7 @@ def _resolve_path(db_path: str | Path | None) -> Path: def _now_iso() -> str: - return datetime.now(timezone.utc).isoformat() + return datetime.now(UTC).isoformat() def _has_fts5(db: sqlite3.Connection) -> bool: diff --git a/tools/lg_tools.py b/tools/lg_tools.py index fd2af1a..161ddcb 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -281,7 +281,7 @@ def _extract_text_from_html(content: bytes) -> str: _MEMORY_LIST_MAX_LIMIT = 200 -def _build_memory_tools(knowledge_store): +def _build_memory_tools(knowledge_store) -> list: """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" from datetime import datetime, timezone @@ -328,9 +328,7 @@ async def memory_recall(query: str, k: int = 5) -> str: results = knowledge_store.search(query, k=clamped_k) if not results: return "No matches." - lines = [] - for r in results: - lines.append(f"[{r.get('domain', '?')}] {r['preview']}") + lines = [f"[{r.get('domain', '?')}] {r['preview']}" for r in results] return "\n".join(lines) @tool From b3a9f1dae02730a13734614dfe8481717cc1c6c1 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 15:25:15 -0700 Subject: [PATCH 4/6] fix(review-3): address round-3 PR #155 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - evals/runner.py: setup is now inside the try block so a partial setup failure (e.g. step 2 of 3 errors) still triggers the finally teardown — rows from steps that did succeed no longer leak into the next case. Was flagged as a duplicate from round 2. - knowledge/store.py: LIKE patterns now escape % and _ via ESCAPE '\' on every clause that takes user input (find_chunk_containing, delete_by_content, _search_like). A query for "100%" or "hello_world" no longer silently matches every row containing "100" or any single character between "hello" and "world". - knowledge/store.py: FTS5 MATCH tokens are now double-quoted via _fts_quote() so user-supplied query terms can't smuggle FTS5 operators (column filters, prefix wildcards, NEAR, AND/OR/NOT). Defence in depth — the [\w']+ tokenizer already filters most special chars. Hardening: - evals/runner.py: the fixed 0.3s asyncio.sleep waiting for the audit log to flush is gone. _await_audit_assertion now polls every 50ms up to a 2s deadline and returns as soon as the assertion passes — exits early on success, only burns the full deadline when the tool genuinely never fired. - evals/runner.py: _run_auth_check accepts case["headers"] so cases can override the default bearer-only header set and exercise X-API-Key auth scenarios (or both auths together). - knowledge/store.py: per-method exception handlers broadened from sqlite3.OperationalError to sqlite3.DatabaseError. Catches IntegrityError, ProgrammingError, and corruption variants too without crashing the agent loop. _has_fts5 (probe) and _connect (connection-time errors only) keep the narrower OperationalError. Co-Authored-By: Claude Opus 4.7 (1M context) --- evals/runner.py | 71 +++++++++++++++++++++++++++++++++--------- knowledge/store.py | 77 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 116 insertions(+), 32 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index d767f90..2fa148f 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -91,7 +91,14 @@ async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult: async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: - """Verify the A2A endpoint rejects a bad bearer with the expected status.""" + """Verify the A2A endpoint rejects a request with the expected status. + + Default behaviour exercises bearer auth alone using ``case["bad_token"]``. + Cases can override headers via ``case["headers"]`` to test other + auth surfaces — e.g. ``{"X-API-Key": "wrong"}`` for the legacy + X-API-Key path. ``Content-Type: application/json`` is always set + for the eval client; case headers override anything else. + """ import httpx expected_status = case.get("expect", {}).get("status", 401) @@ -99,8 +106,8 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {bad}", - # No X-API-Key — testing bearer alone. } + headers.update(case.get("headers") or {}) payload = { "jsonrpc": "2.0", "id": "auth-check", @@ -141,25 +148,61 @@ async def _run_stream(client: AgentClient, case: dict) -> CaseResult: return await _run_prompt_case(client, case, streaming=True) +_AUDIT_POLL_DEADLINE_S = 2.0 +_AUDIT_POLL_INTERVAL_S = 0.05 + + +async def _await_audit_assertion( + since: str, + expected_tools: list[str], + *, + require_success: bool, +) -> tuple[list[dict], bool, str]: + """Poll the audit log until ``expected_tools`` have all fired (or the + deadline is hit). Returns ``(entries, passed, detail)``. + + Replaces a fixed ``asyncio.sleep`` — under audit-log contention the + fixed wait was sometimes shorter than the flush, causing flaky + tool-firing assertions. Polling exits as soon as the assertion + passes; the deadline only kicks in when the tool genuinely never + fired. + """ + deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S + entries: list[dict] = [] + passed = False + detail = "" + while True: + entries = verify.audit_entries_since(since) + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if passed or asyncio.get_event_loop().time() >= deadline: + return entries, passed, detail + await asyncio.sleep(_AUDIT_POLL_INTERVAL_S) + + async def _run_prompt_case( client: AgentClient, case: dict, *, streaming: bool, ) -> CaseResult: - # Pre-seed state via direct DB writes (model never sees this). - if "setup" in case: - err = verify.apply_setup(case["setup"]) - if err: - return CaseResult( - case["id"], case["category"], case["name"], False, - f"setup failed: {err}", - ) - events: list[dict] = [] result: TaskResult | None = None try: + # Pre-seed state via direct DB writes (model never sees this). + # Inside the ``try`` so a partial setup failure still triggers + # the ``finally`` teardown — otherwise rows from the steps that + # *did* succeed would leak into the next case. + if "setup" in case: + err = verify.apply_setup(case["setup"]) + if err: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"setup failed: {err}", + ) + since = verify.audit_now() if streaming: @@ -190,11 +233,9 @@ async def _run_prompt_case( # cases). Missing key skips the audit check entirely. expected_tools = case.get("expected_tools") if expected_tools is not None: - await asyncio.sleep(0.3) # let the audit log catch up - entries = verify.audit_entries_since(since) require_success = case.get("tool_outcome", "success") == "success" - passed, detail = verify.assert_tools_fired( - entries, expected_tools, require_success=require_success, + entries, passed, detail = await _await_audit_assertion( + since, expected_tools, require_success=require_success, ) if not passed: problems.append(detail) diff --git a/knowledge/store.py b/knowledge/store.py index 62c5f54..473bedd 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -17,7 +17,8 @@ - If the configured path is unwritable (running locally outside the container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db`` so a fresh ``python server.py`` works without sudo. -- All write operations swallow ``sqlite3.OperationalError`` and log; +- All write operations swallow ``sqlite3.DatabaseError`` (covers + OperationalError, IntegrityError, and corruption variants) and log; the store never crashes the agent loop on a corrupt or read-only DB. Forks that want embeddings on top of FTS5 can subclass and override @@ -92,6 +93,36 @@ def _now_iso() -> str: return datetime.now(UTC).isoformat() +# LIKE escaping — sqlite treats ``%`` and ``_`` as wildcards in LIKE +# patterns. Without escaping, a search for ``"100%"`` matches every row +# starting with ``"100"`` instead of literal "100%". We escape them +# alongside the escape char itself, then bind ``ESCAPE '\'`` on every +# LIKE clause that takes user input. +_LIKE_ESCAPE = "\\" + + +def _escape_like(text: str) -> str: + """Escape ``%``, ``_``, and the escape char for safe LIKE matching.""" + return ( + text + .replace(_LIKE_ESCAPE, _LIKE_ESCAPE + _LIKE_ESCAPE) + .replace("%", _LIKE_ESCAPE + "%") + .replace("_", _LIKE_ESCAPE + "_") + ) + + +def _fts_quote(token: str) -> str: + """Quote a token for FTS5 MATCH so it's treated as a literal phrase. + + FTS5 has its own query syntax (column filters, prefix wildcards, + NEAR, AND/OR/NOT operators). Wrapping each token in double quotes + forces FTS5 to interpret it as a phrase token, neutralising any + operator characters the user happened to type. Internal double + quotes are doubled per FTS5 phrase rules. + """ + return '"' + token.replace('"', '""') + '"' + + def _has_fts5(db: sqlite3.Connection) -> bool: try: db.execute( @@ -187,7 +218,7 @@ def _init_db(self) -> None: db.execute( "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')" ) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.debug("[knowledge] FTS rebuild skipped: %s", exc) else: log.info( @@ -195,7 +226,7 @@ def _init_db(self) -> None: ) db.commit() db.close() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.error("[knowledge] schema init failed at %s: %s", self.path, exc) # Convenience for middleware that wants the raw connection. Kept @@ -235,7 +266,7 @@ def add_chunk( ) db.commit() return int(cur.lastrowid) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.error("[knowledge] add_chunk failed: %s", exc) return None finally: @@ -285,7 +316,7 @@ def search( try: rows = self._search_fts(db, query, k, domain) if self._fts_available \ else self._search_like(db, query, k, domain) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] search failed: %s", exc) rows = [] finally: @@ -310,10 +341,14 @@ def _search_fts( ) -> list[sqlite3.Row]: # Sanitize to FTS5-safe tokens; OR them so a multi-word query # matches any of the keywords (closer to LIKE behaviour). + # Each token is double-quoted so FTS5 treats it as a literal + # phrase rather than parsing operators (column filters, prefix + # wildcards, NEAR, etc.) — even though ``[\w']+`` already + # filters most special chars, defence in depth is cheap. tokens = [t for t in re.findall(r"[\w']+", query) if t] if not tokens: return [] - match = " OR ".join(tokens) + match = " OR ".join(_fts_quote(t) for t in tokens) if domain: return db.execute( "SELECT c.* FROM chunks_fts f " @@ -341,14 +376,18 @@ def _search_like( if not tokens: return [] # Score = number of tokens matched (rough recall-style ranking). + # User-supplied tokens are LIKE-escaped so a query containing + # ``%`` or ``_`` doesn't silently match every row; ESCAPE is + # bound on each clause. like_clauses = " + ".join( - "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END" + "CASE WHEN content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ? " + "THEN 1 ELSE 0 END" for _ in tokens ) params: list[Any] = [] for t in tokens: - needle = f"%{t}%" - params.extend([needle, needle]) + needle = f"%{_escape_like(t)}%" + params.extend([needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE]) sql = ( f"SELECT *, ({like_clauses}) AS score FROM chunks " "WHERE score > 0" @@ -380,7 +419,7 @@ def list_chunks( "SELECT * FROM chunks ORDER BY id DESC LIMIT ?", (limit,), ).fetchall() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] list_chunks failed: %s", exc) rows = [] finally: @@ -397,7 +436,7 @@ def stats(self) -> dict[str, int]: "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC" ).fetchall() total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] stats failed: %s", exc) return {"total": 0} finally: @@ -426,17 +465,18 @@ def find_chunk_containing( if db is None: return None try: + needle = f"%{_escape_like(text)}%" sql = ( "SELECT * FROM chunks " - "WHERE (content LIKE ? OR heading LIKE ?)" + "WHERE (content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ?)" ) - params: list[Any] = [f"%{text}%", f"%{text}%"] + params: list[Any] = [needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE] if domain: sql += " AND domain = ?" params.append(domain) sql += " ORDER BY id DESC LIMIT 1" row = db.execute(sql, params).fetchone() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] find_chunk_containing failed: %s", exc) row = None finally: @@ -455,10 +495,13 @@ def delete_by_content(self, contains: str) -> int: if db is None: return 0 try: - cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",)) + cur = db.execute( + "DELETE FROM chunks WHERE content LIKE ? ESCAPE ?", + (f"%{_escape_like(contains)}%", _LIKE_ESCAPE), + ) db.commit() return int(cur.rowcount) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] delete_by_content failed: %s", exc) return 0 finally: @@ -476,7 +519,7 @@ def delete_by_heading(self, domain: str, heading: str) -> int: ) db.commit() return int(cur.rowcount) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] delete_by_heading failed: %s", exc) return 0 finally: From b713d8d0997320520d7a14e503fdb12d26fd282e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:47:04 +0000 Subject: [PATCH 5/6] fix(review-4): address round-4 PR #155 CodeRabbit feedback - evals/runner.py: use asyncio.get_running_loop() instead of the deprecated get_event_loop() inside the _await_audit_assertion coroutine - evals/runner.py: prefix unused _entries return value with underscore - evals/runner.py: use datetime.UTC alias (consistent with store.py), drop now-unused timezone import - knowledge/store.py: broaden _get_db exception catch from OperationalError to DatabaseError so corrupt-DB errors are swallowed per the module's no-crash contract - knowledge/store.py: replace log.error with log.exception in all three DatabaseError handlers (schema init, _get_db, add_chunk) so tracebacks appear in error logs Co-Authored-By: Claude https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH --- evals/runner.py | 10 +++++----- knowledge/store.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index 2fa148f..7b66d4d 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -36,7 +36,7 @@ import sys import time from dataclasses import asdict, dataclass, field -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path # Allow ``python -m evals.runner`` and ``python evals/runner.py``. @@ -167,7 +167,7 @@ async def _await_audit_assertion( passes; the deadline only kicks in when the tool genuinely never fired. """ - deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S + deadline = asyncio.get_running_loop().time() + _AUDIT_POLL_DEADLINE_S entries: list[dict] = [] passed = False detail = "" @@ -176,7 +176,7 @@ async def _await_audit_assertion( passed, detail = verify.assert_tools_fired( entries, expected_tools, require_success=require_success, ) - if passed or asyncio.get_event_loop().time() >= deadline: + if passed or asyncio.get_running_loop().time() >= deadline: return entries, passed, detail await asyncio.sleep(_AUDIT_POLL_INTERVAL_S) @@ -234,7 +234,7 @@ async def _run_prompt_case( expected_tools = case.get("expected_tools") if expected_tools is not None: require_success = case.get("tool_outcome", "success") == "success" - entries, passed, detail = await _await_audit_assertion( + _entries, passed, detail = await _await_audit_assertion( since, expected_tools, require_success=require_success, ) if not passed: @@ -337,7 +337,7 @@ def _print_board(results: list[CaseResult]) -> None: def _save_report(results: list[CaseResult], path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) payload = { - "ts": datetime.now(timezone.utc).isoformat(), + "ts": datetime.now(UTC).isoformat(), "total": len(results), "passed": sum(1 for r in results if r.passed), "results": [asdict(r) for r in results], diff --git a/knowledge/store.py b/knowledge/store.py index 473bedd..d26d8a7 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -226,16 +226,16 @@ def _init_db(self) -> None: ) db.commit() db.close() - except sqlite3.DatabaseError as exc: - log.error("[knowledge] schema init failed at %s: %s", self.path, exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] schema init failed at %s", self.path) # Convenience for middleware that wants the raw connection. Kept # private so the public API stays small. def _get_db(self) -> sqlite3.Connection | None: try: return self._connect() - except sqlite3.OperationalError as exc: - log.error("[knowledge] connect failed: %s", exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] connect failed") return None # ── writes ────────────────────────────────────────────────────────────── @@ -266,8 +266,8 @@ def add_chunk( ) db.commit() return int(cur.lastrowid) - except sqlite3.DatabaseError as exc: - log.error("[knowledge] add_chunk failed: %s", exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] add_chunk failed") return None finally: db.close() From ae752a5e523c85714a48846eb9d53ee7e6b77b1c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:48:12 +0000 Subject: [PATCH 6/6] chore: add uv.lock generated during round-4 review session https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH --- uv.lock | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 uv.lock diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..7eae9e0 --- /dev/null +++ b/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.11" + +[[package]] +name = "protoagent" +version = "0.2.1" +source = { virtual = "." }