From 433b44e02b9ea9037304853b503bca1e18032801 Mon Sep 17 00:00:00 2001
From: Josh Mabry <mabry1985@gmail.com>
Date: Mon, 27 Apr 2026 14:30:32 -0700
Subject: [PATCH 1/6] feat: ship default knowledge store + side-effect-verified
 eval harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The template now ships a working memory loop and end-to-end eval suite
on day one so forks have a green baseline before they touch a single
line of code. Closes #154.

What lands:

- knowledge/store.py — sqlite + FTS5 (LIKE fallback). One ``chunks``
  table backs operator notes (memory_ingest), daily-log entries
  (daily_log), and conversation findings extracted by
  MemoryMiddleware (domain='finding'). Path resolves env >
  config > default with an automatic ~/.protoagent/ fallback when
  /sandbox isn't writable.

- tools/lg_tools.py — five new memory tools (memory_ingest,
  memory_recall, memory_list, memory_stats, daily_log) bound to the
  store via a closure factory so tests get a fresh store per run.
  ``echo`` removed; ``get_all_tools(knowledge_store)`` actually uses
  its parameter now.

- server.py — _build_knowledge_store() constructs the store and
  threads it through both initial init and the drawer reload path.
  Defaults flipped: knowledge_middleware + memory_middleware now
  ON by default (config/langgraph-config.yaml + graph/config.py).

- evals/ — A2A client + runner + verify helpers + 15 starter cases
  (tasks.json) covering agent card discovery, bearer auth gating,
  abstention, every shipped tool, KB recall, a chained two-tool
  case, and KnowledgeMiddleware injection. Side-effect-verified:
  audit log + reply text + KB chunks all checked independently so
  hallucinated tool results get caught.

- docs/guides/evals.md — full how-to. README/TEMPLATE/configuration/
  starter-tools/first-agent updated to reflect the new defaults and
  the additional five memory tools.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                           |   4 +-
 TEMPLATE.md                         |  21 +-
 config/langgraph-config.yaml        |  20 +-
 docs/guides/customize-and-deploy.md |   2 +-
 docs/guides/evals.md                | 151 +++++++++
 docs/guides/fork-the-template.md    |   2 +-
 docs/guides/index.md                |   1 +
 docs/guides/subagents.md            |  15 +-
 docs/reference/configuration.md     |  25 +-
 docs/reference/starter-tools.md     |  71 ++++-
 docs/tutorials/first-agent.md       |   2 +-
 docs/tutorials/first-tool.md        |   3 +-
 evals/README.md                     | 100 ++++++
 evals/__init__.py                   |   0
 evals/client.py                     | 262 ++++++++++++++++
 evals/results/.gitkeep              |   0
 evals/runner.py                     | 307 +++++++++++++++++++
 evals/tasks.json                    | 186 ++++++++++++
 evals/verify.py                     | 176 +++++++++++
 graph/config.py                     |  16 +-
 graph/subagents/config.py           |   6 +-
 knowledge/__init__.py               |  12 +
 knowledge/store.py                  | 456 ++++++++++++++++++++++++++++
 server.py                           |  37 ++-
 tests/test_config_io.py             |   8 +-
 tests/test_skill_curator.py         |   2 +-
 tests/test_skill_emission.py        |  20 +-
 tests/test_starter_tools.py         |  10 -
 tools/lg_tools.py                   | 150 +++++++--
 29 files changed, 1976 insertions(+), 89 deletions(-)
 create mode 100644 docs/guides/evals.md
 create mode 100644 evals/README.md
 create mode 100644 evals/__init__.py
 create mode 100644 evals/client.py
 create mode 100644 evals/results/.gitkeep
 create mode 100644 evals/runner.py
 create mode 100644 evals/tasks.json
 create mode 100644 evals/verify.py
 create mode 100644 knowledge/__init__.py
 create mode 100644 knowledge/store.py

diff --git a/README.md b/README.md
index 1a1b34c..ef54b84 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,9 @@ rename / release-pipeline wiring.
 | Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 |
 | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork |
 | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships |
-| Starter tools | `tools/lg_tools.py` | Free, keyless tools so a fresh fork can demo real behaviour: `echo`, `current_time`, `calculator` (safe AST eval), `web_search` (DuckDuckGo), `fetch_url` |
+| Starter tools | `tools/lg_tools.py` | Keyless general tools (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) plus memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled store |
+| Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` |
+| Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) |
 | Tracing | `tracing.py` | Langfuse trace_session with distributed `a2a.trace` propagation and the OTel cross-context-detach filter |
 | Observability | `metrics.py`, `audit.py` | Prometheus metrics with per-agent prefix, JSONL audit log with trace IDs |
 | Output protocol | `graph/output_format.py` | `<scratch_pad>` / `<output>` parsing so the model can think without it leaking to users |
diff --git a/TEMPLATE.md b/TEMPLATE.md
index 31b1eb2..08ff5b3 100644
--- a/TEMPLATE.md
+++ b/TEMPLATE.md
@@ -72,10 +72,10 @@ handler's output extraction depends on it.
 ## 4. Add your real tools
 
 `tools/lg_tools.py` ships with a small keyless starter set so a
-fresh clone can demonstrate a real research loop: `echo`,
-`current_time`, `calculator` (safe AST eval — no `eval()`),
-`web_search` (DuckDuckGo via `ddgs`), and `fetch_url`. Keep the
-ones you want, drop the rest, and add your own:
+fresh clone can demonstrate a real research loop: `current_time`,
+`calculator` (safe AST eval — no `eval()`), `web_search` (DuckDuckGo
+via `ddgs`), and `fetch_url`. Keep the ones you want, drop the rest,
+and add your own:
 
 ```python
 from langchain_core.tools import tool
@@ -167,6 +167,19 @@ your fork. A useful pattern:
 - Extend `tests/test_a2a_integration.py` with assertions for
   your declared skills + extensions on the agent card
 
+For end-to-end behaviour testing — "when the operator asks X, does
+the right tool actually fire and the right row land in the KB?" —
+the template ships an eval harness under `evals/`:
+
+```bash
+python -m evals.runner             # against a running agent
+python -m evals.runner --category tool
+```
+
+See [Eval your fork](./docs/guides/evals.md) for what each case
+asserts, how the three assertion channels work, and how to add
+cases for your fork's new tools.
+
 ## 9a. Understand the skill loop
 
 protoAgent's skill loop lets your agent learn from experience automatically.
diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml
index c3f53e8..05bada2 100644
--- a/config/langgraph-config.yaml
+++ b/config/langgraph-config.yaml
@@ -22,14 +22,24 @@ model:
 subagents:
   worker:
     enabled: true
-    tools: [echo, current_time, calculator, web_search, fetch_url]
+    tools:
+      - current_time
+      - calculator
+      - web_search
+      - fetch_url
+      - memory_ingest
+      - memory_recall
+      - memory_list
+      - memory_stats
+      - daily_log
     max_turns: 20
 
 middleware:
-  # The knowledge middleware requires a knowledge store. Leave false
-  # until you add one. Memory persistence is enabled by default and
-  # writes session summaries to /sandbox/memory/ without a store.
-  knowledge: false
+  # All three middlewares default ON. The knowledge middleware needs a
+  # store; the template constructs one automatically (see
+  # ``server.py::_build_knowledge_store``). Set ``knowledge: false`` if
+  # your fork is purely stateless.
+  knowledge: true
   audit: true
   memory: true
 
diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md
index 17c24d9..81fdeec 100644
--- a/docs/guides/customize-and-deploy.md
+++ b/docs/guides/customize-and-deploy.md
@@ -66,7 +66,7 @@ Replace with the skills your agent actually advertises over A2A. The `name` and
 
 ## 5. (Optional) Add domain tools
 
-`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically.
+`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically.
 
 ## 6. (Optional) Configure subagents
 
diff --git a/docs/guides/evals.md b/docs/guides/evals.md
new file mode 100644
index 0000000..38e1e33
--- /dev/null
+++ b/docs/guides/evals.md
@@ -0,0 +1,151 @@
+# Eval your fork
+
+The template ships an eval harness under `evals/` so a fresh fork has
+a working test suite for its tools, memory, and A2A protocol surface
+on day one. Cases assert across three independent channels — audit
+log, reply text, and knowledge-store side effects — so a model that
+hallucinates a tool result still gets caught.
+
+## When to read this
+
+- You forked the template and want a baseline pass-rate before you
+  ship.
+- You added a new tool and want to lock in its intent — "when the
+  operator says X, fire tool Y".
+- You changed a prompt or model and want to measure regression.
+
+## Run the suite
+
+```bash
+# Agent running at $EVAL_BASE_URL (default http://localhost:7870)
+# with the relevant auth env (A2A_AUTH_TOKEN and/or <AGENT>_API_KEY).
+
+python -m evals.runner
+python -m evals.runner --category tool
+python -m evals.runner --tasks current_time_intent,daily_log_intent
+```
+
+Reports land in `evals/results/run-<ts>.json`. The CLI prints a
+pass/fail board; the JSON report carries reply previews and timing
+for post-hoc inspection.
+
+## The three assertion channels
+
+```
+prompt → A2A → audit log         (1) tools fired with expected outcome
+            → reply text         (2) substrings present in reply
+            → KB chunks table    (3) side effects landed correctly
+```
+
+A case passes only when every configured assertion holds. Most cases
+should opt in to channels 1 and 3 — text patterns alone are brittle
+to model paraphrasing and miss hallucinated tool results entirely.
+
+### Why side-effect verification beats text-only
+
+A model can produce "Logged: ..." in its reply without actually
+calling `daily_log`. Substring matching passes, the DB stays empty,
+and the bug ships. Reading `audit.jsonl` and the `chunks` table
+afterward catches it.
+
+## The shape of a case
+
+```json
+{
+  "id": "unique-id",
+  "category": "tool",
+  "kind": "ask",
+  "name": "Asks for arithmetic → calculator",
+  "prompt": "How much is 17 times 23, plus 1?",
+  "expected_tools": ["calculator"],
+  "expected_patterns": ["392"],
+  "verify_kb": {
+    "find_chunk_containing": "EVAL-MARK-XYZ",
+    "domain": "context"
+  },
+  "setup":    [{"kb_ingest": {"content": "...", "domain": "...", "heading": "..."}}],
+  "teardown": [{"kb_delete_by_content": {"contains": "..."}}]
+}
+```
+
+Three case `kind`s ship:
+
+- `agent_card` — fetch `/.well-known/agent-card.json` and assert on
+  the card's name, skill count, and declared extensions.
+- `auth_check` — send a request with a deliberately bad bearer and
+  assert the server returns the expected status (401 by default).
+- `ask` — the main shape. Sends `prompt`, then asserts on tool firing,
+  reply patterns, and KB state.
+
+## Prompt rule
+
+**The tool name never appears in the prompt.** Every prompt must be
+plausibly typed by a real user. "Use `daily_log` to record..." tests
+instruction-following, not tool selection. If the agent needs to
+infer the tool from intent, that *is* the test.
+
+## Setup and teardown — start clean every time
+
+Each `ask` case can pre-seed state via `setup` blocks (BFCL's
+`initial_config` pattern: direct DB writes the model never sees) and
+clean up after itself with `teardown`. The fixture is invisible to
+the agent — it discovers the seeded state via tools, exactly as a
+real user would.
+
+`teardown` runs even when assertions fail, so case order doesn't
+matter and a noisy failure doesn't poison the next run.
+
+Supported setup/teardown step kinds (extend `evals/verify.py` to add
+more):
+
+| Step kind | Args | What it does |
+|---|---|---|
+| `kb_ingest` | `content`, `domain`, `heading?` | Insert a chunk |
+| `kb_delete_by_content` | `contains` | Delete chunks where content LIKE `%contains%` |
+| `kb_delete_by_heading` | `domain`, `heading` | Delete chunks matching (domain, heading) |
+
+## What forks should test by default
+
+The starter `tasks.json` covers:
+
+- Agent card discovery (name, skill count, `cost-v1` extension)
+- Bearer auth gating
+- Each shipped tool fires from a plausible operator prompt
+- Memory ingest → recall round-trip
+- KB-driven middleware injection (no tool call needed)
+- A chained two-tool case (`daily_log` then `memory_recall`)
+
+When you add a tool, add at least one case for it. When you add a
+skill to the agent card, extend the `card_discovery` case to assert
+the new skill is advertised.
+
+## Running in CI
+
+The runner exits non-zero when any case fails, so it drops in cleanly:
+
+```yaml
+- name: Boot agent
+  run: docker compose up -d agent
+
+- name: Wait for /health
+  run: ./scripts/wait-for-it.sh http://localhost:7870/.well-known/agent-card.json
+
+- name: Run evals
+  run: python -m evals.runner
+  env:
+    EVAL_BASE_URL: http://localhost:7870
+    A2A_AUTH_TOKEN: ${{ secrets.AGENT_BEARER }}
+```
+
+For non-deterministic categories (any `tool` or `chained` case), aim
+for an N-of-M majority threshold rather than 100% — the reference
+implementation runs 3 attempts and gates at 2 passes for those
+categories. Deterministic ones (`a2a-protocol`, `subsystem` with
+seeded state) gate at 100%.
+
+## References
+
+- [`evals/README.md`](https://github.com/protoLabsAI/protoAgent/blob/main/evals/README.md) — quick reference for case authors
+- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents)
+- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html)
+- [ToolSandbox](https://arxiv.org/html/2408.04682v1) — user simulator + milestones / minefields
diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md
index eacbc92..d5472e4 100644
--- a/docs/guides/fork-the-template.md
+++ b/docs/guides/fork-the-template.md
@@ -43,7 +43,7 @@ Keep the `<scratch_pad>` / `<output>` protocol block in `prompts.py` — the A2A
 
 ## 4. Replace the starter tools
 
-`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file.
+`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file.
 
 See the [starter tools reference](/reference/starter-tools) for the shapes of the shipped ones.
 
diff --git a/docs/guides/index.md b/docs/guides/index.md
index 3e49012..65dc41e 100644
--- a/docs/guides/index.md
+++ b/docs/guides/index.md
@@ -9,4 +9,5 @@ Task-oriented procedures. Assumes you already have a running agent (see [Tutoria
 | [Add a custom skill](/guides/add-a-skill) | Your agent does new things and callers need to dispatch to them |
 | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` |
 | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production |
+| [Eval your fork](/guides/evals) | You want a baseline pass-rate for the tools / memory / A2A surface in your fork |
 | [Deploy via GHCR](/guides/deploy) | You're ready to ship and want auto-deploy wired up |
diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md
index f903995..9fc1a9a 100644
--- a/docs/guides/subagents.md
+++ b/docs/guides/subagents.md
@@ -86,7 +86,16 @@ for name in ("worker", "researcher"):  # ← add new names
 subagents:
   worker:
     enabled: true
-    tools: [echo, current_time, calculator, web_search, fetch_url]
+    tools:
+      - current_time
+      - calculator
+      - web_search
+      - fetch_url
+      - memory_ingest
+      - memory_recall
+      - memory_list
+      - memory_stats
+      - daily_log
     max_turns: 20
   researcher:
     enabled: true
@@ -117,8 +126,8 @@ If your agent is simple enough that subagents are pure overhead, flip `include_s
 ```python
 _graph = create_agent_graph(
     _graph_config,
-    knowledge_store=None,
-    include_subagents=False,   # ← skip the task() tool and subagent machinery
+    knowledge_store=knowledge_store,  # keep the bundled store wired up
+    include_subagents=False,           # ← skip the task() tool and subagent machinery
 )
 ```
 
diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
index 3463dba..bd3f5be 100644
--- a/docs/reference/configuration.md
+++ b/docs/reference/configuration.md
@@ -17,13 +17,22 @@ model:
 subagents:
   worker:
     enabled: true
-    tools: [echo, current_time, calculator, web_search, fetch_url]
+    tools:
+      - current_time
+      - calculator
+      - web_search
+      - fetch_url
+      - memory_ingest
+      - memory_recall
+      - memory_list
+      - memory_stats
+      - daily_log
     max_turns: 20
 
 middleware:
-  knowledge: false
+  knowledge: true
   audit: true
-  memory: false
+  memory: true
 
 knowledge:
   db_path: /sandbox/knowledge/agent.db
@@ -59,9 +68,9 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag
 
 | Key | Default | What |
 |---|---|---|
-| `knowledge` | `false` | Inject retrieved knowledge into state before LLM calls. Requires a knowledge store — leave off until you add one. |
+| `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. |
 | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. |
-| `memory` | `false` | Memory middleware (experimental). Requires a knowledge store. |
+| `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. |
 
 ## `knowledge`
 
@@ -69,8 +78,8 @@ Only read when `middleware.knowledge` is `true`.
 
 | Key | Default | What |
 |---|---|---|
-| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. |
-| `embed_model` | `nomic-embed-text` | Embedding model. |
+| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. Falls back to `~/.protoagent/knowledge/agent.db` automatically when the configured path isn't writable (e.g. running locally without `/sandbox`). Override at runtime with `KNOWLEDGE_DB_PATH`. |
+| `embed_model` | `nomic-embed-text` | Reserved for forks that bolt embeddings on top of the FTS5 baseline. The bundled store ignores it. |
 | `top_k` | `5` | Results per query fed into state. |
 
-The template does not ship a knowledge store — the config keys are kept so a fork can flip the switch without rewiring every call site.
+The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 isn't available). One `chunks` table; the `domain` column distinguishes operator-set notes (`memory_ingest`), daily-log entries (`daily_log`), and conversation findings extracted by `MemoryMiddleware` (`domain='finding'`).
diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md
index e47f25e..c0918b5 100644
--- a/docs/reference/starter-tools.md
+++ b/docs/reference/starter-tools.md
@@ -1,15 +1,11 @@
 # Starter tools
 
-Five free, keyless tools ship in `tools/lg_tools.py`. They exist so a fresh template clone can demonstrate real behaviour immediately. Keep them, drop them, or swap them — `get_all_tools()` is the registry.
+Nine tools ship in `tools/lg_tools.py`:
 
-## `echo`
+- Four keyless general-purpose tools — `current_time`, `calculator`, `web_search`, `fetch_url` — that work without any state.
+- Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)).
 
-```python
-@tool
-async def echo(message: str) -> str
-```
-
-Returns `"echo: <message>"`. The template-only sanity tool. Safe to delete once your real tools are wired.
+`get_all_tools(knowledge_store)` is the registry. When `knowledge_store` is `None` (the store is disabled in config) the memory tools are omitted automatically.
 
 ## `current_time`
 
@@ -106,6 +102,62 @@ Example Domain
 This domain is for use in documentation examples...
 ```
 
+## `memory_ingest`
+
+```python
+@tool
+async def memory_ingest(content: str, domain: str = "general", heading: str | None = None) -> str
+```
+
+Stores a chunk in the bundled `KnowledgeStore`. Use for things the operator wants you to remember across sessions — preferences, environment facts, decisions worth recalling later.
+
+`domain` is a logical bucket (`"preferences"`, `"context"`, `"general"`, …). `heading` is an optional short label that doubles as a stable de-dupe key.
+
+Returns `"Stored chunk 17 in 'preferences'."` on success, an error string when the store is unavailable.
+
+## `memory_recall`
+
+```python
+@tool
+async def memory_recall(query: str, k: int = 5) -> str
+```
+
+Top-k keyword search over the store via FTS5 (LIKE fallback). Returns one match per line:
+
+```
+[preferences] coffee: Operator's preferred coffee is a Gibraltar with oat milk.
+[context] lab: Primary lab is Snickerdoodle in Spokane.
+```
+
+Returns `"No matches."` when nothing scores above the keyword threshold.
+
+## `memory_list`
+
+```python
+@tool
+async def memory_list(domain: str | None = None, limit: int = 10) -> str
+```
+
+Most-recent-first listing of stored chunks. Filter by domain when given. Useful for "what did I log today?" style queries.
+
+## `memory_stats`
+
+```python
+@tool
+async def memory_stats() -> str
+```
+
+Per-domain chunk counts plus a total. Useful for sanity-checking that ingest landed.
+
+## `daily_log`
+
+```python
+@tool
+async def daily_log(content: str) -> str
+```
+
+Convenience wrapper around `memory_ingest` that writes to `domain='daily-log'` with today's UTC date as the heading. Same-day entries cluster under the same heading for `memory_list(domain='daily-log')`.
+
 ## Adding your own
 
 Follow the same pattern:
@@ -132,7 +184,7 @@ Then append it to the list in `get_all_tools()`:
 
 ```python
 def get_all_tools(knowledge_store=None):
-    return [echo, current_time, calculator, web_search, fetch_url, my_tool]
+    return [current_time, calculator, web_search, fetch_url, my_tool]
 ```
 
 See [Write your first tool](/tutorials/first-tool) for the full walkthrough.
@@ -141,3 +193,4 @@ See [Write your first tool](/tutorials/first-tool) for the full walkthrough.
 
 - [Configure subagents](/guides/subagents) — tools are allowlisted per subagent
 - [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url`
+- [Eval your fork](/guides/evals) — the eval harness exercises every tool listed here end-to-end
diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md
index 6082f66..ce12744 100644
--- a/docs/tutorials/first-agent.md
+++ b/docs/tutorials/first-agent.md
@@ -40,7 +40,7 @@ Walk through the four steps:
 
 1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one.
 2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent.
-3. **Tools & middleware.** All five starter tools (`echo`, `current_time`, `calculator`, `web_search`, `fetch_url`) are enabled by default. Leave **Audit** and **Memory** middleware on. Leave **Knowledge** off — that needs an index the template doesn't ship with.
+3. **Tools & middleware.** All nine starter tools (`current_time`, `calculator`, `web_search`, `fetch_url`, plus the memory tools `memory_ingest` / `memory_recall` / `memory_list` / `memory_stats` / `daily_log`) are enabled by default. Leave **Audit**, **Memory**, and **Knowledge** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` (falls back to `~/.protoagent/knowledge/agent.db` outside Docker).
 4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`.
 
 Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices.
diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md
index 9f10251..87502d0 100644
--- a/docs/tutorials/first-tool.md
+++ b/docs/tutorials/first-tool.md
@@ -37,7 +37,6 @@ Then register it in `get_all_tools()` at the bottom of the same file:
 ```python
 def get_all_tools(knowledge_store=None):
     return [
-        echo,
         current_time,
         calculator,
         web_search,
@@ -53,7 +52,7 @@ If you want the worker subagent to be able to call `git_sha`, add it to the allo
 ```python
 WORKER_CONFIG = SubagentConfig(
     # ...
-    tools=["echo", "current_time", "calculator", "web_search", "fetch_url", "git_sha"],
+    tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"],
     # ...
 )
 ```
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000..c8aff77
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,100 @@
+# Evals
+
+Side-effect-verified eval harness. Each case sends a prompt over A2A
+to a running agent and asserts on three independent channels:
+
+1. **Audit log** — every expected tool name fires with the expected
+   outcome (`AuditMiddleware` writes JSONL to `/sandbox/audit/audit.jsonl`).
+2. **Reply text** — case-insensitive substring patterns appear in the
+   model's final reply.
+3. **Knowledge store side effects** — the right rows actually land in
+   the `chunks` table after a memory-writing turn.
+
+A case passes only when every configured assertion holds.
+
+## Quickstart
+
+```bash
+# Agent must be running at $EVAL_BASE_URL (default http://localhost:7870).
+# Auth: set $A2A_AUTH_TOKEN if bearer is configured, $<AGENT>_API_KEY
+# (or $EVAL_API_KEY) if X-API-Key auth is configured. Both are sent
+# when both env vars exist.
+
+python -m evals.runner                                 # all cases
+python -m evals.runner --category tool                 # one category
+python -m evals.runner --tasks current_time,daily_log
+python -m evals.runner --base-url http://host:7870
+```
+
+Reports land in `evals/results/run-<ts>.json` per run.
+
+## Categories
+
+| Category | What it covers |
+|---|---|
+| `a2a-protocol` | Agent card discovery, auth gating |
+| `simple` | Direct LLM answers, no tool use |
+| `abstention` | Don't reach for a tool when training data is enough |
+| `tool` | Single-tool invocations across the starter set |
+| `chained` | Multi-step reasoning that calls 2+ tools |
+| `subsystem` | KnowledgeMiddleware retrieval, hot-memory injection |
+
+## File layout
+
+```
+evals/
+  client.py     A2A client (message/send + poll, message/stream, agent card, cancel)
+  runner.py     CLI runner — print board, write JSON report
+  verify.py     Audit-log + KB side-effect assertions, setup/teardown
+  tasks.json    Cases — 15 covering the starter tools end-to-end
+  results/      Per-run reports
+```
+
+## Adding a case
+
+Append to `tasks.json`:
+
+```json
+{
+  "id": "unique-id",
+  "category": "tool",
+  "kind": "ask",
+  "name": "Human-readable description",
+  "prompt": "What you ask the agent (in real-user voice — never name the tool)",
+  "expected_tools": ["tool_name"],
+  "expected_patterns": ["substring-that-must-appear"],
+  "verify_kb": {
+    "find_chunk_containing": "EVAL-MARK-A1B2",
+    "domain": "context"
+  },
+  "setup": [
+    {"kb_ingest": {"content": "...", "domain": "context", "heading": "..."}}
+  ],
+  "teardown": [
+    {"kb_delete_by_content": {"contains": "EVAL-MARK-A1B2"}}
+  ]
+}
+```
+
+Use **unique markers** (`EVAL-MARK-XYZ`, `eval-chain-flag-q9`) in
+prompts whenever you need a verifier to disambiguate from real
+operator data.
+
+## Why side-effect verification
+
+When the model hallucinates a tool result (e.g. "Logged: ..." without
+actually calling `daily_log`), text-only checks pass while the DB
+stays empty. The audit-log + KB queries here catch it.
+
+## Prompt rule
+
+Every prompt must be plausibly typed by a real user. **The tool name
+never appears.** If the agent has to infer the tool from intent, that
+*is* the test — leaking the tool name into the prompt is testing
+instruction-following, not tool selection.
+
+## References
+
+- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents)
+- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html)
+- [ToolSandbox](https://arxiv.org/html/2408.04682v1)
diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/client.py b/evals/client.py
new file mode 100644
index 0000000..51eadfe
--- /dev/null
+++ b/evals/client.py
@@ -0,0 +1,262 @@
+"""A2A client for the eval runner.
+
+Drives the running agent over the same JSON-RPC + SSE surface that
+real A2A callers use:
+
+- ``agent_card()`` — GET ``/.well-known/agent-card.json``
+- ``ask()``        — ``message/send`` + ``tasks/get`` poll
+- ``stream()``     — ``message/stream`` SSE
+- ``cancel()``     — ``tasks/cancel``
+
+Returns structured ``TaskResult`` objects the runner asserts against.
+
+Auth picks up both surfaces the template exposes (see ``server.py``):
+
+- ``Authorization: Bearer <token>`` — wizard-set / ``A2A_AUTH_TOKEN`` env
+- ``X-API-Key: <key>``              — legacy, ``<AGENT>_API_KEY`` env
+
+Both headers are sent when the corresponding env var is set; the
+running agent enforces whichever it is configured for.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+
+@dataclass
+class TaskResult:
+    task_id: str
+    state: str                              # completed / failed / canceled / timeout
+    text: str = ""                          # extracted user-facing reply
+    artifacts: list[dict] = field(default_factory=list)
+    usage: dict = field(default_factory=dict)
+    duration_ms: int = 0
+    error: str | None = None
+
+
+def _resolve_auth_env() -> tuple[str, str]:
+    """Return (bearer_token, api_key) from env.
+
+    Bearer comes from ``A2A_AUTH_TOKEN`` (the env name the A2A handler
+    reads at boot). The API key is named after the agent —
+    ``<AGENT_NAME>_API_KEY`` — so a fork named ``quinn`` reads
+    ``QUINN_API_KEY``. ``EVAL_API_KEY`` is honored as an explicit
+    override so CI doesn't have to know the agent's slug.
+    """
+    bearer = os.environ.get("A2A_AUTH_TOKEN", "")
+
+    api_key = os.environ.get("EVAL_API_KEY", "")
+    if not api_key:
+        agent = os.environ.get("AGENT_NAME", "protoagent").upper()
+        api_key = os.environ.get(f"{agent}_API_KEY", "")
+    return bearer, api_key
+
+
+class AgentClient:
+    """Thin A2A client tied to one agent instance."""
+
+    def __init__(
+        self,
+        base_url: str | None = None,
+        bearer: str | None = None,
+        api_key: str | None = None,
+    ):
+        self.base_url = (
+            base_url
+            or os.environ.get("EVAL_BASE_URL")
+            or os.environ.get("AGENT_BASE_URL")
+            or "http://localhost:7870"
+        ).rstrip("/")
+
+        env_bearer, env_api_key = _resolve_auth_env()
+        token = bearer if bearer is not None else env_bearer
+        x_api = api_key if api_key is not None else env_api_key
+        self.headers = {"Content-Type": "application/json"}
+        if token:
+            self.headers["Authorization"] = f"Bearer {token}"
+        if x_api:
+            self.headers["X-API-Key"] = x_api
+
+    # ── Agent card ──────────────────────────────────────────────────────────
+
+    async def agent_card(self) -> dict:
+        """Fetch the agent card.
+
+        The template serves both ``/.well-known/agent-card.json`` (modern)
+        and ``/.well-known/agent.json`` (legacy). We try the modern path
+        first; fall back to the legacy path so this works against forks
+        that disabled one or the other.
+        """
+        async with httpx.AsyncClient(timeout=10) as client:
+            for path in ("/.well-known/agent-card.json", "/.well-known/agent.json"):
+                r = await client.get(f"{self.base_url}{path}")
+                if r.status_code == 200:
+                    return r.json()
+            r.raise_for_status()  # surface the last error
+            return {}
+
+    # ── message/send + poll ─────────────────────────────────────────────────
+
+    async def ask(self, prompt: str, *, timeout_s: int = 90) -> TaskResult:
+        """Send + poll until terminal. Returns TaskResult with extracted text."""
+        mid = str(uuid.uuid4())
+        payload = {
+            "jsonrpc": "2.0",
+            "id": mid,
+            "method": "message/send",
+            "params": {
+                "message": {
+                    "role": "user",
+                    "parts": [{"kind": "text", "text": prompt}],
+                    "messageId": mid,
+                }
+            },
+        }
+        start = time.time()
+        async with httpx.AsyncClient(timeout=30) as client:
+            r = await client.post(f"{self.base_url}/a2a", headers=self.headers, json=payload)
+            r.raise_for_status()
+            resp = r.json()
+            if "error" in resp:
+                return TaskResult(task_id="", state="failed", error=str(resp["error"]))
+            task_id = resp.get("result", {}).get("id", "")
+
+            deadline = start + timeout_s
+            while time.time() < deadline:
+                await asyncio.sleep(1.5)
+                poll = await client.post(
+                    f"{self.base_url}/a2a",
+                    headers=self.headers,
+                    json={
+                        "jsonrpc": "2.0",
+                        "id": "p",
+                        "method": "tasks/get",
+                        "params": {"id": task_id},
+                    },
+                )
+                poll.raise_for_status()
+                res = poll.json().get("result", {})
+                state = (res.get("status") or {}).get("state", "")
+                if state in ("completed", "failed", "canceled"):
+                    text, usage = _extract(res)
+                    return TaskResult(
+                        task_id=task_id,
+                        state=state,
+                        text=text,
+                        artifacts=res.get("artifacts", []),
+                        usage=usage,
+                        duration_ms=int((time.time() - start) * 1000),
+                    )
+            return TaskResult(
+                task_id=task_id, state="timeout",
+                duration_ms=int((time.time() - start) * 1000),
+            )
+
+    # ── message/stream (SSE) ────────────────────────────────────────────────
+
+    async def stream(self, prompt: str, *, timeout_s: int = 90) -> tuple[list[dict], TaskResult | None]:
+        """Stream a turn over SSE. Returns (event_log, final TaskResult).
+
+        Each event is a dict shaped ``{kind, result}``. Use this to assert
+        on the streaming protocol itself (status-update sequence, final
+        flag, artifact chunks). Most cases should use ``ask()`` instead.
+        """
+        mid = str(uuid.uuid4())
+        payload = {
+            "jsonrpc": "2.0",
+            "id": mid,
+            "method": "message/stream",
+            "params": {
+                "message": {
+                    "role": "user",
+                    "parts": [{"kind": "text", "text": prompt}],
+                    "messageId": mid,
+                }
+            },
+        }
+        events: list[dict] = []
+        final: TaskResult | None = None
+        start = time.time()
+        async with httpx.AsyncClient(timeout=timeout_s) as client:
+            async with client.stream(
+                "POST", f"{self.base_url}/a2a", headers=self.headers, json=payload
+            ) as r:
+                if r.status_code >= 400:
+                    body = await r.aread()
+                    return events, TaskResult(
+                        task_id="", state="failed",
+                        error=f"HTTP {r.status_code}: {body.decode()[:300]}",
+                    )
+                async for line in r.aiter_lines():
+                    if not line or line.startswith(":"):
+                        continue
+                    if line.startswith("data:"):
+                        raw = line[5:].strip()
+                        if not raw:
+                            continue
+                        try:
+                            data = json.loads(raw)
+                        except json.JSONDecodeError:
+                            events.append({"kind": "raw", "raw": raw})
+                            continue
+                        result = (data.get("result") or {})
+                        kind = result.get("kind", "?")
+                        events.append({"kind": kind, "result": result})
+                        if kind in ("status-update", "task") and result.get("final"):
+                            text, usage = _extract(result)
+                            final = TaskResult(
+                                task_id=result.get("taskId") or result.get("id", ""),
+                                state=(result.get("status") or {}).get("state", "unknown"),
+                                text=text,
+                                usage=usage,
+                                duration_ms=int((time.time() - start) * 1000),
+                            )
+                            break
+        return events, final
+
+    # ── tasks/cancel ────────────────────────────────────────────────────────
+
+    async def cancel(self, task_id: str) -> dict:
+        async with httpx.AsyncClient(timeout=10) as client:
+            r = await client.post(
+                f"{self.base_url}/a2a",
+                headers=self.headers,
+                json={
+                    "jsonrpc": "2.0",
+                    "id": "c",
+                    "method": "tasks/cancel",
+                    "params": {"id": task_id},
+                },
+            )
+            return r.json()
+
+
+def _extract(result: dict) -> tuple[str, dict]:
+    """Pull text + cost data out of an A2A result envelope."""
+    text_parts: list[str] = []
+    usage: dict = {}
+    artifacts = result.get("artifacts") or []
+    for art in artifacts:
+        for p in art.get("parts", []):
+            if p.get("kind") == "text" and p.get("text"):
+                text_parts.append(p["text"])
+            elif p.get("kind") == "data" and isinstance(p.get("data"), dict):
+                if "usage" in p["data"]:
+                    usage = dict(p["data"]["usage"])
+                    if "durationMs" in p["data"]:
+                        usage["durationMs"] = p["data"]["durationMs"]
+    status = result.get("status") or {}
+    msg = status.get("message") or {}
+    for p in msg.get("parts") or []:
+        if p.get("kind") == "text" and p.get("text"):
+            text_parts.append(p["text"])
+    return "\n".join(text_parts).strip(), usage
diff --git a/evals/results/.gitkeep b/evals/results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/evals/runner.py b/evals/runner.py
new file mode 100644
index 0000000..ad5154b
--- /dev/null
+++ b/evals/runner.py
@@ -0,0 +1,307 @@
+"""Eval runner — executes ``tasks.json``, prints a pass/fail board,
+writes a JSON report to ``evals/results/run-<ts>.json``.
+
+Usage:
+
+.. code:: bash
+
+    # agent must be running at $EVAL_BASE_URL (default http://localhost:7870)
+    # auth: $A2A_AUTH_TOKEN and/or $<AGENT_NAME>_API_KEY (or $EVAL_API_KEY)
+
+    python -m evals.runner                                # all cases
+    python -m evals.runner --category tool                # one category
+    python -m evals.runner --tasks current_time,daily_log
+    python -m evals.runner --base-url http://host:7870
+
+Cases are described in ``tasks.json``. Each case picks one of three
+``kind`` runners:
+
+- ``agent_card`` — fetch ``/.well-known/agent-card.json`` and assert
+  on the returned card shape.
+- ``auth_check`` — send a request with a known-bad bearer token and
+  assert the expected HTTP status.
+- ``ask`` — send a prompt over A2A, optionally pre-seed the KB, then
+  assert against three independent channels: audit-log tool firing,
+  reply-text patterns, and KB side effects.
+
+A case passes only when all assertions hold. The ``detail`` column in
+the pass/fail board names the missing assertion when one fails.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Allow ``python -m evals.runner`` and ``python evals/runner.py``.
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(_PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(_PROJECT_ROOT))
+
+from evals.client import AgentClient, TaskResult
+from evals import verify
+
+
+@dataclass
+class CaseResult:
+    id: str
+    category: str
+    name: str
+    passed: bool
+    detail: str
+    duration_ms: int = 0
+    tokens: int = 0
+    raw: dict = field(default_factory=dict)
+
+
+# ── case runners ────────────────────────────────────────────────────────────
+
+
+async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult:
+    expect = case.get("expect", {})
+    try:
+        card = await client.agent_card()
+    except Exception as e:
+        return CaseResult(case["id"], case["category"], case["name"], False, f"fetch failed: {e}")
+
+    problems: list[str] = []
+    if "name" in expect and card.get("name") != expect["name"]:
+        problems.append(f"name={card.get('name')!r} expected {expect['name']!r}")
+    if "skills_min" in expect:
+        skills = card.get("skills") or []
+        if len(skills) < expect["skills_min"]:
+            problems.append(f"only {len(skills)} skills, expected >= {expect['skills_min']}")
+    if "extensions_contain" in expect:
+        ext_uris = [
+            e.get("uri", "")
+            for e in (card.get("capabilities") or {}).get("extensions") or []
+        ]
+        for needle in expect["extensions_contain"]:
+            if not any(needle in u for u in ext_uris):
+                problems.append(f"missing extension matching {needle!r}; saw {ext_uris}")
+    if problems:
+        return CaseResult(case["id"], case["category"], case["name"], False, "; ".join(problems))
+    return CaseResult(case["id"], case["category"], case["name"], True, "card OK")
+
+
+async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult:
+    """Verify the A2A endpoint rejects a bad bearer with the expected status."""
+    import httpx
+
+    expected_status = case.get("expect", {}).get("status", 401)
+    bad = case.get("bad_token", "")
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {bad}",
+        # No X-API-Key — testing bearer alone.
+    }
+    payload = {
+        "jsonrpc": "2.0",
+        "id": "auth-check",
+        "method": "message/send",
+        "params": {
+            "message": {
+                "role": "user",
+                "parts": [{"kind": "text", "text": "ping"}],
+                "messageId": "auth-check",
+            }
+        },
+    }
+    try:
+        async with httpx.AsyncClient(timeout=10) as c:
+            r = await c.post(f"{client.base_url}/a2a", headers=headers, json=payload)
+    except Exception as e:
+        return CaseResult(case["id"], case["category"], case["name"], False, f"request failed: {e}")
+    if r.status_code != expected_status:
+        return CaseResult(
+            case["id"], case["category"], case["name"], False,
+            f"got {r.status_code}, expected {expected_status}",
+        )
+    return CaseResult(
+        case["id"], case["category"], case["name"], True, f"status={r.status_code}",
+    )
+
+
+async def _run_ask(client: AgentClient, case: dict) -> CaseResult:
+    # Pre-seed state via direct DB writes (model never sees this).
+    if "setup" in case:
+        err = verify.apply_setup(case["setup"])
+        if err:
+            return CaseResult(
+                case["id"], case["category"], case["name"], False,
+                f"setup failed: {err}",
+            )
+
+    since = verify.audit_now()
+    result: TaskResult = await client.ask(
+        case["prompt"], timeout_s=case.get("timeout_s", 90),
+    )
+
+    if result.state != "completed":
+        if "teardown" in case:
+            verify.apply_teardown(case["teardown"])
+        return CaseResult(
+            case["id"], case["category"], case["name"], False,
+            f"task state={result.state}; error={result.error or '(none)'}",
+            duration_ms=result.duration_ms,
+            raw={"text": result.text[:200]},
+        )
+
+    problems: list[str] = []
+
+    # Tool firing assertions.
+    expected_tools = case.get("expected_tools") or []
+    if expected_tools:
+        await asyncio.sleep(0.3)  # let the audit log catch up
+        entries = verify.audit_entries_since(since)
+        require_success = case.get("tool_outcome", "success") == "success"
+        passed, detail = verify.assert_tools_fired(
+            entries, expected_tools, require_success=require_success,
+        )
+        if not passed:
+            problems.append(detail)
+
+    # Text pattern assertions (case-insensitive substrings).
+    text_lower = result.text.lower()
+    for pattern in case.get("expected_patterns") or []:
+        if pattern.lower() not in text_lower:
+            problems.append(f"missing pattern {pattern!r}")
+
+    # KB side-effect assertions.
+    vk = case.get("verify_kb") or {}
+    if "find_chunk_containing" in vk:
+        chunk = verify.find_chunk_containing(
+            vk["find_chunk_containing"], domain=vk.get("domain"),
+        )
+        if not chunk:
+            problems.append(f"no chunk containing {vk['find_chunk_containing']!r}")
+
+    if "teardown" in case:
+        verify.apply_teardown(case["teardown"])
+
+    detail = (
+        "; ".join(problems) if problems
+        else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)"
+    )
+    return CaseResult(
+        case["id"], case["category"], case["name"],
+        passed=not problems,
+        detail=detail,
+        duration_ms=result.duration_ms,
+        tokens=result.usage.get("total_tokens", 0) or 0,
+        raw={"reply": result.text[:300]},
+    )
+
+
+# ── dispatch ────────────────────────────────────────────────────────────────
+
+
+_RUNNERS = {
+    "agent_card": _run_agent_card,
+    "auth_check": _run_auth_check,
+    "ask": _run_ask,
+}
+
+
+async def run_one(client: AgentClient, case: dict) -> CaseResult:
+    runner = _RUNNERS.get(case.get("kind", "ask"))
+    if runner is None:
+        return CaseResult(
+            case["id"], case.get("category", "?"), case.get("name", "?"),
+            False, f"unknown kind: {case.get('kind')}",
+        )
+    try:
+        return await runner(client, case)
+    except Exception as e:
+        return CaseResult(
+            case["id"], case.get("category", "?"), case.get("name", "?"),
+            False, f"exception: {e!r}",
+        )
+
+
+# ── main ────────────────────────────────────────────────────────────────────
+
+
+def _print_board(results: list[CaseResult]) -> None:
+    width_id = max(len(r.id) for r in results)
+    width_cat = max(len(r.category) for r in results)
+    print()
+    print(f"{'ID'.ljust(width_id)}  {'CAT'.ljust(width_cat)}  RESULT  TIME    TOKENS  DETAIL")
+    print("-" * 90)
+    pass_count = 0
+    for r in results:
+        mark = "PASS" if r.passed else "FAIL"
+        if r.passed:
+            pass_count += 1
+        time_s = f"{r.duration_ms}ms".rjust(6)
+        tokens = str(r.tokens).rjust(6) if r.tokens else "  -   "
+        print(
+            f"{r.id.ljust(width_id)}  {r.category.ljust(width_cat)}  "
+            f"{mark}    {time_s}  {tokens}  {r.detail[:80]}"
+        )
+    print("-" * 90)
+    print(f"\n{pass_count}/{len(results)} passed")
+
+
+def _save_report(results: list[CaseResult], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "ts": datetime.now(timezone.utc).isoformat(),
+        "total": len(results),
+        "passed": sum(1 for r in results if r.passed),
+        "results": [asdict(r) for r in results],
+    }
+    path.write_text(json.dumps(payload, indent=2))
+    print(f"\nReport: {path}")
+
+
+async def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--base-url", default=None)
+    p.add_argument("--tasks", default=None, help="comma-separated case IDs")
+    p.add_argument("--category", default=None)
+    p.add_argument("--out", default=None)
+    args = p.parse_args()
+
+    tasks_path = Path(__file__).parent / "tasks.json"
+    cases = json.loads(tasks_path.read_text())
+
+    if args.tasks:
+        wanted = set(args.tasks.split(","))
+        cases = [c for c in cases if c["id"] in wanted]
+    if args.category:
+        cases = [c for c in cases if c.get("category") == args.category]
+
+    if not cases:
+        print("no cases match filters", file=sys.stderr)
+        return 2
+
+    client = AgentClient(base_url=args.base_url)
+
+    print(f"Running {len(cases)} case(s) against {client.base_url}")
+    results: list[CaseResult] = []
+    for case in cases:
+        sys.stdout.write(f"  {case['id']}... ")
+        sys.stdout.flush()
+        result = await run_one(client, case)
+        sys.stdout.write(f"{'PASS' if result.passed else 'FAIL'}  {result.detail[:60]}\n")
+        results.append(result)
+
+    _print_board(results)
+
+    out_path = Path(args.out) if args.out else (
+        Path(__file__).parent / "results" / f"run-{int(time.time())}.json"
+    )
+    _save_report(results, out_path)
+
+    return 0 if all(r.passed for r in results) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(asyncio.run(main()))
diff --git a/evals/tasks.json b/evals/tasks.json
new file mode 100644
index 0000000..14cdd16
--- /dev/null
+++ b/evals/tasks.json
@@ -0,0 +1,186 @@
+[
+  {
+    "id": "card_discovery",
+    "category": "a2a-protocol",
+    "kind": "agent_card",
+    "name": "Agent card discovery",
+    "expect": {
+      "skills_min": 1,
+      "extensions_contain": ["cost-v1"]
+    }
+  },
+  {
+    "id": "auth_negative",
+    "category": "a2a-protocol",
+    "kind": "auth_check",
+    "name": "Reject bad bearer when bearer auth is configured",
+    "bad_token": "definitely-not-the-real-token",
+    "expect": {"status": 401}
+  },
+
+  {
+    "id": "abstain_no_tool",
+    "category": "abstention",
+    "kind": "ask",
+    "name": "Don't reach for a tool when training data is fine",
+    "prompt": "What's the capital of France? One word.",
+    "expected_tools": [],
+    "expected_patterns": ["paris"]
+  },
+  {
+    "id": "greeting",
+    "category": "simple",
+    "kind": "ask",
+    "name": "Direct greeting, no tool",
+    "prompt": "Hi.",
+    "expected_tools": [],
+    "expected_patterns": []
+  },
+
+  {
+    "id": "current_time_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks about live time → current_time",
+    "prompt": "What time is it in UTC right now?",
+    "expected_tools": ["current_time"],
+    "expected_patterns": ["UTC"]
+  },
+  {
+    "id": "calculator_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks for arithmetic → calculator",
+    "prompt": "How much is 17 times 23, plus 1?",
+    "expected_tools": ["calculator"],
+    "expected_patterns": ["392"]
+  },
+  {
+    "id": "web_search_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks about recent news → web_search",
+    "prompt": "Anything notable in the news about Anthropic this week?",
+    "expected_tools": ["web_search"],
+    "expected_patterns": []
+  },
+  {
+    "id": "fetch_url_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks about a URL's content → fetch_url",
+    "prompt": "What's on https://example.com? Just the page title is fine.",
+    "expected_tools": ["fetch_url"],
+    "expected_patterns": ["example"]
+  },
+
+  {
+    "id": "memory_ingest_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Stores a stable preference → memory_ingest writes a chunk",
+    "prompt": "Remember that I prefer protoLabs Studio standups at 9am Eastern.",
+    "expected_tools": ["memory_ingest"],
+    "expected_patterns": [],
+    "verify_kb": {
+      "find_chunk_containing": "9am"
+    },
+    "teardown": [
+      {"kb_delete_by_content": {"contains": "9am"}}
+    ]
+  },
+  {
+    "id": "daily_log_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks to log an event → daily_log writes today's chunk",
+    "prompt": "Log this for today: my standup just ended, team is unblocked on the auth migration.",
+    "expected_tools": ["daily_log"],
+    "expected_patterns": [],
+    "verify_kb": {
+      "find_chunk_containing": "auth migration",
+      "domain": "daily-log"
+    },
+    "teardown": [
+      {"kb_delete_by_content": {"contains": "auth migration"}}
+    ]
+  },
+  {
+    "id": "memory_recall_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks about a stored fact → recall surfaces it",
+    "setup": [
+      {"kb_ingest": {
+        "content": "Operator's primary lab is Snickerdoodle, located in Spokane.",
+        "domain": "context",
+        "heading": "lab"
+      }}
+    ],
+    "prompt": "Where's my primary lab and what's it called?",
+    "expected_tools": ["memory_recall"],
+    "expected_patterns": ["snickerdoodle", "spokane"],
+    "teardown": [
+      {"kb_delete_by_heading": {"domain": "context", "heading": "lab"}}
+    ]
+  },
+  {
+    "id": "memory_list_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks for recent log entries → memory_list",
+    "setup": [
+      {"kb_ingest": {"content": "called the dentist", "domain": "daily-log", "heading": "today"}},
+      {"kb_ingest": {"content": "merged the auth PR", "domain": "daily-log", "heading": "today"}}
+    ],
+    "prompt": "What did I do today? Summarize from the log.",
+    "expected_tools": ["memory_list"],
+    "expected_patterns": ["dentist"],
+    "teardown": [
+      {"kb_delete_by_content": {"contains": "called the dentist"}},
+      {"kb_delete_by_content": {"contains": "merged the auth PR"}}
+    ]
+  },
+  {
+    "id": "memory_stats_intent",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Asks how much is in memory → memory_stats",
+    "prompt": "How much have I got stored across each memory domain?",
+    "expected_tools": ["memory_stats"],
+    "expected_patterns": []
+  },
+
+  {
+    "id": "log_then_recall_chain",
+    "category": "chained",
+    "kind": "ask",
+    "name": "Log an event, then recall it later in the same turn",
+    "prompt": "Log this for today: 'eval-chain-flag-q9: chained log+recall test'. After logging, search memory for that flag and quote it back.",
+    "expected_tools": ["daily_log", "memory_recall"],
+    "expected_patterns": ["eval-chain-flag-q9"],
+    "teardown": [
+      {"kb_delete_by_content": {"contains": "eval-chain-flag-q9"}}
+    ]
+  },
+
+  {
+    "id": "knowledge_middleware_recall",
+    "category": "subsystem",
+    "kind": "ask",
+    "name": "KnowledgeMiddleware surfaces a stored fact without an explicit search",
+    "setup": [
+      {"kb_ingest": {
+        "content": "Operator's preferred coffee is a Gibraltar with oat milk from Atticus.",
+        "domain": "preferences",
+        "heading": "coffee"
+      }}
+    ],
+    "prompt": "What's my usual coffee order?",
+    "expected_tools": [],
+    "expected_patterns": ["gibraltar", "oat"],
+    "teardown": [
+      {"kb_delete_by_heading": {"domain": "preferences", "heading": "coffee"}}
+    ]
+  }
+]
diff --git a/evals/verify.py b/evals/verify.py
new file mode 100644
index 0000000..5b1f8cc
--- /dev/null
+++ b/evals/verify.py
@@ -0,0 +1,176 @@
+"""Side-effect verifiers for eval cases.
+
+Two channels:
+
+- **Audit log** — JSONL written by ``AuditMiddleware`` at
+  ``/sandbox/audit/audit.jsonl`` (override with ``AUDIT_PATH`` env).
+  ``audit_entries_since`` returns entries newer than a marker, and
+  ``assert_tools_fired`` confirms a tool name appears with the
+  expected outcome.
+- **Knowledge store** — sqlite DB at ``KNOWLEDGE_DB_PATH`` (or the
+  template default). ``find_chunk_containing`` confirms a memory
+  write actually landed; ``setup_chunk`` / ``teardown`` mutate the
+  store directly so cases start from a known state.
+
+The store is opened read/write so setup steps can pre-seed (BFCL's
+``initial_config`` pattern). The model never sees these direct writes
+— it discovers them via ``memory_recall`` / ``memory_list`` tools as
+real users would.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+# ── path resolution ─────────────────────────────────────────────────────────
+
+
+def _audit_path() -> Path:
+    """Audit JSONL location. Falls back to the template's docker default."""
+    raw = os.environ.get("AUDIT_PATH") or "/sandbox/audit/audit.jsonl"
+    p = Path(raw).expanduser()
+    if p.is_file():
+        return p
+    # Local-dev fallback: same shape, but under the home dir.
+    fallback = Path.home() / ".protoagent" / "audit" / "audit.jsonl"
+    return fallback
+
+
+def _kb_store():
+    """Construct a ``KnowledgeStore`` against the configured path.
+
+    Imported lazily so ``evals/verify.py`` can be loaded in a context
+    where ``knowledge/`` isn't on sys.path yet (the runner adjusts
+    sys.path before calling in).
+    """
+    from knowledge import KnowledgeStore
+    return KnowledgeStore()  # honors KNOWLEDGE_DB_PATH env
+
+
+# ── audit log ───────────────────────────────────────────────────────────────
+
+
+def audit_now() -> str:
+    """ISO-8601 marker suitable as a 'since' input to ``audit_entries_since``."""
+    return datetime.now(timezone.utc).isoformat()
+
+
+def audit_entries_since(ts_iso: str) -> list[dict]:
+    """Return audit-log entries with ``ts`` strictly greater than ``ts_iso``."""
+    p = _audit_path()
+    if not p.is_file():
+        return []
+    out: list[dict] = []
+    with p.open() as fh:
+        for line in fh:
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if entry.get("ts", "") > ts_iso:
+                out.append(entry)
+    return out
+
+
+def assert_tools_fired(
+    audit_entries: list[dict],
+    expected: list[str],
+    *,
+    require_success: bool = True,
+) -> tuple[bool, str]:
+    """Confirm each expected tool name appears in audit entries.
+
+    Order doesn't matter — a tool that fires twice still satisfies one
+    expected entry, and extra entries (subset matching, BFCL-style) are
+    allowed.
+
+    ``require_success=True`` (default) only counts ``success=True``
+    entries — use this for happy-path cases. Pass ``require_success=False``
+    when the case represents an error path that the agent should still
+    *attempt* (e.g. fetching a private URL the agent has no creds for).
+    """
+    fired: dict[str, dict[str, int]] = {}
+    for e in audit_entries:
+        bucket = fired.setdefault(e.get("tool", "?"), {"ok": 0, "err": 0})
+        bucket["ok" if e.get("success") else "err"] += 1
+
+    missing: list[str] = []
+    for t in expected:
+        if t not in fired:
+            missing.append(t)
+            continue
+        if require_success and fired[t]["ok"] == 0:
+            missing.append(f"{t} (only errors)")
+
+    if missing:
+        return False, f"missing tools: {missing}; saw: {dict(fired)}"
+    return True, f"saw: {dict(fired)}"
+
+
+# ── knowledge store ─────────────────────────────────────────────────────────
+
+
+def find_chunk_containing(text: str, *, domain: str | None = None) -> dict | None:
+    store = _kb_store()
+    chunk = store.find_chunk_containing(text, domain=domain)
+    return chunk.as_dict() if chunk else None
+
+
+def chunks_in_domain(domain: str, *, limit: int = 50) -> list[dict]:
+    store = _kb_store()
+    return [c.as_dict() for c in store.list_chunks(domain=domain, limit=limit)]
+
+
+# ── setup / teardown helpers ─────────────────────────────────────────────────
+
+
+def apply_setup(steps: list[dict]) -> str | None:
+    """Apply a list of setup steps. Each step is a dict with one key.
+
+    Supported step kinds:
+
+    - ``kb_ingest``: ``{content, domain, heading?}``
+
+    Returns ``None`` on success, an error string on first failure.
+    """
+    store = _kb_store()
+    for step in steps:
+        for kind, args in step.items():
+            if kind == "kb_ingest":
+                if store.add_chunk(
+                    args["content"],
+                    domain=args.get("domain", "general"),
+                    heading=args.get("heading"),
+                ) is None:
+                    return f"kb_ingest failed for {args!r}"
+            else:
+                return f"unknown setup step: {kind}"
+    return None
+
+
+def apply_teardown(steps: list[dict]) -> None:
+    """Best-effort teardown. Never raises so a setup failure or assertion
+    failure doesn't poison subsequent cases.
+
+    Supported step kinds:
+
+    - ``kb_delete_by_content``: ``{contains}``
+    - ``kb_delete_by_heading``: ``{domain, heading}``
+    """
+    store = _kb_store()
+    for step in steps:
+        for kind, args in step.items():
+            try:
+                if kind == "kb_delete_by_content":
+                    store.delete_by_content(args["contains"])
+                elif kind == "kb_delete_by_heading":
+                    store.delete_by_heading(args["domain"], args["heading"])
+            except Exception as exc:  # pragma: no cover
+                log.debug("[verify] teardown step %s failed: %s", kind, exc)
diff --git a/graph/config.py b/graph/config.py
index 00ae1a8..a3df02b 100644
--- a/graph/config.py
+++ b/graph/config.py
@@ -37,16 +37,24 @@ class LangGraphConfig:
     # Subagents — template ships with one example (see graph/subagents/config.py).
     # Add fields here as you add entries to SUBAGENT_REGISTRY.
     worker: SubagentDef = field(default_factory=lambda: SubagentDef(
-        tools=["echo", "current_time", "calculator", "web_search", "fetch_url"],
+        tools=[
+            "current_time", "calculator", "web_search", "fetch_url",
+            "memory_ingest", "memory_recall", "memory_list", "memory_stats",
+            "daily_log",
+        ],
         max_turns=20,
     ))
 
     # Middleware toggles
-    knowledge_middleware: bool = False  # template ships no knowledge store
+    knowledge_middleware: bool = True
     audit_middleware: bool = True
-    memory_middleware: bool = False
+    memory_middleware: bool = True
 
-    # Knowledge store (opt-in — leave disabled until the fork ships one)
+    # Knowledge store — sqlite + FTS5, see ``knowledge/store.py``.
+    # The default path lives under ``/sandbox/`` to play well with the
+    # bundled Docker volume; the store falls back to
+    # ``~/.protoagent/knowledge/agent.db`` automatically when /sandbox
+    # is read-only or absent (e.g. local ``python server.py``).
     knowledge_db_path: str = "/sandbox/knowledge/agent.db"
     embed_model: str = "qwen3-embedding"
     knowledge_top_k: int = 5
diff --git a/graph/subagents/config.py b/graph/subagents/config.py
index 554a321..560edc7 100644
--- a/graph/subagents/config.py
+++ b/graph/subagents/config.py
@@ -63,7 +63,11 @@ class SubagentConfig:
 
 Replace this prompt with domain-specific guidance once your agent has
 real specialized roles.""",
-    tools=["echo", "current_time", "calculator", "web_search", "fetch_url"],
+    tools=[
+        "current_time", "calculator", "web_search", "fetch_url",
+        "memory_ingest", "memory_recall", "memory_list", "memory_stats",
+        "daily_log",
+    ],
     max_turns=20,
 )
 
diff --git a/knowledge/__init__.py b/knowledge/__init__.py
new file mode 100644
index 0000000..1de93de
--- /dev/null
+++ b/knowledge/__init__.py
@@ -0,0 +1,12 @@
+"""Knowledge store — sqlite-backed chunk storage for memory tools and middleware.
+
+The template ships this enabled by default so a fresh fork has a working
+memory loop on day one (memory_ingest, memory_recall, daily_log) and the
+eval harness can assert side effects against real DB state.
+
+See ``knowledge.store.KnowledgeStore`` for the public API.
+"""
+
+from knowledge.store import KnowledgeStore, Chunk
+
+__all__ = ["KnowledgeStore", "Chunk"]
diff --git a/knowledge/store.py b/knowledge/store.py
new file mode 100644
index 0000000..a6aee7a
--- /dev/null
+++ b/knowledge/store.py
@@ -0,0 +1,456 @@
+"""KnowledgeStore — sqlite-backed chunk storage with FTS5 search.
+
+The template's default knowledge surface. One ``chunks`` table holds
+every piece of stored content (operator notes via ``memory_ingest``,
+daily-log entries, conversation findings extracted by
+``MemoryMiddleware``); the ``domain`` column distinguishes them.
+
+Search uses sqlite FTS5 when available (true on virtually all modern
+sqlite builds). When FTS5 is missing — sandboxed sqlite, custom builds
+— the store transparently falls back to ``LIKE`` keyword matching so
+the API contract still holds.
+
+The store is path-aware and degradation-aware:
+
+- Honors ``KNOWLEDGE_DB_PATH`` env var → constructor argument →
+  config default ``/sandbox/knowledge/agent.db``.
+- If the configured path is unwritable (running locally outside the
+  container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db``
+  so a fresh ``python server.py`` works without sudo.
+- All write operations swallow ``sqlite3.OperationalError`` and log;
+  the store never crashes the agent loop on a corrupt or read-only DB.
+
+Forks that want embeddings on top of FTS5 can subclass and override
+``search()`` — the middleware reads through that one method.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+import sqlite3
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+DEFAULT_DB_PATH = "/sandbox/knowledge/agent.db"
+
+
+@dataclass
+class Chunk:
+    """One row from the chunks table — what callers see."""
+    id: int
+    content: str
+    domain: str
+    heading: str | None
+    source: str | None
+    source_type: str | None
+    finding_type: str | None
+    created_at: str
+    updated_at: str
+
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "content": self.content,
+            "domain": self.domain,
+            "heading": self.heading,
+            "source": self.source,
+            "source_type": self.source_type,
+            "finding_type": self.finding_type,
+            "created_at": self.created_at,
+            "updated_at": self.updated_at,
+        }
+
+
+def _resolve_path(db_path: str | Path | None) -> Path:
+    """Pick a writable DB path. Env > arg > default; fall back to ~/.protoagent."""
+    raw = os.environ.get("KNOWLEDGE_DB_PATH") or db_path or DEFAULT_DB_PATH
+    p = Path(str(raw)).expanduser()
+    try:
+        p.parent.mkdir(parents=True, exist_ok=True)
+        # Probe writability
+        probe = p.parent / ".write-probe"
+        probe.touch()
+        probe.unlink()
+        return p
+    except OSError:
+        fallback = Path.home() / ".protoagent" / "knowledge" / "agent.db"
+        fallback.parent.mkdir(parents=True, exist_ok=True)
+        log.info(
+            "[knowledge] %s not writable; using %s instead",
+            p, fallback,
+        )
+        return fallback
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _has_fts5(db: sqlite3.Connection) -> bool:
+    try:
+        db.execute(
+            "CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_probe USING fts5(x)"
+        )
+        db.execute("DROP TABLE _fts5_probe")
+        return True
+    except sqlite3.OperationalError:
+        return False
+
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS chunks (
+    id            INTEGER PRIMARY KEY AUTOINCREMENT,
+    content       TEXT NOT NULL,
+    domain        TEXT NOT NULL DEFAULT 'general',
+    heading       TEXT,
+    source        TEXT,
+    source_type   TEXT,
+    finding_type  TEXT,
+    created_at    TEXT NOT NULL,
+    updated_at    TEXT NOT NULL
+);
+
+CREATE INDEX IF NOT EXISTS idx_chunks_domain     ON chunks(domain);
+CREATE INDEX IF NOT EXISTS idx_chunks_created_at ON chunks(created_at);
+"""
+
+_FTS_SCHEMA = """
+CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
+    content, heading, content='chunks', content_rowid='id'
+);
+
+CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
+    INSERT INTO chunks_fts(rowid, content, heading)
+        VALUES (new.id, new.content, new.heading);
+END;
+
+CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
+    INSERT INTO chunks_fts(chunks_fts, rowid, content, heading)
+        VALUES('delete', old.id, old.content, old.heading);
+END;
+
+CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
+    INSERT INTO chunks_fts(chunks_fts, rowid, content, heading)
+        VALUES('delete', old.id, old.content, old.heading);
+    INSERT INTO chunks_fts(rowid, content, heading)
+        VALUES (new.id, new.content, new.heading);
+END;
+"""
+
+
+class KnowledgeStore:
+    """Default knowledge store. Sqlite + FTS5 (with LIKE fallback).
+
+    Forks usually don't subclass this — extend ``add_chunk`` /
+    ``search`` directly when you need new fields, or wrap it with
+    your own embedding layer.
+    """
+
+    def __init__(self, db_path: str | Path | None = None):
+        self.path = _resolve_path(db_path)
+        self._fts_available: bool | None = None
+        self._init_db()
+
+    # ── connection / schema ─────────────────────────────────────────────────
+
+    def _connect(self) -> sqlite3.Connection:
+        db = sqlite3.connect(str(self.path))
+        db.row_factory = sqlite3.Row
+        db.execute("PRAGMA journal_mode=WAL")
+        return db
+
+    def _init_db(self) -> None:
+        try:
+            db = self._connect()
+            db.executescript(_SCHEMA)
+            self._fts_available = _has_fts5(db)
+            if self._fts_available:
+                db.executescript(_FTS_SCHEMA)
+            else:
+                log.info(
+                    "[knowledge] FTS5 unavailable — search will use LIKE fallback"
+                )
+            db.commit()
+            db.close()
+        except sqlite3.OperationalError as exc:
+            log.error("[knowledge] schema init failed at %s: %s", self.path, exc)
+
+    # Convenience for middleware that wants the raw connection. Kept
+    # private so the public API stays small.
+    def _get_db(self) -> sqlite3.Connection | None:
+        try:
+            return self._connect()
+        except sqlite3.OperationalError as exc:
+            log.error("[knowledge] connect failed: %s", exc)
+            return None
+
+    # ── writes ──────────────────────────────────────────────────────────────
+
+    def add_chunk(
+        self,
+        content: str,
+        domain: str = "general",
+        heading: str | None = None,
+        *,
+        source: str | None = None,
+        source_type: str | None = None,
+        finding_type: str | None = None,
+    ) -> int | None:
+        """Insert a chunk. Returns the new row id, or None on failure."""
+        if not content or not content.strip():
+            return None
+        db = self._get_db()
+        if db is None:
+            return None
+        try:
+            now = _now_iso()
+            cur = db.execute(
+                "INSERT INTO chunks "
+                "(content, domain, heading, source, source_type, finding_type, "
+                "created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+                (content, domain, heading, source, source_type, finding_type, now, now),
+            )
+            db.commit()
+            return int(cur.lastrowid)
+        except sqlite3.OperationalError as exc:
+            log.error("[knowledge] add_chunk failed: %s", exc)
+            return None
+        finally:
+            db.close()
+
+    def add_finding(
+        self,
+        content: str,
+        source: str = "conversation",
+        source_type: str = "chat",
+        finding_type: str = "insight",
+    ) -> int | None:
+        """Compatibility shim for ``MemoryMiddleware.after_agent``.
+
+        Stored under ``domain='finding'`` so memory_list / memory_recall
+        can surface them alongside operator-set chunks.
+        """
+        return self.add_chunk(
+            content,
+            domain="finding",
+            source=source,
+            source_type=source_type,
+            finding_type=finding_type,
+        )
+
+    # ── reads ───────────────────────────────────────────────────────────────
+
+    def search(
+        self,
+        query: str,
+        k: int = 5,
+        *,
+        domain: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Top-k chunks matching ``query``. Shape matches what the
+        ``KnowledgeMiddleware`` consumes: each result has ``table``,
+        ``preview``, plus the underlying chunk fields.
+
+        Uses FTS5 when available, else a tokenized LIKE fallback. Returns
+        an empty list on no matches or DB failure (never raises).
+        """
+        if not query or not query.strip():
+            return []
+        db = self._get_db()
+        if db is None:
+            return []
+        try:
+            rows = self._search_fts(db, query, k, domain) if self._fts_available \
+                else self._search_like(db, query, k, domain)
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] search failed: %s", exc)
+            rows = []
+        finally:
+            db.close()
+
+        results: list[dict[str, Any]] = []
+        for r in rows:
+            preview = (r["heading"] + ": " if r["heading"] else "") + r["content"]
+            results.append({
+                "table": "chunks",
+                "preview": preview[:240],
+                **dict(r),
+            })
+        return results
+
+    def _search_fts(
+        self,
+        db: sqlite3.Connection,
+        query: str,
+        k: int,
+        domain: str | None,
+    ) -> list[sqlite3.Row]:
+        # Sanitize to FTS5-safe tokens; OR them so a multi-word query
+        # matches any of the keywords (closer to LIKE behaviour).
+        tokens = [t for t in re.findall(r"[\w']+", query) if t]
+        if not tokens:
+            return []
+        match = " OR ".join(tokens)
+        if domain:
+            return db.execute(
+                "SELECT c.* FROM chunks_fts f "
+                "JOIN chunks c ON c.id = f.rowid "
+                "WHERE chunks_fts MATCH ? AND c.domain = ? "
+                "ORDER BY rank LIMIT ?",
+                (match, domain, k),
+            ).fetchall()
+        return db.execute(
+            "SELECT c.* FROM chunks_fts f "
+            "JOIN chunks c ON c.id = f.rowid "
+            "WHERE chunks_fts MATCH ? "
+            "ORDER BY rank LIMIT ?",
+            (match, k),
+        ).fetchall()
+
+    def _search_like(
+        self,
+        db: sqlite3.Connection,
+        query: str,
+        k: int,
+        domain: str | None,
+    ) -> list[sqlite3.Row]:
+        tokens = [t for t in re.findall(r"[\w']+", query) if t]
+        if not tokens:
+            return []
+        # Score = number of tokens matched (rough recall-style ranking).
+        like_clauses = " + ".join(
+            "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END"
+            for _ in tokens
+        )
+        params: list[Any] = []
+        for t in tokens:
+            needle = f"%{t}%"
+            params.extend([needle, needle])
+        sql = (
+            f"SELECT *, ({like_clauses}) AS score FROM chunks "
+            "WHERE score > 0"
+        )
+        if domain:
+            sql += " AND domain = ?"
+            params.append(domain)
+        sql += " ORDER BY score DESC, id DESC LIMIT ?"
+        params.append(k)
+        return db.execute(sql, params).fetchall()
+
+    def list_chunks(
+        self,
+        domain: str | None = None,
+        limit: int = 50,
+    ) -> list[Chunk]:
+        """Most-recent-first chunk listing. Used by ``memory_list``."""
+        db = self._get_db()
+        if db is None:
+            return []
+        try:
+            if domain:
+                rows = db.execute(
+                    "SELECT * FROM chunks WHERE domain = ? ORDER BY id DESC LIMIT ?",
+                    (domain, limit),
+                ).fetchall()
+            else:
+                rows = db.execute(
+                    "SELECT * FROM chunks ORDER BY id DESC LIMIT ?",
+                    (limit,),
+                ).fetchall()
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] list_chunks failed: %s", exc)
+            rows = []
+        finally:
+            db.close()
+        return [Chunk(**dict(r)) for r in rows]
+
+    def stats(self) -> dict[str, int]:
+        """Return per-domain chunk counts plus a ``total`` key."""
+        db = self._get_db()
+        if db is None:
+            return {"total": 0}
+        try:
+            rows = db.execute(
+                "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC"
+            ).fetchall()
+            total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] stats failed: %s", exc)
+            return {"total": 0}
+        finally:
+            db.close()
+        out = {r["domain"]: r["n"] for r in rows}
+        out["total"] = int(total)
+        return out
+
+    # ── verification helpers (used by evals/verify.py) ──────────────────────
+
+    def find_chunk_containing(
+        self,
+        text: str,
+        domain: str | None = None,
+    ) -> Chunk | None:
+        """Return the most-recent chunk whose content or heading contains ``text``.
+
+        Used by the eval runner to assert side-effect outcomes after a
+        memory-writing turn.
+        """
+        db = self._get_db()
+        if db is None:
+            return None
+        try:
+            sql = (
+                "SELECT * FROM chunks "
+                "WHERE (content LIKE ? OR heading LIKE ?)"
+            )
+            params: list[Any] = [f"%{text}%", f"%{text}%"]
+            if domain:
+                sql += " AND domain = ?"
+                params.append(domain)
+            sql += " ORDER BY id DESC LIMIT 1"
+            row = db.execute(sql, params).fetchone()
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] find_chunk_containing failed: %s", exc)
+            row = None
+        finally:
+            db.close()
+        return Chunk(**dict(row)) if row else None
+
+    def delete_by_content(self, contains: str) -> int:
+        """Delete chunks whose content matches ``%contains%``. Returns count."""
+        db = self._get_db()
+        if db is None:
+            return 0
+        try:
+            cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",))
+            db.commit()
+            return int(cur.rowcount)
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] delete_by_content failed: %s", exc)
+            return 0
+        finally:
+            db.close()
+
+    def delete_by_heading(self, domain: str, heading: str) -> int:
+        """Delete chunks matching (domain, heading). Returns count."""
+        db = self._get_db()
+        if db is None:
+            return 0
+        try:
+            cur = db.execute(
+                "DELETE FROM chunks WHERE domain = ? AND heading = ?",
+                (domain, heading),
+            )
+            db.commit()
+            return int(cur.rowcount)
+        except sqlite3.OperationalError as exc:
+            log.warning("[knowledge] delete_by_heading failed: %s", exc)
+            return 0
+        finally:
+            db.close()
diff --git a/server.py b/server.py
index 75f9692..f1ecd46 100644
--- a/server.py
+++ b/server.py
@@ -90,8 +90,38 @@ def _init_langgraph_agent():
 
     from graph.agent import create_agent_graph
 
-    _graph = create_agent_graph(_graph_config)
-    log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name)
+    # Construct the default KnowledgeStore so memory tools (memory_ingest,
+    # memory_recall, daily_log) and KnowledgeMiddleware have something to
+    # bind to. Forks that don't want a store can set
+    # ``middleware.knowledge: false`` and remove the memory tools from
+    # the worker subagent — the store is still cheap to construct.
+    knowledge_store = _build_knowledge_store(_graph_config)
+
+    _graph = create_agent_graph(_graph_config, knowledge_store=knowledge_store)
+    log.info(
+        "LangGraph agent initialized (model: %s, knowledge_db: %s)",
+        _graph_config.model_name,
+        getattr(knowledge_store, "path", "(disabled)"),
+    )
+
+
+def _build_knowledge_store(config):
+    """Return a ``KnowledgeStore`` bound to the configured DB path.
+
+    Best-effort: any sqlite-level failure is logged and the store
+    falls back to ``~/.protoagent/knowledge/agent.db`` automatically
+    (see ``knowledge.store._resolve_path``). Returns ``None`` only when
+    knowledge is disabled in config — kept as a separate code path so
+    forks can audit when the agent is running KB-less.
+    """
+    if not getattr(config, "knowledge_middleware", True):
+        return None
+    try:
+        from knowledge import KnowledgeStore
+        return KnowledgeStore(db_path=config.knowledge_db_path)
+    except Exception as exc:
+        log.warning("[server] knowledge store init failed: %s; running KB-less", exc)
+        return None
 
 
 def _reload_langgraph_agent() -> tuple[bool, str]:
@@ -130,7 +160,8 @@ def _reload_langgraph_agent() -> tuple[bool, str]:
     # metrics / card / auth all de-sync from what's actually running.
     if is_setup_complete():
         try:
-            new_graph = create_agent_graph(new_config)
+            new_store = _build_knowledge_store(new_config)
+            new_graph = create_agent_graph(new_config, knowledge_store=new_store)
         except Exception as e:
             log.exception("[reload] graph rebuild failed")
             return False, f"graph rebuild failed: {e}"
diff --git a/tests/test_config_io.py b/tests/test_config_io.py
index 25a7472..39fc017 100644
--- a/tests/test_config_io.py
+++ b/tests/test_config_io.py
@@ -105,11 +105,11 @@ def test_apply_updates_nested_worker(tmp_path: Path) -> None:
 
     config_io.apply_updates_to_yaml(
         doc,
-        {"subagents": {"worker": {"enabled": True, "tools": ["echo", "calculator"]}}},
+        {"subagents": {"worker": {"enabled": True, "tools": ["current_time", "calculator"]}}},
     )
 
     assert doc["subagents"]["worker"]["enabled"] is True
-    assert list(doc["subagents"]["worker"]["tools"]) == ["echo", "calculator"]
+    assert list(doc["subagents"]["worker"]["tools"]) == ["current_time", "calculator"]
 
 
 # ── config_to_dict ───────────────────────────────────────────────────────────
@@ -325,9 +325,9 @@ def test_list_available_tools_returns_starter_set():
     # Lock in the template's starter set — forks replace these but
     # the drawer's CheckboxGroup populates from this call, so the
     # contract is "return tool names in a stable list".
-    assert "echo" in names
-    assert "calculator" in names
     assert "current_time" in names
+    assert "calculator" in names
+    assert "fetch_url" in names
     assert all(isinstance(n, str) for n in names)
 
 
diff --git a/tests/test_skill_curator.py b/tests/test_skill_curator.py
index cb7bf43..3d8e211 100644
--- a/tests/test_skill_curator.py
+++ b/tests/test_skill_curator.py
@@ -52,7 +52,7 @@ def _make_skill(
         "name": name,
         "description": description,
         "prompt_template": f"Run the {name} workflow.",
-        "tools_used": ["echo"],
+        "tools_used": ["current_time"],
         "confidence": confidence,
         "created_at": _utc_iso(days_ago),
     }
diff --git a/tests/test_skill_emission.py b/tests/test_skill_emission.py
index 34b8f1c..6d9555b 100644
--- a/tests/test_skill_emission.py
+++ b/tests/test_skill_emission.py
@@ -88,14 +88,14 @@ def test_skill_datapart_serialization() -> None:
         name="dp-test",
         description="DataPart test",
         prompt_template="prompt",
-        tools_used=["echo"],
+        tools_used=["current_time"],
         source_session_id="s1",
     )
     part = artifact.to_datapart()
     assert part["kind"] == "data"
     assert part["metadata"]["mimeType"] == SKILL_V1_MIME
     assert part["data"]["name"] == "dp-test"
-    assert part["data"]["tools_used"] == ["echo"]
+    assert part["data"]["tools_used"] == ["current_time"]
     # created_at must be present and parseable
     datetime.fromisoformat(part["data"]["created_at"])
 
@@ -125,7 +125,7 @@ def test_skill_artifact_validation_tools_not_list() -> None:
     with pytest.raises(TypeError, match="tools_used"):
         SkillV1Artifact(
             name="x", description="d", prompt_template="p",
-            tools_used="echo",  # type: ignore[arg-type]
+            tools_used="current_time",  # type: ignore[arg-type]
         )
 
 
@@ -250,7 +250,7 @@ def _run_emit_logic(
 def test_skill_emitted_when_emit_skill_true() -> None:
     """Skill artifact is emitted when emit_skill=True and subagent succeeds."""
     msgs = [
-        _make_ai_message_with_tool_calls(["echo"]),
+        _make_ai_message_with_tool_calls(["current_time"]),
         _make_ai_message_with_content("done"),
     ]
     _run_emit_logic(
@@ -264,7 +264,7 @@ def test_skill_emitted_when_emit_skill_true() -> None:
     assert len(skills) == 1
     skill = skills[0]
     assert skill.name == "my-task"
-    assert skill.tools_used == ["echo"]
+    assert skill.tools_used == ["current_time"]
     assert skill.prompt_template == "do the thing"
     assert "Captured workflow" in skill.description
 
@@ -272,7 +272,7 @@ def test_skill_emitted_when_emit_skill_true() -> None:
 def test_no_emission_on_opt_out() -> None:
     """No skill artifact is emitted when emit_skill=False."""
     msgs = [
-        _make_ai_message_with_tool_calls(["echo"]),
+        _make_ai_message_with_tool_calls(["current_time"]),
         _make_ai_message_with_content("done"),
     ]
     _run_emit_logic(
@@ -307,7 +307,7 @@ def test_no_emission_on_failure() -> None:
 def test_no_emission_when_config_disallows() -> None:
     """No skill artifact is emitted when allow_skill_emission=False."""
     msgs = [
-        _make_ai_message_with_tool_calls(["echo"]),
+        _make_ai_message_with_tool_calls(["current_time"]),
         _make_ai_message_with_content("done"),
     ]
     _run_emit_logic(
@@ -323,8 +323,8 @@ def test_no_emission_when_config_disallows() -> None:
 def test_tool_tracking_metadata_captured() -> None:
     """tools_used in the artifact lists all tools invoked, deduplicated."""
     msgs = [
-        _make_ai_message_with_tool_calls(["echo", "calculator"]),
-        _make_ai_message_with_tool_calls(["echo"]),  # duplicate — should appear once
+        _make_ai_message_with_tool_calls(["current_time", "calculator"]),
+        _make_ai_message_with_tool_calls(["current_time"]),  # duplicate — should appear once
         _make_ai_message_with_content("result"),
     ]
     _run_emit_logic(
@@ -336,7 +336,7 @@ def test_tool_tracking_metadata_captured() -> None:
     )
     skills = get_pending_skills()
     assert len(skills) == 1
-    assert skills[0].tools_used.count("echo") == 1
+    assert skills[0].tools_used.count("current_time") == 1
     assert "calculator" in skills[0].tools_used
 
 
diff --git a/tests/test_starter_tools.py b/tests/test_starter_tools.py
index fe4495b..f469365 100644
--- a/tests/test_starter_tools.py
+++ b/tests/test_starter_tools.py
@@ -114,13 +114,3 @@ async def test_fetch_url_rejects_non_http_scheme():
     ):
         result = await fetch_url.ainvoke({"url": bad})
         assert result.startswith("Error:"), f"accepted unsafe url: {bad!r}"
-
-
-# ── echo — sanity ────────────────────────────────────────────────────────────
-
-
-@pytest.mark.asyncio
-async def test_echo_sanity():
-    from tools.lg_tools import echo
-    result = await echo.ainvoke({"message": "hello"})
-    assert result == "echo: hello"
diff --git a/tools/lg_tools.py b/tools/lg_tools.py
index d8ce0f5..b59ccba 100644
--- a/tools/lg_tools.py
+++ b/tools/lg_tools.py
@@ -7,12 +7,20 @@
 The template ships with a small starter set of free, keyless tools so
 a fresh clone can demonstrate real agent behaviour out of the box:
 
-- ``echo`` — sanity check
 - ``current_time`` — wall-clock time in any IANA timezone
 - ``calculator`` — safe numeric expression evaluation
 - ``web_search`` — DuckDuckGo text search (via ``ddgs``, no API key)
 - ``fetch_url`` — fetch a URL and return cleaned text
 
+Plus memory tools that bind to a ``KnowledgeStore`` (constructed in
+``server.py`` and threaded through ``get_all_tools(knowledge_store)``):
+
+- ``memory_ingest`` — store a fact / preference / note
+- ``memory_recall`` — search the store for relevant chunks
+- ``memory_list``   — list recent chunks (optionally per domain)
+- ``memory_stats``  — per-domain counts
+- ``daily_log``     — convenience: write a daily-log chunk
+
 Replace or extend this file with your agent's real tools and update
 ``get_all_tools()`` to return the full list.
 
@@ -39,20 +47,6 @@
 from langchain_core.tools import tool
 
 
-# ── echo ─────────────────────────────────────────────────────────────────────
-
-
-@tool
-async def echo(message: str) -> str:
-    """Echo the input back with a prefix. Template-only sanity tool.
-
-    Useful to verify the tool loop is wired end-to-end before real
-    tools are in place. Safe to delete once your fork has its own
-    tools.
-    """
-    return f"echo: {message}"
-
-
 # ── current_time ─────────────────────────────────────────────────────────────
 
 
@@ -273,16 +267,130 @@ def _extract_text_from_html(content: bytes) -> str:
     return "\n".join(lines)
 
 
+# ── memory tools ─────────────────────────────────────────────────────────────
+#
+# Each memory tool is built by a factory that closes over the
+# ``KnowledgeStore`` instance. Doing it this way (rather than module-
+# level globals) keeps tests isolated — they pass a temp store and get
+# a fresh tool list bound to it. Production constructs one store in
+# ``server.py`` and reuses the bound tools for the lifetime of the
+# process.
+
+
+def _build_memory_tools(knowledge_store):
+    """Bind memory tools to a ``KnowledgeStore``. Returns a list."""
+    from datetime import datetime, timezone
+
+    @tool
+    async def memory_ingest(
+        content: str,
+        domain: str = "general",
+        heading: str | None = None,
+    ) -> str:
+        """Store a fact, preference, or note in long-term memory.
+
+        Use this for things the operator wants you to remember across
+        sessions — preferences ("I take my coffee black"), facts about
+        the operator's environment, decisions worth recalling later.
+
+        Args:
+            content: The text to remember. Be specific and self-contained;
+                the chunk is retrieved by keyword search.
+            domain: Logical bucket — ``"preferences"``, ``"context"``,
+                ``"general"``. Defaults to ``"general"``.
+            heading: Optional short label (e.g. ``"coffee"``) used as a
+                stable de-dupe key by the eval suite and curator.
+
+        Returns ``"Stored chunk N in 'domain'."`` on success.
+        """
+        chunk_id = knowledge_store.add_chunk(content, domain=domain, heading=heading)
+        if chunk_id is None:
+            return "Error: failed to store chunk (knowledge store unavailable)."
+        return f"Stored chunk {chunk_id} in {domain!r}."
+
+    @tool
+    async def memory_recall(query: str, k: int = 5) -> str:
+        """Search long-term memory for chunks relevant to ``query``.
+
+        Returns the top-k matches, one per line. Pull this when the
+        operator asks something where stored context is more reliable
+        than the model's own training data ("what's my coffee order?",
+        "remind me what we decided about the auth migration").
+
+        Returns ``"No matches."`` when the store is empty or nothing
+        scores above the keyword threshold.
+        """
+        results = knowledge_store.search(query, k=k)
+        if not results:
+            return "No matches."
+        lines = []
+        for r in results:
+            lines.append(f"[{r.get('domain', '?')}] {r['preview']}")
+        return "\n".join(lines)
+
+    @tool
+    async def memory_list(domain: str | None = None, limit: int = 10) -> str:
+        """List the most recent chunks. Filter by domain when given.
+
+        Useful when the operator asks for recent activity ("what did I
+        log today?") or wants to inspect what the agent has stored.
+        """
+        chunks = knowledge_store.list_chunks(domain=domain, limit=limit)
+        if not chunks:
+            return f"No chunks in {domain or 'any domain'}."
+        lines = []
+        for c in chunks:
+            head = f"[{c.domain}]"
+            if c.heading:
+                head += f" {c.heading}:"
+            preview = (c.content or "")[:200]
+            lines.append(f"{c.created_at} {head} {preview}")
+        return "\n".join(lines)
+
+    @tool
+    async def memory_stats() -> str:
+        """Return chunk counts per domain. Useful for sanity checks."""
+        s = knowledge_store.stats()
+        if s.get("total", 0) == 0:
+            return "Knowledge store is empty."
+        lines = [f"Total: {s['total']}"]
+        for k, v in s.items():
+            if k == "total":
+                continue
+            lines.append(f"  {k}: {v}")
+        return "\n".join(lines)
+
+    @tool
+    async def daily_log(content: str) -> str:
+        """Append a daily-log entry for today.
+
+        Stored under ``domain='daily-log'`` with today's UTC date as
+        the heading, so the same day's entries cluster together for
+        ``memory_list(domain='daily-log')`` queries.
+        """
+        today = datetime.now(timezone.utc).date().isoformat()
+        chunk_id = knowledge_store.add_chunk(
+            content, domain="daily-log", heading=today,
+        )
+        if chunk_id is None:
+            return "Error: failed to write daily log entry."
+        return f"Logged ({today}): {content[:120]}"
+
+    return [memory_ingest, memory_recall, memory_list, memory_stats, daily_log]
+
+
 # ── registry ─────────────────────────────────────────────────────────────────
 
 
 def get_all_tools(knowledge_store=None):
     """Return every LangChain tool the lead agent + subagents can use.
 
-    ``knowledge_store`` is threaded through for agents that ship a
-    knowledge / memory subsystem (see ``graph/middleware/knowledge.py``
-    for the hook-in pattern). The template doesn't ship a store — the
-    parameter is kept so adding one later doesn't require touching
-    every call site.
+    When ``knowledge_store`` is provided, the memory tools are bound
+    to it and included. Forks that disable the store can pass
+    ``knowledge_store=None`` and the lead agent runs with the four
+    keyless tools only.
     """
-    return [echo, current_time, calculator, web_search, fetch_url]
+    tools = [current_time, calculator, web_search, fetch_url]
+    if knowledge_store is not None:
+        tools.extend(_build_memory_tools(knowledge_store))
+    return tools

From 4d0d4288e483fa464def76382f8bb4b469c97bc0 Mon Sep 17 00:00:00 2001
From: Josh Mabry <mabry1985@gmail.com>
Date: Mon, 27 Apr 2026 14:46:58 -0700
Subject: [PATCH 2/6] fix(review): address PR #155 CodeRabbit feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real bugs:
- evals/runner.py: teardown now runs in a finally block so seeded KB
  rows get cleaned up even when the verifier or client.ask() raises.
  expected_tools=[] now means "assert no tools fired" (was conflated
  with "no key" via the `or []` short-circuit, making the abstention
  case a no-op).
- evals/runner.py + tasks.json: added a `stream` runner kind so
  AgentClient.stream() is reachable from tasks.json — new
  streaming_status_updates case asserts the SSE event sequence.
- knowledge/store.py: PRAGMA journal_mode=WAL is now best-effort
  (read-only DBs no longer break _connect). FTS5 rebuild after
  schema install so an existing chunks table populated before FTS
  was added gets indexed. find_chunk_containing/delete_by_content
  reject empty/whitespace-only inputs to prevent LIKE '%%' wildcards
  from matching every row.

Hardening:
- tools/lg_tools.py: clamp memory_recall(k) to [1, 20] and
  memory_list(limit) to [1, 200] so the agent can't request
  arbitrarily large slices of the KB.

Doc cleanup:
- docs/guides/subagents.md: LangGraphConfig snippet had a stale
  "echo" reference; replaced with the new memory-tool list.
- docs/tutorials/first-tool.md: WORKER_CONFIG example now appends
  git_sha alongside the bundled defaults instead of replacing
  them and dropping the memory tools.
- docs/reference/starter-tools.md: "adding your own" snippet now
  preserves the conditional _build_memory_tools(knowledge_store)
  extension.
- tests/test_config_io.py: starter-tool contract assertion now
  also covers web_search.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/guides/subagents.md        |   6 +-
 docs/reference/starter-tools.md |   7 +-
 docs/tutorials/first-tool.md    |   9 +-
 evals/runner.py                 | 155 +++++++++++++++++++++-----------
 evals/tasks.json                |  10 +++
 knowledge/store.py              |  33 ++++++-
 tests/test_config_io.py         |   1 +
 tools/lg_tools.py               |  10 ++-
 8 files changed, 167 insertions(+), 64 deletions(-)

diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md
index 9fc1a9a..031cff6 100644
--- a/docs/guides/subagents.md
+++ b/docs/guides/subagents.md
@@ -56,7 +56,11 @@ The template's `LangGraphConfig` (in `graph/config.py`) has a `worker` field. Ad
 class LangGraphConfig:
     # ... existing fields ...
     worker: SubagentDef = field(default_factory=lambda: SubagentDef(
-        tools=["echo", "current_time", "calculator", "web_search", "fetch_url"],
+        tools=[
+            "current_time", "calculator", "web_search", "fetch_url",
+            "memory_ingest", "memory_recall", "memory_list", "memory_stats",
+            "daily_log",
+        ],
         max_turns=20,
     ))
     researcher: SubagentDef = field(default_factory=lambda: SubagentDef(
diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md
index c0918b5..60d74d4 100644
--- a/docs/reference/starter-tools.md
+++ b/docs/reference/starter-tools.md
@@ -180,11 +180,14 @@ async def my_tool(required_arg: str, optional_arg: int = 5) -> str:
     return f"Success: {result}"
 ```
 
-Then append it to the list in `get_all_tools()`:
+Then append it to the keyless tool list in `get_all_tools()` — keep the conditional `_build_memory_tools(knowledge_store)` extension below it so the bundled memory tools still ship when a store is configured:
 
 ```python
 def get_all_tools(knowledge_store=None):
-    return [current_time, calculator, web_search, fetch_url, my_tool]
+    tools = [current_time, calculator, web_search, fetch_url, my_tool]
+    if knowledge_store is not None:
+        tools.extend(_build_memory_tools(knowledge_store))
+    return tools
 ```
 
 See [Write your first tool](/tutorials/first-tool) for the full walkthrough.
diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md
index 87502d0..056a8e4 100644
--- a/docs/tutorials/first-tool.md
+++ b/docs/tutorials/first-tool.md
@@ -47,12 +47,17 @@ def get_all_tools(knowledge_store=None):
 
 ## 2. Allow the subagent to use it (optional)
 
-If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`:
+If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`. Append rather than replace — dropping the bundled defaults removes the worker's memory tools:
 
 ```python
 WORKER_CONFIG = SubagentConfig(
     # ...
-    tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"],
+    tools=[
+        "current_time", "calculator", "web_search", "fetch_url",
+        "memory_ingest", "memory_recall", "memory_list", "memory_stats",
+        "daily_log",
+        "git_sha",   # ← new
+    ],
     # ...
 )
 ```
diff --git a/evals/runner.py b/evals/runner.py
index ad5154b..522f830 100644
--- a/evals/runner.py
+++ b/evals/runner.py
@@ -129,7 +129,26 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult:
 
 
 async def _run_ask(client: AgentClient, case: dict) -> CaseResult:
+    """Send via ``message/send`` + poll. Teardown always runs."""
+    return await _run_prompt_case(client, case, streaming=False)
+
+
+async def _run_stream(client: AgentClient, case: dict) -> CaseResult:
+    """Send via ``message/stream`` + SSE. Same assertion shape as ``ask``,
+    plus an optional ``expected_event_kinds`` list that asserts the SSE
+    stream surfaced the named event kinds (``status-update``, ``task``,
+    etc.) at least once."""
+    return await _run_prompt_case(client, case, streaming=True)
+
+
+async def _run_prompt_case(
+    client: AgentClient,
+    case: dict,
+    *,
+    streaming: bool,
+) -> CaseResult:
     # Pre-seed state via direct DB writes (model never sees this).
+    setup_applied = False
     if "setup" in case:
         err = verify.apply_setup(case["setup"])
         if err:
@@ -137,66 +156,93 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult:
                 case["id"], case["category"], case["name"], False,
                 f"setup failed: {err}",
             )
+        setup_applied = True
 
-    since = verify.audit_now()
-    result: TaskResult = await client.ask(
-        case["prompt"], timeout_s=case.get("timeout_s", 90),
-    )
+    events: list[dict] = []
+    result: TaskResult | None = None
 
-    if result.state != "completed":
-        if "teardown" in case:
-            verify.apply_teardown(case["teardown"])
-        return CaseResult(
-            case["id"], case["category"], case["name"], False,
-            f"task state={result.state}; error={result.error or '(none)'}",
-            duration_ms=result.duration_ms,
-            raw={"text": result.text[:200]},
-        )
+    try:
+        since = verify.audit_now()
 
-    problems: list[str] = []
+        if streaming:
+            events, result = await client.stream(
+                case["prompt"], timeout_s=case.get("timeout_s", 90),
+            )
+        else:
+            result = await client.ask(
+                case["prompt"], timeout_s=case.get("timeout_s", 90),
+            )
 
-    # Tool firing assertions.
-    expected_tools = case.get("expected_tools") or []
-    if expected_tools:
-        await asyncio.sleep(0.3)  # let the audit log catch up
-        entries = verify.audit_entries_since(since)
-        require_success = case.get("tool_outcome", "success") == "success"
-        passed, detail = verify.assert_tools_fired(
-            entries, expected_tools, require_success=require_success,
+        if result is None or result.state != "completed":
+            state = result.state if result else "no-final-event"
+            error = (result.error if result else None) or "(none)"
+            duration = result.duration_ms if result else 0
+            text_preview = (result.text if result else "")[:200]
+            return CaseResult(
+                case["id"], case["category"], case["name"], False,
+                f"task state={state}; error={error}",
+                duration_ms=duration,
+                raw={"text": text_preview},
+            )
+
+        problems: list[str] = []
+
+        # Tool firing assertions. ``expected_tools is not None`` so an
+        # explicit empty list asserts that *no* tools fired (abstention
+        # cases). Missing key skips the audit check entirely.
+        expected_tools = case.get("expected_tools")
+        if expected_tools is not None:
+            await asyncio.sleep(0.3)  # let the audit log catch up
+            entries = verify.audit_entries_since(since)
+            require_success = case.get("tool_outcome", "success") == "success"
+            passed, detail = verify.assert_tools_fired(
+                entries, expected_tools, require_success=require_success,
+            )
+            if not passed:
+                problems.append(detail)
+
+        # Text pattern assertions (case-insensitive substrings).
+        text_lower = result.text.lower()
+        for pattern in case.get("expected_patterns") or []:
+            if pattern.lower() not in text_lower:
+                problems.append(f"missing pattern {pattern!r}")
+
+        # KB side-effect assertions.
+        vk = case.get("verify_kb") or {}
+        if "find_chunk_containing" in vk:
+            chunk = verify.find_chunk_containing(
+                vk["find_chunk_containing"], domain=vk.get("domain"),
+            )
+            if not chunk:
+                problems.append(f"no chunk containing {vk['find_chunk_containing']!r}")
+
+        # Streaming-only: assert the SSE event sequence surfaced the
+        # expected kinds at least once.
+        if streaming:
+            seen_kinds = {e.get("kind") for e in events}
+            for kind in case.get("expected_event_kinds") or []:
+                if kind not in seen_kinds:
+                    problems.append(f"missing SSE event kind {kind!r}; saw {sorted(seen_kinds)}")
+
+        detail = (
+            "; ".join(problems) if problems
+            else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)"
         )
-        if not passed:
-            problems.append(detail)
-
-    # Text pattern assertions (case-insensitive substrings).
-    text_lower = result.text.lower()
-    for pattern in case.get("expected_patterns") or []:
-        if pattern.lower() not in text_lower:
-            problems.append(f"missing pattern {pattern!r}")
-
-    # KB side-effect assertions.
-    vk = case.get("verify_kb") or {}
-    if "find_chunk_containing" in vk:
-        chunk = verify.find_chunk_containing(
-            vk["find_chunk_containing"], domain=vk.get("domain"),
+        return CaseResult(
+            case["id"], case["category"], case["name"],
+            passed=not problems,
+            detail=detail,
+            duration_ms=result.duration_ms,
+            tokens=result.usage.get("total_tokens", 0) or 0,
+            raw={"reply": result.text[:300]},
         )
-        if not chunk:
-            problems.append(f"no chunk containing {vk['find_chunk_containing']!r}")
-
-    if "teardown" in case:
-        verify.apply_teardown(case["teardown"])
-
-    detail = (
-        "; ".join(problems) if problems
-        else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)"
-    )
-    return CaseResult(
-        case["id"], case["category"], case["name"],
-        passed=not problems,
-        detail=detail,
-        duration_ms=result.duration_ms,
-        tokens=result.usage.get("total_tokens", 0) or 0,
-        raw={"reply": result.text[:300]},
-    )
+    finally:
+        # Teardown unconditionally — even when the task crashed or
+        # an assertion raised — so seeded KB rows never leak into the
+        # next case.
+        if setup_applied or "teardown" in case:
+            if "teardown" in case:
+                verify.apply_teardown(case["teardown"])
 
 
 # ── dispatch ────────────────────────────────────────────────────────────────
@@ -206,6 +252,7 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult:
     "agent_card": _run_agent_card,
     "auth_check": _run_auth_check,
     "ask": _run_ask,
+    "stream": _run_stream,
 }
 
 
diff --git a/evals/tasks.json b/evals/tasks.json
index 14cdd16..d4b5389 100644
--- a/evals/tasks.json
+++ b/evals/tasks.json
@@ -17,6 +17,16 @@
     "bad_token": "definitely-not-the-real-token",
     "expect": {"status": 401}
   },
+  {
+    "id": "streaming_status_updates",
+    "category": "a2a-protocol",
+    "kind": "stream",
+    "name": "message/stream surfaces status-update events ending in final=true",
+    "prompt": "Hi.",
+    "expected_tools": [],
+    "expected_patterns": [],
+    "expected_event_kinds": ["status-update"]
+  },
 
   {
     "id": "abstain_no_tool",
diff --git a/knowledge/store.py b/knowledge/store.py
index a6aee7a..72d9353 100644
--- a/knowledge/store.py
+++ b/knowledge/store.py
@@ -162,7 +162,14 @@ def __init__(self, db_path: str | Path | None = None):
     def _connect(self) -> sqlite3.Connection:
         db = sqlite3.connect(str(self.path))
         db.row_factory = sqlite3.Row
-        db.execute("PRAGMA journal_mode=WAL")
+        # WAL is best-effort — read-only sqlite files (e.g. immutable
+        # mounts) reject the PRAGMA. The connection stays usable for
+        # reads; only writes will fail later, and those go through
+        # the per-method OperationalError guards.
+        try:
+            db.execute("PRAGMA journal_mode=WAL")
+        except sqlite3.OperationalError as exc:
+            log.debug("[knowledge] PRAGMA journal_mode=WAL skipped: %s", exc)
         return db
 
     def _init_db(self) -> None:
@@ -172,6 +179,16 @@ def _init_db(self) -> None:
             self._fts_available = _has_fts5(db)
             if self._fts_available:
                 db.executescript(_FTS_SCHEMA)
+                # Re-index any pre-existing rows. The CREATE TRIGGER
+                # statements only fire on subsequent inserts, so a DB
+                # populated before FTS was added would have an empty
+                # virtual table without this rebuild.
+                try:
+                    db.execute(
+                        "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')"
+                    )
+                except sqlite3.OperationalError as exc:
+                    log.debug("[knowledge] FTS rebuild skipped: %s", exc)
             else:
                 log.info(
                     "[knowledge] FTS5 unavailable — search will use LIKE fallback"
@@ -399,8 +416,12 @@ def find_chunk_containing(
         """Return the most-recent chunk whose content or heading contains ``text``.
 
         Used by the eval runner to assert side-effect outcomes after a
-        memory-writing turn.
+        memory-writing turn. Empty / whitespace-only ``text`` returns
+        ``None`` rather than building a ``LIKE '%%'`` predicate that
+        would match every row.
         """
+        if not text or not text.strip():
+            return None
         db = self._get_db()
         if db is None:
             return None
@@ -423,7 +444,13 @@ def find_chunk_containing(
         return Chunk(**dict(row)) if row else None
 
     def delete_by_content(self, contains: str) -> int:
-        """Delete chunks whose content matches ``%contains%``. Returns count."""
+        """Delete chunks whose content matches ``%contains%``. Returns count.
+
+        Empty / whitespace-only ``contains`` is a no-op — the alternative
+        is ``DELETE WHERE content LIKE '%%'`` which wipes every row.
+        """
+        if not contains or not contains.strip():
+            return 0
         db = self._get_db()
         if db is None:
             return 0
diff --git a/tests/test_config_io.py b/tests/test_config_io.py
index 39fc017..caf0bb2 100644
--- a/tests/test_config_io.py
+++ b/tests/test_config_io.py
@@ -327,6 +327,7 @@ def test_list_available_tools_returns_starter_set():
     # contract is "return tool names in a stable list".
     assert "current_time" in names
     assert "calculator" in names
+    assert "web_search" in names
     assert "fetch_url" in names
     assert all(isinstance(n, str) for n in names)
 
diff --git a/tools/lg_tools.py b/tools/lg_tools.py
index b59ccba..fd2af1a 100644
--- a/tools/lg_tools.py
+++ b/tools/lg_tools.py
@@ -277,6 +277,10 @@ def _extract_text_from_html(content: bytes) -> str:
 # process.
 
 
+_MEMORY_RECALL_MAX_K = 20
+_MEMORY_LIST_MAX_LIMIT = 200
+
+
 def _build_memory_tools(knowledge_store):
     """Bind memory tools to a ``KnowledgeStore``. Returns a list."""
     from datetime import datetime, timezone
@@ -320,7 +324,8 @@ async def memory_recall(query: str, k: int = 5) -> str:
         Returns ``"No matches."`` when the store is empty or nothing
         scores above the keyword threshold.
         """
-        results = knowledge_store.search(query, k=k)
+        clamped_k = max(1, min(int(k), _MEMORY_RECALL_MAX_K))
+        results = knowledge_store.search(query, k=clamped_k)
         if not results:
             return "No matches."
         lines = []
@@ -335,7 +340,8 @@ async def memory_list(domain: str | None = None, limit: int = 10) -> str:
         Useful when the operator asks for recent activity ("what did I
         log today?") or wants to inspect what the agent has stored.
         """
-        chunks = knowledge_store.list_chunks(domain=domain, limit=limit)
+        clamped_limit = max(1, min(int(limit), _MEMORY_LIST_MAX_LIMIT))
+        chunks = knowledge_store.list_chunks(domain=domain, limit=clamped_limit)
         if not chunks:
             return f"No chunks in {domain or 'any domain'}."
         lines = []

From cab3bd8d0f57d8df3ad98c25345cc36df2a11e68 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 27 Apr 2026 22:02:53 +0000
Subject: [PATCH 3/6] fix(review-2): address round-2 PR #155 CodeRabbit
 feedback

- evals/runner.py: collapse redundant nested teardown guard into a
  single `if "teardown" in case:` (SIM102); remove now-unused
  `setup_applied` flag
- knowledge/store.py: use `datetime.UTC` alias (Python 3.11+, UP017)
- tools/lg_tools.py: add `-> list` return annotation to
  `_build_memory_tools` (ANN202); replace explicit loop with list
  comprehension in `memory_recall` (PERF401)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

https://claude.ai/code/session_01148o8ppbuQwuZBsVGTQWwQ
---
 evals/runner.py    | 7 ++-----
 knowledge/store.py | 4 ++--
 tools/lg_tools.py  | 6 ++----
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/evals/runner.py b/evals/runner.py
index 522f830..d767f90 100644
--- a/evals/runner.py
+++ b/evals/runner.py
@@ -148,7 +148,6 @@ async def _run_prompt_case(
     streaming: bool,
 ) -> CaseResult:
     # Pre-seed state via direct DB writes (model never sees this).
-    setup_applied = False
     if "setup" in case:
         err = verify.apply_setup(case["setup"])
         if err:
@@ -156,7 +155,6 @@ async def _run_prompt_case(
                 case["id"], case["category"], case["name"], False,
                 f"setup failed: {err}",
             )
-        setup_applied = True
 
     events: list[dict] = []
     result: TaskResult | None = None
@@ -240,9 +238,8 @@ async def _run_prompt_case(
         # Teardown unconditionally — even when the task crashed or
         # an assertion raised — so seeded KB rows never leak into the
         # next case.
-        if setup_applied or "teardown" in case:
-            if "teardown" in case:
-                verify.apply_teardown(case["teardown"])
+        if "teardown" in case:
+            verify.apply_teardown(case["teardown"])
 
 
 # ── dispatch ────────────────────────────────────────────────────────────────
diff --git a/knowledge/store.py b/knowledge/store.py
index 72d9353..62c5f54 100644
--- a/knowledge/store.py
+++ b/knowledge/store.py
@@ -31,7 +31,7 @@
 import re
 import sqlite3
 from dataclasses import dataclass
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
 
@@ -89,7 +89,7 @@ def _resolve_path(db_path: str | Path | None) -> Path:
 
 
 def _now_iso() -> str:
-    return datetime.now(timezone.utc).isoformat()
+    return datetime.now(UTC).isoformat()
 
 
 def _has_fts5(db: sqlite3.Connection) -> bool:
diff --git a/tools/lg_tools.py b/tools/lg_tools.py
index fd2af1a..161ddcb 100644
--- a/tools/lg_tools.py
+++ b/tools/lg_tools.py
@@ -281,7 +281,7 @@ def _extract_text_from_html(content: bytes) -> str:
 _MEMORY_LIST_MAX_LIMIT = 200
 
 
-def _build_memory_tools(knowledge_store):
+def _build_memory_tools(knowledge_store) -> list:
     """Bind memory tools to a ``KnowledgeStore``. Returns a list."""
     from datetime import datetime, timezone
 
@@ -328,9 +328,7 @@ async def memory_recall(query: str, k: int = 5) -> str:
         results = knowledge_store.search(query, k=clamped_k)
         if not results:
             return "No matches."
-        lines = []
-        for r in results:
-            lines.append(f"[{r.get('domain', '?')}] {r['preview']}")
+        lines = [f"[{r.get('domain', '?')}] {r['preview']}" for r in results]
         return "\n".join(lines)
 
     @tool

From b3a9f1dae02730a13734614dfe8481717cc1c6c1 Mon Sep 17 00:00:00 2001
From: Josh Mabry <mabry1985@gmail.com>
Date: Mon, 27 Apr 2026 15:25:15 -0700
Subject: [PATCH 4/6] fix(review-3): address round-3 PR #155 CodeRabbit
 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real bugs:
- evals/runner.py: setup is now inside the try block so a partial
  setup failure (e.g. step 2 of 3 errors) still triggers the
  finally teardown — rows from steps that did succeed no longer
  leak into the next case. Was flagged as a duplicate from round 2.
- knowledge/store.py: LIKE patterns now escape % and _ via
  ESCAPE '\' on every clause that takes user input
  (find_chunk_containing, delete_by_content, _search_like). A
  query for "100%" or "hello_world" no longer silently matches
  every row containing "100" or any single character between
  "hello" and "world".
- knowledge/store.py: FTS5 MATCH tokens are now double-quoted via
  _fts_quote() so user-supplied query terms can't smuggle FTS5
  operators (column filters, prefix wildcards, NEAR, AND/OR/NOT).
  Defence in depth — the [\w']+ tokenizer already filters most
  special chars.

Hardening:
- evals/runner.py: the fixed 0.3s asyncio.sleep waiting for the
  audit log to flush is gone. _await_audit_assertion now polls
  every 50ms up to a 2s deadline and returns as soon as the
  assertion passes — exits early on success, only burns the full
  deadline when the tool genuinely never fired.
- evals/runner.py: _run_auth_check accepts case["headers"] so cases
  can override the default bearer-only header set and exercise
  X-API-Key auth scenarios (or both auths together).
- knowledge/store.py: per-method exception handlers broadened from
  sqlite3.OperationalError to sqlite3.DatabaseError. Catches
  IntegrityError, ProgrammingError, and corruption variants too
  without crashing the agent loop. _has_fts5 (probe) and _connect
  (connection-time errors only) keep the narrower OperationalError.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evals/runner.py    | 71 +++++++++++++++++++++++++++++++++---------
 knowledge/store.py | 77 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/evals/runner.py b/evals/runner.py
index d767f90..2fa148f 100644
--- a/evals/runner.py
+++ b/evals/runner.py
@@ -91,7 +91,14 @@ async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult:
 
 
 async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult:
-    """Verify the A2A endpoint rejects a bad bearer with the expected status."""
+    """Verify the A2A endpoint rejects a request with the expected status.
+
+    Default behaviour exercises bearer auth alone using ``case["bad_token"]``.
+    Cases can override headers via ``case["headers"]`` to test other
+    auth surfaces — e.g. ``{"X-API-Key": "wrong"}`` for the legacy
+    X-API-Key path. ``Content-Type: application/json`` is always set
+    for the eval client; case headers override anything else.
+    """
     import httpx
 
     expected_status = case.get("expect", {}).get("status", 401)
@@ -99,8 +106,8 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult:
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {bad}",
-        # No X-API-Key — testing bearer alone.
     }
+    headers.update(case.get("headers") or {})
     payload = {
         "jsonrpc": "2.0",
         "id": "auth-check",
@@ -141,25 +148,61 @@ async def _run_stream(client: AgentClient, case: dict) -> CaseResult:
     return await _run_prompt_case(client, case, streaming=True)
 
 
+_AUDIT_POLL_DEADLINE_S = 2.0
+_AUDIT_POLL_INTERVAL_S = 0.05
+
+
+async def _await_audit_assertion(
+    since: str,
+    expected_tools: list[str],
+    *,
+    require_success: bool,
+) -> tuple[list[dict], bool, str]:
+    """Poll the audit log until ``expected_tools`` have all fired (or the
+    deadline is hit). Returns ``(entries, passed, detail)``.
+
+    Replaces a fixed ``asyncio.sleep`` — under audit-log contention the
+    fixed wait was sometimes shorter than the flush, causing flaky
+    tool-firing assertions. Polling exits as soon as the assertion
+    passes; the deadline only kicks in when the tool genuinely never
+    fired.
+    """
+    deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S
+    entries: list[dict] = []
+    passed = False
+    detail = ""
+    while True:
+        entries = verify.audit_entries_since(since)
+        passed, detail = verify.assert_tools_fired(
+            entries, expected_tools, require_success=require_success,
+        )
+        if passed or asyncio.get_event_loop().time() >= deadline:
+            return entries, passed, detail
+        await asyncio.sleep(_AUDIT_POLL_INTERVAL_S)
+
+
 async def _run_prompt_case(
     client: AgentClient,
     case: dict,
     *,
     streaming: bool,
 ) -> CaseResult:
-    # Pre-seed state via direct DB writes (model never sees this).
-    if "setup" in case:
-        err = verify.apply_setup(case["setup"])
-        if err:
-            return CaseResult(
-                case["id"], case["category"], case["name"], False,
-                f"setup failed: {err}",
-            )
-
     events: list[dict] = []
     result: TaskResult | None = None
 
     try:
+        # Pre-seed state via direct DB writes (model never sees this).
+        # Inside the ``try`` so a partial setup failure still triggers
+        # the ``finally`` teardown — otherwise rows from the steps that
+        # *did* succeed would leak into the next case.
+        if "setup" in case:
+            err = verify.apply_setup(case["setup"])
+            if err:
+                return CaseResult(
+                    case["id"], case["category"], case["name"], False,
+                    f"setup failed: {err}",
+                )
+
         since = verify.audit_now()
 
         if streaming:
@@ -190,11 +233,9 @@ async def _run_prompt_case(
         # cases). Missing key skips the audit check entirely.
         expected_tools = case.get("expected_tools")
         if expected_tools is not None:
-            await asyncio.sleep(0.3)  # let the audit log catch up
-            entries = verify.audit_entries_since(since)
             require_success = case.get("tool_outcome", "success") == "success"
-            passed, detail = verify.assert_tools_fired(
-                entries, expected_tools, require_success=require_success,
+            entries, passed, detail = await _await_audit_assertion(
+                since, expected_tools, require_success=require_success,
             )
             if not passed:
                 problems.append(detail)
diff --git a/knowledge/store.py b/knowledge/store.py
index 62c5f54..473bedd 100644
--- a/knowledge/store.py
+++ b/knowledge/store.py
@@ -17,7 +17,8 @@
 - If the configured path is unwritable (running locally outside the
   container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db``
   so a fresh ``python server.py`` works without sudo.
-- All write operations swallow ``sqlite3.OperationalError`` and log;
+- All write operations swallow ``sqlite3.DatabaseError`` (covers
+  OperationalError, IntegrityError, and corruption variants) and log;
   the store never crashes the agent loop on a corrupt or read-only DB.
 
 Forks that want embeddings on top of FTS5 can subclass and override
@@ -92,6 +93,36 @@ def _now_iso() -> str:
     return datetime.now(UTC).isoformat()
 
 
+# LIKE escaping — sqlite treats ``%`` and ``_`` as wildcards in LIKE
+# patterns. Without escaping, a search for ``"100%"`` matches every row
+# starting with ``"100"`` instead of literal "100%". We escape them
+# alongside the escape char itself, then bind ``ESCAPE '\'`` on every
+# LIKE clause that takes user input.
+_LIKE_ESCAPE = "\\"
+
+
+def _escape_like(text: str) -> str:
+    """Escape ``%``, ``_``, and the escape char for safe LIKE matching."""
+    return (
+        text
+        .replace(_LIKE_ESCAPE, _LIKE_ESCAPE + _LIKE_ESCAPE)
+        .replace("%", _LIKE_ESCAPE + "%")
+        .replace("_", _LIKE_ESCAPE + "_")
+    )
+
+
+def _fts_quote(token: str) -> str:
+    """Quote a token for FTS5 MATCH so it's treated as a literal phrase.
+
+    FTS5 has its own query syntax (column filters, prefix wildcards,
+    NEAR, AND/OR/NOT operators). Wrapping each token in double quotes
+    forces FTS5 to interpret it as a phrase token, neutralising any
+    operator characters the user happened to type. Internal double
+    quotes are doubled per FTS5 phrase rules.
+    """
+    return '"' + token.replace('"', '""') + '"'
+
+
 def _has_fts5(db: sqlite3.Connection) -> bool:
     try:
         db.execute(
@@ -187,7 +218,7 @@ def _init_db(self) -> None:
                     db.execute(
                         "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')"
                     )
-                except sqlite3.OperationalError as exc:
+                except sqlite3.DatabaseError as exc:
                     log.debug("[knowledge] FTS rebuild skipped: %s", exc)
             else:
                 log.info(
@@ -195,7 +226,7 @@ def _init_db(self) -> None:
                 )
             db.commit()
             db.close()
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.error("[knowledge] schema init failed at %s: %s", self.path, exc)
 
     # Convenience for middleware that wants the raw connection. Kept
@@ -235,7 +266,7 @@ def add_chunk(
             )
             db.commit()
             return int(cur.lastrowid)
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.error("[knowledge] add_chunk failed: %s", exc)
             return None
         finally:
@@ -285,7 +316,7 @@ def search(
         try:
             rows = self._search_fts(db, query, k, domain) if self._fts_available \
                 else self._search_like(db, query, k, domain)
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] search failed: %s", exc)
             rows = []
         finally:
@@ -310,10 +341,14 @@ def _search_fts(
     ) -> list[sqlite3.Row]:
         # Sanitize to FTS5-safe tokens; OR them so a multi-word query
         # matches any of the keywords (closer to LIKE behaviour).
+        # Each token is double-quoted so FTS5 treats it as a literal
+        # phrase rather than parsing operators (column filters, prefix
+        # wildcards, NEAR, etc.) — even though ``[\w']+`` already
+        # filters most special chars, defence in depth is cheap.
         tokens = [t for t in re.findall(r"[\w']+", query) if t]
         if not tokens:
             return []
-        match = " OR ".join(tokens)
+        match = " OR ".join(_fts_quote(t) for t in tokens)
         if domain:
             return db.execute(
                 "SELECT c.* FROM chunks_fts f "
@@ -341,14 +376,18 @@ def _search_like(
         if not tokens:
             return []
         # Score = number of tokens matched (rough recall-style ranking).
+        # User-supplied tokens are LIKE-escaped so a query containing
+        # ``%`` or ``_`` doesn't silently match every row; ESCAPE is
+        # bound on each clause.
         like_clauses = " + ".join(
-            "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END"
+            "CASE WHEN content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ? "
+            "THEN 1 ELSE 0 END"
             for _ in tokens
         )
         params: list[Any] = []
         for t in tokens:
-            needle = f"%{t}%"
-            params.extend([needle, needle])
+            needle = f"%{_escape_like(t)}%"
+            params.extend([needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE])
         sql = (
             f"SELECT *, ({like_clauses}) AS score FROM chunks "
             "WHERE score > 0"
@@ -380,7 +419,7 @@ def list_chunks(
                     "SELECT * FROM chunks ORDER BY id DESC LIMIT ?",
                     (limit,),
                 ).fetchall()
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] list_chunks failed: %s", exc)
             rows = []
         finally:
@@ -397,7 +436,7 @@ def stats(self) -> dict[str, int]:
                 "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC"
             ).fetchall()
             total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] stats failed: %s", exc)
             return {"total": 0}
         finally:
@@ -426,17 +465,18 @@ def find_chunk_containing(
         if db is None:
             return None
         try:
+            needle = f"%{_escape_like(text)}%"
             sql = (
                 "SELECT * FROM chunks "
-                "WHERE (content LIKE ? OR heading LIKE ?)"
+                "WHERE (content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ?)"
             )
-            params: list[Any] = [f"%{text}%", f"%{text}%"]
+            params: list[Any] = [needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE]
             if domain:
                 sql += " AND domain = ?"
                 params.append(domain)
             sql += " ORDER BY id DESC LIMIT 1"
             row = db.execute(sql, params).fetchone()
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] find_chunk_containing failed: %s", exc)
             row = None
         finally:
@@ -455,10 +495,13 @@ def delete_by_content(self, contains: str) -> int:
         if db is None:
             return 0
         try:
-            cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",))
+            cur = db.execute(
+                "DELETE FROM chunks WHERE content LIKE ? ESCAPE ?",
+                (f"%{_escape_like(contains)}%", _LIKE_ESCAPE),
+            )
             db.commit()
             return int(cur.rowcount)
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] delete_by_content failed: %s", exc)
             return 0
         finally:
@@ -476,7 +519,7 @@ def delete_by_heading(self, domain: str, heading: str) -> int:
             )
             db.commit()
             return int(cur.rowcount)
-        except sqlite3.OperationalError as exc:
+        except sqlite3.DatabaseError as exc:
             log.warning("[knowledge] delete_by_heading failed: %s", exc)
             return 0
         finally:

From b713d8d0997320520d7a14e503fdb12d26fd282e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 27 Apr 2026 22:47:04 +0000
Subject: [PATCH 5/6] fix(review-4): address round-4 PR #155 CodeRabbit
 feedback

- evals/runner.py: use asyncio.get_running_loop() instead of the
  deprecated get_event_loop() inside the _await_audit_assertion coroutine
- evals/runner.py: prefix unused _entries return value with underscore
- evals/runner.py: use datetime.UTC alias (consistent with store.py),
  drop now-unused timezone import
- knowledge/store.py: broaden _get_db exception catch from
  OperationalError to DatabaseError so corrupt-DB errors are swallowed
  per the module's no-crash contract
- knowledge/store.py: replace log.error with log.exception in all three
  DatabaseError handlers (schema init, _get_db, add_chunk) so
  tracebacks appear in error logs

Co-Authored-By: Claude <claude@anthropic.com>

https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH
---
 evals/runner.py    | 10 +++++-----
 knowledge/store.py | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/evals/runner.py b/evals/runner.py
index 2fa148f..7b66d4d 100644
--- a/evals/runner.py
+++ b/evals/runner.py
@@ -36,7 +36,7 @@
 import sys
 import time
 from dataclasses import asdict, dataclass, field
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 
 # Allow ``python -m evals.runner`` and ``python evals/runner.py``.
@@ -167,7 +167,7 @@ async def _await_audit_assertion(
     passes; the deadline only kicks in when the tool genuinely never
     fired.
     """
-    deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S
+    deadline = asyncio.get_running_loop().time() + _AUDIT_POLL_DEADLINE_S
     entries: list[dict] = []
     passed = False
     detail = ""
@@ -176,7 +176,7 @@ async def _await_audit_assertion(
         passed, detail = verify.assert_tools_fired(
             entries, expected_tools, require_success=require_success,
         )
-        if passed or asyncio.get_event_loop().time() >= deadline:
+        if passed or asyncio.get_running_loop().time() >= deadline:
             return entries, passed, detail
         await asyncio.sleep(_AUDIT_POLL_INTERVAL_S)
 
@@ -234,7 +234,7 @@ async def _run_prompt_case(
         expected_tools = case.get("expected_tools")
         if expected_tools is not None:
             require_success = case.get("tool_outcome", "success") == "success"
-            entries, passed, detail = await _await_audit_assertion(
+            _entries, passed, detail = await _await_audit_assertion(
                 since, expected_tools, require_success=require_success,
             )
             if not passed:
@@ -337,7 +337,7 @@ def _print_board(results: list[CaseResult]) -> None:
 def _save_report(results: list[CaseResult], path: Path) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     payload = {
-        "ts": datetime.now(timezone.utc).isoformat(),
+        "ts": datetime.now(UTC).isoformat(),
         "total": len(results),
         "passed": sum(1 for r in results if r.passed),
         "results": [asdict(r) for r in results],
diff --git a/knowledge/store.py b/knowledge/store.py
index 473bedd..d26d8a7 100644
--- a/knowledge/store.py
+++ b/knowledge/store.py
@@ -226,16 +226,16 @@ def _init_db(self) -> None:
                 )
             db.commit()
             db.close()
-        except sqlite3.DatabaseError as exc:
-            log.error("[knowledge] schema init failed at %s: %s", self.path, exc)
+        except sqlite3.DatabaseError:
+            log.exception("[knowledge] schema init failed at %s", self.path)
 
     # Convenience for middleware that wants the raw connection. Kept
     # private so the public API stays small.
     def _get_db(self) -> sqlite3.Connection | None:
         try:
             return self._connect()
-        except sqlite3.OperationalError as exc:
-            log.error("[knowledge] connect failed: %s", exc)
+        except sqlite3.DatabaseError:
+            log.exception("[knowledge] connect failed")
             return None
 
     # ── writes ──────────────────────────────────────────────────────────────
@@ -266,8 +266,8 @@ def add_chunk(
             )
             db.commit()
             return int(cur.lastrowid)
-        except sqlite3.DatabaseError as exc:
-            log.error("[knowledge] add_chunk failed: %s", exc)
+        except sqlite3.DatabaseError:
+            log.exception("[knowledge] add_chunk failed")
             return None
         finally:
             db.close()

From ae752a5e523c85714a48846eb9d53ee7e6b77b1c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 27 Apr 2026 22:48:12 +0000
Subject: [PATCH 6/6] chore: add uv.lock generated during round-4 review
 session

https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH
---
 uv.lock | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 uv.lock

diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..7eae9e0
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,8 @@
+version = 1
+revision = 3
+requires-python = ">=3.11"
+
+[[package]]
+name = "protoagent"
+version = "0.2.1"
+source = { virtual = "." }