From f14b2fd41fbb4ff5f63e5768ae051d0fd25fa084 Mon Sep 17 00:00:00 2001 From: Automaker Date: Sun, 19 Apr 2026 14:36:08 -0700 Subject: [PATCH 01/24] chore: release v0.2.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First tagged release. Contents of community-improvements project: M1 — Security Hardening (A2A bearer auth, audit redaction, origin verification) M2 — Memory On By Default (session persistence + load-on-start) M3 — Skill Loop (skill-v1 emission + SQLite FTS5 index + curator) Plus: .gitignore cleanup for .automaker-lock + .worktrees, docs coverage of security layer, skill-loop architecture, and new env vars. Manual bump because prepare-release.yml requires GH_PAT secret (not configured). Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e63ed8b..fc92a48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "protoagent" -version = "0.1.0" +version = "0.2.0" description = "protoAgent — LangGraph + A2A template for spawning protoLabs agents" requires-python = ">=3.11" From e7a56208dd68eccf124d5ed2d06d6def5f5d1878 Mon Sep 17 00:00:00 2001 From: Ava Date: Sun, 19 Apr 2026 19:02:33 -0700 Subject: [PATCH 02/24] chore: release v0.2.1 Bug fixes from v0.2.0 smoke testing: - Agent card now advertises bearer scheme when A2A_AUTH_TOKEN is set - Session memory persistence actually fires (moved from unreachable on_session_end to after_agent) - Test suite collects cleanly in fresh Docker env - MemoryMiddleware activates standalone (without knowledge_store) Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c649139..39bdee2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "protoagent" -version = "0.2.0" +version = "0.2.1" description = "protoAgent — LangGraph + A2A template for spawning protoLabs agents" requires-python = ">=3.11" From f1dcd3f368bd226a326dd18e9f9b82a084af77bd Mon Sep 17 00:00:00 2001 From: Josh Mabry <31560031+mabry1985@users.noreply.github.com> Date: Tue, 21 Apr 2026 20:55:50 -0700 Subject: [PATCH 03/24] chore(ci): update repo homepage after docs deploy (#149) Writes the deployed GitHub Pages URL back to the repo's `homepage` field so it renders in the About sidebar on the repo page. Co-authored-by: Automaker Co-authored-by: Claude Opus 4.7 (1M context) --- .github/workflows/docs.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a47454c..c619d94 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -10,6 +10,7 @@ permissions: contents: read pages: write id-token: write + administration: write concurrency: group: pages @@ -39,3 +40,7 @@ jobs: steps: - id: deployment uses: actions/deploy-pages@v4 + - name: Update repo homepage + run: gh api -X PATCH repos/${{ github.repository }} -f homepage="${{ steps.deployment.outputs.page_url }}" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From f49c0126a09e91da3ba0a9ac05f2726cb272284c Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 17:05:48 -0700 Subject: [PATCH 04/24] feat(ui): live-edit config drawer with model discovery + SOUL.md editor Elevates langgraph-config.yaml + SOUL.md into a typed form inside the Gradio sidebar so forks can iterate on model / temperature / tools / middleware / persona without a code edit + restart. Save rebuilds the compiled graph in place; in-flight turns finish on the prior graph. The model dropdown is populated from the connected gateway's `/v1/models` endpoint so forks always see what's actually available through the configured api_base + api_key, no hardcoded list to drift. graph/config_io.py is the new I/O layer: YAML round-trip preserves comments and unknown top-level sections (the shipped YAML's memory/skills blocks that the dataclass doesn't model), dual-location SOUL.md handling writes to both /sandbox/SOUL.md (runtime) and config/SOUL.md (source), and gateway model discovery returns a readable error string instead of raising when the endpoint is down. Also exposes GET/POST /api/config + GET /api/config/models for external control, and falls SOUL back to config/SOUL.md in graph/prompts.py so local dev without a /sandbox mount still picks up drawer edits. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + chat_ui.py | 342 ++++++++++++++++++++++++++++++++++------ graph/config_io.py | 321 +++++++++++++++++++++++++++++++++++++ graph/prompts.py | 7 +- requirements.txt | 1 + server.py | 145 +++++++++++++++++ tests/test_config_io.py | 323 +++++++++++++++++++++++++++++++++++++ 7 files changed, 1094 insertions(+), 46 deletions(-) create mode 100644 graph/config_io.py create mode 100644 tests/test_config_io.py diff --git a/.gitignore b/.gitignore index cd98bb9..86ef2cd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.pyo .env .venv/ +.venv-*/ venv/ *.egg-info/ dist/ diff --git a/chat_ui.py b/chat_ui.py index 62fb697..e9fe105 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -214,60 +214,312 @@ def _build() -> gr.Blocks: gr.HTML(footer_html) # --- Settings sidebar --- + # Each section below is gated on the presence of its callback, + # so forks can opt in per panel. The Configuration panel (the + # live-editable drawer) renders when "get_config" + "save_all" + # are provided by the server. if settings: with gr.Sidebar(label="Settings", open=False, position="right"): - with gr.Accordion("Tools", open=False): - tools_display = gr.Markdown("Loading...") - refresh_tools_btn = gr.Button("Refresh", size="sm") - - with gr.Accordion("Model", open=False): - model_display = gr.Markdown("Loading...") - provider_dropdown = gr.Dropdown( - label="Provider", choices=[], interactive=True, + + # === Live configuration drawer ============================ + if "get_config" in settings and "save_all" in settings: + gr.Markdown( + "### Configuration\n" + "Edits are written to `config/langgraph-config.yaml` " + "and applied with a live graph rebuild — in-flight " + "turns finish on the previous config.", + ) + config_status = gr.Markdown("") + + with gr.Accordion("Model", open=True): + api_base_in = gr.Textbox( + label="API Base URL", + placeholder="http://gateway:4000/v1", + interactive=True, + ) + api_key_in = gr.Textbox( + label="API Key", + type="password", + placeholder="blank → use $OPENAI_API_KEY env", + interactive=True, + ) + with gr.Row(): + model_in = gr.Dropdown( + label="Model", + choices=[], + interactive=True, + allow_custom_value=True, + scale=4, + ) + fetch_models_btn = gr.Button( + "Fetch", size="sm", scale=1, min_width=60, + ) + model_fetch_status = gr.Markdown("") + temperature_in = gr.Slider( + label="Temperature", + minimum=0.0, maximum=2.0, step=0.05, + interactive=True, + ) + max_tokens_in = gr.Number( + label="Max Tokens", precision=0, + minimum=1, interactive=True, + ) + max_iter_in = gr.Slider( + label="Max Iterations", + minimum=1, maximum=200, step=1, + interactive=True, + ) + + with gr.Accordion("Worker Subagent", open=False): + worker_enabled_in = gr.Checkbox( + label="Enabled", interactive=True, + ) + worker_tools_in = gr.CheckboxGroup( + label="Tools", choices=[], interactive=True, + ) + worker_max_turns_in = gr.Number( + label="Max Turns", precision=0, + minimum=1, interactive=True, + ) + + with gr.Accordion("Middleware", open=False): + mw_knowledge_in = gr.Checkbox( + label="Knowledge", interactive=True, + ) + mw_audit_in = gr.Checkbox( + label="Audit", interactive=True, + ) + mw_memory_in = gr.Checkbox( + label="Memory", interactive=True, + ) + + with gr.Accordion("Knowledge Store", open=False): + kb_db_in = gr.Textbox( + label="DB Path", interactive=True, + ) + kb_embed_in = gr.Textbox( + label="Embed Model", interactive=True, + ) + kb_top_k_in = gr.Number( + label="Top K", precision=0, + minimum=1, interactive=True, + ) + + with gr.Accordion("Persona (SOUL.md)", open=False): + soul_in = gr.Textbox( + label="SOUL.md", lines=16, show_label=False, + interactive=True, + placeholder="Agent persona — loaded into every system prompt.", + ) + + with gr.Row(): + save_btn = gr.Button( + "Save & Reload", variant="primary", scale=2, + ) + reload_btn = gr.Button( + "Reload from Disk", variant="secondary", scale=1, + ) + + # Ordered tuple used for both load_all outputs and + # save_all inputs — keeps the wiring obvious and the + # two lists from drifting out of sync. + _config_components = [ + api_base_in, api_key_in, model_in, + temperature_in, max_tokens_in, max_iter_in, + worker_enabled_in, worker_tools_in, worker_max_turns_in, + mw_knowledge_in, mw_audit_in, mw_memory_in, + kb_db_in, kb_embed_in, kb_top_k_in, + soul_in, + ] + + def _load_all(): + cfg = settings["get_config"]() + soul = settings["get_soul"]() if "get_soul" in settings else "" + tools = settings["list_tools"]() if "list_tools" in settings else [] + + # Best-effort gateway probe. If it fails (offline, + # wrong key) we surface the error but keep the form + # populated with the saved model name — the user + # can still edit everything else. + models, err = ([], "") + if "list_models" in settings: + try: + models, err = settings["list_models"]("", "") + except Exception as e: + err = str(e) + current_name = cfg["model"]["name"] + dropdown_choices = models if models else [current_name] + if current_name and current_name not in dropdown_choices: + dropdown_choices = [current_name, *dropdown_choices] + + fetch_msg = ( + f"✓ {len(models)} model(s) from gateway" + if models and not err + else f"⚠ {err}" if err else "" + ) + + worker = cfg["subagents"]["worker"] + return ( + cfg["model"]["api_base"], + cfg["model"]["api_key"], + gr.update(choices=dropdown_choices, value=current_name), + cfg["model"]["temperature"], + cfg["model"]["max_tokens"], + cfg["model"]["max_iterations"], + worker["enabled"], + gr.update(choices=tools, value=list(worker["tools"])), + worker["max_turns"], + cfg["middleware"]["knowledge"], + cfg["middleware"]["audit"], + cfg["middleware"]["memory"], + cfg["knowledge"]["db_path"], + cfg["knowledge"]["embed_model"], + cfg["knowledge"]["top_k"], + soul, + fetch_msg, + ) + + def _fetch_models(api_base, api_key): + if "list_models" not in settings: + return gr.update(), "⚠ list_models not wired" + try: + models, err = settings["list_models"](api_base, api_key) + except Exception as e: + return gr.update(), f"⚠ {e}" + if err: + return gr.update(), f"⚠ {err}" + return gr.update(choices=models), f"✓ {len(models)} model(s) from gateway" + + def _save( + api_base, api_key, model_name, + temperature, max_tokens, max_iter, + worker_enabled, worker_tools, worker_max_turns, + mw_knowledge, mw_audit, mw_memory, + kb_db, kb_embed, kb_top_k, + soul, + ): + new_config = { + "model": { + "api_base": api_base or "", + "api_key": api_key or "", + "name": model_name or "", + "temperature": float(temperature), + "max_tokens": int(max_tokens or 0), + "max_iterations": int(max_iter or 0), + }, + "subagents": { + "worker": { + "enabled": bool(worker_enabled), + "tools": list(worker_tools or []), + "max_turns": int(worker_max_turns or 0), + }, + }, + "middleware": { + "knowledge": bool(mw_knowledge), + "audit": bool(mw_audit), + "memory": bool(mw_memory), + }, + "knowledge": { + "db_path": kb_db or "", + "embed_model": kb_embed or "", + "top_k": int(kb_top_k or 1), + }, + } + try: + ok, msg = settings["save_all"](new_config, soul or "") + except Exception as e: + return f"⚠ save failed: {e}" + return f"{'✓' if ok else '⚠'} {msg}" + + def _reload_only(): + try: + ok, msg = settings["save_all"](None, None) + except Exception as e: + return f"⚠ reload failed: {e}" + return f"{'✓' if ok else '⚠'} {msg}" + + app.load( + fn=_load_all, + outputs=[*_config_components, model_fetch_status], + ) + fetch_models_btn.click( + fn=_fetch_models, + inputs=[api_base_in, api_key_in], + outputs=[model_in, model_fetch_status], + ) + save_btn.click( + fn=_save, + inputs=_config_components, + outputs=[config_status], + ).then( + fn=_fetch_models, + inputs=[api_base_in, api_key_in], + outputs=[model_in, model_fetch_status], ) - switch_status = gr.Markdown("") - refresh_model_btn = gr.Button("Refresh", size="sm") + reload_btn.click( + fn=_reload_only, outputs=[config_status], + ).then( + fn=_load_all, + outputs=[*_config_components, model_fetch_status], + ) + + # === Legacy read-only panels (opt-in via their own keys) == + if "get_tools_list" in settings: + with gr.Accordion("Tools", open=False): + tools_display = gr.Markdown("Loading...") + refresh_tools_btn = gr.Button("Refresh", size="sm") + + def load_tools(): + return settings["get_tools_list"]() + + app.load(fn=load_tools, outputs=[tools_display]) + refresh_tools_btn.click(fn=load_tools, outputs=[tools_display]) + + if "get_model_info" in settings: + with gr.Accordion("Model Status", open=False): + model_display = gr.Markdown("Loading...") + refresh_model_btn = gr.Button("Refresh", size="sm") + + provider_dropdown = None + switch_status = None + if "get_provider_choices" in settings: + provider_dropdown = gr.Dropdown( + label="Provider", choices=[], interactive=True, + ) + switch_status = gr.Markdown("") + + def load_model(): + return settings["get_model_info"]() + + app.load(fn=load_model, outputs=[model_display]) + refresh_model_btn.click(fn=load_model, outputs=[model_display]) + + if provider_dropdown is not None: + def load_provider_choices(): + choices = settings["get_provider_choices"]() + current = settings["get_current_provider"]() + return gr.update(choices=choices, value=current) + + def switch_provider(choice): + return settings["switch_provider"](choice) + + def load_subtitle(): + return settings["get_subtitle"]() + + app.load(fn=load_provider_choices, outputs=[provider_dropdown]) + provider_dropdown.change( + fn=switch_provider, + inputs=[provider_dropdown], + outputs=[switch_status], + ).then(fn=load_model, outputs=[model_display]).then( + fn=load_subtitle, outputs=[header_md], + ) if "get_knowledge_stats" in settings: with gr.Accordion("Knowledge Base", open=False): kb_display = gr.Markdown("Loading...") refresh_kb_btn = gr.Button("Refresh", size="sm") - # --- Callbacks --- - - def load_tools(): - return settings["get_tools_list"]() - - def load_model(): - return settings["get_model_info"]() - - def load_provider_choices(): - choices = settings["get_provider_choices"]() - current = settings["get_current_provider"]() - return gr.update(choices=choices, value=current) - - def switch_provider(choice): - return settings["switch_provider"](choice) - - def load_subtitle(): - return settings["get_subtitle"]() - - app.load(fn=load_tools, outputs=[tools_display]) - app.load(fn=load_model, outputs=[model_display]) - app.load(fn=load_provider_choices, outputs=[provider_dropdown]) - - refresh_tools_btn.click(fn=load_tools, outputs=[tools_display]) - refresh_model_btn.click( - fn=load_model, outputs=[model_display] - ).then(fn=load_provider_choices, outputs=[provider_dropdown]) - - provider_dropdown.change( - fn=switch_provider, inputs=[provider_dropdown], outputs=[switch_status], - ).then(fn=load_model, outputs=[model_display]).then( - fn=load_subtitle, outputs=[header_md], - ) - - if "get_knowledge_stats" in settings: def load_kb_stats(): return settings["get_knowledge_stats"]() diff --git a/graph/config_io.py b/graph/config_io.py new file mode 100644 index 0000000..e77a45c --- /dev/null +++ b/graph/config_io.py @@ -0,0 +1,321 @@ +"""Config I/O for the live-edit drawer in chat_ui.py. + +Three jobs: + +1. **YAML round-trip** that preserves comments and unknown keys in + ``config/langgraph-config.yaml``. ``LangGraphConfig.from_yaml`` + silently drops anything it doesn't know about, so writing back via + a freshly-constructed dataclass would wipe fork-added sections + (e.g. the ``memory`` / ``skills`` blocks the template already + ships). We use ruamel.yaml when available for comment preservation; + PyYAML is the fallback. + +2. **Two-location SOUL.md handling.** The runtime reads + ``/sandbox/SOUL.md`` (populated by ``entrypoint.sh`` at container + start). The source-of-truth lives at ``config/SOUL.md`` in the + repo. Drawer edits write to both so container restarts preserve + the change and local-dev runs without a ``/sandbox`` directory + still pick up the edit. + +3. **Gateway introspection.** ``list_gateway_models`` hits + ``{api_base}/models`` so the drawer's model dropdown reflects + whatever the connected LiteLLM gateway (or OpenAI-compat endpoint) + actually exposes — no hardcoded list to drift out of sync. +""" + +from __future__ import annotations + +import logging +import os +from io import StringIO +from pathlib import Path +from typing import Any + +from graph.config import LangGraphConfig + +log = logging.getLogger("protoagent.config_io") + +REPO_ROOT = Path(__file__).parent.parent +CONFIG_YAML_PATH = REPO_ROOT / "config" / "langgraph-config.yaml" +SOUL_SOURCE_PATH = REPO_ROOT / "config" / "SOUL.md" +SOUL_RUNTIME_PATH = Path("/sandbox/SOUL.md") + + +# --------------------------------------------------------------------------- +# YAML round-trip +# --------------------------------------------------------------------------- + +try: + from ruamel.yaml import YAML # type: ignore + + _ruamel = YAML(typ="rt") + _ruamel.preserve_quotes = True + _ruamel.indent(mapping=2, sequence=4, offset=2) + _HAS_RUAMEL = True +except ImportError: + _HAS_RUAMEL = False + + +def load_yaml_doc(path: Path = CONFIG_YAML_PATH) -> Any: + """Load the config YAML as a mutable document. + + With ruamel: returns a CommentedMap that preserves comments + + key order on subsequent dump. Without: returns a plain dict and + comments are lost on next save (a warning is logged once per + save so the operator knows). + """ + if not path.exists(): + return {} if not _HAS_RUAMEL else _ruamel.load("{}\n") + + with open(path) as f: + if _HAS_RUAMEL: + return _ruamel.load(f) or _ruamel.load("{}\n") + import yaml + return yaml.safe_load(f) or {} + + +def save_yaml_doc(doc: Any, path: Path = CONFIG_YAML_PATH) -> None: + """Persist the document. Creates parent dirs if needed.""" + path.parent.mkdir(parents=True, exist_ok=True) + if _HAS_RUAMEL: + with open(path, "w") as f: + _ruamel.dump(doc, f) + return + + log.warning( + "ruamel.yaml not installed — YAML comments in %s will not be " + "preserved on save. Add `ruamel.yaml>=0.18` to requirements.txt " + "to fix.", path, + ) + import yaml + with open(path, "w") as f: + yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False) + + +# --------------------------------------------------------------------------- +# Config dict <-> dataclass +# --------------------------------------------------------------------------- + +# Nested dotted path → LangGraphConfig attribute. +_FIELD_MAP: dict[str, str] = { + "model.provider": "model_provider", + "model.name": "model_name", + "model.api_base": "api_base", + "model.api_key": "api_key", + "model.temperature": "temperature", + "model.max_tokens": "max_tokens", + "model.max_iterations": "max_iterations", + "middleware.knowledge": "knowledge_middleware", + "middleware.audit": "audit_middleware", + "middleware.memory": "memory_middleware", + "knowledge.db_path": "knowledge_db_path", + "knowledge.embed_model": "embed_model", + "knowledge.top_k": "knowledge_top_k", +} + + +def config_to_dict(config: LangGraphConfig) -> dict[str, Any]: + """Serialize a LangGraphConfig into the nested dict shape the UI + works with. Mirrors the YAML schema so round-tripping is trivial. + """ + return { + "model": { + "provider": config.model_provider, + "name": config.model_name, + "api_base": config.api_base, + "api_key": config.api_key, + "temperature": config.temperature, + "max_tokens": config.max_tokens, + "max_iterations": config.max_iterations, + }, + "subagents": { + "worker": { + "enabled": config.worker.enabled, + "tools": list(config.worker.tools), + "max_turns": config.worker.max_turns, + }, + }, + "middleware": { + "knowledge": config.knowledge_middleware, + "audit": config.audit_middleware, + "memory": config.memory_middleware, + }, + "knowledge": { + "db_path": config.knowledge_db_path, + "embed_model": config.embed_model, + "top_k": config.knowledge_top_k, + }, + } + + +def apply_updates_to_yaml(doc: Any, updates: dict[str, Any]) -> Any: + """Merge a nested updates dict into the loaded YAML document. + + Uses __setitem__ on whatever container ruamel loaded (CommentedMap + acts like dict), so comments / key order / unknown sections are + preserved. Keys that don't exist yet get added at the end of the + containing section. + """ + for section, values in updates.items(): + if not isinstance(values, dict): + doc[section] = values + continue + if section not in doc or not isinstance(doc.get(section), dict): + doc[section] = {} + for key, val in values.items(): + if isinstance(val, dict): + if key not in doc[section] or not isinstance(doc[section].get(key), dict): + doc[section][key] = {} + for inner_key, inner_val in val.items(): + doc[section][key][inner_key] = inner_val + else: + doc[section][key] = val + return doc + + +def validate_config_dict(updates: dict[str, Any]) -> tuple[bool, str]: + """Validate without persisting. Returns (ok, error-message). + + Catches type mismatches and obvious range errors before we touch + disk or rebuild the graph. + """ + try: + model = updates.get("model", {}) + temp = float(model.get("temperature", 0.2)) + if not 0.0 <= temp <= 2.0: + return False, f"temperature must be 0.0–2.0, got {temp}" + max_tokens = int(model.get("max_tokens", 4096)) + if max_tokens < 1: + return False, f"max_tokens must be >= 1, got {max_tokens}" + max_iter = int(model.get("max_iterations", 50)) + if max_iter < 1: + return False, f"max_iterations must be >= 1, got {max_iter}" + + worker = updates.get("subagents", {}).get("worker", {}) + if worker: + max_turns = int(worker.get("max_turns", 20)) + if max_turns < 1: + return False, f"worker.max_turns must be >= 1, got {max_turns}" + tools = worker.get("tools", []) + if not isinstance(tools, list): + return False, "worker.tools must be a list" + + knowledge = updates.get("knowledge", {}) + if knowledge: + top_k = int(knowledge.get("top_k", 5)) + if top_k < 1: + return False, f"knowledge.top_k must be >= 1, got {top_k}" + except (TypeError, ValueError) as e: + return False, f"config validation: {e}" + return True, "" + + +# --------------------------------------------------------------------------- +# SOUL.md +# --------------------------------------------------------------------------- + + +def read_soul() -> str: + """Return the current persona text. + + Prefers the runtime path (``/sandbox/SOUL.md``) since that's what + ``graph/prompts.build_system_prompt`` actually reads; falls back + to the repo source so local-dev picks it up even when no sandbox + volume is mounted. + """ + for path in (SOUL_RUNTIME_PATH, SOUL_SOURCE_PATH): + if path.exists(): + return path.read_text(encoding="utf-8") + return "" + + +def write_soul(text: str) -> list[Path]: + """Write persona text to every reachable SOUL.md path. + + Always writes the repo source (``config/SOUL.md``). Additionally + writes the runtime path if its parent directory exists — in the + container ``/sandbox`` is created by Dockerfile; in local dev it + usually isn't, so we skip quietly instead of erroring. + + Returns the paths that were written for UI feedback. + """ + written: list[Path] = [] + SOUL_SOURCE_PATH.parent.mkdir(parents=True, exist_ok=True) + SOUL_SOURCE_PATH.write_text(text, encoding="utf-8") + written.append(SOUL_SOURCE_PATH) + + if SOUL_RUNTIME_PATH.parent.exists(): + SOUL_RUNTIME_PATH.write_text(text, encoding="utf-8") + written.append(SOUL_RUNTIME_PATH) + + return written + + +# --------------------------------------------------------------------------- +# Gateway model discovery +# --------------------------------------------------------------------------- + + +def list_gateway_models( + api_base: str, + api_key: str = "", + timeout: float = 10.0, +) -> tuple[list[str], str]: + """Fetch the model list from ``{api_base}/models``. + + Works against any OpenAI-compatible endpoint — LiteLLM gateway, + OpenAI proper, vLLM, Ollama with the OpenAI adapter. Returns + ``(model_ids, error_message)``. On success ``error_message`` is + empty; on failure model_ids is empty and the message is human- + readable. + """ + import httpx + + if not api_base: + return [], "api_base is empty" + + key = api_key or os.environ.get("OPENAI_API_KEY", "") + url = api_base.rstrip("/") + "/models" + headers = {} + if key: + headers["Authorization"] = f"Bearer {key}" + + try: + with httpx.Client(timeout=timeout) as client: + resp = client.get(url, headers=headers) + except httpx.HTTPError as e: + return [], f"connection failed: {e}" + + if resp.status_code >= 400: + detail = resp.text[:200] if resp.text else "" + return [], f"HTTP {resp.status_code} from {url}: {detail}" + + try: + data = resp.json() + except ValueError: + return [], f"non-JSON response from {url}" + + items = data.get("data") if isinstance(data, dict) else None + if not isinstance(items, list): + return [], f"unexpected shape from {url} — no 'data' array" + + ids: list[str] = [] + for item in items: + if isinstance(item, dict): + model_id = item.get("id") or item.get("name") + if isinstance(model_id, str): + ids.append(model_id) + ids.sort() + return ids, "" + + +# --------------------------------------------------------------------------- +# Tool registry introspection +# --------------------------------------------------------------------------- + + +def list_available_tools(knowledge_store: Any = None) -> list[str]: + """Return every tool name the runtime would wire into the graph.""" + from tools.lg_tools import get_all_tools + + return [t.name for t in get_all_tools(knowledge_store)] diff --git a/graph/prompts.py b/graph/prompts.py index b948909..b26e296 100644 --- a/graph/prompts.py +++ b/graph/prompts.py @@ -49,8 +49,13 @@ def build_system_prompt( """ parts = [] - # 1. Identity + # 1. Identity — prefer the runtime workspace (entrypoint.sh copies + # config/SOUL.md to /sandbox/SOUL.md at container start). Fall back + # to the repo source so local `python server.py` runs without a + # /sandbox mount still pick up persona edits made via the drawer. soul = _read_file(f"{workspace}/SOUL.md") + if not soul: + soul = _read_file(Path(__file__).parent.parent / "config" / "SOUL.md") if soul: parts.append(soul) else: diff --git a/requirements.txt b/requirements.txt index 9cb6ff6..30ef46d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ uvicorn>=0.30 langfuse>=3.0 prometheus-client>=0.20 pyyaml>=6.0 +ruamel.yaml>=0.18 # round-trip YAML that preserves comments in config/langgraph-config.yaml when the drawer writes back edits websockets>=12.0 # LangGraph agent backend diff --git a/server.py b/server.py index 2221b11..af6acb1 100644 --- a/server.py +++ b/server.py @@ -72,6 +72,120 @@ def _init_langgraph_agent(): log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name) +def _reload_langgraph_agent() -> tuple[bool, str]: + """Rebuild the compiled graph from the latest config YAML. + + Called by the drawer's Save & Reload action and the + ``/api/config/reload`` endpoint. Preserves the existing + ``_checkpointer`` so active session threads stay addressable + — a fresh MemorySaver would orphan every in-flight thread. + + Rebinding ``_graph`` is atomic in CPython; in-flight + ``astream_events`` iterators hold their own reference to the + prior graph and finish cleanly on the old instance. + """ + global _graph, _graph_config + + from graph.agent import create_agent_graph + from graph.config import LangGraphConfig + + config_path = Path(__file__).parent / "config" / "langgraph-config.yaml" + try: + new_config = LangGraphConfig.from_yaml(config_path) + new_graph = create_agent_graph(new_config) + except Exception as e: + log.exception("[reload] rebuild failed: %s", e) + return False, f"reload failed: {e}" + + _graph_config = new_config + _graph = new_graph + log.info("LangGraph agent reloaded (model: %s)", _graph_config.model_name) + return True, f"reloaded • model={_graph_config.model_name}" + + +def _apply_settings_changes( + config: dict | None = None, + soul: str | None = None, +) -> tuple[bool, list[str]]: + """Persist config YAML + SOUL.md then reload the graph once. + + Passing ``None`` for either argument skips that write — a bare + call with both None acts as a pure reload (useful for picking up + external file edits). + """ + from graph.config_io import ( + apply_updates_to_yaml, + load_yaml_doc, + save_yaml_doc, + validate_config_dict, + write_soul, + ) + + messages: list[str] = [] + + if config is not None: + ok, err = validate_config_dict(config) + if not ok: + return False, [f"validation: {err}"] + try: + doc = load_yaml_doc() + apply_updates_to_yaml(doc, config) + save_yaml_doc(doc) + messages.append("config saved") + except Exception as e: + log.exception("[config] YAML write failed: %s", e) + return False, [f"config write: {e}"] + + if soul is not None: + try: + paths = write_soul(soul) + messages.append(f"SOUL saved ({len(paths)} path{'s' if len(paths) != 1 else ''})") + except Exception as e: + log.exception("[config] SOUL write failed: %s", e) + return False, [f"soul write: {e}"] + + ok, reload_msg = _reload_langgraph_agent() + messages.append(reload_msg) + return ok, messages + + +def _build_settings_callbacks() -> dict[str, Any]: + """Callbacks consumed by the Gradio Configuration drawer.""" + from graph.config_io import ( + config_to_dict, + list_available_tools, + list_gateway_models, + read_soul, + ) + + def get_config() -> dict[str, Any]: + return config_to_dict(_graph_config) + + def list_models(api_base: str = "", api_key: str = "") -> tuple[list[str], str]: + """UI-friendly model lookup. + + Uses the form-local api_base/api_key when the user is trying a + different endpoint before saving; falls back to the currently + loaded graph config so the initial render works without + arguments. + """ + base = api_base or (_graph_config.api_base if _graph_config else "") + key = api_key or (_graph_config.api_key if _graph_config else "") + return list_gateway_models(base, key) + + def save_all(config: dict | None, soul: str | None) -> tuple[bool, str]: + ok, messages = _apply_settings_changes(config=config, soul=soul) + return ok, " • ".join(messages) + + return { + "get_config": get_config, + "get_soul": read_soul, + "list_models": list_models, + "list_tools": list_available_tools, + "save_all": save_all, + } + + # --------------------------------------------------------------------------- # Chat backend — called by the A2A handler + OpenAI-compat endpoint # --------------------------------------------------------------------------- @@ -347,6 +461,7 @@ def _main(): subtitle="protoAgent", placeholder="Send a message...", pwa=True, + settings=_build_settings_callbacks(), ) import gradio as gr @@ -369,6 +484,36 @@ async def _api_chat(req: ChatRequest): parts = [m["content"] for m in result if m.get("role") == "assistant" and m.get("content")] return {"response": "\n\n".join(parts), "messages": result} + # --- Live config / SOUL editing ---------------------------------------- + # GET returns the current config + persona so external clients (the + # Gradio drawer is one; curl is another) can mirror what's running. + # POST accepts partial edits — pass only the sections you want to + # change. Reload is automatic. + class ConfigReloadRequest(PydanticBaseModel): + config: dict | None = None + soul: str | None = None + + @fastapi_app.get("/api/config") + async def _api_get_config(): + from graph.config_io import config_to_dict, read_soul + return { + "config": config_to_dict(_graph_config), + "soul": read_soul(), + } + + @fastapi_app.post("/api/config") + async def _api_post_config(req: ConfigReloadRequest): + ok, messages = _apply_settings_changes(config=req.config, soul=req.soul) + return {"ok": ok, "messages": messages} + + @fastapi_app.get("/api/config/models") + async def _api_list_models(api_base: str = "", api_key: str = ""): + from graph.config_io import list_gateway_models + base = api_base or (_graph_config.api_base if _graph_config else "") + key = api_key or (_graph_config.api_key if _graph_config else "") + models, error = list_gateway_models(base, key) + return {"models": models, "error": error} + # --- OpenAI-compatible chat completions -------------------------------- # Lets this agent be registered as a model in the LiteLLM gateway / # OpenWebUI without any protocol adapter. diff --git a/tests/test_config_io.py b/tests/test_config_io.py new file mode 100644 index 0000000..ce31bf6 --- /dev/null +++ b/tests/test_config_io.py @@ -0,0 +1,323 @@ +"""Tests for graph/config_io.py — the plumbing behind the live-edit drawer. + +Critical invariants: + +- YAML round-trip preserves unknown top-level sections (forks add + these; silently dropping them on save would be a footgun). +- ``apply_updates_to_yaml`` mutates only the keys you pass and leaves + siblings alone. +- ``validate_config_dict`` catches range / type errors before disk + writes. +- ``read_soul`` / ``write_soul`` handles the dual-location contract + (/sandbox/SOUL.md as runtime, config/SOUL.md as source). +- ``list_gateway_models`` returns a readable error message rather + than raising — the UI shows this string directly. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import httpx +import pytest + + +# ── YAML round-trip ────────────────────────────────────────────────────────── + + +def test_yaml_round_trip_preserves_unknown_keys(tmp_path: Path) -> None: + """Forks add custom top-level sections (the shipped YAML already + has ``memory`` and ``skills`` that the dataclass doesn't model). + Round-tripping through load_yaml_doc + save_yaml_doc must leave + them intact.""" + from graph import config_io + + yaml_path = tmp_path / "langgraph-config.yaml" + yaml_path.write_text( + "model:\n" + " name: test-model\n" + " temperature: 0.5\n" + "memory:\n" + " path: /custom/memory\n" + " max_sessions: 42\n" + "custom_section:\n" + " arbitrary_key: arbitrary_value\n" + ) + + doc = config_io.load_yaml_doc(yaml_path) + config_io.save_yaml_doc(doc, yaml_path) + + reloaded = config_io.load_yaml_doc(yaml_path) + assert reloaded["memory"]["path"] == "/custom/memory" + assert reloaded["memory"]["max_sessions"] == 42 + assert reloaded["custom_section"]["arbitrary_key"] == "arbitrary_value" + + +def test_apply_updates_merges_shallowly(tmp_path: Path) -> None: + """Updating model.temperature must NOT clobber model.name or + other model.* fields.""" + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text( + "model:\n" + " name: original-model\n" + " temperature: 0.1\n" + " api_base: http://original\n" + ) + + doc = config_io.load_yaml_doc(yaml_path) + config_io.apply_updates_to_yaml(doc, {"model": {"temperature": 0.9}}) + config_io.save_yaml_doc(doc, yaml_path) + + reloaded = config_io.load_yaml_doc(yaml_path) + assert reloaded["model"]["name"] == "original-model" + assert reloaded["model"]["api_base"] == "http://original" + assert reloaded["model"]["temperature"] == 0.9 + + +def test_apply_updates_adds_missing_sections(tmp_path: Path) -> None: + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text("model:\n name: x\n") + doc = config_io.load_yaml_doc(yaml_path) + + config_io.apply_updates_to_yaml( + doc, + {"middleware": {"audit": True, "memory": False}}, + ) + + assert doc["middleware"]["audit"] is True + assert doc["middleware"]["memory"] is False + assert doc["model"]["name"] == "x" + + +def test_apply_updates_nested_worker(tmp_path: Path) -> None: + """subagents.worker.tools is a list, subagents.worker.enabled + is a bool — both must land in the right nested slot.""" + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text("subagents:\n worker:\n enabled: false\n") + doc = config_io.load_yaml_doc(yaml_path) + + config_io.apply_updates_to_yaml( + doc, + {"subagents": {"worker": {"enabled": True, "tools": ["echo", "calculator"]}}}, + ) + + assert doc["subagents"]["worker"]["enabled"] is True + assert list(doc["subagents"]["worker"]["tools"]) == ["echo", "calculator"] + + +# ── config_to_dict ─────────────────────────────────────────────────────────── + + +def test_config_to_dict_mirrors_yaml_shape() -> None: + """The UI works with the dict shape; the YAML schema uses the + same paths. Keep them in lockstep so round-tripping through + apply_updates_to_yaml works without path rewrites.""" + from graph.config import LangGraphConfig + from graph.config_io import config_to_dict + + cfg = LangGraphConfig() + d = config_to_dict(cfg) + + # Top-level schema surface + assert set(d.keys()) == {"model", "subagents", "middleware", "knowledge"} + assert d["model"]["name"] == cfg.model_name + assert d["model"]["temperature"] == cfg.temperature + assert d["subagents"]["worker"]["tools"] == list(cfg.worker.tools) + assert d["middleware"]["audit"] == cfg.audit_middleware + assert d["knowledge"]["top_k"] == cfg.knowledge_top_k + + +# ── validate_config_dict ───────────────────────────────────────────────────── + + +@pytest.mark.parametrize("bad_value,expected_error_fragment", [ + ({"model": {"temperature": 3.0}}, "temperature"), + ({"model": {"temperature": -0.1}}, "temperature"), + ({"model": {"max_tokens": 0}}, "max_tokens"), + ({"model": {"max_iterations": 0}}, "max_iterations"), + ({"subagents": {"worker": {"max_turns": 0}}}, "max_turns"), + ({"subagents": {"worker": {"tools": "not-a-list"}}}, "list"), + ({"knowledge": {"top_k": 0}}, "top_k"), +]) +def test_validate_rejects_bad_values(bad_value, expected_error_fragment): + from graph.config_io import validate_config_dict + ok, err = validate_config_dict(bad_value) + assert not ok + assert expected_error_fragment in err + + +def test_validate_accepts_happy_path(): + from graph.config_io import config_to_dict, validate_config_dict + from graph.config import LangGraphConfig + + ok, err = validate_config_dict(config_to_dict(LangGraphConfig())) + assert ok, err + + +# ── SOUL.md dual-path ──────────────────────────────────────────────────────── + + +def test_read_soul_falls_back_to_source(monkeypatch, tmp_path: Path) -> None: + """When /sandbox/SOUL.md doesn't exist (local dev), fall through + to the repo config dir so drawer edits are still visible.""" + from graph import config_io + + # Point the runtime path at an unreachable location so the source + # fallback is exercised. + fake_runtime = tmp_path / "nonexistent" / "SOUL.md" + fake_source = tmp_path / "SOUL-source.md" + fake_source.write_text("from source", encoding="utf-8") + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", fake_runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", fake_source) + + assert config_io.read_soul() == "from source" + + +def test_read_soul_prefers_runtime(monkeypatch, tmp_path: Path) -> None: + from graph import config_io + + runtime = tmp_path / "runtime" / "SOUL.md" + runtime.parent.mkdir() + runtime.write_text("runtime wins", encoding="utf-8") + source = tmp_path / "SOUL-source.md" + source.write_text("source loses", encoding="utf-8") + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + assert config_io.read_soul() == "runtime wins" + + +def test_write_soul_writes_source_always(monkeypatch, tmp_path: Path) -> None: + """The source-of-truth write (config/SOUL.md) must always succeed; + the runtime write is best-effort (skipped when /sandbox missing).""" + from graph import config_io + + # Runtime points at a path whose parent doesn't exist — should skip + # gracefully. + runtime = tmp_path / "no-sandbox-here" / "SOUL.md" + source = tmp_path / "src" / "SOUL.md" + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + written = config_io.write_soul("hello world") + assert source in written + assert runtime not in written + assert source.read_text() == "hello world" + + +def test_write_soul_writes_both_when_runtime_parent_exists( + monkeypatch, tmp_path: Path, +) -> None: + from graph import config_io + + runtime_dir = tmp_path / "sandbox" + runtime_dir.mkdir() + runtime = runtime_dir / "SOUL.md" + source = tmp_path / "src" / "SOUL.md" + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + written = config_io.write_soul("dual write") + assert runtime in written + assert source in written + assert runtime.read_text() == "dual write" + assert source.read_text() == "dual write" + + +# ── Gateway model listing ──────────────────────────────────────────────────── + + +def test_list_gateway_models_success(monkeypatch): + from graph import config_io + + fake_response = MagicMock() + fake_response.status_code = 200 + fake_response.json.return_value = { + "data": [ + {"id": "model-b"}, + {"id": "model-a"}, + {"id": "model-c"}, + ], + } + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.return_value = fake_response + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://gateway:4000/v1", "test-key") + assert err == "" + assert models == ["model-a", "model-b", "model-c"] # sorted + called_url = fake_client.get.call_args[0][0] + assert called_url == "http://gateway:4000/v1/models" + + +def test_list_gateway_models_empty_base_returns_error(): + from graph.config_io import list_gateway_models + + models, err = list_gateway_models("", "key") + assert models == [] + assert "api_base" in err + + +def test_list_gateway_models_http_error(monkeypatch): + from graph import config_io + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.side_effect = httpx.ConnectError("no route to host") + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://bad-host/v1") + assert models == [] + assert "connection failed" in err + + +def test_list_gateway_models_bad_status(monkeypatch): + from graph import config_io + + fake_response = MagicMock() + fake_response.status_code = 401 + fake_response.text = "unauthorized" + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.return_value = fake_response + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://x/v1", "bad-key") + assert models == [] + assert "401" in err + + +# ── list_available_tools ───────────────────────────────────────────────────── + + +def test_list_available_tools_returns_starter_set(): + from graph.config_io import list_available_tools + + names = list_available_tools() + # Lock in the template's starter set — forks replace these but + # the drawer's CheckboxGroup populates from this call, so the + # contract is "return tool names in a stable list". + assert "echo" in names + assert "calculator" in names + assert "current_time" in names + assert all(isinstance(n, str) for n in names) From a406cd171600012062299d6647e1afce53cd4011 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 17:34:40 -0700 Subject: [PATCH 05/24] feat: first-run setup wizard + autostart + SOUL presets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns the "fork a template and edit code" onboarding into a download-and-run flow. A fresh clone boots without any env vars, lands in a 4-step wizard (Connect / Identity / Tools / Profile), and writes out config + SOUL.md + a .setup-complete marker on Launch — the chat UI then appears on the same page, drawer pre-populated with the wizard's values. Key pieces: - Wizard UI in chat_ui.py: visibility-toggled wizard pane vs chat pane, populated from the live config so re-runs pre-fill. 4 ship-with presets in config/soul-presets/ (generic-assistant, research, coding, blank) power the persona dropdown. - Lazy graph init in server.py: no model required at boot. The chat endpoints return a friendly "setup required" message until the wizard completes. After wizard save, the marker is flipped BEFORE the graph reload so the rebuild actually compiles (this order matters — earlier iteration reloaded before marking complete and left _graph=None). - Identity/auth/runtime sections added to LangGraphConfig so the wizard-captured name, operator, A2A token, and autostart flag round-trip through the existing YAML infrastructure. agent_name() resolver prefers YAML identity.name → env → "protoagent" so the agent card + OpenAI-compat model id reflect the wizard value without a process restart. - autostart.py: macOS LaunchAgent install/uninstall with Linux/Windows stubs. Captures sys.executable at install time so venv-based runs survive a reboot. Opt-in via wizard checkbox; toggle from drawer anytime. - Dockerfile: config volume declared so setup persistence survives docker run without a -v flag. - Docs: first-agent.md rewritten for clone→pip→run→wizard flow; old fork/sed/docker content moved to new customize-and-deploy.md guide. Tests: 29 passing (7 new — setup marker lifecycle, preset discovery, preset content shape, shipped starter presence). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 5 + Dockerfile | 12 +- README.md | 43 ++- TEMPLATE.md | 11 + autostart.py | 245 +++++++++++++++ chat_ui.py | 381 ++++++++++++++++++++++- config/soul-presets/blank.md | 24 ++ config/soul-presets/coding.md | 37 +++ config/soul-presets/generic-assistant.md | 33 ++ config/soul-presets/research.md | 35 +++ docs/.vitepress/config.mts | 3 +- docs/guides/customize-and-deploy.md | 97 ++++++ docs/guides/index.md | 5 +- docs/index.md | 6 +- docs/tutorials/first-agent.md | 101 +++--- graph/config.py | 27 ++ graph/config_io.py | 87 ++++++ server.py | 278 +++++++++++++++-- tests/test_config_io.py | 102 +++++- 19 files changed, 1411 insertions(+), 121 deletions(-) create mode 100644 autostart.py create mode 100644 config/soul-presets/blank.md create mode 100644 config/soul-presets/coding.md create mode 100644 config/soul-presets/generic-assistant.md create mode 100644 config/soul-presets/research.md create mode 100644 docs/guides/customize-and-deploy.md diff --git a/.gitignore b/.gitignore index 86ef2cd..d28c925 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ __pycache__/ .venv/ .venv-*/ venv/ + +# Local-run artifacts — autostart stdout/stderr logs + memory middleware +# fallback directory when /sandbox is not available. +logs/ +.proto/ *.egg-info/ dist/ build/ diff --git a/Dockerfile b/Dockerfile index a52723c..58b8164 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ RUN useradd -m -s /bin/bash -u ${SANDBOX_UID} sandbox # auth, add them here. The ddgs + beautifulsoup4 pair powers the # starter web_search / fetch_url tools; drop them if you strip those. RUN pip install --no-cache-dir \ - gradio httpx uvicorn langfuse prometheus-client pyyaml \ + gradio httpx uvicorn langfuse prometheus-client pyyaml ruamel.yaml \ langchain langchain-openai langgraph websockets \ ddgs beautifulsoup4 @@ -40,6 +40,16 @@ RUN chmod +x /opt/protoagent/entrypoint.sh RUN mkdir -p /sandbox /tmp/sandbox /sandbox/audit /sandbox/knowledge \ && chown -R sandbox:sandbox /sandbox /tmp/sandbox +# Make /opt/protoagent/config writable by the sandbox user so the +# drawer and setup wizard can persist edits from inside the container. +RUN chown -R sandbox:sandbox /opt/protoagent/config + +# Declare config as a volume so setup completion (``.setup-complete`` +# marker + any YAML / SOUL.md edits) survives ``docker run`` without +# a -v flag. Operators who want cross-host persistence still mount a +# named volume or host directory at /opt/protoagent/config. +VOLUME ["/opt/protoagent/config"] + ENV PYTHONPATH=/opt/protoagent USER sandbox diff --git a/README.md b/README.md index 6bf76ee..1a1b34c 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,15 @@ close to a rewrite of `SOUL.md`, `graph/prompts.py`, and Quinn was the first agent built on this template — it's a good example of what a filled-in fork looks like end-to-end. -Start a new agent by clicking **"Use this template"** at the top -of the GitHub repo. See [TEMPLATE.md](./TEMPLATE.md) for the -step-by-step fork checklist. +**Try it in 5 minutes:** clone, `pip install -r requirements.txt`, +`python server.py`, open , and walk the +setup wizard — no forking, no `sed`, no Docker required to get +your first agent talking. See the [first-agent tutorial](./docs/tutorials/first-agent.md). + +**When you're ready to ship your own:** click **"Use this template"** +at the top of the GitHub repo, then follow [Customize & +deploy](./docs/guides/customize-and-deploy.md) for the fork / +rename / release-pipeline wiring. ## What you get out of the box @@ -31,28 +37,31 @@ step-by-step fork checklist. | UI | `chat_ui.py`, `static/` | Gradio chat with PWA shell, dark theme, offline fallback | | Release pipeline | `.github/workflows/*.yml` | Autonomous semver bumps, GHCR image push, GitHub release with filtered notes, optional Discord post | -## Quickstart +## Quickstart — from zero to chatting in 5 minutes ```bash -# 1. Click "Use this template" on GitHub, or: -gh repo create protoLabsAI/my-agent \ - --template protoLabsAI/protoAgent \ - --public --clone - +# 1. Get the code (no fork needed for a first run) +git clone https://github.com/protoLabsAI/protoAgent.git my-agent cd my-agent -# 2. Rename the agent (one env var, read by server.py, metrics, tracing) -export AGENT_NAME=my-agent +# 2. Install deps into a venv +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt -# 3. Boot the container -docker build -t my-agent:local . -docker run --rm -p 7870:7870 -e AGENT_NAME=my-agent my-agent:local +# 3. Run the server — no env vars required +python server.py -# 4. Hit the agent card -curl http://localhost:7870/.well-known/agent-card.json +# 4. Open the wizard — pick your endpoint, pick a model, name the +# agent, pick a persona preset, hit Launch. The chat UI appears +# on the same page. +open http://localhost:7870 ``` -See [TEMPLATE.md](./TEMPLATE.md) for the full fork checklist. +[First-agent tutorial](./docs/tutorials/first-agent.md) walks +through every wizard step with screenshots. + +Once you're happy and want to ship it as your own image in your +own GHCR: [Customize & deploy](./docs/guides/customize-and-deploy.md). ## Architecture diff --git a/TEMPLATE.md b/TEMPLATE.md index 2997277..31b1eb2 100644 --- a/TEMPLATE.md +++ b/TEMPLATE.md @@ -1,5 +1,16 @@ # Fork checklist +> **Most of what used to be in this file is now a runtime wizard** +> that runs on first page load. Model, tools, persona, name, auth, +> autostart — all captured without editing code. See +> [first-agent tutorial](./docs/tutorials/first-agent.md). +> +> This checklist is only for forks that want to ship their own +> container image under their own GitHub org — the structural +> changes the wizard can't do. For most of that, the new +> [Customize & deploy](./docs/guides/customize-and-deploy.md) +> guide is the canonical source. This file stays for back-compat. + You clicked "Use this template" (or ran `gh repo create --template`). Now what? diff --git a/autostart.py b/autostart.py new file mode 100644 index 0000000..7f45b79 --- /dev/null +++ b/autostart.py @@ -0,0 +1,245 @@ +"""OS-level autostart for the protoAgent server. + +Hooks the server into the OS so it launches on user login. Today +macOS is the only supported path (LaunchAgent plist); Linux and +Windows stubs return a clear "not yet supported" error so the +wizard surfaces that instead of silently failing. + +Design notes: + +- The source of truth for "should autostart be on?" is + ``runtime.autostart_on_boot`` in ``langgraph-config.yaml``. This + module only installs / removes the OS artifact — it doesn't + decide policy. The wizard and drawer toggle the YAML value and + call these functions to bring the OS state in sync. + +- ``sys.executable`` is captured at install time so reinstalling + after a venv rebuild picks up the new interpreter path. If a user + recreates their venv without reinstalling, the LaunchAgent keeps + pointing at the stale path and will fail at next login — noisy + log but not catastrophic. Documented in the docs. + +- Install is idempotent: ``install_autostart`` overwrites any + prior plist so the same file always reflects current state, no + stale LaunchAgents piling up. +""" + +from __future__ import annotations + +import platform +import shlex +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).parent.resolve() + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def autostart_supported() -> tuple[bool, str]: + """Is this platform a supported autostart target? + + Returns ``(True, "")`` on supported platforms, ``(False, reason)`` + otherwise. Wizard / drawer check this before offering the toggle. + """ + system = platform.system() + if system == "Darwin": + return True, "" + if system == "Linux": + return False, "Linux autostart (systemd user unit) not yet implemented" + if system == "Windows": + return False, "Windows autostart (Task Scheduler) not yet implemented" + return False, f"autostart not implemented for platform {system!r}" + + +def install_autostart(agent_name: str = "protoagent", port: int = 7870) -> tuple[bool, str]: + """Install the OS artifact that runs the server on user login. + + Returns ``(ok, message)``. On success, ``message`` is a short + human-readable note the UI can display; on failure it's the + actual error (permission denied, launchctl exit code, etc). + """ + ok, reason = autostart_supported() + if not ok: + return False, reason + + if platform.system() == "Darwin": + return _install_macos_launchagent(agent_name, port) + return False, "unreachable" # autostart_supported already rejected + + +def uninstall_autostart(agent_name: str = "protoagent") -> tuple[bool, str]: + """Remove the OS autostart artifact. Safe to call when nothing + is installed — returns success in that case. + """ + ok, reason = autostart_supported() + if not ok: + return False, reason + + if platform.system() == "Darwin": + return _uninstall_macos_launchagent(agent_name) + return False, "unreachable" + + +def autostart_status(agent_name: str = "protoagent") -> dict: + """Report current on-disk state for diagnostics. + + The UI uses this to render accurate "autostart is currently + on/off" without having to remember what it last wrote. + """ + ok, reason = autostart_supported() + if not ok: + return {"supported": False, "installed": False, "reason": reason} + + if platform.system() == "Darwin": + plist = _macos_plist_path(agent_name) + return { + "supported": True, + "installed": plist.exists(), + "plist_path": str(plist), + "python": sys.executable, + "server_path": str(REPO_ROOT / "server.py"), + } + return {"supported": False, "installed": False, "reason": "unreachable"} + + +# --------------------------------------------------------------------------- +# macOS — LaunchAgent plist +# --------------------------------------------------------------------------- + + +def _macos_label(agent_name: str) -> str: + """Plist label — namespaced so it doesn't collide with system labels.""" + safe = agent_name.lower().replace(" ", "-") + return f"ai.protolabs.{safe}" + + +def _macos_plist_path(agent_name: str) -> Path: + home = Path.home() + return home / "Library" / "LaunchAgents" / f"{_macos_label(agent_name)}.plist" + + +def _install_macos_launchagent(agent_name: str, port: int) -> tuple[bool, str]: + """Write the plist and ``launchctl load`` it. + + Unload-then-load (rather than a bootstrap-replace dance) is the + simplest idempotent recipe that works across macOS versions. A + missing label on unload is a no-op. + """ + python = sys.executable + server_py = REPO_ROOT / "server.py" + if not server_py.exists(): + return False, f"server.py not found at {server_py}" + + label = _macos_label(agent_name) + plist_path = _macos_plist_path(agent_name) + log_dir = REPO_ROOT / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + + plist = _render_launchagent_plist( + label=label, + python=python, + server_py=str(server_py), + port=port, + working_dir=str(REPO_ROOT), + agent_name=agent_name, + stdout_log=str(log_dir / "autostart.out.log"), + stderr_log=str(log_dir / "autostart.err.log"), + ) + + plist_path.parent.mkdir(parents=True, exist_ok=True) + + # Unload any prior incarnation first — silently ok if absent. + subprocess.run( + ["launchctl", "unload", str(plist_path)], + capture_output=True, check=False, + ) + + plist_path.write_text(plist, encoding="utf-8") + + result = subprocess.run( + ["launchctl", "load", str(plist_path)], + capture_output=True, check=False, + ) + if result.returncode != 0: + err = (result.stderr.decode("utf-8", errors="replace") + or result.stdout.decode("utf-8", errors="replace") + or f"launchctl load exit={result.returncode}") + return False, f"plist written but launchctl load failed: {err.strip()}" + + return True, f"installed • {plist_path.name} • runs `{shlex.quote(python)} server.py` on login" + + +def _uninstall_macos_launchagent(agent_name: str) -> tuple[bool, str]: + plist_path = _macos_plist_path(agent_name) + if not plist_path.exists(): + return True, "autostart was not installed" + + subprocess.run( + ["launchctl", "unload", str(plist_path)], + capture_output=True, check=False, + ) + + try: + plist_path.unlink() + except OSError as e: + return False, f"failed to remove plist: {e}" + + return True, f"uninstalled • removed {plist_path.name}" + + +def _render_launchagent_plist( + *, + label: str, + python: str, + server_py: str, + port: int, + working_dir: str, + agent_name: str, + stdout_log: str, + stderr_log: str, +) -> str: + """Render the plist XML. Small enough to inline; escaping is + limited to the known-safe fields we control, so no XML-injection + surface to audit here. + """ + return f""" + + + + Label + {label} + ProgramArguments + + {python} + {server_py} + --port + {port} + + WorkingDirectory + {working_dir} + EnvironmentVariables + + AGENT_NAME + {agent_name} + PATH + /usr/local/bin:/usr/bin:/bin:/opt/homebrew/bin + + RunAtLoad + + KeepAlive + + SuccessfulExit + + + StandardOutPath + {stdout_log} + StandardErrorPath + {stderr_log} + + +""" diff --git a/chat_ui.py b/chat_ui.py index e9fe105..d332af6 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -181,6 +181,21 @@ def create_chat_app( _css = CLEAN_CSS + AGENT_DARK_CSS + extra_css _head = AGENT_PWA_HEAD if pwa else "" + # Determine first-run state. Fresh clones land in the wizard; + # subsequent boots go straight to chat unless the user explicitly + # triggers "Re-run setup" from the drawer. Settings dicts without + # the ``is_setup_complete`` key (older template forks) skip the + # wizard entirely — chat is always visible. + setup_done = True + if settings and "is_setup_complete" in settings: + try: + setup_done = bool(settings["is_setup_complete"]()) + except Exception: + setup_done = True # fail-open: don't trap forks in a broken wizard + wizard_enabled = bool( + settings and "finish_setup" in settings and "get_config" in settings + ) + def _build() -> gr.Blocks: with gr.Blocks( title=title.replace("*", "").strip(), @@ -197,29 +212,156 @@ def _build() -> gr.Blocks: header_md = gr.Markdown(header_text) - chatbot = gr.Chatbot(height=chat_height, show_label=False) + # === SETUP WIZARD PANE ===================================== + # Visible on first run (no .setup-complete marker), hidden + # after the user clicks Launch. All fields default from the + # current config so re-running the wizard doesn't start + # from zero. + wizard_pane = None + w_launch_btn = None + w_launch_status = None + w_inputs: list = [] + if wizard_enabled: + with gr.Column(visible=not setup_done) as wizard_pane: + gr.Markdown( + "# Welcome — let's set up your agent\n\n" + "Walk through the steps below and hit **Launch agent**. " + "You can revisit every one of these choices later from " + "the Configuration drawer. Nothing is persisted until " + "you click Launch." + ) + w_launch_status = gr.Markdown("") + + with gr.Accordion("1. Connect to your model", open=True): + w_api_base = gr.Textbox( + label="API base URL", + placeholder="e.g. https://api.openai.com/v1 or http://localhost:4000/v1", + interactive=True, + ) + w_api_key = gr.Textbox( + label="API key", + type="password", + placeholder="your OpenAI or gateway master key", + interactive=True, + ) + with gr.Row(): + w_test_btn = gr.Button( + "Test connection & fetch models", + variant="secondary", scale=3, + ) + w_test_status = gr.Markdown("") + w_model = gr.Dropdown( + label="Model", + choices=[], allow_custom_value=True, + interactive=True, + ) - with gr.Row(): - txt = gr.Textbox( - placeholder=placeholder, show_label=False, - scale=9, container=False, - ) - send_btn = gr.Button("Send", variant="primary", scale=1, min_width=80) + with gr.Accordion("2. Name your agent", open=False): + w_agent_name = gr.Textbox( + label="Agent name", + placeholder="short lowercase slug, e.g. product-director", + interactive=True, + ) + gr.Markdown( + "_This becomes the agent card name, OpenAI-compat " + "model id, and chat header. Metric prefix still " + "needs a process restart to pick up._" + ) + with gr.Row(): + w_preset = gr.Dropdown( + label="Persona preset (optional)", + choices=[], interactive=True, scale=3, + ) + w_load_preset_btn = gr.Button( + "Load preset into SOUL.md", + size="sm", scale=2, + ) + w_soul = gr.Textbox( + label="SOUL.md — the agent's persona", + lines=14, interactive=True, + placeholder=( + "Identity, personality, values, communication " + "style. Loaded into every system prompt." + ), + ) - with gr.Row(): - clear_btn = gr.Button("Clear", size="sm", variant="secondary") - new_btn = gr.Button("New Session", size="sm", variant="secondary") + with gr.Accordion("3. Tools & middleware", open=False): + w_tools = gr.CheckboxGroup( + label="Tools available to the agent", + choices=[], interactive=True, + ) + w_mw_audit = gr.Checkbox( + label="Audit middleware — logs every tool call", + value=True, interactive=True, + ) + w_mw_memory = gr.Checkbox( + label="Memory middleware — persists session summaries", + value=True, interactive=True, + ) + w_mw_knowledge = gr.Checkbox( + label="Knowledge middleware — requires a knowledge store (leave off for starter setups)", + value=False, interactive=True, + ) - if footer_html: - gr.HTML(footer_html) + with gr.Accordion("4. Optional — you, security, autostart", open=False): + w_operator = gr.Textbox( + label="Your name", + placeholder="so the agent can address you directly — blank = anonymous", + interactive=True, + ) + w_auth = gr.Textbox( + label="A2A bearer token", + type="password", + placeholder="set before exposing to a network; blank = open mode for local dev", + interactive=True, + ) + w_autostart = gr.Checkbox( + label="Launch this agent automatically on login", + value=False, interactive=True, + ) + w_autostart_note = gr.Markdown("") + + w_launch_btn = gr.Button( + "Launch agent", variant="primary", size="lg", + ) + + w_inputs = [ + w_api_base, w_api_key, w_model, + w_agent_name, w_soul, w_preset, + w_tools, w_mw_audit, w_mw_memory, w_mw_knowledge, + w_operator, w_auth, w_autostart, + ] + + # === CHAT PANE ============================================= + # Wrapped in a Column so visibility toggles in lockstep with + # the wizard. On fresh setup it starts hidden and the Launch + # button flips it on. + with gr.Column(visible=setup_done) as chat_pane: + chatbot = gr.Chatbot(height=chat_height, show_label=False) + + with gr.Row(): + txt = gr.Textbox( + placeholder=placeholder, show_label=False, + scale=9, container=False, + ) + send_btn = gr.Button("Send", variant="primary", scale=1, min_width=80) + + with gr.Row(): + clear_btn = gr.Button("Clear", size="sm", variant="secondary") + new_btn = gr.Button("New Session", size="sm", variant="secondary") + + if footer_html: + gr.HTML(footer_html) # --- Settings sidebar --- # Each section below is gated on the presence of its callback, # so forks can opt in per panel. The Configuration panel (the # live-editable drawer) renders when "get_config" + "save_all" - # are provided by the server. + # are provided by the server. The drawer is hidden during the + # wizard so the user has one surface to look at at a time. + sidebar_block = None if settings: - with gr.Sidebar(label="Settings", open=False, position="right"): + with gr.Sidebar(label="Settings", open=False, position="right", visible=setup_done) as sidebar_block: # === Live configuration drawer ============================ if "get_config" in settings and "save_all" in settings: @@ -320,6 +462,22 @@ def _build() -> gr.Blocks: "Reload from Disk", variant="secondary", scale=1, ) + # "Re-run setup" re-opens the wizard with current + # values pre-populated — for re-picking a preset, + # swapping models, or resetting the autostart plist. + if "restart_setup" in settings and wizard_enabled: + with gr.Accordion("Re-run setup wizard", open=False): + gr.Markdown( + "_Reopens the wizard with all current " + "values pre-filled. Your config isn't " + "wiped — you're just re-visiting the " + "choices._" + ) + reset_setup_btn = gr.Button( + "Run wizard now", variant="secondary", + ) + reset_setup_status = gr.Markdown("") + # Ordered tuple used for both load_all outputs and # save_all inputs — keeps the wiring obvious and the # two lists from drifting out of sync. @@ -563,6 +721,201 @@ def get_response(history: list[dict], original_msg: str, sid: str): clear_btn.click(fn=lambda: ([], "default"), outputs=[chatbot, session_id]) new_btn.click(fn=lambda: ([], secrets.token_hex(4)), outputs=[chatbot, session_id]) + # --- Wizard callbacks ----------------------------------------- + if wizard_enabled: + def _load_wizard_defaults(): + """Seed every wizard field from the current on-disk + config. Returns updates in the exact order of + ``w_inputs`` plus the connection-test status + the + autostart note.""" + cfg = settings["get_config"]() if "get_config" in settings else {} + soul = settings["get_soul"]() if "get_soul" in settings else "" + tools = settings["list_tools"]() if "list_tools" in settings else [] + presets = settings["list_soul_presets"]() if "list_soul_presets" in settings else [] + + model = cfg.get("model", {}) + identity = cfg.get("identity", {}) + worker = cfg.get("subagents", {}).get("worker", {}) + mw = cfg.get("middleware", {}) + runtime = cfg.get("runtime", {}) + auth = cfg.get("auth", {}) + + current_model = model.get("name", "") + model_choices = [current_model] if current_model else [] + + autostart_msg = "" + if "autostart_info" in settings: + try: + info = settings["autostart_info"]() + except Exception as e: + info = {"supported": False, "reason": str(e)} + if info.get("supported"): + state = "installed" if info.get("installed") else "not installed" + autostart_msg = f"_Platform supported. Current state: **{state}**._" + else: + autostart_msg = f"⚠ {info.get('reason', 'not supported on this platform')}" + + return ( + model.get("api_base", ""), + model.get("api_key", ""), + gr.update(choices=model_choices, value=current_model), + identity.get("name", ""), + soul, + gr.update(choices=presets, value=None), + gr.update(choices=tools, value=list(worker.get("tools", []))), + bool(mw.get("audit", True)), + bool(mw.get("memory", True)), + bool(mw.get("knowledge", False)), + identity.get("operator", ""), + auth.get("token", ""), + bool(runtime.get("autostart_on_boot", False)), + "", # w_test_status + autostart_msg, + ) + + app.load( + fn=_load_wizard_defaults, + outputs=[*w_inputs, w_test_status, w_autostart_note], + ) + + # Connection test — fills the model dropdown + def _test_connection(api_base, api_key): + if "list_models" not in settings: + return gr.update(), "⚠ list_models callback not wired" + if not api_base: + return gr.update(), "⚠ enter an API base URL first" + try: + models, err = settings["list_models"](api_base, api_key) + except Exception as e: + return gr.update(), f"⚠ {e}" + if err: + return gr.update(), f"⚠ {err}" + pick = models[0] if models else None + return ( + gr.update(choices=models, value=pick), + f"✓ {len(models)} model(s) — picked **{pick}**, change if needed", + ) + + w_test_btn.click( + fn=_test_connection, + inputs=[w_api_base, w_api_key], + outputs=[w_model, w_test_status], + ) + + # Preset loader — pastes template text into SOUL textarea + def _load_preset(name): + if not name or "read_soul_preset" not in settings: + return gr.update() + try: + return settings["read_soul_preset"](name) + except Exception: + return gr.update() + + w_load_preset_btn.click( + fn=_load_preset, inputs=[w_preset], outputs=[w_soul], + ) + + # Launch button — write everything, mark complete, swap panes + def _finish_wizard( + api_base, api_key, model_name, + agent_name_val, soul, _preset_unused, + tools, mw_audit, mw_memory, mw_knowledge, + operator, auth_token, autostart, + ): + if not (api_base or "").strip(): + return ( + "⚠ API base URL is required — go back to step 1", + gr.update(), gr.update(), gr.update(), + ) + if not (model_name or "").strip(): + return ( + "⚠ pick a model — use the Test connection button in step 1", + gr.update(), gr.update(), gr.update(), + ) + if not (agent_name_val or "").strip(): + return ( + "⚠ agent name is required — step 2", + gr.update(), gr.update(), gr.update(), + ) + + new_config = { + "model": { + "api_base": api_base, + "api_key": api_key or "", + "name": model_name, + }, + "subagents": { + "worker": { + "enabled": True, + "tools": list(tools or []), + }, + }, + "middleware": { + "audit": bool(mw_audit), + "memory": bool(mw_memory), + "knowledge": bool(mw_knowledge), + }, + "identity": { + "name": agent_name_val.strip(), + "operator": (operator or "").strip(), + }, + "auth": {"token": auth_token or ""}, + "runtime": {"autostart_on_boot": bool(autostart)}, + } + try: + ok, msg = settings["finish_setup"](new_config, soul or "") + except Exception as e: + return ( + f"⚠ setup failed: {e}", + gr.update(), gr.update(), gr.update(), + ) + if ok: + return ( + f"✓ {msg}", + gr.update(visible=False), # wizard_pane + gr.update(visible=True), # chat_pane + gr.update(visible=True), # sidebar_block + ) + return ( + f"⚠ {msg}", + gr.update(), gr.update(), gr.update(), + ) + + w_launch_btn.click( + fn=_finish_wizard, + inputs=w_inputs, + outputs=[w_launch_status, wizard_pane, chat_pane, + sidebar_block if sidebar_block is not None else w_launch_status], + ) + + # "Re-run setup" in the drawer flips panes back to wizard + if "restart_setup" in settings: + def _trigger_rerun(): + try: + msg = settings["restart_setup"]() + except Exception as e: + return ( + f"⚠ {e}", + gr.update(), gr.update(), gr.update(), + ) + return ( + f"✓ {msg}", + gr.update(visible=True), # wizard_pane + gr.update(visible=False), # chat_pane + gr.update(visible=False), # sidebar_block + ) + + reset_setup_btn.click( + fn=_trigger_rerun, + outputs=[ + reset_setup_status, wizard_pane, chat_pane, + sidebar_block if sidebar_block is not None else reset_setup_status, + ], + ).then( + fn=_load_wizard_defaults, + outputs=[*w_inputs, w_test_status, w_autostart_note], + ) + return app app = _build() diff --git a/config/soul-presets/blank.md b/config/soul-presets/blank.md new file mode 100644 index 0000000..e53908c --- /dev/null +++ b/config/soul-presets/blank.md @@ -0,0 +1,24 @@ +# Identity + +_Describe your agent in one paragraph — who it is, who it +reports to, what domain it owns._ + +# Personality + +_3–6 traits. Affects the tone of every response._ + +# Values + +_Rules that shape judgement calls. Example: "never modify +production data while investigating."_ + +# Communication style + +_How output is formatted — markdown, plain text, JSON, Discord +embeds. How long responses should be by default._ + +# Capabilities + +_What tools are available and when to reach for each. The tool +docstrings are already in context; this is where you explain +the higher-level decision procedure._ diff --git a/config/soul-presets/coding.md b/config/soul-presets/coding.md new file mode 100644 index 0000000..1ad23d1 --- /dev/null +++ b/config/soul-presets/coding.md @@ -0,0 +1,37 @@ +# Identity + +I am a coding agent. I read code, explain it, suggest changes, and +write code when asked — grounded in what the codebase actually +does, not in what a general-purpose model might guess. + +# Personality + +- Precise — file paths, line numbers, exact identifiers. Never + "somewhere in the auth module." +- Conservative on edits — the smallest change that solves the + problem. I don't refactor surrounding code as a bonus. +- Root-cause oriented — when something breaks, I find the cause + before patching the symptom. + +# Communication style + +- Short prose, code in code fences, one clear recommendation. +- For any file reference, include the path and the relevant + lines. The operator shouldn't have to hunt. +- When I suggest a change, explain the *why* in one sentence. + Reserve multi-paragraph explanations for genuinely subtle cases. + +# When to reach for tools + +- `fetch_url` for official docs when the question is + library-specific and the model's training data may be stale. +- `web_search` for error messages with distinctive strings to + find similar reports. +- `calculator` for bit math, offsets, sizing. + +# Values + +- No speculation. If I haven't read the file, I say so before + making claims about it. +- A clean diff beats a clever one. Readability is a feature. +- Tests are evidence. A bug without a failing test is unverified. diff --git a/config/soul-presets/generic-assistant.md b/config/soul-presets/generic-assistant.md new file mode 100644 index 0000000..58e6459 --- /dev/null +++ b/config/soul-presets/generic-assistant.md @@ -0,0 +1,33 @@ +# Identity + +I am an AI assistant. I help the operator think through problems, +answer questions, and take action via the tools available to me. + +# Personality + +- Direct — I answer the question asked, not a version of it I wish + had been asked. +- Grounded — when I use a tool, I surface what it returned rather + than paraphrasing away the evidence. +- Calibrated — I say "I don't know" when I don't, rather than + fabricating a confident answer. + +# Communication style + +- Short by default. Expand when the operator asks or when the + answer genuinely requires it. +- Markdown when the surface renders it; plain text otherwise. +- Reference concrete artifacts (URLs, file paths, tool outputs) + so the operator can verify. + +# When to reach for tools + +- `web_search` + `fetch_url` when the question depends on current + information that the model's training data wouldn't know. +- `current_time` any time "now" matters — never guess the time. +- `calculator` for any numeric work beyond trivial mental math. + +# Values + +- Verify before asserting. +- Surface failures plainly; the operator decides what to do next. diff --git a/config/soul-presets/research.md b/config/soul-presets/research.md new file mode 100644 index 0000000..6d51cac --- /dev/null +++ b/config/soul-presets/research.md @@ -0,0 +1,35 @@ +# Identity + +I am a research agent. My job is to find information, evaluate +source quality, and deliver a synthesis the operator can act on. + +# Personality + +- Curious — I follow threads until I've seen enough to answer, + not until I find the first plausible-looking result. +- Skeptical — I assume claims are wrong until the evidence holds + up. I note when sources disagree. +- Thorough — when the operator asks for "three sources" I return + three distinct sources, not three links to the same article. + +# Communication style + +- Lead with the answer, then the evidence. Never bury the + conclusion under a recap of my search process. +- Cite with URLs. Prefer primary sources (docs, filings, papers) + over summaries. +- Flag confidence explicitly — "confirmed by X and Y" vs "one + source, unverified" — so the operator can calibrate. + +# Search loop + +1. Search with `web_search`. Read the top N titles + snippets. +2. Pick the most credible-looking 2–5. `fetch_url` each. +3. Cross-check: do independent sources agree? Which disagree? +4. Synthesize. Return claim → evidence → confidence, not a + chronological log of what I read. + +# Values + +- A hole in the evidence is more useful than a confident guess. +- Never present a synthesis as settled when the sources are thin. diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index d705a63..2deb10c 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -35,7 +35,8 @@ export default defineConfig({ text: "How-To Guides", items: [ { text: "Overview", link: "/guides/" }, - { text: "Fork the template", link: "/guides/fork-the-template" }, + { text: "Customize & deploy", link: "/guides/customize-and-deploy" }, + { text: "Fork checklist (fast path)", link: "/guides/fork-the-template" }, { text: "Add a custom skill", link: "/guides/add-a-skill" }, { text: "Configure subagents", link: "/guides/subagents" }, { text: "Wire Langfuse + Prometheus", link: "/guides/observability" }, diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md new file mode 100644 index 0000000..17c24d9 --- /dev/null +++ b/docs/guides/customize-and-deploy.md @@ -0,0 +1,97 @@ +# Customize & deploy + +Use this guide when you've run through the wizard, decided the template fits your use case, and now want to fork it into your own GitHub repo + ship a deployable image. If you're still evaluating, stay on the [first-agent tutorial](/tutorials/first-agent) — you don't need any of this to run the agent locally. + +## Why this is a separate step + +The [setup wizard](/tutorials/first-agent) handles runtime customization — model, tools, persona, auth — without editing code. Everything below is structural: renaming the template throughout the codebase, bending the release pipeline to your repo, baking your fork's identity into the Docker image. Do it once per fork, not every time you tweak a setting. + +## 1. Fork the template on GitHub + +```bash +gh repo create protoLabsAI/my-agent \ + --template protoLabsAI/protoAgent \ + --public --clone + +cd my-agent +``` + +Or: `Use this template → Create a new repository` from the browser. Pick a short slug (`jon`, `echo-agent`, `product-director`) — it ends up as the image name, metric prefix, Langfuse tag, and release-workflow repo guard. + +## 2. Rename `protoagent` throughout + +The template uses `protoagent` as the placeholder everywhere. Do one pass: + +```bash +# macOS / BSD sed +git grep -li protoagent | xargs sed -i '' 's/protoagent/my-agent/g' +git grep -li protoAgent | xargs sed -i '' 's/protoAgent/MyAgent/g' + +# Linux / GNU sed — drop the empty-string backup suffix +git grep -li protoagent | xargs sed -i 's/protoagent/my-agent/g' +git grep -li protoAgent | xargs sed -i 's/protoAgent/MyAgent/g' +``` + +Review the diff. Key hits: + +- `Dockerfile` — the `/opt/protoagent/` paths become `/opt/my-agent/`. +- `entrypoint.sh` — same. +- `server.py` — `AGENT_NAME_ENV` fallback becomes `my-agent`. +- `chat_ui.py` — branding strings (service worker label, apple-mobile-web-app-title). +- Workflow files — the repo guards check `protoLabsAI/my-agent` instead. + +The runtime name (`identity.name` in `config/langgraph-config.yaml`, set by the wizard) is separate — keep both in sync unless you have a reason not to. + +## 3. Un-freeze the release pipeline + +The release workflows gate on the template's repo path so third-party clones don't accidentally cut releases: + +- `.github/workflows/prepare-release.yml` +- `.github/workflows/release.yml` +- `.github/workflows/docker-publish.yml` + +Each has a `if: github.repository == 'protoLabsAI/protoAgent'` (or similar) check. Swap `protoLabsAI/protoAgent` for `/` in all three, or the pipeline won't fire on merges. + +## 4. Rewrite the agent card + +`server.py::_build_agent_card` ships with placeholder skills: + +```python +"skills": [ + {"id": "chat", "name": "Chat", "description": "General-purpose...", ...}, +], +``` + +Replace with the skills your agent actually advertises over A2A. The `name` and `url` fields already pick up `identity.name` from YAML, so the wizard-set name lands on the card without code changes. + +## 5. (Optional) Add domain tools + +`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. + +## 6. (Optional) Configure subagents + +`graph/subagents/config.py` ships with one `worker`. Register more `SubagentConfig` instances in `SUBAGENT_REGISTRY` and add matching fields in `graph/config.py::LangGraphConfig`. The lead agent delegates via the `task` tool; the subagent delegation rules are built from the registry. + +## 7. Build and ship the image + +```bash +docker build -t ghcr.io/my-org/my-agent:local . + +# local test — mount the config volume so wizard completions persist +docker run --rm -p 7870:7870 \ + -e OPENAI_API_KEY="$OPENAI_API_KEY" \ + -v my-agent-config:/opt/my-agent/config \ + ghcr.io/my-org/my-agent:local +``` + +The Dockerfile declares `VOLUME /opt//config` so even without `-v` the wizard writes persist across container runs on the same Docker host — they live in an anonymous volume. For production, use a named volume or host mount so you can back it up. + +Once the local build is happy, merge a PR to trigger the release pipeline ([Deploy via GHCR](/guides/deploy)). + +## 8. Delete `TEMPLATE.md` + +Once the checklist is done, `rm TEMPLATE.md` and rewrite `README.md` to describe your specific agent — its purpose, its skills, its operators. + +## Canonical reference implementation + +[protoLabsAI/quinn](https://github.com/protoLabsAI/quinn) is the first agent built on this template, now running in production. When this guide doesn't cover a specific decision, Quinn is the filled-in example — worth a skim before you invent something new. diff --git a/docs/guides/index.md b/docs/guides/index.md index 843adee..3e49012 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -1,10 +1,11 @@ # How-To Guides -Task-oriented procedures. Assumes you already have a forked, running agent (see [Tutorials](/tutorials/) if not). +Task-oriented procedures. Assumes you already have a running agent (see [Tutorials](/tutorials/) if not — the wizard runs with zero setup). | Guide | When to read | |---|---| -| [Fork the template](/guides/fork-the-template) | Fast-path checklist for experienced forkers | +| [Customize & deploy](/guides/customize-and-deploy) | You've evaluated via the wizard and now want to fork, rename, and ship your own image | +| [Fork checklist (fast path)](/guides/fork-the-template) | Terser version of the above for experienced forkers | | [Add a custom skill](/guides/add-a-skill) | Your agent does new things and callers need to dispatch to them | | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` | | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production | diff --git a/docs/index.md b/docs/index.md index d66540d..aaf8c0e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,14 +3,14 @@ layout: home hero: name: protoAgent text: LangGraph + A2A template for protoLabs agents - tagline: Fork this repo. Rewrite SOUL.md, prompts, and tools. Ship. + tagline: Clone. Run. Walk the wizard. Chat. Fork when you're ready to ship. actions: - theme: brand text: Spin up your first agent link: /tutorials/first-agent - theme: alt - text: Reference - link: /reference/ + text: Customize & deploy + link: /guides/customize-and-deploy features: - icon: 🔌 diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md index 4fc9a16..6082f66 100644 --- a/docs/tutorials/first-agent.md +++ b/docs/tutorials/first-agent.md @@ -1,99 +1,80 @@ # Spin up your first agent -This walks you from "I clicked Use this template" to "I have a running agent answering a web-search query". About 15 minutes, assuming Docker and a LiteLLM gateway are already set up. +About 5 minutes. You need Python 3.11+ and an OpenAI-compatible API key (OpenAI direct, LiteLLM gateway, Anthropic-via-gateway, Ollama, anything that speaks the OpenAI REST shape). -## What you'll need +No forking, no `sed`, no Docker for your first run. That's all in [Customize & deploy](/guides/customize-and-deploy) once you've decided this template works for you. -- A GitHub account with access to `protoLabsAI` (or your own org — the workflows gate on the repo owner; see step 7) -- Docker -- A LiteLLM gateway running somewhere reachable (the template points at `http://gateway:4000/v1`) -- A model alias in that gateway. The template's default is `protolabs/agent` — either add that alias or retarget `model.name` in step 4 - -## 1. Use the template - -From GitHub, click **Use this template → Create a new repository** on [protoLabsAI/protoAgent](https://github.com/protoLabsAI/protoAgent). Pick a short slug like `jon` or `echo-agent` — it will end up as the image name, metric prefix, Langfuse tag, and more. - -Or from the CLI: +## 1. Get the code ```bash -gh repo create protoLabsAI/my-agent \ - --template protoLabsAI/protoAgent \ - --public --clone - +git clone https://github.com/protoLabsAI/protoAgent.git my-agent cd my-agent ``` -## 2. Rename the agent - -The template uses `protoagent` as the placeholder throughout. Do a pass: +## 2. Install dependencies ```bash -git grep -li protoagent | xargs sed -i 's/protoagent/my-agent/g' -git grep -li protoAgent | xargs sed -i 's/protoAgent/MyAgent/g' +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt ``` -Review the diff before committing — the replacement hits Dockerfile paths (`/opt/protoagent` → `/opt/my-agent`), the GHCR image path, workflow repo guards, and the Gradio UI branding. All of those want the new name. +## 3. Run the server -## 3. Rewrite identity - -Three files carry the agent's identity. Edit each one: - -- `config/SOUL.md` — the persona doc loaded at session start. See the placeholder file itself for guidance. -- `graph/prompts.py` — the system prompt for the lead agent + subagents. -- `server.py::_build_agent_card` — the agent card served at `/.well-known/agent-card.json`. At minimum, fix `name` and `description`; revisit `skills` once you have real tools. - -## 4. Point at a model +```bash +python server.py +``` -Edit `config/langgraph-config.yaml`: +You should see: -```yaml -model: - name: protolabs/my-agent # or openai/gpt-4o, anthropic/claude-opus-4-6, etc. - api_base: http://gateway:4000/v1 +``` +LangGraph agent initialized (setup wizard not complete — graph not compiled. Open the UI to finish setup.) +Starting protoagent on http://0.0.0.0:7870 ``` -If you're using a gateway alias (recommended), make sure the alias is registered there before booting — swapping models later becomes a gateway edit instead of a code change. +## 4. Open the setup wizard -## 5. Build and run +Visit in a browser. Because `config/.setup-complete` doesn't exist yet, you'll land in the wizard instead of the chat UI. -```bash -docker build -t my-agent:local . -docker run --rm -p 7870:7870 \ - -e AGENT_NAME=my-agent \ - -e OPENAI_API_KEY="$LITELLM_MASTER_KEY" \ - my-agent:local -``` +Walk through the four steps: -## 6. Verify the agent is up +1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one. +2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent. +3. **Tools & middleware.** All five starter tools (`echo`, `current_time`, `calculator`, `web_search`, `fetch_url`) are enabled by default. Leave **Audit** and **Memory** middleware on. Leave **Knowledge** off — that needs an index the template doesn't ship with. +4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`. -In another terminal: +Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices. -```bash -curl http://localhost:7870/.well-known/agent-card.json | jq .name -# → "my-agent" +## 5. Try it -curl http://localhost:7870/metrics | grep my_agent_active_sessions -# → my_agent_active_sessions 0 -``` - -Hit `http://localhost:7870` in a browser to get the Gradio chat UI. Ask it: +In the chat box: > What time is it in Tokyo? -If the starter tools are wired correctly, it should call `current_time`, return an ISO-8601 timestamp with the timezone offset, and explain what it found. +The agent calls `current_time`, returns an ISO-8601 timestamp, and explains what it found. Then: > Find three recent articles about the A2A protocol and summarize them. -The agent will call `web_search`, then `fetch_url` for each of the top results, and return a summary. That round-trip exercises the full tool loop + LLM call + streaming response path. +The agent calls `web_search`, then `fetch_url` on the top results, and hands back a synthesis. That round-trip exercises the full tool loop + LLM call + streaming response path. + +## What just happened + +- Your answers were written to `config/langgraph-config.yaml` (human-readable — peek at it). +- The persona preset was written to `config/SOUL.md`. +- A `config/.setup-complete` marker was created so the next boot goes straight to chat. +- The agent card at now reflects your agent name. +- If you checked autostart, `~/Library/LaunchAgents/ai.protolabs..plist` was installed and `launchctl load`-ed. -## 7. Un-freeze the release pipeline +## Changing your mind -The three release workflows (`docker-publish.yml`, `prepare-release.yml`, `release.yml`) all gate on `github.repository == 'protoLabsAI/protoAgent'`. Change that check in each file to match your repo's owner/repo before merging anything to `main`, or the release automation won't fire. +- **Any field** — open the Configuration drawer on the right side of the chat UI. Every wizard field is there, plus a few advanced ones (temperature, max_tokens, max_iterations, knowledge store settings). +- **The whole wizard** — expand the drawer's "Re-run setup wizard" accordion and click **Run wizard now**. Your current values pre-fill every step. +- **Autostart** — toggle it off in the wizard or the drawer; the LaunchAgent is removed and the plist file deleted. ## Where to go next - [Write your first tool](/tutorials/first-tool) — wire a custom LangChain tool into the loop +- [Customize & deploy](/guides/customize-and-deploy) — fork the template, rename throughout, ship a GHCR image - [Add a custom skill](/guides/add-a-skill) — expose the new behaviour on the A2A agent card -- [Deploy via GHCR](/guides/deploy) — get Watchtower auto-deploying your merges diff --git a/graph/config.py b/graph/config.py index c8c2601..00ae1a8 100644 --- a/graph/config.py +++ b/graph/config.py @@ -51,6 +51,26 @@ class LangGraphConfig: embed_model: str = "qwen3-embedding" knowledge_top_k: int = 5 + # Identity — captured by the setup wizard, editable via the drawer. + # ``identity_name`` falls back to the AGENT_NAME env var at runtime; + # the YAML value wins when both are set so per-fork customization + # survives image rebuilds. ``operator`` is the human the agent thinks + # it's talking to — injected into the system prompt when non-empty. + identity_name: str = "protoagent" + identity_operator: str = "" + + # A2A bearer token — blank = open mode (local dev). Writing a token + # here makes the A2A handler require ``Authorization: Bearer `` + # on every request and advertises the bearer scheme on the agent card. + # Kept in YAML rather than env so the drawer can manage it. + auth_token: str = "" + + # OS-level autostart — ``True`` means the server launches on user + # login (macOS LaunchAgent today; Linux/Windows TBD). Managed by + # ``autostart.py``; the field here is the source of truth for + # whether the plist should exist. + autostart_on_boot: bool = False + @classmethod def from_yaml(cls, path: str | Path) -> "LangGraphConfig": """Load config from YAML file. Falls back to defaults if absent.""" @@ -65,6 +85,9 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig": subagents = data.get("subagents", {}) middleware = data.get("middleware", {}) knowledge = data.get("knowledge", {}) + identity = data.get("identity", {}) + auth = data.get("auth", {}) + runtime = data.get("runtime", {}) config = cls( model_provider=model.get("provider", cls.model_provider), @@ -80,6 +103,10 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig": knowledge_db_path=knowledge.get("db_path", cls.knowledge_db_path), embed_model=knowledge.get("embed_model", cls.embed_model), knowledge_top_k=knowledge.get("top_k", cls.knowledge_top_k), + identity_name=identity.get("name", cls.identity_name), + identity_operator=identity.get("operator", cls.identity_operator), + auth_token=auth.get("token", cls.auth_token), + autostart_on_boot=runtime.get("autostart_on_boot", cls.autostart_on_boot), ) for name in ("worker",): diff --git a/graph/config_io.py b/graph/config_io.py index e77a45c..8c39491 100644 --- a/graph/config_io.py +++ b/graph/config_io.py @@ -40,6 +40,18 @@ SOUL_SOURCE_PATH = REPO_ROOT / "config" / "SOUL.md" SOUL_RUNTIME_PATH = Path("/sandbox/SOUL.md") +# Setup wizard state. +# Presence of this (empty) marker file = wizard has been run and the +# server should boot straight into the chat UI. Absence = show the +# wizard on first page load. Lives in ``config/`` so a Docker volume +# mount at /opt//config persists setup across container runs. +SETUP_MARKER_PATH = REPO_ROOT / "config" / ".setup-complete" + +# SOUL.md starter templates. The wizard offers these as presets the +# user can pick then edit before saving. Adding a new file here +# automatically makes it a choice — no registry to update. +PRESETS_DIR = REPO_ROOT / "config" / "soul-presets" + # --------------------------------------------------------------------------- # YAML round-trip @@ -145,6 +157,16 @@ def config_to_dict(config: LangGraphConfig) -> dict[str, Any]: "embed_model": config.embed_model, "top_k": config.knowledge_top_k, }, + "identity": { + "name": config.identity_name, + "operator": config.identity_operator, + }, + "auth": { + "token": config.auth_token, + }, + "runtime": { + "autostart_on_boot": config.autostart_on_boot, + }, } @@ -319,3 +341,68 @@ def list_available_tools(knowledge_store: Any = None) -> list[str]: from tools.lg_tools import get_all_tools return [t.name for t in get_all_tools(knowledge_store)] + + +# --------------------------------------------------------------------------- +# Setup wizard state +# --------------------------------------------------------------------------- + + +def is_setup_complete() -> bool: + """True once the wizard has been completed at least once. + + Checked at server boot to decide wizard-first vs chat-first + rendering. Don't read the YAML to infer this — a fork that ships + with a baked-in config still needs to walk a user through the + wizard on first run. + """ + return SETUP_MARKER_PATH.exists() + + +def mark_setup_complete() -> None: + """Write the marker so subsequent boots skip the wizard. + + Idempotent — safe to call repeatedly. The file is empty; only + its presence matters. + """ + SETUP_MARKER_PATH.parent.mkdir(parents=True, exist_ok=True) + SETUP_MARKER_PATH.touch() + + +def reset_setup() -> None: + """Remove the marker, forcing the wizard to run on next page load. + + Exposed to the drawer as a "Re-run setup" action. Leaves the YAML + + SOUL.md in place so the wizard pre-populates with the current + values — reset is for revisiting choices, not for wiping config. + """ + SETUP_MARKER_PATH.unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# SOUL.md presets +# --------------------------------------------------------------------------- + + +def list_soul_presets() -> list[str]: + """Return preset names (file stems, no extension) sorted alphabetically. + + The wizard's preset dropdown reads from this — dropping a new + markdown file into ``config/soul-presets/`` makes it a choice + without code changes. + """ + if not PRESETS_DIR.exists(): + return [] + return sorted(p.stem for p in PRESETS_DIR.glob("*.md")) + + +def read_soul_preset(name: str) -> str: + """Return the preset's content. + + Returns empty string for an unknown name rather than raising — + the wizard treats that as "no preset selected, blank canvas". + """ + path = PRESETS_DIR / f"{name}.md" + if not path.exists(): + return "" + return path.read_text(encoding="utf-8") diff --git a/server.py b/server.py index af6acb1..ab52430 100644 --- a/server.py +++ b/server.py @@ -58,16 +58,35 @@ def _init_langgraph_agent(): - """Initialize the LangGraph agent backend.""" + """Initialize the LangGraph backend — setup-aware. + + Always loads the config + checkpointer so the wizard and drawer + can introspect what's on disk. The compiled graph is only built + when the setup wizard has been completed (``.setup-complete`` + marker present). This lets the server boot cleanly on a fresh + clone with no model credentials — the wizard drives the user to + provide them, then triggers a reload. + """ global _graph, _graph_config, _checkpointer - from graph.agent import create_agent_graph from graph.config import LangGraphConfig + from graph.config_io import is_setup_complete from langgraph.checkpoint.memory import MemorySaver config_path = Path(__file__).parent / "config" / "langgraph-config.yaml" _graph_config = LangGraphConfig.from_yaml(config_path) _checkpointer = MemorySaver() + + if not is_setup_complete(): + _graph = None + log.info( + "Setup wizard has not been completed — graph not compiled. " + "Open the UI to finish setup.", + ) + return + + from graph.agent import create_agent_graph + _graph = create_agent_graph(_graph_config) log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name) @@ -83,21 +102,37 @@ def _reload_langgraph_agent() -> tuple[bool, str]: Rebinding ``_graph`` is atomic in CPython; in-flight ``astream_events`` iterators hold their own reference to the prior graph and finish cleanly on the old instance. + + If the setup marker is absent this returns early without + compiling — the wizard is still in front of the user, so there + is nothing to hot-swap yet. """ global _graph, _graph_config from graph.agent import create_agent_graph from graph.config import LangGraphConfig + from graph.config_io import is_setup_complete config_path = Path(__file__).parent / "config" / "langgraph-config.yaml" try: new_config = LangGraphConfig.from_yaml(config_path) - new_graph = create_agent_graph(new_config) except Exception as e: - log.exception("[reload] rebuild failed: %s", e) - return False, f"reload failed: {e}" + log.exception("[reload] config load failed: %s", e) + return False, f"config load failed: {e}" _graph_config = new_config + + if not is_setup_complete(): + _graph = None + log.info("[reload] setup not complete — config reloaded, graph not compiled") + return True, "config reloaded • setup not complete" + + try: + new_graph = create_agent_graph(new_config) + except Exception as e: + log.exception("[reload] graph rebuild failed: %s", e) + return False, f"graph rebuild failed: {e}" + _graph = new_graph log.info("LangGraph agent reloaded (model: %s)", _graph_config.model_name) return True, f"reloaded • model={_graph_config.model_name}" @@ -150,12 +185,17 @@ def _apply_settings_changes( def _build_settings_callbacks() -> dict[str, Any]: - """Callbacks consumed by the Gradio Configuration drawer.""" + """Callbacks consumed by the Gradio Configuration drawer + wizard.""" from graph.config_io import ( config_to_dict, + is_setup_complete, list_available_tools, list_gateway_models, + list_soul_presets, + mark_setup_complete, read_soul, + read_soul_preset, + reset_setup, ) def get_config() -> dict[str, Any]: @@ -177,15 +217,153 @@ def save_all(config: dict | None, soul: str | None) -> tuple[bool, str]: ok, messages = _apply_settings_changes(config=config, soul=soul) return ok, " • ".join(messages) + def finish_setup(config: dict | None, soul: str | None) -> tuple[bool, str]: + """Wizard terminal action — write everything, mark complete, reload. + + Ordering matters: + + 1. Write config YAML + SOUL.md (no reload yet). + 2. ``mark_setup_complete()`` — flip the marker BEFORE the + reload so ``_reload_langgraph_agent`` actually compiles + the graph. Doing it after means the reload sees + setup-incomplete and stays ``_graph = None``. + 3. Sync autostart (LaunchAgent plist is independent of the + graph, so it can happen any time after the config is + written). + 4. Reload — marker present, graph compiles, chat works. + + Returns a single status string joining per-step messages. + """ + from graph.config_io import ( + apply_updates_to_yaml, + load_yaml_doc, + save_yaml_doc, + validate_config_dict, + write_soul, + ) + + messages: list[str] = [] + + # 1. Persist + if config is not None: + ok, err = validate_config_dict(config) + if not ok: + return False, f"validation: {err}" + try: + doc = load_yaml_doc() + apply_updates_to_yaml(doc, config) + save_yaml_doc(doc) + messages.append("config saved") + except Exception as e: + log.exception("[setup] YAML write failed: %s", e) + return False, f"config write: {e}" + + if soul is not None: + try: + paths = write_soul(soul) + messages.append(f"SOUL saved ({len(paths)} path{'s' if len(paths) != 1 else ''})") + except Exception as e: + log.exception("[setup] SOUL write failed: %s", e) + return False, f"soul write: {e}" + + # 2. Flip the marker — MUST be before reload so the graph builds + mark_setup_complete() + messages.append("setup marked complete") + + # 3. Autostart sync + if config and "runtime" in config: + want_autostart = bool(config.get("runtime", {}).get("autostart_on_boot", False)) + try: + from autostart import install_autostart, uninstall_autostart + + as_name = ( + config.get("identity", {}).get("name") + or _graph_config.identity_name + or "protoagent" + ) + if want_autostart: + ok_as, msg_as = install_autostart(agent_name=as_name) + else: + ok_as, msg_as = uninstall_autostart(agent_name=as_name) + messages.append(f"autostart: {msg_as}") + if not ok_as: + log.warning("[setup] autostart sync failed: %s", msg_as) + except Exception as e: + log.exception("[setup] autostart sync raised: %s", e) + messages.append(f"autostart failed: {e}") + + # 4. Reload — now picks up setup_complete=True and compiles + ok, reload_msg = _reload_langgraph_agent() + messages.append(reload_msg) + + return ok, " • ".join(messages) + + def restart_setup() -> str: + """Drawer action — delete the marker so the wizard runs again.""" + reset_setup() + log.info("[setup] marker removed — wizard will run on next page load") + return "setup marker removed • reload the page to run the wizard" + + def autostart_info() -> dict[str, Any]: + """Report platform support + current on-disk state. The drawer + uses this to render the toggle correctly and to print the + plist path for debugging.""" + try: + from autostart import autostart_status + + name = (_graph_config.identity_name if _graph_config else "") or "protoagent" + return autostart_status(name) + except Exception as e: + return {"supported": False, "installed": False, "reason": str(e)} + + def toggle_autostart(enabled: bool) -> tuple[bool, str]: + """Install or uninstall the OS autostart artifact, mirroring + the YAML field. Called from the drawer's checkbox handler so + toggling takes effect immediately without waiting for Save.""" + try: + from autostart import install_autostart, uninstall_autostart + + name = (_graph_config.identity_name if _graph_config else "") or "protoagent" + if enabled: + return install_autostart(agent_name=name) + return uninstall_autostart(agent_name=name) + except Exception as e: + return False, str(e) + return { "get_config": get_config, "get_soul": read_soul, "list_models": list_models, "list_tools": list_available_tools, + "list_soul_presets": list_soul_presets, + "read_soul_preset": read_soul_preset, "save_all": save_all, + "finish_setup": finish_setup, + "restart_setup": restart_setup, + "is_setup_complete": is_setup_complete, + "autostart_info": autostart_info, + "toggle_autostart": toggle_autostart, } +def _setup_required_message() -> list[dict[str, Any]]: + """Returned by chat endpoints when the wizard hasn't been run. + + The Gradio UI hides the chat pane until setup completes, but the + HTTP /api/chat, OpenAI-compat, and A2A endpoints don't know the + UI state — so they emit a plain-text "finish setup first" + message instead of 500ing on ``_graph is None``. + """ + return [{ + "role": "assistant", + "content": ( + "**Setup required.** The setup wizard has not been completed. " + "Open the UI and finish the wizard, or POST the completed config " + "to `/api/config/setup` before calling chat endpoints." + ), + }] + + # --------------------------------------------------------------------------- # Chat backend — called by the A2A handler + OpenAI-compat endpoint # --------------------------------------------------------------------------- @@ -199,6 +377,8 @@ async def chat(message: str, session_id: str) -> list[dict[str, Any]]: capture tool events and emit the cost-v1 DataPart on the terminal artifact. """ + if _graph is None: + return _setup_required_message() return await _chat_langgraph(message, session_id) @@ -234,6 +414,10 @@ async def _chat_langgraph_stream( if caller_trace.get("spanId"): trace_meta["caller_span_id"] = caller_trace["spanId"] + if _graph is None: + yield ("error", "setup required — finish the setup wizard before calling A2A endpoints") + return + async with tracing.trace_session( session_id=session_id, name="a2a-stream", @@ -362,13 +546,28 @@ async def _chat_langgraph(message: str, session_id: str) -> list[dict[str, Any]] # Agent card — EDIT THIS when forking # --------------------------------------------------------------------------- -AGENT_NAME = os.environ.get("AGENT_NAME", "protoagent") +AGENT_NAME_ENV = os.environ.get("AGENT_NAME", "protoagent") + + +def agent_name() -> str: + """Resolve the active agent name. + + Preference order: wizard-set ``identity.name`` in YAML (when loaded + and non-placeholder) → ``AGENT_NAME`` env var → ``"protoagent"``. + The agent card, OpenAI-compat model id, and chat header all call + this so a wizard rename propagates without a restart. The + Prometheus metric prefix and ``_API_KEY`` env name are + set at boot and still require a restart (see docs). + """ + if _graph_config and _graph_config.identity_name and _graph_config.identity_name != "protoagent": + return _graph_config.identity_name + return AGENT_NAME_ENV def _build_security_schemes() -> dict: """Return securitySchemes dict, adding bearer only when A2A_AUTH_TOKEN is set.""" schemes: dict = {"apiKey": {"type": "apiKey", "in": "header", "name": "X-API-Key"}} - if os.environ.get("A2A_AUTH_TOKEN", ""): + if os.environ.get("A2A_AUTH_TOKEN", "") or (_graph_config and _graph_config.auth_token): schemes["bearer"] = {"type": "http", "scheme": "bearer"} return schemes @@ -395,7 +594,7 @@ def _build_agent_card(host: str) -> dict: it only if you strip the usage-capture. """ return { - "name": AGENT_NAME, + "name": agent_name(), "description": ( "protoAgent template — A2A-compliant LangGraph agent. " "Replace this description with your agent's actual purpose." @@ -440,7 +639,7 @@ def _build_agent_card(host: str) -> dict: # --------------------------------------------------------------------------- def _main(): - parser = argparse.ArgumentParser(description=f"{AGENT_NAME} — protoAgent server") + parser = argparse.ArgumentParser(description=f"{AGENT_NAME_ENV} — protoAgent server") parser.add_argument("--port", type=int, default=7870) parser.add_argument("--config", type=str, default=None) args = parser.parse_args() @@ -457,7 +656,7 @@ def _main(): from chat_ui import create_chat_app blocks = create_chat_app( chat_fn=chat, - title=AGENT_NAME, + title=agent_name(), subtitle="protoAgent", placeholder="Send a message...", pwa=True, @@ -471,7 +670,7 @@ def _main(): from fastapi.staticfiles import StaticFiles from pydantic import BaseModel as PydanticBaseModel - fastapi_app = FastAPI(title=f"{AGENT_NAME} — protoAgent") + fastapi_app = FastAPI(title=f"{agent_name()} — protoAgent") # --- Chat API ----------------------------------------------------------- class ChatRequest(PydanticBaseModel): @@ -514,6 +713,36 @@ async def _api_list_models(api_base: str = "", api_key: str = ""): models, error = list_gateway_models(base, key) return {"models": models, "error": error} + # --- Setup wizard state ------------------------------------------------- + @fastapi_app.get("/api/config/setup-status") + async def _api_setup_status(): + from graph.config_io import is_setup_complete, list_soul_presets + return { + "setup_complete": is_setup_complete(), + "presets": list_soul_presets(), + } + + @fastapi_app.post("/api/config/setup") + async def _api_finish_setup(req: ConfigReloadRequest): + """Terminal wizard action over HTTP. Same semantics as the + drawer's ``finish_setup`` callback — writes everything, marks + setup complete, optionally installs autostart, then reloads. + """ + callbacks = _build_settings_callbacks() + ok, msg = callbacks["finish_setup"](req.config, req.soul) + return {"ok": ok, "message": msg} + + @fastapi_app.post("/api/config/reset-setup") + async def _api_reset_setup(): + from graph.config_io import reset_setup + reset_setup() + return {"ok": True, "message": "setup marker removed"} + + @fastapi_app.get("/api/config/presets/{name}") + async def _api_read_preset(name: str): + from graph.config_io import read_soul_preset + return {"name": name, "content": read_soul_preset(name)} + # --- OpenAI-compatible chat completions -------------------------------- # Lets this agent be registered as a model in the LiteLLM gateway / # OpenWebUI without any protocol adapter. @@ -531,19 +760,19 @@ async def _openai_chat_completions(req: dict): parts = [m["content"] for m in result if m.get("role") == "assistant" and m.get("content")] content = "\n\n".join(parts) created = int(time.time()) - completion_id = f"{AGENT_NAME}-{session_id}" + completion_id = f"{agent_name()}-{session_id}" if stream: async def _stream(): chunk = { "id": completion_id, "object": "chat.completion.chunk", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{"index": 0, "delta": {"role": "assistant", "content": content}, "finish_reason": None}], } yield f"data: {json.dumps(chunk)}\n\n" done_chunk = { "id": completion_id, "object": "chat.completion.chunk", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], } yield f"data: {json.dumps(done_chunk)}\n\n" @@ -552,7 +781,7 @@ async def _stream(): return { "id": completion_id, "object": "chat.completion", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{ "index": 0, "message": {"role": "assistant", "content": content}, @@ -565,14 +794,14 @@ async def _stream(): async def _openai_models(): return { "object": "list", - "data": [{"id": AGENT_NAME, "object": "model", "created": 1774600000, "owned_by": "protolabs"}], + "data": [{"id": agent_name(), "object": "model", "created": 1774600000, "owned_by": "protolabs"}], } # --- A2A agent card ----------------------------------------------------- @fastapi_app.get("/.well-known/agent.json", include_in_schema=False) @fastapi_app.get("/.well-known/agent-card.json", include_in_schema=False) async def _a2a_agent_card(request: Request): - host = request.headers.get("host", f"{AGENT_NAME}:7870") + host = request.headers.get("host", f"{agent_name()}:7870") return JSONResponse( content=_build_agent_card(host), headers={"Cache-Control": "public, max-age=60"}, @@ -582,12 +811,19 @@ async def _a2a_agent_card(request: Request): # JSON-RPC + REST, streaming, polling, cancel, push webhooks. from a2a_handler import register_a2a_routes - auth_env = f"{AGENT_NAME.upper()}_API_KEY" + # A2A bearer token: YAML ``auth.token`` wins if set, else falls back + # to the legacy ``_API_KEY`` env var — so the wizard can set + # auth without an env restart. + auth_env = f"{AGENT_NAME_ENV.upper()}_API_KEY" + auth_key = ( + (_graph_config.auth_token if _graph_config else "") + or os.environ.get(auth_env, "") + ) register_a2a_routes( app=fastapi_app, chat_stream_fn_factory=_chat_langgraph_stream, chat_fn=chat, - api_key=os.environ.get(auth_env, ""), + api_key=auth_key, agent_card={}, register_card_route=False, # card is already served above ) @@ -631,7 +867,7 @@ async def _serve_sw() -> FileResponse: favicon_path=str(static_dir / "favicon.svg") if (static_dir / "favicon.svg").exists() else None, ) - log.info("Starting %s on http://0.0.0.0:%d", AGENT_NAME, args.port) + log.info("Starting %s on http://0.0.0.0:%d", agent_name(), args.port) uvicorn.run(app, host="0.0.0.0", port=args.port) diff --git a/tests/test_config_io.py b/tests/test_config_io.py index ce31bf6..bb72f8a 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -125,13 +125,21 @@ def test_config_to_dict_mirrors_yaml_shape() -> None: cfg = LangGraphConfig() d = config_to_dict(cfg) - # Top-level schema surface - assert set(d.keys()) == {"model", "subagents", "middleware", "knowledge"} + # Top-level schema surface — all the sections the YAML exposes. + # Adding a new section here without updating config_to_dict would + # strand fork-added fields outside the drawer's round-trip. + assert set(d.keys()) == { + "model", "subagents", "middleware", "knowledge", + "identity", "auth", "runtime", + } assert d["model"]["name"] == cfg.model_name assert d["model"]["temperature"] == cfg.temperature assert d["subagents"]["worker"]["tools"] == list(cfg.worker.tools) assert d["middleware"]["audit"] == cfg.audit_middleware assert d["knowledge"]["top_k"] == cfg.knowledge_top_k + assert d["identity"]["name"] == cfg.identity_name + assert d["auth"]["token"] == cfg.auth_token + assert d["runtime"]["autostart_on_boot"] == cfg.autostart_on_boot # ── validate_config_dict ───────────────────────────────────────────────────── @@ -321,3 +329,93 @@ def test_list_available_tools_returns_starter_set(): assert "calculator" in names assert "current_time" in names assert all(isinstance(n, str) for n in names) + + +# ── Setup wizard marker ───────────────────────────────────────────────────── + + +def test_setup_marker_lifecycle(monkeypatch, tmp_path): + """Marker presence = wizard skipped. Mark → present. Reset → gone. + Reset on a missing marker is a no-op, not an error.""" + from graph import config_io + + marker = tmp_path / ".setup-complete" + monkeypatch.setattr(config_io, "SETUP_MARKER_PATH", marker) + + assert config_io.is_setup_complete() is False + + config_io.mark_setup_complete() + assert config_io.is_setup_complete() is True + assert marker.exists() + + config_io.mark_setup_complete() # idempotent + assert config_io.is_setup_complete() is True + + config_io.reset_setup() + assert config_io.is_setup_complete() is False + + config_io.reset_setup() # no-op on missing marker — doesn't raise + + +def test_mark_setup_complete_creates_parent_dir(monkeypatch, tmp_path): + """If config/ doesn't exist yet, mark_setup_complete must create + it — otherwise a fresh clone with a pristine filesystem fails + on first wizard run.""" + from graph import config_io + + marker = tmp_path / "fresh" / "config" / ".setup-complete" + monkeypatch.setattr(config_io, "SETUP_MARKER_PATH", marker) + + config_io.mark_setup_complete() + assert marker.exists() + + +# ── SOUL.md presets ───────────────────────────────────────────────────────── + + +def test_list_soul_presets_returns_shipped_starters(): + """The template must ship four starter presets so the wizard's + dropdown is useful on day one. Add a file to config/soul-presets/ + and it should appear here automatically — no registry.""" + from graph.config_io import list_soul_presets + + presets = list_soul_presets() + assert "generic-assistant" in presets + assert "research" in presets + assert "coding" in presets + assert "blank" in presets + + +def test_list_soul_presets_sorted(): + from graph.config_io import list_soul_presets + + presets = list_soul_presets() + assert presets == sorted(presets) + + +def test_read_soul_preset_returns_content(): + from graph.config_io import read_soul_preset + + content = read_soul_preset("research") + assert "research" in content.lower() + assert content.strip().startswith("#") # markdown h1 + + +def test_read_soul_preset_unknown_returns_empty(): + """Unknown preset names must return '' not raise — the wizard + treats empty as 'user didn't pick a preset, keep textarea as-is'.""" + from graph.config_io import read_soul_preset + + assert read_soul_preset("not-a-real-preset") == "" + assert read_soul_preset("") == "" + + +def test_list_soul_presets_missing_dir_returns_empty(monkeypatch, tmp_path): + """If a fork accidentally deletes the presets dir, the wizard + should render an empty dropdown, not crash.""" + from graph import config_io + + fake = tmp_path / "does-not-exist" + monkeypatch.setattr(config_io, "PRESETS_DIR", fake) + + assert config_io.list_soul_presets() == [] From 4f74c7fe12f9c66be22873542ff6ac9472e5a331 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 17:37:41 -0700 Subject: [PATCH 06/24] fix(ui): sync wizard/chat visibility on every page load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, a browser refresh after the marker was written externally (POST /api/config/setup, or /api/config/reset-setup from another tab) kept Gradio serving its initial visibility snapshot — wizard visible even though setup is done, or vice versa. app.load runs per page visit so visibility tracks is_setup_complete() live. Co-Authored-By: Claude Opus 4.7 (1M context) --- chat_ui.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/chat_ui.py b/chat_ui.py index d332af6..fe4e863 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -778,6 +778,35 @@ def _load_wizard_defaults(): outputs=[*w_inputs, w_test_status, w_autostart_note], ) + # Re-check setup state on every page load so external + # completions (POST /api/config/setup from curl, or a + # reset triggered in another tab) are reflected after + # a browser refresh. Without this, Gradio keeps serving + # the initial visibility state from when the Blocks + # were first rendered. + def _sync_visibility(): + if "is_setup_complete" not in settings: + return gr.update(), gr.update(), gr.update() + done = bool(settings["is_setup_complete"]()) + sidebar_upd = ( + gr.update(visible=done) + if sidebar_block is not None + else gr.update() + ) + return ( + gr.update(visible=not done), # wizard_pane + gr.update(visible=done), # chat_pane + sidebar_upd, + ) + + app.load( + fn=_sync_visibility, + outputs=[ + wizard_pane, chat_pane, + sidebar_block if sidebar_block is not None else wizard_pane, + ], + ) + # Connection test — fills the model dropdown def _test_connection(api_base, api_key): if "list_models" not in settings: From a2c5b5b282f2582ae9b93f08c833f913894ad258 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 19:40:30 -0700 Subject: [PATCH 07/24] fix(llm): override OpenAI SDK User-Agent to bypass gateway WAF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cloudflare's managed WAF in front of api.proto-labs.ai (and likely other gateways behind a default WAF config) blocks the OpenAI Python SDK's `OpenAI/Python ` User-Agent with a 403 "Your request was blocked". /v1/models went through fine because the gateway's model-list handler doesn't gate on UA the same way — only /v1/chat/completions 403'd, which made this look like a key or model-alias problem rather than what it actually was. tools/lg_tools.py already sets a custom UA on its outbound httpx fetches for exactly this reason; graph/llm.py had no equivalent, so ChatOpenAI fell back to the SDK default. Threading the same identifier through default_headers makes every protoAgent egress present a consistent allowlisted UA. Verified: product-director wizard → chat turn → 200 OK from api.proto-labs.ai with the groq-llama-70b alias. Co-Authored-By: Claude Opus 4.7 (1M context) --- graph/llm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/graph/llm.py b/graph/llm.py index 70f3fe0..f364cd8 100644 --- a/graph/llm.py +++ b/graph/llm.py @@ -32,4 +32,14 @@ def create_llm(config: LangGraphConfig) -> ChatOpenAI: # AIMessageChunks with usage_metadata=None and we can't emit # the cost-v1 DataPart on the terminal artifact. stream_usage=True, + # Cloudflare's managed WAF blocks the OpenAI SDK's default + # `OpenAI/Python ` User-Agent (observed 403 "Your request + # was blocked" against api.proto-labs.ai). Override with the + # same identifier `tools/lg_tools.py` uses for outbound fetches + # so every protoAgent egress presents a consistent, allowlisted + # UA. If you self-host behind a different edge, this is safe to + # keep. + default_headers={ + "User-Agent": "protoAgent/0.1 (+https://github.com/protoLabsAI/protoAgent)", + }, ) From acf2bd6865cb3bd0126aecad902021a5e7e0508a Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 19:51:00 -0700 Subject: [PATCH 08/24] fix(review): address PR #150 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical — path traversal in preset loader (graph/config_io.py: read_soul_preset): Inputs like "../secret" escaped config/soul-presets/ and read arbitrary .md files anywhere on disk. Resolve both the preset root and the candidate path and require the latter live inside the former before reading. 7 parametrized tests cover the malicious inputs I could think of (../, ../../, absolute paths, bare "..", mid-path ../../). Major — YAML auth.token was non-functional for A2A bearer: register_a2a_routes captured _a2a_token at register time, so wizard-set tokens were ignored until process restart. Promoted _a2a_token to a module-level mutable holder (_A2A_TOKEN: list) that the closure reads on every request, added set_a2a_token() as the public mutator, and a new auth_token= arg to register_a2a_routes as the seed source (env still the fallback). server.py's reload path now calls set_a2a_token on every YAML change so the wizard → live bearer enforcement flow works with no restart — verified: fresh boot open → wizard token set → 401 on wrong token / 200 on right → drawer clears token → open again. Major — plist XML injection in autostart.py: Agent names containing <, >, & produced malformed plists (and could theoretically inject nodes). xml.sax.saxutils.escape() every interpolated string field before embedding. Major — install_autostart defaulted to port 7870 regardless of --port flag (autostart.py / server.py): Captured the active port in a module-level _active_port at _main() time and threaded it through both finish_setup's autostart sync and the drawer's toggle_autostart callback. The generated LaunchAgent now reboots on whatever port the operator launched with. Minor — chat_ui polish: * Numeric fields (max_tokens, max_iterations, worker_max_turns) fall back to sensible defaults (4096/50/20) instead of 0 when cleared — validate_config_dict rejects zero, so "or 0" blocked legitimate saves with a confusing validation error. * _sync_visibility no longer aliases the sidebar output slot to wizard_pane when the sidebar is absent; split into two closures with matching output arities so Gradio doesn't receive duplicate updates to the same component. * Legacy load_provider_choices handler guards get_current_provider existence — KeyError risk when a fork provides get_provider_choices alone. Nitpicks: * Remove unused _FIELD_MAP from config_io.py. * ASCII hyphen (U+002D) instead of en-dash (U+2013) in the temperature validation error. * Pin ruamel.yaml>=0.18 in Dockerfile to match requirements.txt. * Document the VOLUME anonymous-volume lifecycle and named-volume recommendation in the Dockerfile comment. Not addressed (deliberate): * CodeRabbit flagged test_list_gateway_models_http_error as expecting httpx.ConnectError to be caught by except httpx.HTTPError — false positive, ConnectError → NetworkError → TransportError → RequestError → HTTPError, test already passes. * "Reuse config_io.read_soul() in graph/prompts.py" — kept the inline check to avoid introducing an import dependency from prompts.py (loaded early, widely used) into config_io.py. * "Use tuple form for @pytest.parametrize" — stylistic; comma- separated string works identically. Test surface: 36 passing (7 new — the path-traversal parametrize set). Co-Authored-By: Claude Opus 4.7 (1M context) --- Dockerfile | 17 ++++- a2a_handler.py | 48 +++++++++--- autostart.py | 29 ++++--- chat_ui.py | 164 +++++++++++++++++++++++----------------- graph/config_io.py | 32 +++----- server.py | 48 +++++++++--- tests/test_config_io.py | 19 +++++ 7 files changed, 232 insertions(+), 125 deletions(-) diff --git a/Dockerfile b/Dockerfile index 58b8164..0b1fe77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ RUN useradd -m -s /bin/bash -u ${SANDBOX_UID} sandbox # auth, add them here. The ddgs + beautifulsoup4 pair powers the # starter web_search / fetch_url tools; drop them if you strip those. RUN pip install --no-cache-dir \ - gradio httpx uvicorn langfuse prometheus-client pyyaml ruamel.yaml \ + gradio httpx uvicorn langfuse prometheus-client pyyaml 'ruamel.yaml>=0.18' \ langchain langchain-openai langgraph websockets \ ddgs beautifulsoup4 @@ -46,8 +46,19 @@ RUN chown -R sandbox:sandbox /opt/protoagent/config # Declare config as a volume so setup completion (``.setup-complete`` # marker + any YAML / SOUL.md edits) survives ``docker run`` without -# a -v flag. Operators who want cross-host persistence still mount a -# named volume or host directory at /opt/protoagent/config. +# a -v flag. +# +# Lifecycle note: without an explicit mount, Docker creates an +# ANONYMOUS volume on every ``docker run``. Those accumulate and the +# volume is NOT removed when the container is removed unless you pass +# ``--rm -v``. For long-lived deployments, use a named volume or a +# host mount so upgrades don't silently carry stale config forward: +# +# docker run -v my-agent-config:/opt/protoagent/config my-agent:latest +# +# or a bind mount: +# +# docker run -v /srv/my-agent/config:/opt/protoagent/config my-agent:latest VOLUME ["/opt/protoagent/config"] ENV PYTHONPATH=/opt/protoagent diff --git a/a2a_handler.py b/a2a_handler.py index 69b9520..7efecc7 100644 --- a/a2a_handler.py +++ b/a2a_handler.py @@ -919,6 +919,26 @@ def _check_auth(request: Request, api_key: str) -> None: # ── Route factory ───────────────────────────────────────────────────────────── +# Module-level mutable holder for the bearer token so hosts can +# update it at runtime without re-registering routes (e.g. when the +# setup wizard captures a token post-boot). ``register_a2a_routes`` +# seeds this from its ``auth_token`` argument (or ``A2A_AUTH_TOKEN`` +# env as fallback); ``set_a2a_token`` updates it live. Closures inside +# ``register_a2a_routes`` read ``_A2A_TOKEN[0]`` on every request, so +# a mutation is picked up by the next incoming call. +_A2A_TOKEN: list[str | None] = [None] + + +def set_a2a_token(token: str | None) -> None: + """Update the active A2A bearer token at runtime. + + Called by the host (e.g. ``server.py``) after the wizard / drawer + changes ``auth.token`` in the YAML — without this, bearer auth + captured at register time would stay stale until process restart. + """ + _A2A_TOKEN[0] = (token or "").strip() or None + + def register_a2a_routes( app: FastAPI, chat_stream_fn_factory: Callable[..., AsyncGenerator], @@ -926,29 +946,39 @@ def register_a2a_routes( api_key: str, agent_card: dict, register_card_route: bool = True, + auth_token: str = "", ) -> None: """Register all A2A routes on *app* and update *agent_card* capabilities. Host apps that already serve the agent card themselves (e.g. at multiple well-known paths for sdk compat) should pass ``register_card_route=False`` so FastAPI does not raise on a duplicate route registration. + + ``auth_token`` seeds the bearer-token check. When empty, falls + back to the ``A2A_AUTH_TOKEN`` env var. Hosts can update the + active token post-registration via ``set_a2a_token(...)`` (e.g. + after a wizard-driven config reload) without needing a restart. """ # ── Bearer token authentication ─────────────────────────────────────────── - _raw_a2a_token = os.environ.get("A2A_AUTH_TOKEN", "") - _a2a_token: str | None = _raw_a2a_token.strip() or None - if not _a2a_token: + # Seed order: explicit arg > env. Stored in the module-level holder + # so mutations propagate to the closure below. + seed = (auth_token or os.environ.get("A2A_AUTH_TOKEN", "") or "").strip() + _A2A_TOKEN[0] = seed or None + if _A2A_TOKEN[0] is None: logger.warning( "[a2a] A2A auth token not configured — endpoint is open" ) def _check_bearer_auth(request: Request) -> None: - """Validate Authorization: Bearer against A2A_AUTH_TOKEN. + """Validate Authorization: Bearer against the active + token. No-ops when unset. Raises HTTP 401 on missing/invalid. - No-ops when A2A_AUTH_TOKEN is unset (open mode). - Raises HTTP 401 on missing or invalid token. + Reads ``_A2A_TOKEN[0]`` on every call so runtime updates via + ``set_a2a_token`` are honored without route re-registration. """ - if not _a2a_token: + active = _A2A_TOKEN[0] + if not active: return auth_header = request.headers.get("Authorization", "") if not auth_header.startswith("Bearer "): @@ -957,7 +987,7 @@ def _check_bearer_auth(request: Request) -> None: detail="Unauthorized: expected 'Authorization: Bearer '", ) provided = auth_header[len("Bearer "):] - if not hmac.compare_digest(provided, _a2a_token): + if not hmac.compare_digest(provided, active): raise HTTPException(status_code=401, detail="Unauthorized: invalid bearer token") # ── Origin verification for SSE/streaming endpoints ─────────────────────── @@ -989,7 +1019,7 @@ def _check_origin(request: Request) -> None: agent_card.setdefault("capabilities", {}) agent_card["capabilities"]["streaming"] = True agent_card["capabilities"]["pushNotifications"] = True - if _a2a_token: + if _A2A_TOKEN[0]: agent_card.setdefault("securitySchemes", {}) agent_card["securitySchemes"]["bearer"] = { "type": "http", diff --git a/autostart.py b/autostart.py index 7f45b79..42eabe7 100644 --- a/autostart.py +++ b/autostart.py @@ -31,6 +31,7 @@ import subprocess import sys from pathlib import Path +from xml.sax.saxutils import escape as xml_escape REPO_ROOT = Path(__file__).parent.resolve() @@ -203,29 +204,35 @@ def _render_launchagent_plist( stdout_log: str, stderr_log: str, ) -> str: - """Render the plist XML. Small enough to inline; escaping is - limited to the known-safe fields we control, so no XML-injection - surface to audit here. + """Render the plist XML. + + Every interpolated string is XML-escaped because several fields + (``agent_name`` most notably) come from user input — a wizard + user who names their agent ``bad`` or ``me & co`` would + otherwise produce a malformed or injection-vulnerable plist. + ``port`` is an int so it's safe as-is, but we coerce+escape it + anyway for consistency. """ + e = xml_escape return f""" Label - {label} + {e(label)} ProgramArguments - {python} - {server_py} + {e(python)} + {e(server_py)} --port - {port} + {e(str(port))} WorkingDirectory - {working_dir} + {e(working_dir)} EnvironmentVariables AGENT_NAME - {agent_name} + {e(agent_name)} PATH /usr/local/bin:/usr/bin:/bin:/opt/homebrew/bin @@ -237,9 +244,9 @@ def _render_launchagent_plist( StandardOutPath - {stdout_log} + {e(stdout_log)} StandardErrorPath - {stderr_log} + {e(stderr_log)} """ diff --git a/chat_ui.py b/chat_ui.py index fe4e863..3974203 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -556,20 +556,25 @@ def _save( kb_db, kb_embed, kb_top_k, soul, ): + # Numeric fields fall back to sensible minimums + # rather than 0 when the user clears them — + # ``validate_config_dict`` rejects zero values so + # a blank field would otherwise block the save + # with a confusing validation error. new_config = { "model": { "api_base": api_base or "", "api_key": api_key or "", "name": model_name or "", "temperature": float(temperature), - "max_tokens": int(max_tokens or 0), - "max_iterations": int(max_iter or 0), + "max_tokens": int(max_tokens or 4096), + "max_iterations": int(max_iter or 50), }, "subagents": { "worker": { "enabled": bool(worker_enabled), "tools": list(worker_tools or []), - "max_turns": int(worker_max_turns or 0), + "max_turns": int(worker_max_turns or 20), }, }, "middleware": { @@ -655,7 +660,12 @@ def load_model(): if provider_dropdown is not None: def load_provider_choices(): choices = settings["get_provider_choices"]() - current = settings["get_current_provider"]() + # get_current_provider is optional — older + # forks provided only the choices list. + # Missing key must not raise KeyError; the + # dropdown simply renders with no preselect. + current_fn = settings.get("get_current_provider") + current = current_fn() if current_fn else None return gr.update(choices=choices, value=current) def switch_provider(choice): @@ -784,28 +794,41 @@ def _load_wizard_defaults(): # a browser refresh. Without this, Gradio keeps serving # the initial visibility state from when the Blocks # were first rendered. - def _sync_visibility(): - if "is_setup_complete" not in settings: - return gr.update(), gr.update(), gr.update() - done = bool(settings["is_setup_complete"]()) - sidebar_upd = ( - gr.update(visible=done) - if sidebar_block is not None - else gr.update() - ) - return ( - gr.update(visible=not done), # wizard_pane - gr.update(visible=done), # chat_pane - sidebar_upd, + # Sync visibility on every page load. The output list is + # either two or three elements depending on whether the + # sidebar exists — we used to alias the sidebar slot to + # wizard_pane when missing, but that sent a duplicate + # gr.update to the same component, which Gradio treats + # as two competing writes to wizard_pane.visible. + if sidebar_block is not None: + def _sync_visibility_with_sidebar(): + if "is_setup_complete" not in settings: + return gr.update(), gr.update(), gr.update() + done = bool(settings["is_setup_complete"]()) + return ( + gr.update(visible=not done), # wizard_pane + gr.update(visible=done), # chat_pane + gr.update(visible=done), # sidebar_block + ) + + app.load( + fn=_sync_visibility_with_sidebar, + outputs=[wizard_pane, chat_pane, sidebar_block], ) + else: + def _sync_visibility_no_sidebar(): + if "is_setup_complete" not in settings: + return gr.update(), gr.update() + done = bool(settings["is_setup_complete"]()) + return ( + gr.update(visible=not done), # wizard_pane + gr.update(visible=done), # chat_pane + ) - app.load( - fn=_sync_visibility, - outputs=[ - wizard_pane, chat_pane, - sidebar_block if sidebar_block is not None else wizard_pane, - ], - ) + app.load( + fn=_sync_visibility_no_sidebar, + outputs=[wizard_pane, chat_pane], + ) # Connection test — fills the model dropdown def _test_connection(api_base, api_key): @@ -844,7 +867,14 @@ def _load_preset(name): fn=_load_preset, inputs=[w_preset], outputs=[w_soul], ) - # Launch button — write everything, mark complete, swap panes + # Launch button — write everything, mark complete, then + # hard-reload the page. Toggling ``visible=`` on nested + # gr.Column + gr.Sidebar via gr.update is unreliable + # (children don't always re-mount); a full reload is the + # only bulletproof way to guarantee the chat pane appears. + # The reload re-enters _build() which reads + # is_setup_complete()==True and renders chat + drawer + # visible from scratch. def _finish_wizard( api_base, api_key, model_name, agent_name_val, soul, _preset_unused, @@ -852,20 +882,11 @@ def _finish_wizard( operator, auth_token, autostart, ): if not (api_base or "").strip(): - return ( - "⚠ API base URL is required — go back to step 1", - gr.update(), gr.update(), gr.update(), - ) + return "⚠ API base URL is required — go back to step 1" if not (model_name or "").strip(): - return ( - "⚠ pick a model — use the Test connection button in step 1", - gr.update(), gr.update(), gr.update(), - ) + return "⚠ pick a model — use the Test connection button in step 1" if not (agent_name_val or "").strip(): - return ( - "⚠ agent name is required — step 2", - gr.update(), gr.update(), gr.update(), - ) + return "⚠ agent name is required — step 2" new_config = { "model": { @@ -894,55 +915,58 @@ def _finish_wizard( try: ok, msg = settings["finish_setup"](new_config, soul or "") except Exception as e: - return ( - f"⚠ setup failed: {e}", - gr.update(), gr.update(), gr.update(), - ) + return f"⚠ setup failed: {e}" if ok: - return ( - f"✓ {msg}", - gr.update(visible=False), # wizard_pane - gr.update(visible=True), # chat_pane - gr.update(visible=True), # sidebar_block - ) - return ( - f"⚠ {msg}", - gr.update(), gr.update(), gr.update(), - ) + return f"✓ {msg} — reloading page…" + return f"⚠ {msg}" + # 1. Run the save. 2. On the client, if the status message + # starts with "✓", reload after a short beat so the user + # sees the success line. Any warning (⚠) keeps the wizard + # visible so they can correct and retry. w_launch_btn.click( fn=_finish_wizard, inputs=w_inputs, - outputs=[w_launch_status, wizard_pane, chat_pane, - sidebar_block if sidebar_block is not None else w_launch_status], + outputs=[w_launch_status], + ).then( + fn=None, + inputs=[w_launch_status], + outputs=None, + js=( + "(status) => {" + " if (typeof status === 'string' && status.startsWith('✓')) {" + " setTimeout(() => window.location.reload(), 1000);" + " }" + " return [];" + "}" + ), ) - # "Re-run setup" in the drawer flips panes back to wizard + # "Re-run setup" in the drawer — same reload-after-flip + # pattern for the reverse direction. if "restart_setup" in settings: def _trigger_rerun(): try: msg = settings["restart_setup"]() except Exception as e: - return ( - f"⚠ {e}", - gr.update(), gr.update(), gr.update(), - ) - return ( - f"✓ {msg}", - gr.update(visible=True), # wizard_pane - gr.update(visible=False), # chat_pane - gr.update(visible=False), # sidebar_block - ) + return f"⚠ {e}" + return f"✓ {msg} — reloading page…" reset_setup_btn.click( fn=_trigger_rerun, - outputs=[ - reset_setup_status, wizard_pane, chat_pane, - sidebar_block if sidebar_block is not None else reset_setup_status, - ], + outputs=[reset_setup_status], ).then( - fn=_load_wizard_defaults, - outputs=[*w_inputs, w_test_status, w_autostart_note], + fn=None, + inputs=[reset_setup_status], + outputs=None, + js=( + "(status) => {" + " if (typeof status === 'string' && status.startsWith('✓')) {" + " setTimeout(() => window.location.reload(), 800);" + " }" + " return [];" + "}" + ), ) return app diff --git a/graph/config_io.py b/graph/config_io.py index 8c39491..24a44ca 100644 --- a/graph/config_io.py +++ b/graph/config_io.py @@ -108,24 +108,6 @@ def save_yaml_doc(doc: Any, path: Path = CONFIG_YAML_PATH) -> None: # Config dict <-> dataclass # --------------------------------------------------------------------------- -# Nested dotted path → LangGraphConfig attribute. -_FIELD_MAP: dict[str, str] = { - "model.provider": "model_provider", - "model.name": "model_name", - "model.api_base": "api_base", - "model.api_key": "api_key", - "model.temperature": "temperature", - "model.max_tokens": "max_tokens", - "model.max_iterations": "max_iterations", - "middleware.knowledge": "knowledge_middleware", - "middleware.audit": "audit_middleware", - "middleware.memory": "memory_middleware", - "knowledge.db_path": "knowledge_db_path", - "knowledge.embed_model": "embed_model", - "knowledge.top_k": "knowledge_top_k", -} - - def config_to_dict(config: LangGraphConfig) -> dict[str, Any]: """Serialize a LangGraphConfig into the nested dict shape the UI works with. Mirrors the YAML schema so round-tripping is trivial. @@ -205,7 +187,7 @@ def validate_config_dict(updates: dict[str, Any]) -> tuple[bool, str]: model = updates.get("model", {}) temp = float(model.get("temperature", 0.2)) if not 0.0 <= temp <= 2.0: - return False, f"temperature must be 0.0–2.0, got {temp}" + return False, f"temperature must be 0.0-2.0, got {temp}" max_tokens = int(model.get("max_tokens", 4096)) if max_tokens < 1: return False, f"max_tokens must be >= 1, got {max_tokens}" @@ -401,8 +383,14 @@ def read_soul_preset(name: str) -> str: Returns empty string for an unknown name rather than raising — the wizard treats that as "no preset selected, blank canvas". + + Path-traversal guarded: the resolved target must live inside + ``PRESETS_DIR``. A name like ``"../secret"`` would otherwise + escape the presets directory and read arbitrary ``.md`` files + anywhere the process can reach. """ - path = PRESETS_DIR / f"{name}.md" - if not path.exists(): + presets_root = PRESETS_DIR.resolve() + candidate = (PRESETS_DIR / f"{name}.md").resolve() + if presets_root not in candidate.parents or not candidate.is_file(): return "" - return path.read_text(encoding="utf-8") + return candidate.read_text(encoding="utf-8") diff --git a/server.py b/server.py index ab52430..52ee6b4 100644 --- a/server.py +++ b/server.py @@ -55,6 +55,9 @@ _graph = None # LangGraph compiled graph _graph_config = None # LangGraphConfig _checkpointer = None # MemorySaver for session persistence +_active_port = 7870 # populated by _main() — the port this process is actually bound to. + # Read by the autostart installer so the LaunchAgent reboots + # on the same port the operator launched with, not the default. def _init_langgraph_agent(): @@ -122,6 +125,19 @@ def _reload_langgraph_agent() -> tuple[bool, str]: _graph_config = new_config + # Keep A2A bearer-auth state aligned with YAML on every reload. + # ``a2a_handler.set_a2a_token`` mutates the module-level holder the + # bearer-check closure reads, so wizard/drawer updates take effect + # on the next incoming request without a route re-register. + try: + from a2a_handler import set_a2a_token + + set_a2a_token(new_config.auth_token or None) + except ImportError: + # a2a_handler not yet imported (e.g. during early-boot reload + # before _main wires routes) — harmless. + pass + if not is_setup_complete(): _graph = None log.info("[reload] setup not complete — config reloaded, graph not compiled") @@ -282,7 +298,11 @@ def finish_setup(config: dict | None, soul: str | None) -> tuple[bool, str]: or "protoagent" ) if want_autostart: - ok_as, msg_as = install_autostart(agent_name=as_name) + # Pass the port this process is actually bound to so the + # LaunchAgent reboots on the right port, not the 7870 + # default. Operators frequently pick a custom port when + # another agent is already on 7870. + ok_as, msg_as = install_autostart(agent_name=as_name, port=_active_port) else: ok_as, msg_as = uninstall_autostart(agent_name=as_name) messages.append(f"autostart: {msg_as}") @@ -325,7 +345,7 @@ def toggle_autostart(enabled: bool) -> tuple[bool, str]: name = (_graph_config.identity_name if _graph_config else "") or "protoagent" if enabled: - return install_autostart(agent_name=name) + return install_autostart(agent_name=name, port=_active_port) return uninstall_autostart(agent_name=name) except Exception as e: return False, str(e) @@ -639,10 +659,13 @@ def _build_agent_card(host: str) -> dict: # --------------------------------------------------------------------------- def _main(): + global _active_port + parser = argparse.ArgumentParser(description=f"{AGENT_NAME_ENV} — protoAgent server") parser.add_argument("--port", type=int, default=7870) parser.add_argument("--config", type=str, default=None) args = parser.parse_args() + _active_port = args.port # Initialize observability import tracing @@ -811,19 +834,24 @@ async def _a2a_agent_card(request: Request): # JSON-RPC + REST, streaming, polling, cancel, push webhooks. from a2a_handler import register_a2a_routes - # A2A bearer token: YAML ``auth.token`` wins if set, else falls back - # to the legacy ``_API_KEY`` env var — so the wizard can set - # auth without an env restart. + # Two independent A2A auth surfaces: + # + # 1. **Bearer** (modern) — ``auth.token`` in YAML, captured by the + # wizard as "A2A bearer token". Passed via the ``auth_token`` + # argument, with ``A2A_AUTH_TOKEN`` env as fallback. Updates + # from a wizard/drawer-driven reload propagate live through + # ``a2a_handler.set_a2a_token`` — no restart needed. + # 2. **X-API-Key** (legacy) — ``_API_KEY`` env var, threaded + # through the ``api_key`` argument. Kept env-driven; forks that + # want it YAML-configurable can add a field later. + yaml_bearer = _graph_config.auth_token if _graph_config else "" auth_env = f"{AGENT_NAME_ENV.upper()}_API_KEY" - auth_key = ( - (_graph_config.auth_token if _graph_config else "") - or os.environ.get(auth_env, "") - ) register_a2a_routes( app=fastapi_app, chat_stream_fn_factory=_chat_langgraph_stream, chat_fn=chat, - api_key=auth_key, + api_key=os.environ.get(auth_env, ""), + auth_token=yaml_bearer, agent_card={}, register_card_route=False, # card is already served above ) diff --git a/tests/test_config_io.py b/tests/test_config_io.py index bb72f8a..25a7472 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -410,6 +410,25 @@ def test_read_soul_preset_unknown_returns_empty(): assert read_soul_preset("") == "" +@pytest.mark.parametrize("malicious", [ + "../secret", + "../../etc/passwd", + "../../../etc/passwd", + "subdir/../../../outside", + "/etc/hosts", + "..", + "../../graph/config", # try to read a real repo file via ../../ +]) +def test_read_soul_preset_rejects_path_traversal(malicious): + """CRITICAL: the preset name must not let a caller escape + ``config/soul-presets/``. Every ``..`` or absolute path + should return empty string, not read an arbitrary .md file + elsewhere on disk.""" + from graph.config_io import read_soul_preset + + assert read_soul_preset(malicious) == "" + + def test_list_soul_presets_missing_dir_returns_empty(monkeypatch, tmp_path): """If a fork accidentally deletes the presets dir, the wizard should render an empty dropdown, not crash.""" From 6f708f854fc9e279d28461651c5783f71ec328a6 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Wed, 22 Apr 2026 20:05:26 -0700 Subject: [PATCH 09/24] fix(review-2): address round-2 PR #150 feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major — atomic graph reload (server.py::_reload_langgraph_agent): Previously swapped _graph_config + set_a2a_token BEFORE calling create_agent_graph, so a failed build would leave the running _graph pinned to the OLD agent but reporting the NEW config and rotated bearer token. Now build first; commit config/auth/graph state only on success. Major — rollback marker on failed first-run reload (finish_setup): mark_setup_complete fires before the reload so the graph compiles. If the reload fails, the marker stays and the next page load drops the user into chat with _graph=None and no obvious recovery path. finish_setup now reset_setup()s on reload failure, so the wizard returns for a retry. Major — sanitize agent_name for plist path (autostart.py::_macos_label): Prior sanitization only lowercased + replaced spaces. `/` and `..` survived, so an agent name with path-traversal chars could target arbitrary paths relative to ~/Library/LaunchAgents/ on install / status / uninstall. Strip input to [a-z0-9_.-] and fall back to "protoagent" when the result is empty/dots-only. Verified resolved plist path stays inside LaunchAgents/ for every path-traversal payload I could think of. Major — gateway api_key off query string (POST /api/config/models): GET with ?api_key=... leaks credentials into browser history, reverse-proxy access logs, and uvicorn's own request log. Switched to POST taking a small ModelsProbeRequest body. Empty body still falls back to stored config so the drawer's initial render works. Major — round-trip identity/security/autostart through the drawer (chat_ui.py + server.py): Drawer previously only edited model/worker/middleware/knowledge/ SOUL, leaving the wizard's agent name, operator, bearer token, and autostart flag with no post-setup edit path. Added three new accordion sections (Identity / Security / Autostart), wired them into _config_components, _load_all, and _save. Extracted _sync_autostart_with_config so the wizard's finish_setup and the drawer's save_all both drive the LaunchAgent install/ uninstall from the same code path — flipping the drawer's Autostart checkbox + Save & Reload now installs/removes the plist the same way the wizard does. Verified end-to-end on product-director: * Fresh clone → wizard → "pd-renamed" via drawer → agent card says pd-renamed, old bearer token → 401, new bearer token → 200 * Invalid temperature (99) → rejected at validation; YAML + marker untouched * Path traversal via /api/config/presets/../secret → 404 Tests: 36 passing (no new cases — existing coverage was already sufficient for these fixes). Co-Authored-By: Claude Opus 4.7 (1M context) --- autostart.py | 20 ++++++-- chat_ui.py | 52 +++++++++++++++++++ server.py | 137 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 160 insertions(+), 49 deletions(-) diff --git a/autostart.py b/autostart.py index 42eabe7..65ce5f8 100644 --- a/autostart.py +++ b/autostart.py @@ -27,6 +27,7 @@ from __future__ import annotations import platform +import re import shlex import subprocess import sys @@ -113,10 +114,23 @@ def autostart_status(agent_name: str = "protoagent") -> dict: # --------------------------------------------------------------------------- +_SAFE_LABEL_RE = re.compile(r"[^a-z0-9_.-]+") + + def _macos_label(agent_name: str) -> str: - """Plist label — namespaced so it doesn't collide with system labels.""" - safe = agent_name.lower().replace(" ", "-") - return f"ai.protolabs.{safe}" + """Plist label — namespaced so it doesn't collide with system labels. + + Sanitizes the input: only ``[a-z0-9_.-]`` survive. Leading / trailing + dots and hyphens are stripped so the resulting filename can't be + a hidden file or look like a path-segment. Path-traversal + characters like ``/`` and ``..`` are filtered here rather than at + the filesystem layer so ``install_autostart(agent_name="../../x")`` + can't escape ``~/Library/LaunchAgents/``. + """ + sanitized = _SAFE_LABEL_RE.sub("-", agent_name.lower()).strip("-.") + if not sanitized: + sanitized = "protoagent" + return f"ai.protolabs.{sanitized}" def _macos_plist_path(agent_name: str) -> Path: diff --git a/chat_ui.py b/chat_ui.py index 3974203..ba832e3 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -447,6 +447,37 @@ def _build() -> gr.Blocks: minimum=1, interactive=True, ) + with gr.Accordion("Identity", open=False): + identity_name_in = gr.Textbox( + label="Agent name", + placeholder="short lowercase slug", + interactive=True, + ) + identity_operator_in = gr.Textbox( + label="Your name (operator)", + placeholder="injected into system prompt when set", + interactive=True, + ) + + with gr.Accordion("Security — A2A bearer token", open=False): + auth_token_in = gr.Textbox( + label="Bearer token", + type="password", + placeholder="blank → open mode; set to require Authorization: Bearer ", + interactive=True, + ) + gr.Markdown( + "_Live-reloadable. Save & Reload flips A2A " + "enforcement on or off immediately; no restart._" + ) + + with gr.Accordion("Autostart on login", open=False): + autostart_in = gr.Checkbox( + label="Launch this agent automatically on login", + interactive=True, + ) + autostart_drawer_status = gr.Markdown("") + with gr.Accordion("Persona (SOUL.md)", open=False): soul_in = gr.Textbox( label="SOUL.md", lines=16, show_label=False, @@ -487,6 +518,8 @@ def _build() -> gr.Blocks: worker_enabled_in, worker_tools_in, worker_max_turns_in, mw_knowledge_in, mw_audit_in, mw_memory_in, kb_db_in, kb_embed_in, kb_top_k_in, + identity_name_in, identity_operator_in, + auth_token_in, autostart_in, soul_in, ] @@ -517,6 +550,9 @@ def _load_all(): ) worker = cfg["subagents"]["worker"] + identity = cfg.get("identity", {}) + auth = cfg.get("auth", {}) + runtime = cfg.get("runtime", {}) return ( cfg["model"]["api_base"], cfg["model"]["api_key"], @@ -533,6 +569,10 @@ def _load_all(): cfg["knowledge"]["db_path"], cfg["knowledge"]["embed_model"], cfg["knowledge"]["top_k"], + identity.get("name", ""), + identity.get("operator", ""), + auth.get("token", ""), + bool(runtime.get("autostart_on_boot", False)), soul, fetch_msg, ) @@ -554,6 +594,8 @@ def _save( worker_enabled, worker_tools, worker_max_turns, mw_knowledge, mw_audit, mw_memory, kb_db, kb_embed, kb_top_k, + identity_name, identity_operator, + auth_token, autostart_on, soul, ): # Numeric fields fall back to sensible minimums @@ -587,6 +629,16 @@ def _save( "embed_model": kb_embed or "", "top_k": int(kb_top_k or 1), }, + "identity": { + "name": (identity_name or "").strip() or "protoagent", + "operator": (identity_operator or "").strip(), + }, + "auth": { + "token": auth_token or "", + }, + "runtime": { + "autostart_on_boot": bool(autostart_on), + }, } try: ok, msg = settings["save_all"](new_config, soul or "") diff --git a/server.py b/server.py index 52ee6b4..75f9692 100644 --- a/server.py +++ b/server.py @@ -120,15 +120,26 @@ def _reload_langgraph_agent() -> tuple[bool, str]: try: new_config = LangGraphConfig.from_yaml(config_path) except Exception as e: - log.exception("[reload] config load failed: %s", e) + log.exception("[reload] config load failed") return False, f"config load failed: {e}" - _graph_config = new_config + # Build the graph FIRST (when setup is complete) — only commit + # runtime state after the rebuild succeeds. Doing the swap first + # would leave the process serving the prior compiled _graph under + # fresh _graph_config + rotated bearer auth on failure — the + # metrics / card / auth all de-sync from what's actually running. + if is_setup_complete(): + try: + new_graph = create_agent_graph(new_config) + except Exception as e: + log.exception("[reload] graph rebuild failed") + return False, f"graph rebuild failed: {e}" + else: + new_graph = None - # Keep A2A bearer-auth state aligned with YAML on every reload. - # ``a2a_handler.set_a2a_token`` mutates the module-level holder the - # bearer-check closure reads, so wizard/drawer updates take effect - # on the next incoming request without a route re-register. + # Commit: config → A2A bearer → graph. All three reference the + # same ``new_config`` so they stay consistent. + _graph_config = new_config try: from a2a_handler import set_a2a_token @@ -137,21 +148,48 @@ def _reload_langgraph_agent() -> tuple[bool, str]: # a2a_handler not yet imported (e.g. during early-boot reload # before _main wires routes) — harmless. pass + _graph = new_graph - if not is_setup_complete(): - _graph = None + if new_graph is None: log.info("[reload] setup not complete — config reloaded, graph not compiled") return True, "config reloaded • setup not complete" + log.info("LangGraph agent reloaded (model: %s)", _graph_config.model_name) + return True, f"reloaded • model={_graph_config.model_name}" + + +def _sync_autostart_with_config(config: dict | None) -> str | None: + """Align the OS autostart artifact with the YAML runtime flag. + + Returns a short status string to append to the caller's message + log, or ``None`` when the config doesn't touch the runtime + section. Shared by ``finish_setup`` (wizard path) and + ``_apply_settings_changes`` (drawer path) so both surfaces + produce the same side effect when the checkbox flips. + """ + if not (config and "runtime" in config): + return None + want = bool(config.get("runtime", {}).get("autostart_on_boot", False)) + try: - new_graph = create_agent_graph(new_config) + from autostart import install_autostart, uninstall_autostart + + as_name = ( + config.get("identity", {}).get("name") + or (_graph_config.identity_name if _graph_config else "") + or "protoagent" + ) + if want: + ok, msg = install_autostart(agent_name=as_name, port=_active_port) + else: + ok, msg = uninstall_autostart(agent_name=as_name) except Exception as e: - log.exception("[reload] graph rebuild failed: %s", e) - return False, f"graph rebuild failed: {e}" + log.exception("[autostart] sync raised") + return f"autostart failed: {e}" - _graph = new_graph - log.info("LangGraph agent reloaded (model: %s)", _graph_config.model_name) - return True, f"reloaded • model={_graph_config.model_name}" + if not ok: + log.warning("[autostart] sync failed: %s", msg) + return f"autostart: {msg}" def _apply_settings_changes( @@ -184,7 +222,7 @@ def _apply_settings_changes( save_yaml_doc(doc) messages.append("config saved") except Exception as e: - log.exception("[config] YAML write failed: %s", e) + log.exception("[config] YAML write failed") return False, [f"config write: {e}"] if soul is not None: @@ -192,9 +230,16 @@ def _apply_settings_changes( paths = write_soul(soul) messages.append(f"SOUL saved ({len(paths)} path{'s' if len(paths) != 1 else ''})") except Exception as e: - log.exception("[config] SOUL write failed: %s", e) + log.exception("[config] SOUL write failed") return False, [f"soul write: {e}"] + # Drawer toggles of runtime.autostart_on_boot ride this path, + # not the wizard's finish_setup, so the LaunchAgent plist has + # to be installed/removed here too. + as_msg = _sync_autostart_with_config(config) + if as_msg: + messages.append(as_msg) + ok, reload_msg = _reload_langgraph_agent() messages.append(reload_msg) return ok, messages @@ -286,35 +331,21 @@ def finish_setup(config: dict | None, soul: str | None) -> tuple[bool, str]: mark_setup_complete() messages.append("setup marked complete") - # 3. Autostart sync - if config and "runtime" in config: - want_autostart = bool(config.get("runtime", {}).get("autostart_on_boot", False)) - try: - from autostart import install_autostart, uninstall_autostart - - as_name = ( - config.get("identity", {}).get("name") - or _graph_config.identity_name - or "protoagent" - ) - if want_autostart: - # Pass the port this process is actually bound to so the - # LaunchAgent reboots on the right port, not the 7870 - # default. Operators frequently pick a custom port when - # another agent is already on 7870. - ok_as, msg_as = install_autostart(agent_name=as_name, port=_active_port) - else: - ok_as, msg_as = uninstall_autostart(agent_name=as_name) - messages.append(f"autostart: {msg_as}") - if not ok_as: - log.warning("[setup] autostart sync failed: %s", msg_as) - except Exception as e: - log.exception("[setup] autostart sync raised: %s", e) - messages.append(f"autostart failed: {e}") + # 3. Autostart sync (shared helper — drawer path runs the same) + as_msg = _sync_autostart_with_config(config) + if as_msg: + messages.append(as_msg) - # 4. Reload — now picks up setup_complete=True and compiles + # 4. Reload — now picks up setup_complete=True and compiles. + # On failure, roll back the marker so the next page load + # drops the user back into the wizard instead of landing + # them in the chat UI with the "setup required" fallback + # and no obvious way to retry. ok, reload_msg = _reload_langgraph_agent() messages.append(reload_msg) + if not ok: + reset_setup() + messages.append("setup marker rolled back — re-run the wizard after fixing the error above") return ok, " • ".join(messages) @@ -728,11 +759,25 @@ async def _api_post_config(req: ConfigReloadRequest): ok, messages = _apply_settings_changes(config=req.config, soul=req.soul) return {"ok": ok, "messages": messages} - @fastapi_app.get("/api/config/models") - async def _api_list_models(api_base: str = "", api_key: str = ""): + class ModelsProbeRequest(PydanticBaseModel): + api_base: str = "" + api_key: str = "" + + @fastapi_app.post("/api/config/models") + async def _api_list_models(req: ModelsProbeRequest | None = None): + """Fetch the gateway's model list. + + POST (body) not GET (query) so the caller's API key doesn't + end up in browser history, reverse-proxy access logs, or the + uvicorn request log. A blank body falls back to whatever key + and base are stored in the current config — useful for the + drawer's initial render where there's nothing to POST yet. + """ from graph.config_io import list_gateway_models - base = api_base or (_graph_config.api_base if _graph_config else "") - key = api_key or (_graph_config.api_key if _graph_config else "") + + body = req or ModelsProbeRequest() + base = body.api_base or (_graph_config.api_base if _graph_config else "") + key = body.api_key or (_graph_config.api_key if _graph_config else "") models, error = list_gateway_models(base, key) return {"models": models, "error": error} From 433b44e02b9ea9037304853b503bca1e18032801 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 14:30:32 -0700 Subject: [PATCH 10/24] feat: ship default knowledge store + side-effect-verified eval harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The template now ships a working memory loop and end-to-end eval suite on day one so forks have a green baseline before they touch a single line of code. Closes #154. What lands: - knowledge/store.py — sqlite + FTS5 (LIKE fallback). One ``chunks`` table backs operator notes (memory_ingest), daily-log entries (daily_log), and conversation findings extracted by MemoryMiddleware (domain='finding'). Path resolves env > config > default with an automatic ~/.protoagent/ fallback when /sandbox isn't writable. - tools/lg_tools.py — five new memory tools (memory_ingest, memory_recall, memory_list, memory_stats, daily_log) bound to the store via a closure factory so tests get a fresh store per run. ``echo`` removed; ``get_all_tools(knowledge_store)`` actually uses its parameter now. - server.py — _build_knowledge_store() constructs the store and threads it through both initial init and the drawer reload path. Defaults flipped: knowledge_middleware + memory_middleware now ON by default (config/langgraph-config.yaml + graph/config.py). - evals/ — A2A client + runner + verify helpers + 15 starter cases (tasks.json) covering agent card discovery, bearer auth gating, abstention, every shipped tool, KB recall, a chained two-tool case, and KnowledgeMiddleware injection. Side-effect-verified: audit log + reply text + KB chunks all checked independently so hallucinated tool results get caught. - docs/guides/evals.md — full how-to. README/TEMPLATE/configuration/ starter-tools/first-agent updated to reflect the new defaults and the additional five memory tools. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 4 +- TEMPLATE.md | 21 +- config/langgraph-config.yaml | 20 +- docs/guides/customize-and-deploy.md | 2 +- docs/guides/evals.md | 151 +++++++++ docs/guides/fork-the-template.md | 2 +- docs/guides/index.md | 1 + docs/guides/subagents.md | 15 +- docs/reference/configuration.md | 25 +- docs/reference/starter-tools.md | 71 ++++- docs/tutorials/first-agent.md | 2 +- docs/tutorials/first-tool.md | 3 +- evals/README.md | 100 ++++++ evals/__init__.py | 0 evals/client.py | 262 ++++++++++++++++ evals/results/.gitkeep | 0 evals/runner.py | 307 +++++++++++++++++++ evals/tasks.json | 186 ++++++++++++ evals/verify.py | 176 +++++++++++ graph/config.py | 16 +- graph/subagents/config.py | 6 +- knowledge/__init__.py | 12 + knowledge/store.py | 456 ++++++++++++++++++++++++++++ server.py | 37 ++- tests/test_config_io.py | 8 +- tests/test_skill_curator.py | 2 +- tests/test_skill_emission.py | 20 +- tests/test_starter_tools.py | 10 - tools/lg_tools.py | 150 +++++++-- 29 files changed, 1976 insertions(+), 89 deletions(-) create mode 100644 docs/guides/evals.md create mode 100644 evals/README.md create mode 100644 evals/__init__.py create mode 100644 evals/client.py create mode 100644 evals/results/.gitkeep create mode 100644 evals/runner.py create mode 100644 evals/tasks.json create mode 100644 evals/verify.py create mode 100644 knowledge/__init__.py create mode 100644 knowledge/store.py diff --git a/README.md b/README.md index 1a1b34c..ef54b84 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,9 @@ rename / release-pipeline wiring. | Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 | | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork | | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships | -| Starter tools | `tools/lg_tools.py` | Free, keyless tools so a fresh fork can demo real behaviour: `echo`, `current_time`, `calculator` (safe AST eval), `web_search` (DuckDuckGo), `fetch_url` | +| Starter tools | `tools/lg_tools.py` | Keyless general tools (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) plus memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled store | +| Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` | +| Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) | | Tracing | `tracing.py` | Langfuse trace_session with distributed `a2a.trace` propagation and the OTel cross-context-detach filter | | Observability | `metrics.py`, `audit.py` | Prometheus metrics with per-agent prefix, JSONL audit log with trace IDs | | Output protocol | `graph/output_format.py` | `` / `` parsing so the model can think without it leaking to users | diff --git a/TEMPLATE.md b/TEMPLATE.md index 31b1eb2..08ff5b3 100644 --- a/TEMPLATE.md +++ b/TEMPLATE.md @@ -72,10 +72,10 @@ handler's output extraction depends on it. ## 4. Add your real tools `tools/lg_tools.py` ships with a small keyless starter set so a -fresh clone can demonstrate a real research loop: `echo`, -`current_time`, `calculator` (safe AST eval — no `eval()`), -`web_search` (DuckDuckGo via `ddgs`), and `fetch_url`. Keep the -ones you want, drop the rest, and add your own: +fresh clone can demonstrate a real research loop: `current_time`, +`calculator` (safe AST eval — no `eval()`), `web_search` (DuckDuckGo +via `ddgs`), and `fetch_url`. Keep the ones you want, drop the rest, +and add your own: ```python from langchain_core.tools import tool @@ -167,6 +167,19 @@ your fork. A useful pattern: - Extend `tests/test_a2a_integration.py` with assertions for your declared skills + extensions on the agent card +For end-to-end behaviour testing — "when the operator asks X, does +the right tool actually fire and the right row land in the KB?" — +the template ships an eval harness under `evals/`: + +```bash +python -m evals.runner # against a running agent +python -m evals.runner --category tool +``` + +See [Eval your fork](./docs/guides/evals.md) for what each case +asserts, how the three assertion channels work, and how to add +cases for your fork's new tools. + ## 9a. Understand the skill loop protoAgent's skill loop lets your agent learn from experience automatically. diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml index c3f53e8..05bada2 100644 --- a/config/langgraph-config.yaml +++ b/config/langgraph-config.yaml @@ -22,14 +22,24 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 middleware: - # The knowledge middleware requires a knowledge store. Leave false - # until you add one. Memory persistence is enabled by default and - # writes session summaries to /sandbox/memory/ without a store. - knowledge: false + # All three middlewares default ON. The knowledge middleware needs a + # store; the template constructs one automatically (see + # ``server.py::_build_knowledge_store``). Set ``knowledge: false`` if + # your fork is purely stateless. + knowledge: true audit: true memory: true diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md index 17c24d9..81fdeec 100644 --- a/docs/guides/customize-and-deploy.md +++ b/docs/guides/customize-and-deploy.md @@ -66,7 +66,7 @@ Replace with the skills your agent actually advertises over A2A. The `name` and ## 5. (Optional) Add domain tools -`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. ## 6. (Optional) Configure subagents diff --git a/docs/guides/evals.md b/docs/guides/evals.md new file mode 100644 index 0000000..38e1e33 --- /dev/null +++ b/docs/guides/evals.md @@ -0,0 +1,151 @@ +# Eval your fork + +The template ships an eval harness under `evals/` so a fresh fork has +a working test suite for its tools, memory, and A2A protocol surface +on day one. Cases assert across three independent channels — audit +log, reply text, and knowledge-store side effects — so a model that +hallucinates a tool result still gets caught. + +## When to read this + +- You forked the template and want a baseline pass-rate before you + ship. +- You added a new tool and want to lock in its intent — "when the + operator says X, fire tool Y". +- You changed a prompt or model and want to measure regression. + +## Run the suite + +```bash +# Agent running at $EVAL_BASE_URL (default http://localhost:7870) +# with the relevant auth env (A2A_AUTH_TOKEN and/or _API_KEY). + +python -m evals.runner +python -m evals.runner --category tool +python -m evals.runner --tasks current_time_intent,daily_log_intent +``` + +Reports land in `evals/results/run-.json`. The CLI prints a +pass/fail board; the JSON report carries reply previews and timing +for post-hoc inspection. + +## The three assertion channels + +``` +prompt → A2A → audit log (1) tools fired with expected outcome + → reply text (2) substrings present in reply + → KB chunks table (3) side effects landed correctly +``` + +A case passes only when every configured assertion holds. Most cases +should opt in to channels 1 and 3 — text patterns alone are brittle +to model paraphrasing and miss hallucinated tool results entirely. + +### Why side-effect verification beats text-only + +A model can produce "Logged: ..." in its reply without actually +calling `daily_log`. Substring matching passes, the DB stays empty, +and the bug ships. Reading `audit.jsonl` and the `chunks` table +afterward catches it. + +## The shape of a case + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-XYZ", + "domain": "context" + }, + "setup": [{"kb_ingest": {"content": "...", "domain": "...", "heading": "..."}}], + "teardown": [{"kb_delete_by_content": {"contains": "..."}}] +} +``` + +Three case `kind`s ship: + +- `agent_card` — fetch `/.well-known/agent-card.json` and assert on + the card's name, skill count, and declared extensions. +- `auth_check` — send a request with a deliberately bad bearer and + assert the server returns the expected status (401 by default). +- `ask` — the main shape. Sends `prompt`, then asserts on tool firing, + reply patterns, and KB state. + +## Prompt rule + +**The tool name never appears in the prompt.** Every prompt must be +plausibly typed by a real user. "Use `daily_log` to record..." tests +instruction-following, not tool selection. If the agent needs to +infer the tool from intent, that *is* the test. + +## Setup and teardown — start clean every time + +Each `ask` case can pre-seed state via `setup` blocks (BFCL's +`initial_config` pattern: direct DB writes the model never sees) and +clean up after itself with `teardown`. The fixture is invisible to +the agent — it discovers the seeded state via tools, exactly as a +real user would. + +`teardown` runs even when assertions fail, so case order doesn't +matter and a noisy failure doesn't poison the next run. + +Supported setup/teardown step kinds (extend `evals/verify.py` to add +more): + +| Step kind | Args | What it does | +|---|---|---| +| `kb_ingest` | `content`, `domain`, `heading?` | Insert a chunk | +| `kb_delete_by_content` | `contains` | Delete chunks where content LIKE `%contains%` | +| `kb_delete_by_heading` | `domain`, `heading` | Delete chunks matching (domain, heading) | + +## What forks should test by default + +The starter `tasks.json` covers: + +- Agent card discovery (name, skill count, `cost-v1` extension) +- Bearer auth gating +- Each shipped tool fires from a plausible operator prompt +- Memory ingest → recall round-trip +- KB-driven middleware injection (no tool call needed) +- A chained two-tool case (`daily_log` then `memory_recall`) + +When you add a tool, add at least one case for it. When you add a +skill to the agent card, extend the `card_discovery` case to assert +the new skill is advertised. + +## Running in CI + +The runner exits non-zero when any case fails, so it drops in cleanly: + +```yaml +- name: Boot agent + run: docker compose up -d agent + +- name: Wait for /health + run: ./scripts/wait-for-it.sh http://localhost:7870/.well-known/agent-card.json + +- name: Run evals + run: python -m evals.runner + env: + EVAL_BASE_URL: http://localhost:7870 + A2A_AUTH_TOKEN: ${{ secrets.AGENT_BEARER }} +``` + +For non-deterministic categories (any `tool` or `chained` case), aim +for an N-of-M majority threshold rather than 100% — the reference +implementation runs 3 attempts and gates at 2 passes for those +categories. Deterministic ones (`a2a-protocol`, `subsystem` with +seeded state) gate at 100%. + +## References + +- [`evals/README.md`](https://github.com/protoLabsAI/protoAgent/blob/main/evals/README.md) — quick reference for case authors +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) — user simulator + milestones / minefields diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md index eacbc92..d5472e4 100644 --- a/docs/guides/fork-the-template.md +++ b/docs/guides/fork-the-template.md @@ -43,7 +43,7 @@ Keep the `` / `` protocol block in `prompts.py` — the A2A ## 4. Replace the starter tools -`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. See the [starter tools reference](/reference/starter-tools) for the shapes of the shipped ones. diff --git a/docs/guides/index.md b/docs/guides/index.md index 3e49012..65dc41e 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -9,4 +9,5 @@ Task-oriented procedures. Assumes you already have a running agent (see [Tutoria | [Add a custom skill](/guides/add-a-skill) | Your agent does new things and callers need to dispatch to them | | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` | | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production | +| [Eval your fork](/guides/evals) | You want a baseline pass-rate for the tools / memory / A2A surface in your fork | | [Deploy via GHCR](/guides/deploy) | You're ready to ship and want auto-deploy wired up | diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md index f903995..9fc1a9a 100644 --- a/docs/guides/subagents.md +++ b/docs/guides/subagents.md @@ -86,7 +86,16 @@ for name in ("worker", "researcher"): # ← add new names subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 researcher: enabled: true @@ -117,8 +126,8 @@ If your agent is simple enough that subagents are pure overhead, flip `include_s ```python _graph = create_agent_graph( _graph_config, - knowledge_store=None, - include_subagents=False, # ← skip the task() tool and subagent machinery + knowledge_store=knowledge_store, # keep the bundled store wired up + include_subagents=False, # ← skip the task() tool and subagent machinery ) ``` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 3463dba..bd3f5be 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -17,13 +17,22 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 middleware: - knowledge: false + knowledge: true audit: true - memory: false + memory: true knowledge: db_path: /sandbox/knowledge/agent.db @@ -59,9 +68,9 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag | Key | Default | What | |---|---|---| -| `knowledge` | `false` | Inject retrieved knowledge into state before LLM calls. Requires a knowledge store — leave off until you add one. | +| `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. | | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. | -| `memory` | `false` | Memory middleware (experimental). Requires a knowledge store. | +| `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. | ## `knowledge` @@ -69,8 +78,8 @@ Only read when `middleware.knowledge` is `true`. | Key | Default | What | |---|---|---| -| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. | -| `embed_model` | `nomic-embed-text` | Embedding model. | +| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. Falls back to `~/.protoagent/knowledge/agent.db` automatically when the configured path isn't writable (e.g. running locally without `/sandbox`). Override at runtime with `KNOWLEDGE_DB_PATH`. | +| `embed_model` | `nomic-embed-text` | Reserved for forks that bolt embeddings on top of the FTS5 baseline. The bundled store ignores it. | | `top_k` | `5` | Results per query fed into state. | -The template does not ship a knowledge store — the config keys are kept so a fork can flip the switch without rewiring every call site. +The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 isn't available). One `chunks` table; the `domain` column distinguishes operator-set notes (`memory_ingest`), daily-log entries (`daily_log`), and conversation findings extracted by `MemoryMiddleware` (`domain='finding'`). diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index e47f25e..c0918b5 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -1,15 +1,11 @@ # Starter tools -Five free, keyless tools ship in `tools/lg_tools.py`. They exist so a fresh template clone can demonstrate real behaviour immediately. Keep them, drop them, or swap them — `get_all_tools()` is the registry. +Nine tools ship in `tools/lg_tools.py`: -## `echo` +- Four keyless general-purpose tools — `current_time`, `calculator`, `web_search`, `fetch_url` — that work without any state. +- Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)). -```python -@tool -async def echo(message: str) -> str -``` - -Returns `"echo: "`. The template-only sanity tool. Safe to delete once your real tools are wired. +`get_all_tools(knowledge_store)` is the registry. When `knowledge_store` is `None` (the store is disabled in config) the memory tools are omitted automatically. ## `current_time` @@ -106,6 +102,62 @@ Example Domain This domain is for use in documentation examples... ``` +## `memory_ingest` + +```python +@tool +async def memory_ingest(content: str, domain: str = "general", heading: str | None = None) -> str +``` + +Stores a chunk in the bundled `KnowledgeStore`. Use for things the operator wants you to remember across sessions — preferences, environment facts, decisions worth recalling later. + +`domain` is a logical bucket (`"preferences"`, `"context"`, `"general"`, …). `heading` is an optional short label that doubles as a stable de-dupe key. + +Returns `"Stored chunk 17 in 'preferences'."` on success, an error string when the store is unavailable. + +## `memory_recall` + +```python +@tool +async def memory_recall(query: str, k: int = 5) -> str +``` + +Top-k keyword search over the store via FTS5 (LIKE fallback). Returns one match per line: + +``` +[preferences] coffee: Operator's preferred coffee is a Gibraltar with oat milk. +[context] lab: Primary lab is Snickerdoodle in Spokane. +``` + +Returns `"No matches."` when nothing scores above the keyword threshold. + +## `memory_list` + +```python +@tool +async def memory_list(domain: str | None = None, limit: int = 10) -> str +``` + +Most-recent-first listing of stored chunks. Filter by domain when given. Useful for "what did I log today?" style queries. + +## `memory_stats` + +```python +@tool +async def memory_stats() -> str +``` + +Per-domain chunk counts plus a total. Useful for sanity-checking that ingest landed. + +## `daily_log` + +```python +@tool +async def daily_log(content: str) -> str +``` + +Convenience wrapper around `memory_ingest` that writes to `domain='daily-log'` with today's UTC date as the heading. Same-day entries cluster under the same heading for `memory_list(domain='daily-log')`. + ## Adding your own Follow the same pattern: @@ -132,7 +184,7 @@ Then append it to the list in `get_all_tools()`: ```python def get_all_tools(knowledge_store=None): - return [echo, current_time, calculator, web_search, fetch_url, my_tool] + return [current_time, calculator, web_search, fetch_url, my_tool] ``` See [Write your first tool](/tutorials/first-tool) for the full walkthrough. @@ -141,3 +193,4 @@ See [Write your first tool](/tutorials/first-tool) for the full walkthrough. - [Configure subagents](/guides/subagents) — tools are allowlisted per subagent - [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url` +- [Eval your fork](/guides/evals) — the eval harness exercises every tool listed here end-to-end diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md index 6082f66..ce12744 100644 --- a/docs/tutorials/first-agent.md +++ b/docs/tutorials/first-agent.md @@ -40,7 +40,7 @@ Walk through the four steps: 1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one. 2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent. -3. **Tools & middleware.** All five starter tools (`echo`, `current_time`, `calculator`, `web_search`, `fetch_url`) are enabled by default. Leave **Audit** and **Memory** middleware on. Leave **Knowledge** off — that needs an index the template doesn't ship with. +3. **Tools & middleware.** All nine starter tools (`current_time`, `calculator`, `web_search`, `fetch_url`, plus the memory tools `memory_ingest` / `memory_recall` / `memory_list` / `memory_stats` / `daily_log`) are enabled by default. Leave **Audit**, **Memory**, and **Knowledge** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` (falls back to `~/.protoagent/knowledge/agent.db` outside Docker). 4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`. Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices. diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 9f10251..87502d0 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -37,7 +37,6 @@ Then register it in `get_all_tools()` at the bottom of the same file: ```python def get_all_tools(knowledge_store=None): return [ - echo, current_time, calculator, web_search, @@ -53,7 +52,7 @@ If you want the worker subagent to be able to call `git_sha`, add it to the allo ```python WORKER_CONFIG = SubagentConfig( # ... - tools=["echo", "current_time", "calculator", "web_search", "fetch_url", "git_sha"], + tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"], # ... ) ``` diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..c8aff77 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,100 @@ +# Evals + +Side-effect-verified eval harness. Each case sends a prompt over A2A +to a running agent and asserts on three independent channels: + +1. **Audit log** — every expected tool name fires with the expected + outcome (`AuditMiddleware` writes JSONL to `/sandbox/audit/audit.jsonl`). +2. **Reply text** — case-insensitive substring patterns appear in the + model's final reply. +3. **Knowledge store side effects** — the right rows actually land in + the `chunks` table after a memory-writing turn. + +A case passes only when every configured assertion holds. + +## Quickstart + +```bash +# Agent must be running at $EVAL_BASE_URL (default http://localhost:7870). +# Auth: set $A2A_AUTH_TOKEN if bearer is configured, $_API_KEY +# (or $EVAL_API_KEY) if X-API-Key auth is configured. Both are sent +# when both env vars exist. + +python -m evals.runner # all cases +python -m evals.runner --category tool # one category +python -m evals.runner --tasks current_time,daily_log +python -m evals.runner --base-url http://host:7870 +``` + +Reports land in `evals/results/run-.json` per run. + +## Categories + +| Category | What it covers | +|---|---| +| `a2a-protocol` | Agent card discovery, auth gating | +| `simple` | Direct LLM answers, no tool use | +| `abstention` | Don't reach for a tool when training data is enough | +| `tool` | Single-tool invocations across the starter set | +| `chained` | Multi-step reasoning that calls 2+ tools | +| `subsystem` | KnowledgeMiddleware retrieval, hot-memory injection | + +## File layout + +``` +evals/ + client.py A2A client (message/send + poll, message/stream, agent card, cancel) + runner.py CLI runner — print board, write JSON report + verify.py Audit-log + KB side-effect assertions, setup/teardown + tasks.json Cases — 15 covering the starter tools end-to-end + results/ Per-run reports +``` + +## Adding a case + +Append to `tasks.json`: + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Human-readable description", + "prompt": "What you ask the agent (in real-user voice — never name the tool)", + "expected_tools": ["tool_name"], + "expected_patterns": ["substring-that-must-appear"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-A1B2", + "domain": "context" + }, + "setup": [ + {"kb_ingest": {"content": "...", "domain": "context", "heading": "..."}} + ], + "teardown": [ + {"kb_delete_by_content": {"contains": "EVAL-MARK-A1B2"}} + ] +} +``` + +Use **unique markers** (`EVAL-MARK-XYZ`, `eval-chain-flag-q9`) in +prompts whenever you need a verifier to disambiguate from real +operator data. + +## Why side-effect verification + +When the model hallucinates a tool result (e.g. "Logged: ..." without +actually calling `daily_log`), text-only checks pass while the DB +stays empty. The audit-log + KB queries here catch it. + +## Prompt rule + +Every prompt must be plausibly typed by a real user. **The tool name +never appears.** If the agent has to infer the tool from intent, that +*is* the test — leaking the tool name into the prompt is testing +instruction-following, not tool selection. + +## References + +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/client.py b/evals/client.py new file mode 100644 index 0000000..51eadfe --- /dev/null +++ b/evals/client.py @@ -0,0 +1,262 @@ +"""A2A client for the eval runner. + +Drives the running agent over the same JSON-RPC + SSE surface that +real A2A callers use: + +- ``agent_card()`` — GET ``/.well-known/agent-card.json`` +- ``ask()`` — ``message/send`` + ``tasks/get`` poll +- ``stream()`` — ``message/stream`` SSE +- ``cancel()`` — ``tasks/cancel`` + +Returns structured ``TaskResult`` objects the runner asserts against. + +Auth picks up both surfaces the template exposes (see ``server.py``): + +- ``Authorization: Bearer `` — wizard-set / ``A2A_AUTH_TOKEN`` env +- ``X-API-Key: `` — legacy, ``_API_KEY`` env + +Both headers are sent when the corresponding env var is set; the +running agent enforces whichever it is configured for. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import time +import uuid +from dataclasses import dataclass, field +from typing import Any + +import httpx + + +@dataclass +class TaskResult: + task_id: str + state: str # completed / failed / canceled / timeout + text: str = "" # extracted user-facing reply + artifacts: list[dict] = field(default_factory=list) + usage: dict = field(default_factory=dict) + duration_ms: int = 0 + error: str | None = None + + +def _resolve_auth_env() -> tuple[str, str]: + """Return (bearer_token, api_key) from env. + + Bearer comes from ``A2A_AUTH_TOKEN`` (the env name the A2A handler + reads at boot). The API key is named after the agent — + ``_API_KEY`` — so a fork named ``quinn`` reads + ``QUINN_API_KEY``. ``EVAL_API_KEY`` is honored as an explicit + override so CI doesn't have to know the agent's slug. + """ + bearer = os.environ.get("A2A_AUTH_TOKEN", "") + + api_key = os.environ.get("EVAL_API_KEY", "") + if not api_key: + agent = os.environ.get("AGENT_NAME", "protoagent").upper() + api_key = os.environ.get(f"{agent}_API_KEY", "") + return bearer, api_key + + +class AgentClient: + """Thin A2A client tied to one agent instance.""" + + def __init__( + self, + base_url: str | None = None, + bearer: str | None = None, + api_key: str | None = None, + ): + self.base_url = ( + base_url + or os.environ.get("EVAL_BASE_URL") + or os.environ.get("AGENT_BASE_URL") + or "http://localhost:7870" + ).rstrip("/") + + env_bearer, env_api_key = _resolve_auth_env() + token = bearer if bearer is not None else env_bearer + x_api = api_key if api_key is not None else env_api_key + self.headers = {"Content-Type": "application/json"} + if token: + self.headers["Authorization"] = f"Bearer {token}" + if x_api: + self.headers["X-API-Key"] = x_api + + # ── Agent card ────────────────────────────────────────────────────────── + + async def agent_card(self) -> dict: + """Fetch the agent card. + + The template serves both ``/.well-known/agent-card.json`` (modern) + and ``/.well-known/agent.json`` (legacy). We try the modern path + first; fall back to the legacy path so this works against forks + that disabled one or the other. + """ + async with httpx.AsyncClient(timeout=10) as client: + for path in ("/.well-known/agent-card.json", "/.well-known/agent.json"): + r = await client.get(f"{self.base_url}{path}") + if r.status_code == 200: + return r.json() + r.raise_for_status() # surface the last error + return {} + + # ── message/send + poll ───────────────────────────────────────────────── + + async def ask(self, prompt: str, *, timeout_s: int = 90) -> TaskResult: + """Send + poll until terminal. Returns TaskResult with extracted text.""" + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + start = time.time() + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(f"{self.base_url}/a2a", headers=self.headers, json=payload) + r.raise_for_status() + resp = r.json() + if "error" in resp: + return TaskResult(task_id="", state="failed", error=str(resp["error"])) + task_id = resp.get("result", {}).get("id", "") + + deadline = start + timeout_s + while time.time() < deadline: + await asyncio.sleep(1.5) + poll = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "p", + "method": "tasks/get", + "params": {"id": task_id}, + }, + ) + poll.raise_for_status() + res = poll.json().get("result", {}) + state = (res.get("status") or {}).get("state", "") + if state in ("completed", "failed", "canceled"): + text, usage = _extract(res) + return TaskResult( + task_id=task_id, + state=state, + text=text, + artifacts=res.get("artifacts", []), + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + return TaskResult( + task_id=task_id, state="timeout", + duration_ms=int((time.time() - start) * 1000), + ) + + # ── message/stream (SSE) ──────────────────────────────────────────────── + + async def stream(self, prompt: str, *, timeout_s: int = 90) -> tuple[list[dict], TaskResult | None]: + """Stream a turn over SSE. Returns (event_log, final TaskResult). + + Each event is a dict shaped ``{kind, result}``. Use this to assert + on the streaming protocol itself (status-update sequence, final + flag, artifact chunks). Most cases should use ``ask()`` instead. + """ + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/stream", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + events: list[dict] = [] + final: TaskResult | None = None + start = time.time() + async with httpx.AsyncClient(timeout=timeout_s) as client: + async with client.stream( + "POST", f"{self.base_url}/a2a", headers=self.headers, json=payload + ) as r: + if r.status_code >= 400: + body = await r.aread() + return events, TaskResult( + task_id="", state="failed", + error=f"HTTP {r.status_code}: {body.decode()[:300]}", + ) + async for line in r.aiter_lines(): + if not line or line.startswith(":"): + continue + if line.startswith("data:"): + raw = line[5:].strip() + if not raw: + continue + try: + data = json.loads(raw) + except json.JSONDecodeError: + events.append({"kind": "raw", "raw": raw}) + continue + result = (data.get("result") or {}) + kind = result.get("kind", "?") + events.append({"kind": kind, "result": result}) + if kind in ("status-update", "task") and result.get("final"): + text, usage = _extract(result) + final = TaskResult( + task_id=result.get("taskId") or result.get("id", ""), + state=(result.get("status") or {}).get("state", "unknown"), + text=text, + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + break + return events, final + + # ── tasks/cancel ──────────────────────────────────────────────────────── + + async def cancel(self, task_id: str) -> dict: + async with httpx.AsyncClient(timeout=10) as client: + r = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "c", + "method": "tasks/cancel", + "params": {"id": task_id}, + }, + ) + return r.json() + + +def _extract(result: dict) -> tuple[str, dict]: + """Pull text + cost data out of an A2A result envelope.""" + text_parts: list[str] = [] + usage: dict = {} + artifacts = result.get("artifacts") or [] + for art in artifacts: + for p in art.get("parts", []): + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + elif p.get("kind") == "data" and isinstance(p.get("data"), dict): + if "usage" in p["data"]: + usage = dict(p["data"]["usage"]) + if "durationMs" in p["data"]: + usage["durationMs"] = p["data"]["durationMs"] + status = result.get("status") or {} + msg = status.get("message") or {} + for p in msg.get("parts") or []: + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + return "\n".join(text_parts).strip(), usage diff --git a/evals/results/.gitkeep b/evals/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/runner.py b/evals/runner.py new file mode 100644 index 0000000..ad5154b --- /dev/null +++ b/evals/runner.py @@ -0,0 +1,307 @@ +"""Eval runner — executes ``tasks.json``, prints a pass/fail board, +writes a JSON report to ``evals/results/run-.json``. + +Usage: + +.. code:: bash + + # agent must be running at $EVAL_BASE_URL (default http://localhost:7870) + # auth: $A2A_AUTH_TOKEN and/or $_API_KEY (or $EVAL_API_KEY) + + python -m evals.runner # all cases + python -m evals.runner --category tool # one category + python -m evals.runner --tasks current_time,daily_log + python -m evals.runner --base-url http://host:7870 + +Cases are described in ``tasks.json``. Each case picks one of three +``kind`` runners: + +- ``agent_card`` — fetch ``/.well-known/agent-card.json`` and assert + on the returned card shape. +- ``auth_check`` — send a request with a known-bad bearer token and + assert the expected HTTP status. +- ``ask`` — send a prompt over A2A, optionally pre-seed the KB, then + assert against three independent channels: audit-log tool firing, + reply-text patterns, and KB side effects. + +A case passes only when all assertions hold. The ``detail`` column in +the pass/fail board names the missing assertion when one fails. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +# Allow ``python -m evals.runner`` and ``python evals/runner.py``. +_PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(_PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(_PROJECT_ROOT)) + +from evals.client import AgentClient, TaskResult +from evals import verify + + +@dataclass +class CaseResult: + id: str + category: str + name: str + passed: bool + detail: str + duration_ms: int = 0 + tokens: int = 0 + raw: dict = field(default_factory=dict) + + +# ── case runners ──────────────────────────────────────────────────────────── + + +async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult: + expect = case.get("expect", {}) + try: + card = await client.agent_card() + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"fetch failed: {e}") + + problems: list[str] = [] + if "name" in expect and card.get("name") != expect["name"]: + problems.append(f"name={card.get('name')!r} expected {expect['name']!r}") + if "skills_min" in expect: + skills = card.get("skills") or [] + if len(skills) < expect["skills_min"]: + problems.append(f"only {len(skills)} skills, expected >= {expect['skills_min']}") + if "extensions_contain" in expect: + ext_uris = [ + e.get("uri", "") + for e in (card.get("capabilities") or {}).get("extensions") or [] + ] + for needle in expect["extensions_contain"]: + if not any(needle in u for u in ext_uris): + problems.append(f"missing extension matching {needle!r}; saw {ext_uris}") + if problems: + return CaseResult(case["id"], case["category"], case["name"], False, "; ".join(problems)) + return CaseResult(case["id"], case["category"], case["name"], True, "card OK") + + +async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: + """Verify the A2A endpoint rejects a bad bearer with the expected status.""" + import httpx + + expected_status = case.get("expect", {}).get("status", 401) + bad = case.get("bad_token", "") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {bad}", + # No X-API-Key — testing bearer alone. + } + payload = { + "jsonrpc": "2.0", + "id": "auth-check", + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": "ping"}], + "messageId": "auth-check", + } + }, + } + try: + async with httpx.AsyncClient(timeout=10) as c: + r = await c.post(f"{client.base_url}/a2a", headers=headers, json=payload) + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"request failed: {e}") + if r.status_code != expected_status: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"got {r.status_code}, expected {expected_status}", + ) + return CaseResult( + case["id"], case["category"], case["name"], True, f"status={r.status_code}", + ) + + +async def _run_ask(client: AgentClient, case: dict) -> CaseResult: + # Pre-seed state via direct DB writes (model never sees this). + if "setup" in case: + err = verify.apply_setup(case["setup"]) + if err: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"setup failed: {err}", + ) + + since = verify.audit_now() + result: TaskResult = await client.ask( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + + if result.state != "completed": + if "teardown" in case: + verify.apply_teardown(case["teardown"]) + return CaseResult( + case["id"], case["category"], case["name"], False, + f"task state={result.state}; error={result.error or '(none)'}", + duration_ms=result.duration_ms, + raw={"text": result.text[:200]}, + ) + + problems: list[str] = [] + + # Tool firing assertions. + expected_tools = case.get("expected_tools") or [] + if expected_tools: + await asyncio.sleep(0.3) # let the audit log catch up + entries = verify.audit_entries_since(since) + require_success = case.get("tool_outcome", "success") == "success" + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if not passed: + problems.append(detail) + + # Text pattern assertions (case-insensitive substrings). + text_lower = result.text.lower() + for pattern in case.get("expected_patterns") or []: + if pattern.lower() not in text_lower: + problems.append(f"missing pattern {pattern!r}") + + # KB side-effect assertions. + vk = case.get("verify_kb") or {} + if "find_chunk_containing" in vk: + chunk = verify.find_chunk_containing( + vk["find_chunk_containing"], domain=vk.get("domain"), + ) + if not chunk: + problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") + + if "teardown" in case: + verify.apply_teardown(case["teardown"]) + + detail = ( + "; ".join(problems) if problems + else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" + ) + return CaseResult( + case["id"], case["category"], case["name"], + passed=not problems, + detail=detail, + duration_ms=result.duration_ms, + tokens=result.usage.get("total_tokens", 0) or 0, + raw={"reply": result.text[:300]}, + ) + + +# ── dispatch ──────────────────────────────────────────────────────────────── + + +_RUNNERS = { + "agent_card": _run_agent_card, + "auth_check": _run_auth_check, + "ask": _run_ask, +} + + +async def run_one(client: AgentClient, case: dict) -> CaseResult: + runner = _RUNNERS.get(case.get("kind", "ask")) + if runner is None: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"unknown kind: {case.get('kind')}", + ) + try: + return await runner(client, case) + except Exception as e: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"exception: {e!r}", + ) + + +# ── main ──────────────────────────────────────────────────────────────────── + + +def _print_board(results: list[CaseResult]) -> None: + width_id = max(len(r.id) for r in results) + width_cat = max(len(r.category) for r in results) + print() + print(f"{'ID'.ljust(width_id)} {'CAT'.ljust(width_cat)} RESULT TIME TOKENS DETAIL") + print("-" * 90) + pass_count = 0 + for r in results: + mark = "PASS" if r.passed else "FAIL" + if r.passed: + pass_count += 1 + time_s = f"{r.duration_ms}ms".rjust(6) + tokens = str(r.tokens).rjust(6) if r.tokens else " - " + print( + f"{r.id.ljust(width_id)} {r.category.ljust(width_cat)} " + f"{mark} {time_s} {tokens} {r.detail[:80]}" + ) + print("-" * 90) + print(f"\n{pass_count}/{len(results)} passed") + + +def _save_report(results: list[CaseResult], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "ts": datetime.now(timezone.utc).isoformat(), + "total": len(results), + "passed": sum(1 for r in results if r.passed), + "results": [asdict(r) for r in results], + } + path.write_text(json.dumps(payload, indent=2)) + print(f"\nReport: {path}") + + +async def main(): + p = argparse.ArgumentParser() + p.add_argument("--base-url", default=None) + p.add_argument("--tasks", default=None, help="comma-separated case IDs") + p.add_argument("--category", default=None) + p.add_argument("--out", default=None) + args = p.parse_args() + + tasks_path = Path(__file__).parent / "tasks.json" + cases = json.loads(tasks_path.read_text()) + + if args.tasks: + wanted = set(args.tasks.split(",")) + cases = [c for c in cases if c["id"] in wanted] + if args.category: + cases = [c for c in cases if c.get("category") == args.category] + + if not cases: + print("no cases match filters", file=sys.stderr) + return 2 + + client = AgentClient(base_url=args.base_url) + + print(f"Running {len(cases)} case(s) against {client.base_url}") + results: list[CaseResult] = [] + for case in cases: + sys.stdout.write(f" {case['id']}... ") + sys.stdout.flush() + result = await run_one(client, case) + sys.stdout.write(f"{'PASS' if result.passed else 'FAIL'} {result.detail[:60]}\n") + results.append(result) + + _print_board(results) + + out_path = Path(args.out) if args.out else ( + Path(__file__).parent / "results" / f"run-{int(time.time())}.json" + ) + _save_report(results, out_path) + + return 0 if all(r.passed for r in results) else 1 + + +if __name__ == "__main__": + raise SystemExit(asyncio.run(main())) diff --git a/evals/tasks.json b/evals/tasks.json new file mode 100644 index 0000000..14cdd16 --- /dev/null +++ b/evals/tasks.json @@ -0,0 +1,186 @@ +[ + { + "id": "card_discovery", + "category": "a2a-protocol", + "kind": "agent_card", + "name": "Agent card discovery", + "expect": { + "skills_min": 1, + "extensions_contain": ["cost-v1"] + } + }, + { + "id": "auth_negative", + "category": "a2a-protocol", + "kind": "auth_check", + "name": "Reject bad bearer when bearer auth is configured", + "bad_token": "definitely-not-the-real-token", + "expect": {"status": 401} + }, + + { + "id": "abstain_no_tool", + "category": "abstention", + "kind": "ask", + "name": "Don't reach for a tool when training data is fine", + "prompt": "What's the capital of France? One word.", + "expected_tools": [], + "expected_patterns": ["paris"] + }, + { + "id": "greeting", + "category": "simple", + "kind": "ask", + "name": "Direct greeting, no tool", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [] + }, + + { + "id": "current_time_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about live time → current_time", + "prompt": "What time is it in UTC right now?", + "expected_tools": ["current_time"], + "expected_patterns": ["UTC"] + }, + { + "id": "calculator_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"] + }, + { + "id": "web_search_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about recent news → web_search", + "prompt": "Anything notable in the news about Anthropic this week?", + "expected_tools": ["web_search"], + "expected_patterns": [] + }, + { + "id": "fetch_url_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a URL's content → fetch_url", + "prompt": "What's on https://example.com? Just the page title is fine.", + "expected_tools": ["fetch_url"], + "expected_patterns": ["example"] + }, + + { + "id": "memory_ingest_intent", + "category": "tool", + "kind": "ask", + "name": "Stores a stable preference → memory_ingest writes a chunk", + "prompt": "Remember that I prefer protoLabs Studio standups at 9am Eastern.", + "expected_tools": ["memory_ingest"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "9am" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "9am"}} + ] + }, + { + "id": "daily_log_intent", + "category": "tool", + "kind": "ask", + "name": "Asks to log an event → daily_log writes today's chunk", + "prompt": "Log this for today: my standup just ended, team is unblocked on the auth migration.", + "expected_tools": ["daily_log"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "auth migration", + "domain": "daily-log" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "auth migration"}} + ] + }, + { + "id": "memory_recall_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a stored fact → recall surfaces it", + "setup": [ + {"kb_ingest": { + "content": "Operator's primary lab is Snickerdoodle, located in Spokane.", + "domain": "context", + "heading": "lab" + }} + ], + "prompt": "Where's my primary lab and what's it called?", + "expected_tools": ["memory_recall"], + "expected_patterns": ["snickerdoodle", "spokane"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "context", "heading": "lab"}} + ] + }, + { + "id": "memory_list_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for recent log entries → memory_list", + "setup": [ + {"kb_ingest": {"content": "called the dentist", "domain": "daily-log", "heading": "today"}}, + {"kb_ingest": {"content": "merged the auth PR", "domain": "daily-log", "heading": "today"}} + ], + "prompt": "What did I do today? Summarize from the log.", + "expected_tools": ["memory_list"], + "expected_patterns": ["dentist"], + "teardown": [ + {"kb_delete_by_content": {"contains": "called the dentist"}}, + {"kb_delete_by_content": {"contains": "merged the auth PR"}} + ] + }, + { + "id": "memory_stats_intent", + "category": "tool", + "kind": "ask", + "name": "Asks how much is in memory → memory_stats", + "prompt": "How much have I got stored across each memory domain?", + "expected_tools": ["memory_stats"], + "expected_patterns": [] + }, + + { + "id": "log_then_recall_chain", + "category": "chained", + "kind": "ask", + "name": "Log an event, then recall it later in the same turn", + "prompt": "Log this for today: 'eval-chain-flag-q9: chained log+recall test'. After logging, search memory for that flag and quote it back.", + "expected_tools": ["daily_log", "memory_recall"], + "expected_patterns": ["eval-chain-flag-q9"], + "teardown": [ + {"kb_delete_by_content": {"contains": "eval-chain-flag-q9"}} + ] + }, + + { + "id": "knowledge_middleware_recall", + "category": "subsystem", + "kind": "ask", + "name": "KnowledgeMiddleware surfaces a stored fact without an explicit search", + "setup": [ + {"kb_ingest": { + "content": "Operator's preferred coffee is a Gibraltar with oat milk from Atticus.", + "domain": "preferences", + "heading": "coffee" + }} + ], + "prompt": "What's my usual coffee order?", + "expected_tools": [], + "expected_patterns": ["gibraltar", "oat"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "preferences", "heading": "coffee"}} + ] + } +] diff --git a/evals/verify.py b/evals/verify.py new file mode 100644 index 0000000..5b1f8cc --- /dev/null +++ b/evals/verify.py @@ -0,0 +1,176 @@ +"""Side-effect verifiers for eval cases. + +Two channels: + +- **Audit log** — JSONL written by ``AuditMiddleware`` at + ``/sandbox/audit/audit.jsonl`` (override with ``AUDIT_PATH`` env). + ``audit_entries_since`` returns entries newer than a marker, and + ``assert_tools_fired`` confirms a tool name appears with the + expected outcome. +- **Knowledge store** — sqlite DB at ``KNOWLEDGE_DB_PATH`` (or the + template default). ``find_chunk_containing`` confirms a memory + write actually landed; ``setup_chunk`` / ``teardown`` mutate the + store directly so cases start from a known state. + +The store is opened read/write so setup steps can pre-seed (BFCL's +``initial_config`` pattern). The model never sees these direct writes +— it discovers them via ``memory_recall`` / ``memory_list`` tools as +real users would. +""" + +from __future__ import annotations + +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +# ── path resolution ───────────────────────────────────────────────────────── + + +def _audit_path() -> Path: + """Audit JSONL location. Falls back to the template's docker default.""" + raw = os.environ.get("AUDIT_PATH") or "/sandbox/audit/audit.jsonl" + p = Path(raw).expanduser() + if p.is_file(): + return p + # Local-dev fallback: same shape, but under the home dir. + fallback = Path.home() / ".protoagent" / "audit" / "audit.jsonl" + return fallback + + +def _kb_store(): + """Construct a ``KnowledgeStore`` against the configured path. + + Imported lazily so ``evals/verify.py`` can be loaded in a context + where ``knowledge/`` isn't on sys.path yet (the runner adjusts + sys.path before calling in). + """ + from knowledge import KnowledgeStore + return KnowledgeStore() # honors KNOWLEDGE_DB_PATH env + + +# ── audit log ─────────────────────────────────────────────────────────────── + + +def audit_now() -> str: + """ISO-8601 marker suitable as a 'since' input to ``audit_entries_since``.""" + return datetime.now(timezone.utc).isoformat() + + +def audit_entries_since(ts_iso: str) -> list[dict]: + """Return audit-log entries with ``ts`` strictly greater than ``ts_iso``.""" + p = _audit_path() + if not p.is_file(): + return [] + out: list[dict] = [] + with p.open() as fh: + for line in fh: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if entry.get("ts", "") > ts_iso: + out.append(entry) + return out + + +def assert_tools_fired( + audit_entries: list[dict], + expected: list[str], + *, + require_success: bool = True, +) -> tuple[bool, str]: + """Confirm each expected tool name appears in audit entries. + + Order doesn't matter — a tool that fires twice still satisfies one + expected entry, and extra entries (subset matching, BFCL-style) are + allowed. + + ``require_success=True`` (default) only counts ``success=True`` + entries — use this for happy-path cases. Pass ``require_success=False`` + when the case represents an error path that the agent should still + *attempt* (e.g. fetching a private URL the agent has no creds for). + """ + fired: dict[str, dict[str, int]] = {} + for e in audit_entries: + bucket = fired.setdefault(e.get("tool", "?"), {"ok": 0, "err": 0}) + bucket["ok" if e.get("success") else "err"] += 1 + + missing: list[str] = [] + for t in expected: + if t not in fired: + missing.append(t) + continue + if require_success and fired[t]["ok"] == 0: + missing.append(f"{t} (only errors)") + + if missing: + return False, f"missing tools: {missing}; saw: {dict(fired)}" + return True, f"saw: {dict(fired)}" + + +# ── knowledge store ───────────────────────────────────────────────────────── + + +def find_chunk_containing(text: str, *, domain: str | None = None) -> dict | None: + store = _kb_store() + chunk = store.find_chunk_containing(text, domain=domain) + return chunk.as_dict() if chunk else None + + +def chunks_in_domain(domain: str, *, limit: int = 50) -> list[dict]: + store = _kb_store() + return [c.as_dict() for c in store.list_chunks(domain=domain, limit=limit)] + + +# ── setup / teardown helpers ───────────────────────────────────────────────── + + +def apply_setup(steps: list[dict]) -> str | None: + """Apply a list of setup steps. Each step is a dict with one key. + + Supported step kinds: + + - ``kb_ingest``: ``{content, domain, heading?}`` + + Returns ``None`` on success, an error string on first failure. + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + if kind == "kb_ingest": + if store.add_chunk( + args["content"], + domain=args.get("domain", "general"), + heading=args.get("heading"), + ) is None: + return f"kb_ingest failed for {args!r}" + else: + return f"unknown setup step: {kind}" + return None + + +def apply_teardown(steps: list[dict]) -> None: + """Best-effort teardown. Never raises so a setup failure or assertion + failure doesn't poison subsequent cases. + + Supported step kinds: + + - ``kb_delete_by_content``: ``{contains}`` + - ``kb_delete_by_heading``: ``{domain, heading}`` + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + try: + if kind == "kb_delete_by_content": + store.delete_by_content(args["contains"]) + elif kind == "kb_delete_by_heading": + store.delete_by_heading(args["domain"], args["heading"]) + except Exception as exc: # pragma: no cover + log.debug("[verify] teardown step %s failed: %s", kind, exc) diff --git a/graph/config.py b/graph/config.py index 00ae1a8..a3df02b 100644 --- a/graph/config.py +++ b/graph/config.py @@ -37,16 +37,24 @@ class LangGraphConfig: # Subagents — template ships with one example (see graph/subagents/config.py). # Add fields here as you add entries to SUBAGENT_REGISTRY. worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, )) # Middleware toggles - knowledge_middleware: bool = False # template ships no knowledge store + knowledge_middleware: bool = True audit_middleware: bool = True - memory_middleware: bool = False + memory_middleware: bool = True - # Knowledge store (opt-in — leave disabled until the fork ships one) + # Knowledge store — sqlite + FTS5, see ``knowledge/store.py``. + # The default path lives under ``/sandbox/`` to play well with the + # bundled Docker volume; the store falls back to + # ``~/.protoagent/knowledge/agent.db`` automatically when /sandbox + # is read-only or absent (e.g. local ``python server.py``). knowledge_db_path: str = "/sandbox/knowledge/agent.db" embed_model: str = "qwen3-embedding" knowledge_top_k: int = 5 diff --git a/graph/subagents/config.py b/graph/subagents/config.py index 554a321..560edc7 100644 --- a/graph/subagents/config.py +++ b/graph/subagents/config.py @@ -63,7 +63,11 @@ class SubagentConfig: Replace this prompt with domain-specific guidance once your agent has real specialized roles.""", - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, ) diff --git a/knowledge/__init__.py b/knowledge/__init__.py new file mode 100644 index 0000000..1de93de --- /dev/null +++ b/knowledge/__init__.py @@ -0,0 +1,12 @@ +"""Knowledge store — sqlite-backed chunk storage for memory tools and middleware. + +The template ships this enabled by default so a fresh fork has a working +memory loop on day one (memory_ingest, memory_recall, daily_log) and the +eval harness can assert side effects against real DB state. + +See ``knowledge.store.KnowledgeStore`` for the public API. +""" + +from knowledge.store import KnowledgeStore, Chunk + +__all__ = ["KnowledgeStore", "Chunk"] diff --git a/knowledge/store.py b/knowledge/store.py new file mode 100644 index 0000000..a6aee7a --- /dev/null +++ b/knowledge/store.py @@ -0,0 +1,456 @@ +"""KnowledgeStore — sqlite-backed chunk storage with FTS5 search. + +The template's default knowledge surface. One ``chunks`` table holds +every piece of stored content (operator notes via ``memory_ingest``, +daily-log entries, conversation findings extracted by +``MemoryMiddleware``); the ``domain`` column distinguishes them. + +Search uses sqlite FTS5 when available (true on virtually all modern +sqlite builds). When FTS5 is missing — sandboxed sqlite, custom builds +— the store transparently falls back to ``LIKE`` keyword matching so +the API contract still holds. + +The store is path-aware and degradation-aware: + +- Honors ``KNOWLEDGE_DB_PATH`` env var → constructor argument → + config default ``/sandbox/knowledge/agent.db``. +- If the configured path is unwritable (running locally outside the + container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db`` + so a fresh ``python server.py`` works without sudo. +- All write operations swallow ``sqlite3.OperationalError`` and log; + the store never crashes the agent loop on a corrupt or read-only DB. + +Forks that want embeddings on top of FTS5 can subclass and override +``search()`` — the middleware reads through that one method. +""" + +from __future__ import annotations + +import logging +import os +import re +import sqlite3 +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +DEFAULT_DB_PATH = "/sandbox/knowledge/agent.db" + + +@dataclass +class Chunk: + """One row from the chunks table — what callers see.""" + id: int + content: str + domain: str + heading: str | None + source: str | None + source_type: str | None + finding_type: str | None + created_at: str + updated_at: str + + def as_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "content": self.content, + "domain": self.domain, + "heading": self.heading, + "source": self.source, + "source_type": self.source_type, + "finding_type": self.finding_type, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + + +def _resolve_path(db_path: str | Path | None) -> Path: + """Pick a writable DB path. Env > arg > default; fall back to ~/.protoagent.""" + raw = os.environ.get("KNOWLEDGE_DB_PATH") or db_path or DEFAULT_DB_PATH + p = Path(str(raw)).expanduser() + try: + p.parent.mkdir(parents=True, exist_ok=True) + # Probe writability + probe = p.parent / ".write-probe" + probe.touch() + probe.unlink() + return p + except OSError: + fallback = Path.home() / ".protoagent" / "knowledge" / "agent.db" + fallback.parent.mkdir(parents=True, exist_ok=True) + log.info( + "[knowledge] %s not writable; using %s instead", + p, fallback, + ) + return fallback + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _has_fts5(db: sqlite3.Connection) -> bool: + try: + db.execute( + "CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_probe USING fts5(x)" + ) + db.execute("DROP TABLE _fts5_probe") + return True + except sqlite3.OperationalError: + return False + + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + content TEXT NOT NULL, + domain TEXT NOT NULL DEFAULT 'general', + heading TEXT, + source TEXT, + source_type TEXT, + finding_type TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_chunks_domain ON chunks(domain); +CREATE INDEX IF NOT EXISTS idx_chunks_created_at ON chunks(created_at); +""" + +_FTS_SCHEMA = """ +CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5( + content, heading, content='chunks', content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; +""" + + +class KnowledgeStore: + """Default knowledge store. Sqlite + FTS5 (with LIKE fallback). + + Forks usually don't subclass this — extend ``add_chunk`` / + ``search`` directly when you need new fields, or wrap it with + your own embedding layer. + """ + + def __init__(self, db_path: str | Path | None = None): + self.path = _resolve_path(db_path) + self._fts_available: bool | None = None + self._init_db() + + # ── connection / schema ───────────────────────────────────────────────── + + def _connect(self) -> sqlite3.Connection: + db = sqlite3.connect(str(self.path)) + db.row_factory = sqlite3.Row + db.execute("PRAGMA journal_mode=WAL") + return db + + def _init_db(self) -> None: + try: + db = self._connect() + db.executescript(_SCHEMA) + self._fts_available = _has_fts5(db) + if self._fts_available: + db.executescript(_FTS_SCHEMA) + else: + log.info( + "[knowledge] FTS5 unavailable — search will use LIKE fallback" + ) + db.commit() + db.close() + except sqlite3.OperationalError as exc: + log.error("[knowledge] schema init failed at %s: %s", self.path, exc) + + # Convenience for middleware that wants the raw connection. Kept + # private so the public API stays small. + def _get_db(self) -> sqlite3.Connection | None: + try: + return self._connect() + except sqlite3.OperationalError as exc: + log.error("[knowledge] connect failed: %s", exc) + return None + + # ── writes ────────────────────────────────────────────────────────────── + + def add_chunk( + self, + content: str, + domain: str = "general", + heading: str | None = None, + *, + source: str | None = None, + source_type: str | None = None, + finding_type: str | None = None, + ) -> int | None: + """Insert a chunk. Returns the new row id, or None on failure.""" + if not content or not content.strip(): + return None + db = self._get_db() + if db is None: + return None + try: + now = _now_iso() + cur = db.execute( + "INSERT INTO chunks " + "(content, domain, heading, source, source_type, finding_type, " + "created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + (content, domain, heading, source, source_type, finding_type, now, now), + ) + db.commit() + return int(cur.lastrowid) + except sqlite3.OperationalError as exc: + log.error("[knowledge] add_chunk failed: %s", exc) + return None + finally: + db.close() + + def add_finding( + self, + content: str, + source: str = "conversation", + source_type: str = "chat", + finding_type: str = "insight", + ) -> int | None: + """Compatibility shim for ``MemoryMiddleware.after_agent``. + + Stored under ``domain='finding'`` so memory_list / memory_recall + can surface them alongside operator-set chunks. + """ + return self.add_chunk( + content, + domain="finding", + source=source, + source_type=source_type, + finding_type=finding_type, + ) + + # ── reads ─────────────────────────────────────────────────────────────── + + def search( + self, + query: str, + k: int = 5, + *, + domain: str | None = None, + ) -> list[dict[str, Any]]: + """Top-k chunks matching ``query``. Shape matches what the + ``KnowledgeMiddleware`` consumes: each result has ``table``, + ``preview``, plus the underlying chunk fields. + + Uses FTS5 when available, else a tokenized LIKE fallback. Returns + an empty list on no matches or DB failure (never raises). + """ + if not query or not query.strip(): + return [] + db = self._get_db() + if db is None: + return [] + try: + rows = self._search_fts(db, query, k, domain) if self._fts_available \ + else self._search_like(db, query, k, domain) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] search failed: %s", exc) + rows = [] + finally: + db.close() + + results: list[dict[str, Any]] = [] + for r in rows: + preview = (r["heading"] + ": " if r["heading"] else "") + r["content"] + results.append({ + "table": "chunks", + "preview": preview[:240], + **dict(r), + }) + return results + + def _search_fts( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + # Sanitize to FTS5-safe tokens; OR them so a multi-word query + # matches any of the keywords (closer to LIKE behaviour). + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + match = " OR ".join(tokens) + if domain: + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? AND c.domain = ? " + "ORDER BY rank LIMIT ?", + (match, domain, k), + ).fetchall() + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? " + "ORDER BY rank LIMIT ?", + (match, k), + ).fetchall() + + def _search_like( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + # Score = number of tokens matched (rough recall-style ranking). + like_clauses = " + ".join( + "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END" + for _ in tokens + ) + params: list[Any] = [] + for t in tokens: + needle = f"%{t}%" + params.extend([needle, needle]) + sql = ( + f"SELECT *, ({like_clauses}) AS score FROM chunks " + "WHERE score > 0" + ) + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY score DESC, id DESC LIMIT ?" + params.append(k) + return db.execute(sql, params).fetchall() + + def list_chunks( + self, + domain: str | None = None, + limit: int = 50, + ) -> list[Chunk]: + """Most-recent-first chunk listing. Used by ``memory_list``.""" + db = self._get_db() + if db is None: + return [] + try: + if domain: + rows = db.execute( + "SELECT * FROM chunks WHERE domain = ? ORDER BY id DESC LIMIT ?", + (domain, limit), + ).fetchall() + else: + rows = db.execute( + "SELECT * FROM chunks ORDER BY id DESC LIMIT ?", + (limit,), + ).fetchall() + except sqlite3.OperationalError as exc: + log.warning("[knowledge] list_chunks failed: %s", exc) + rows = [] + finally: + db.close() + return [Chunk(**dict(r)) for r in rows] + + def stats(self) -> dict[str, int]: + """Return per-domain chunk counts plus a ``total`` key.""" + db = self._get_db() + if db is None: + return {"total": 0} + try: + rows = db.execute( + "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC" + ).fetchall() + total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] + except sqlite3.OperationalError as exc: + log.warning("[knowledge] stats failed: %s", exc) + return {"total": 0} + finally: + db.close() + out = {r["domain"]: r["n"] for r in rows} + out["total"] = int(total) + return out + + # ── verification helpers (used by evals/verify.py) ────────────────────── + + def find_chunk_containing( + self, + text: str, + domain: str | None = None, + ) -> Chunk | None: + """Return the most-recent chunk whose content or heading contains ``text``. + + Used by the eval runner to assert side-effect outcomes after a + memory-writing turn. + """ + db = self._get_db() + if db is None: + return None + try: + sql = ( + "SELECT * FROM chunks " + "WHERE (content LIKE ? OR heading LIKE ?)" + ) + params: list[Any] = [f"%{text}%", f"%{text}%"] + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY id DESC LIMIT 1" + row = db.execute(sql, params).fetchone() + except sqlite3.OperationalError as exc: + log.warning("[knowledge] find_chunk_containing failed: %s", exc) + row = None + finally: + db.close() + return Chunk(**dict(row)) if row else None + + def delete_by_content(self, contains: str) -> int: + """Delete chunks whose content matches ``%contains%``. Returns count.""" + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",)) + db.commit() + return int(cur.rowcount) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] delete_by_content failed: %s", exc) + return 0 + finally: + db.close() + + def delete_by_heading(self, domain: str, heading: str) -> int: + """Delete chunks matching (domain, heading). Returns count.""" + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute( + "DELETE FROM chunks WHERE domain = ? AND heading = ?", + (domain, heading), + ) + db.commit() + return int(cur.rowcount) + except sqlite3.OperationalError as exc: + log.warning("[knowledge] delete_by_heading failed: %s", exc) + return 0 + finally: + db.close() diff --git a/server.py b/server.py index 75f9692..f1ecd46 100644 --- a/server.py +++ b/server.py @@ -90,8 +90,38 @@ def _init_langgraph_agent(): from graph.agent import create_agent_graph - _graph = create_agent_graph(_graph_config) - log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name) + # Construct the default KnowledgeStore so memory tools (memory_ingest, + # memory_recall, daily_log) and KnowledgeMiddleware have something to + # bind to. Forks that don't want a store can set + # ``middleware.knowledge: false`` and remove the memory tools from + # the worker subagent — the store is still cheap to construct. + knowledge_store = _build_knowledge_store(_graph_config) + + _graph = create_agent_graph(_graph_config, knowledge_store=knowledge_store) + log.info( + "LangGraph agent initialized (model: %s, knowledge_db: %s)", + _graph_config.model_name, + getattr(knowledge_store, "path", "(disabled)"), + ) + + +def _build_knowledge_store(config): + """Return a ``KnowledgeStore`` bound to the configured DB path. + + Best-effort: any sqlite-level failure is logged and the store + falls back to ``~/.protoagent/knowledge/agent.db`` automatically + (see ``knowledge.store._resolve_path``). Returns ``None`` only when + knowledge is disabled in config — kept as a separate code path so + forks can audit when the agent is running KB-less. + """ + if not getattr(config, "knowledge_middleware", True): + return None + try: + from knowledge import KnowledgeStore + return KnowledgeStore(db_path=config.knowledge_db_path) + except Exception as exc: + log.warning("[server] knowledge store init failed: %s; running KB-less", exc) + return None def _reload_langgraph_agent() -> tuple[bool, str]: @@ -130,7 +160,8 @@ def _reload_langgraph_agent() -> tuple[bool, str]: # metrics / card / auth all de-sync from what's actually running. if is_setup_complete(): try: - new_graph = create_agent_graph(new_config) + new_store = _build_knowledge_store(new_config) + new_graph = create_agent_graph(new_config, knowledge_store=new_store) except Exception as e: log.exception("[reload] graph rebuild failed") return False, f"graph rebuild failed: {e}" diff --git a/tests/test_config_io.py b/tests/test_config_io.py index 25a7472..39fc017 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -105,11 +105,11 @@ def test_apply_updates_nested_worker(tmp_path: Path) -> None: config_io.apply_updates_to_yaml( doc, - {"subagents": {"worker": {"enabled": True, "tools": ["echo", "calculator"]}}}, + {"subagents": {"worker": {"enabled": True, "tools": ["current_time", "calculator"]}}}, ) assert doc["subagents"]["worker"]["enabled"] is True - assert list(doc["subagents"]["worker"]["tools"]) == ["echo", "calculator"] + assert list(doc["subagents"]["worker"]["tools"]) == ["current_time", "calculator"] # ── config_to_dict ─────────────────────────────────────────────────────────── @@ -325,9 +325,9 @@ def test_list_available_tools_returns_starter_set(): # Lock in the template's starter set — forks replace these but # the drawer's CheckboxGroup populates from this call, so the # contract is "return tool names in a stable list". - assert "echo" in names - assert "calculator" in names assert "current_time" in names + assert "calculator" in names + assert "fetch_url" in names assert all(isinstance(n, str) for n in names) diff --git a/tests/test_skill_curator.py b/tests/test_skill_curator.py index cb7bf43..3d8e211 100644 --- a/tests/test_skill_curator.py +++ b/tests/test_skill_curator.py @@ -52,7 +52,7 @@ def _make_skill( "name": name, "description": description, "prompt_template": f"Run the {name} workflow.", - "tools_used": ["echo"], + "tools_used": ["current_time"], "confidence": confidence, "created_at": _utc_iso(days_ago), } diff --git a/tests/test_skill_emission.py b/tests/test_skill_emission.py index 34b8f1c..6d9555b 100644 --- a/tests/test_skill_emission.py +++ b/tests/test_skill_emission.py @@ -88,14 +88,14 @@ def test_skill_datapart_serialization() -> None: name="dp-test", description="DataPart test", prompt_template="prompt", - tools_used=["echo"], + tools_used=["current_time"], source_session_id="s1", ) part = artifact.to_datapart() assert part["kind"] == "data" assert part["metadata"]["mimeType"] == SKILL_V1_MIME assert part["data"]["name"] == "dp-test" - assert part["data"]["tools_used"] == ["echo"] + assert part["data"]["tools_used"] == ["current_time"] # created_at must be present and parseable datetime.fromisoformat(part["data"]["created_at"]) @@ -125,7 +125,7 @@ def test_skill_artifact_validation_tools_not_list() -> None: with pytest.raises(TypeError, match="tools_used"): SkillV1Artifact( name="x", description="d", prompt_template="p", - tools_used="echo", # type: ignore[arg-type] + tools_used="current_time", # type: ignore[arg-type] ) @@ -250,7 +250,7 @@ def _run_emit_logic( def test_skill_emitted_when_emit_skill_true() -> None: """Skill artifact is emitted when emit_skill=True and subagent succeeds.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -264,7 +264,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: assert len(skills) == 1 skill = skills[0] assert skill.name == "my-task" - assert skill.tools_used == ["echo"] + assert skill.tools_used == ["current_time"] assert skill.prompt_template == "do the thing" assert "Captured workflow" in skill.description @@ -272,7 +272,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: def test_no_emission_on_opt_out() -> None: """No skill artifact is emitted when emit_skill=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -307,7 +307,7 @@ def test_no_emission_on_failure() -> None: def test_no_emission_when_config_disallows() -> None: """No skill artifact is emitted when allow_skill_emission=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -323,8 +323,8 @@ def test_no_emission_when_config_disallows() -> None: def test_tool_tracking_metadata_captured() -> None: """tools_used in the artifact lists all tools invoked, deduplicated.""" msgs = [ - _make_ai_message_with_tool_calls(["echo", "calculator"]), - _make_ai_message_with_tool_calls(["echo"]), # duplicate — should appear once + _make_ai_message_with_tool_calls(["current_time", "calculator"]), + _make_ai_message_with_tool_calls(["current_time"]), # duplicate — should appear once _make_ai_message_with_content("result"), ] _run_emit_logic( @@ -336,7 +336,7 @@ def test_tool_tracking_metadata_captured() -> None: ) skills = get_pending_skills() assert len(skills) == 1 - assert skills[0].tools_used.count("echo") == 1 + assert skills[0].tools_used.count("current_time") == 1 assert "calculator" in skills[0].tools_used diff --git a/tests/test_starter_tools.py b/tests/test_starter_tools.py index fe4495b..f469365 100644 --- a/tests/test_starter_tools.py +++ b/tests/test_starter_tools.py @@ -114,13 +114,3 @@ async def test_fetch_url_rejects_non_http_scheme(): ): result = await fetch_url.ainvoke({"url": bad}) assert result.startswith("Error:"), f"accepted unsafe url: {bad!r}" - - -# ── echo — sanity ──────────────────────────────────────────────────────────── - - -@pytest.mark.asyncio -async def test_echo_sanity(): - from tools.lg_tools import echo - result = await echo.ainvoke({"message": "hello"}) - assert result == "echo: hello" diff --git a/tools/lg_tools.py b/tools/lg_tools.py index d8ce0f5..b59ccba 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -7,12 +7,20 @@ The template ships with a small starter set of free, keyless tools so a fresh clone can demonstrate real agent behaviour out of the box: -- ``echo`` — sanity check - ``current_time`` — wall-clock time in any IANA timezone - ``calculator`` — safe numeric expression evaluation - ``web_search`` — DuckDuckGo text search (via ``ddgs``, no API key) - ``fetch_url`` — fetch a URL and return cleaned text +Plus memory tools that bind to a ``KnowledgeStore`` (constructed in +``server.py`` and threaded through ``get_all_tools(knowledge_store)``): + +- ``memory_ingest`` — store a fact / preference / note +- ``memory_recall`` — search the store for relevant chunks +- ``memory_list`` — list recent chunks (optionally per domain) +- ``memory_stats`` — per-domain counts +- ``daily_log`` — convenience: write a daily-log chunk + Replace or extend this file with your agent's real tools and update ``get_all_tools()`` to return the full list. @@ -39,20 +47,6 @@ from langchain_core.tools import tool -# ── echo ───────────────────────────────────────────────────────────────────── - - -@tool -async def echo(message: str) -> str: - """Echo the input back with a prefix. Template-only sanity tool. - - Useful to verify the tool loop is wired end-to-end before real - tools are in place. Safe to delete once your fork has its own - tools. - """ - return f"echo: {message}" - - # ── current_time ───────────────────────────────────────────────────────────── @@ -273,16 +267,130 @@ def _extract_text_from_html(content: bytes) -> str: return "\n".join(lines) +# ── memory tools ───────────────────────────────────────────────────────────── +# +# Each memory tool is built by a factory that closes over the +# ``KnowledgeStore`` instance. Doing it this way (rather than module- +# level globals) keeps tests isolated — they pass a temp store and get +# a fresh tool list bound to it. Production constructs one store in +# ``server.py`` and reuses the bound tools for the lifetime of the +# process. + + +def _build_memory_tools(knowledge_store): + """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" + from datetime import datetime, timezone + + @tool + async def memory_ingest( + content: str, + domain: str = "general", + heading: str | None = None, + ) -> str: + """Store a fact, preference, or note in long-term memory. + + Use this for things the operator wants you to remember across + sessions — preferences ("I take my coffee black"), facts about + the operator's environment, decisions worth recalling later. + + Args: + content: The text to remember. Be specific and self-contained; + the chunk is retrieved by keyword search. + domain: Logical bucket — ``"preferences"``, ``"context"``, + ``"general"``. Defaults to ``"general"``. + heading: Optional short label (e.g. ``"coffee"``) used as a + stable de-dupe key by the eval suite and curator. + + Returns ``"Stored chunk N in 'domain'."`` on success. + """ + chunk_id = knowledge_store.add_chunk(content, domain=domain, heading=heading) + if chunk_id is None: + return "Error: failed to store chunk (knowledge store unavailable)." + return f"Stored chunk {chunk_id} in {domain!r}." + + @tool + async def memory_recall(query: str, k: int = 5) -> str: + """Search long-term memory for chunks relevant to ``query``. + + Returns the top-k matches, one per line. Pull this when the + operator asks something where stored context is more reliable + than the model's own training data ("what's my coffee order?", + "remind me what we decided about the auth migration"). + + Returns ``"No matches."`` when the store is empty or nothing + scores above the keyword threshold. + """ + results = knowledge_store.search(query, k=k) + if not results: + return "No matches." + lines = [] + for r in results: + lines.append(f"[{r.get('domain', '?')}] {r['preview']}") + return "\n".join(lines) + + @tool + async def memory_list(domain: str | None = None, limit: int = 10) -> str: + """List the most recent chunks. Filter by domain when given. + + Useful when the operator asks for recent activity ("what did I + log today?") or wants to inspect what the agent has stored. + """ + chunks = knowledge_store.list_chunks(domain=domain, limit=limit) + if not chunks: + return f"No chunks in {domain or 'any domain'}." + lines = [] + for c in chunks: + head = f"[{c.domain}]" + if c.heading: + head += f" {c.heading}:" + preview = (c.content or "")[:200] + lines.append(f"{c.created_at} {head} {preview}") + return "\n".join(lines) + + @tool + async def memory_stats() -> str: + """Return chunk counts per domain. Useful for sanity checks.""" + s = knowledge_store.stats() + if s.get("total", 0) == 0: + return "Knowledge store is empty." + lines = [f"Total: {s['total']}"] + for k, v in s.items(): + if k == "total": + continue + lines.append(f" {k}: {v}") + return "\n".join(lines) + + @tool + async def daily_log(content: str) -> str: + """Append a daily-log entry for today. + + Stored under ``domain='daily-log'`` with today's UTC date as + the heading, so the same day's entries cluster together for + ``memory_list(domain='daily-log')`` queries. + """ + today = datetime.now(timezone.utc).date().isoformat() + chunk_id = knowledge_store.add_chunk( + content, domain="daily-log", heading=today, + ) + if chunk_id is None: + return "Error: failed to write daily log entry." + return f"Logged ({today}): {content[:120]}" + + return [memory_ingest, memory_recall, memory_list, memory_stats, daily_log] + + # ── registry ───────────────────────────────────────────────────────────────── def get_all_tools(knowledge_store=None): """Return every LangChain tool the lead agent + subagents can use. - ``knowledge_store`` is threaded through for agents that ship a - knowledge / memory subsystem (see ``graph/middleware/knowledge.py`` - for the hook-in pattern). The template doesn't ship a store — the - parameter is kept so adding one later doesn't require touching - every call site. + When ``knowledge_store`` is provided, the memory tools are bound + to it and included. Forks that disable the store can pass + ``knowledge_store=None`` and the lead agent runs with the four + keyless tools only. """ - return [echo, current_time, calculator, web_search, fetch_url] + tools = [current_time, calculator, web_search, fetch_url] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + return tools From 4d0d4288e483fa464def76382f8bb4b469c97bc0 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 14:46:58 -0700 Subject: [PATCH 11/24] fix(review): address PR #155 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - evals/runner.py: teardown now runs in a finally block so seeded KB rows get cleaned up even when the verifier or client.ask() raises. expected_tools=[] now means "assert no tools fired" (was conflated with "no key" via the `or []` short-circuit, making the abstention case a no-op). - evals/runner.py + tasks.json: added a `stream` runner kind so AgentClient.stream() is reachable from tasks.json — new streaming_status_updates case asserts the SSE event sequence. - knowledge/store.py: PRAGMA journal_mode=WAL is now best-effort (read-only DBs no longer break _connect). FTS5 rebuild after schema install so an existing chunks table populated before FTS was added gets indexed. find_chunk_containing/delete_by_content reject empty/whitespace-only inputs to prevent LIKE '%%' wildcards from matching every row. Hardening: - tools/lg_tools.py: clamp memory_recall(k) to [1, 20] and memory_list(limit) to [1, 200] so the agent can't request arbitrarily large slices of the KB. Doc cleanup: - docs/guides/subagents.md: LangGraphConfig snippet had a stale "echo" reference; replaced with the new memory-tool list. - docs/tutorials/first-tool.md: WORKER_CONFIG example now appends git_sha alongside the bundled defaults instead of replacing them and dropping the memory tools. - docs/reference/starter-tools.md: "adding your own" snippet now preserves the conditional _build_memory_tools(knowledge_store) extension. - tests/test_config_io.py: starter-tool contract assertion now also covers web_search. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/guides/subagents.md | 6 +- docs/reference/starter-tools.md | 7 +- docs/tutorials/first-tool.md | 9 +- evals/runner.py | 155 +++++++++++++++++++++----------- evals/tasks.json | 10 +++ knowledge/store.py | 33 ++++++- tests/test_config_io.py | 1 + tools/lg_tools.py | 10 ++- 8 files changed, 167 insertions(+), 64 deletions(-) diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md index 9fc1a9a..031cff6 100644 --- a/docs/guides/subagents.md +++ b/docs/guides/subagents.md @@ -56,7 +56,11 @@ The template's `LangGraphConfig` (in `graph/config.py`) has a `worker` field. Ad class LangGraphConfig: # ... existing fields ... worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, )) researcher: SubagentDef = field(default_factory=lambda: SubagentDef( diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index c0918b5..60d74d4 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -180,11 +180,14 @@ async def my_tool(required_arg: str, optional_arg: int = 5) -> str: return f"Success: {result}" ``` -Then append it to the list in `get_all_tools()`: +Then append it to the keyless tool list in `get_all_tools()` — keep the conditional `_build_memory_tools(knowledge_store)` extension below it so the bundled memory tools still ship when a store is configured: ```python def get_all_tools(knowledge_store=None): - return [current_time, calculator, web_search, fetch_url, my_tool] + tools = [current_time, calculator, web_search, fetch_url, my_tool] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + return tools ``` See [Write your first tool](/tutorials/first-tool) for the full walkthrough. diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 87502d0..056a8e4 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -47,12 +47,17 @@ def get_all_tools(knowledge_store=None): ## 2. Allow the subagent to use it (optional) -If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`: +If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`. Append rather than replace — dropping the bundled defaults removes the worker's memory tools: ```python WORKER_CONFIG = SubagentConfig( # ... - tools=["current_time", "calculator", "web_search", "fetch_url", "git_sha"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + "git_sha", # ← new + ], # ... ) ``` diff --git a/evals/runner.py b/evals/runner.py index ad5154b..522f830 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -129,7 +129,26 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: async def _run_ask(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/send`` + poll. Teardown always runs.""" + return await _run_prompt_case(client, case, streaming=False) + + +async def _run_stream(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/stream`` + SSE. Same assertion shape as ``ask``, + plus an optional ``expected_event_kinds`` list that asserts the SSE + stream surfaced the named event kinds (``status-update``, ``task``, + etc.) at least once.""" + return await _run_prompt_case(client, case, streaming=True) + + +async def _run_prompt_case( + client: AgentClient, + case: dict, + *, + streaming: bool, +) -> CaseResult: # Pre-seed state via direct DB writes (model never sees this). + setup_applied = False if "setup" in case: err = verify.apply_setup(case["setup"]) if err: @@ -137,66 +156,93 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult: case["id"], case["category"], case["name"], False, f"setup failed: {err}", ) + setup_applied = True - since = verify.audit_now() - result: TaskResult = await client.ask( - case["prompt"], timeout_s=case.get("timeout_s", 90), - ) + events: list[dict] = [] + result: TaskResult | None = None - if result.state != "completed": - if "teardown" in case: - verify.apply_teardown(case["teardown"]) - return CaseResult( - case["id"], case["category"], case["name"], False, - f"task state={result.state}; error={result.error or '(none)'}", - duration_ms=result.duration_ms, - raw={"text": result.text[:200]}, - ) + try: + since = verify.audit_now() - problems: list[str] = [] + if streaming: + events, result = await client.stream( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + else: + result = await client.ask( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) - # Tool firing assertions. - expected_tools = case.get("expected_tools") or [] - if expected_tools: - await asyncio.sleep(0.3) # let the audit log catch up - entries = verify.audit_entries_since(since) - require_success = case.get("tool_outcome", "success") == "success" - passed, detail = verify.assert_tools_fired( - entries, expected_tools, require_success=require_success, + if result is None or result.state != "completed": + state = result.state if result else "no-final-event" + error = (result.error if result else None) or "(none)" + duration = result.duration_ms if result else 0 + text_preview = (result.text if result else "")[:200] + return CaseResult( + case["id"], case["category"], case["name"], False, + f"task state={state}; error={error}", + duration_ms=duration, + raw={"text": text_preview}, + ) + + problems: list[str] = [] + + # Tool firing assertions. ``expected_tools is not None`` so an + # explicit empty list asserts that *no* tools fired (abstention + # cases). Missing key skips the audit check entirely. + expected_tools = case.get("expected_tools") + if expected_tools is not None: + await asyncio.sleep(0.3) # let the audit log catch up + entries = verify.audit_entries_since(since) + require_success = case.get("tool_outcome", "success") == "success" + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if not passed: + problems.append(detail) + + # Text pattern assertions (case-insensitive substrings). + text_lower = result.text.lower() + for pattern in case.get("expected_patterns") or []: + if pattern.lower() not in text_lower: + problems.append(f"missing pattern {pattern!r}") + + # KB side-effect assertions. + vk = case.get("verify_kb") or {} + if "find_chunk_containing" in vk: + chunk = verify.find_chunk_containing( + vk["find_chunk_containing"], domain=vk.get("domain"), + ) + if not chunk: + problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") + + # Streaming-only: assert the SSE event sequence surfaced the + # expected kinds at least once. + if streaming: + seen_kinds = {e.get("kind") for e in events} + for kind in case.get("expected_event_kinds") or []: + if kind not in seen_kinds: + problems.append(f"missing SSE event kind {kind!r}; saw {sorted(seen_kinds)}") + + detail = ( + "; ".join(problems) if problems + else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" ) - if not passed: - problems.append(detail) - - # Text pattern assertions (case-insensitive substrings). - text_lower = result.text.lower() - for pattern in case.get("expected_patterns") or []: - if pattern.lower() not in text_lower: - problems.append(f"missing pattern {pattern!r}") - - # KB side-effect assertions. - vk = case.get("verify_kb") or {} - if "find_chunk_containing" in vk: - chunk = verify.find_chunk_containing( - vk["find_chunk_containing"], domain=vk.get("domain"), + return CaseResult( + case["id"], case["category"], case["name"], + passed=not problems, + detail=detail, + duration_ms=result.duration_ms, + tokens=result.usage.get("total_tokens", 0) or 0, + raw={"reply": result.text[:300]}, ) - if not chunk: - problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") - - if "teardown" in case: - verify.apply_teardown(case["teardown"]) - - detail = ( - "; ".join(problems) if problems - else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" - ) - return CaseResult( - case["id"], case["category"], case["name"], - passed=not problems, - detail=detail, - duration_ms=result.duration_ms, - tokens=result.usage.get("total_tokens", 0) or 0, - raw={"reply": result.text[:300]}, - ) + finally: + # Teardown unconditionally — even when the task crashed or + # an assertion raised — so seeded KB rows never leak into the + # next case. + if setup_applied or "teardown" in case: + if "teardown" in case: + verify.apply_teardown(case["teardown"]) # ── dispatch ──────────────────────────────────────────────────────────────── @@ -206,6 +252,7 @@ async def _run_ask(client: AgentClient, case: dict) -> CaseResult: "agent_card": _run_agent_card, "auth_check": _run_auth_check, "ask": _run_ask, + "stream": _run_stream, } diff --git a/evals/tasks.json b/evals/tasks.json index 14cdd16..d4b5389 100644 --- a/evals/tasks.json +++ b/evals/tasks.json @@ -17,6 +17,16 @@ "bad_token": "definitely-not-the-real-token", "expect": {"status": 401} }, + { + "id": "streaming_status_updates", + "category": "a2a-protocol", + "kind": "stream", + "name": "message/stream surfaces status-update events ending in final=true", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [], + "expected_event_kinds": ["status-update"] + }, { "id": "abstain_no_tool", diff --git a/knowledge/store.py b/knowledge/store.py index a6aee7a..72d9353 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -162,7 +162,14 @@ def __init__(self, db_path: str | Path | None = None): def _connect(self) -> sqlite3.Connection: db = sqlite3.connect(str(self.path)) db.row_factory = sqlite3.Row - db.execute("PRAGMA journal_mode=WAL") + # WAL is best-effort — read-only sqlite files (e.g. immutable + # mounts) reject the PRAGMA. The connection stays usable for + # reads; only writes will fail later, and those go through + # the per-method OperationalError guards. + try: + db.execute("PRAGMA journal_mode=WAL") + except sqlite3.OperationalError as exc: + log.debug("[knowledge] PRAGMA journal_mode=WAL skipped: %s", exc) return db def _init_db(self) -> None: @@ -172,6 +179,16 @@ def _init_db(self) -> None: self._fts_available = _has_fts5(db) if self._fts_available: db.executescript(_FTS_SCHEMA) + # Re-index any pre-existing rows. The CREATE TRIGGER + # statements only fire on subsequent inserts, so a DB + # populated before FTS was added would have an empty + # virtual table without this rebuild. + try: + db.execute( + "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')" + ) + except sqlite3.OperationalError as exc: + log.debug("[knowledge] FTS rebuild skipped: %s", exc) else: log.info( "[knowledge] FTS5 unavailable — search will use LIKE fallback" @@ -399,8 +416,12 @@ def find_chunk_containing( """Return the most-recent chunk whose content or heading contains ``text``. Used by the eval runner to assert side-effect outcomes after a - memory-writing turn. + memory-writing turn. Empty / whitespace-only ``text`` returns + ``None`` rather than building a ``LIKE '%%'`` predicate that + would match every row. """ + if not text or not text.strip(): + return None db = self._get_db() if db is None: return None @@ -423,7 +444,13 @@ def find_chunk_containing( return Chunk(**dict(row)) if row else None def delete_by_content(self, contains: str) -> int: - """Delete chunks whose content matches ``%contains%``. Returns count.""" + """Delete chunks whose content matches ``%contains%``. Returns count. + + Empty / whitespace-only ``contains`` is a no-op — the alternative + is ``DELETE WHERE content LIKE '%%'`` which wipes every row. + """ + if not contains or not contains.strip(): + return 0 db = self._get_db() if db is None: return 0 diff --git a/tests/test_config_io.py b/tests/test_config_io.py index 39fc017..caf0bb2 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -327,6 +327,7 @@ def test_list_available_tools_returns_starter_set(): # contract is "return tool names in a stable list". assert "current_time" in names assert "calculator" in names + assert "web_search" in names assert "fetch_url" in names assert all(isinstance(n, str) for n in names) diff --git a/tools/lg_tools.py b/tools/lg_tools.py index b59ccba..fd2af1a 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -277,6 +277,10 @@ def _extract_text_from_html(content: bytes) -> str: # process. +_MEMORY_RECALL_MAX_K = 20 +_MEMORY_LIST_MAX_LIMIT = 200 + + def _build_memory_tools(knowledge_store): """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" from datetime import datetime, timezone @@ -320,7 +324,8 @@ async def memory_recall(query: str, k: int = 5) -> str: Returns ``"No matches."`` when the store is empty or nothing scores above the keyword threshold. """ - results = knowledge_store.search(query, k=k) + clamped_k = max(1, min(int(k), _MEMORY_RECALL_MAX_K)) + results = knowledge_store.search(query, k=clamped_k) if not results: return "No matches." lines = [] @@ -335,7 +340,8 @@ async def memory_list(domain: str | None = None, limit: int = 10) -> str: Useful when the operator asks for recent activity ("what did I log today?") or wants to inspect what the agent has stored. """ - chunks = knowledge_store.list_chunks(domain=domain, limit=limit) + clamped_limit = max(1, min(int(limit), _MEMORY_LIST_MAX_LIMIT)) + chunks = knowledge_store.list_chunks(domain=domain, limit=clamped_limit) if not chunks: return f"No chunks in {domain or 'any domain'}." lines = [] From cab3bd8d0f57d8df3ad98c25345cc36df2a11e68 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:02:53 +0000 Subject: [PATCH 12/24] fix(review-2): address round-2 PR #155 CodeRabbit feedback - evals/runner.py: collapse redundant nested teardown guard into a single `if "teardown" in case:` (SIM102); remove now-unused `setup_applied` flag - knowledge/store.py: use `datetime.UTC` alias (Python 3.11+, UP017) - tools/lg_tools.py: add `-> list` return annotation to `_build_memory_tools` (ANN202); replace explicit loop with list comprehension in `memory_recall` (PERF401) Co-Authored-By: Claude Opus 4.7 (1M context) https://claude.ai/code/session_01148o8ppbuQwuZBsVGTQWwQ --- evals/runner.py | 7 ++----- knowledge/store.py | 4 ++-- tools/lg_tools.py | 6 ++---- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index 522f830..d767f90 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -148,7 +148,6 @@ async def _run_prompt_case( streaming: bool, ) -> CaseResult: # Pre-seed state via direct DB writes (model never sees this). - setup_applied = False if "setup" in case: err = verify.apply_setup(case["setup"]) if err: @@ -156,7 +155,6 @@ async def _run_prompt_case( case["id"], case["category"], case["name"], False, f"setup failed: {err}", ) - setup_applied = True events: list[dict] = [] result: TaskResult | None = None @@ -240,9 +238,8 @@ async def _run_prompt_case( # Teardown unconditionally — even when the task crashed or # an assertion raised — so seeded KB rows never leak into the # next case. - if setup_applied or "teardown" in case: - if "teardown" in case: - verify.apply_teardown(case["teardown"]) + if "teardown" in case: + verify.apply_teardown(case["teardown"]) # ── dispatch ──────────────────────────────────────────────────────────────── diff --git a/knowledge/store.py b/knowledge/store.py index 72d9353..62c5f54 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -31,7 +31,7 @@ import re import sqlite3 from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -89,7 +89,7 @@ def _resolve_path(db_path: str | Path | None) -> Path: def _now_iso() -> str: - return datetime.now(timezone.utc).isoformat() + return datetime.now(UTC).isoformat() def _has_fts5(db: sqlite3.Connection) -> bool: diff --git a/tools/lg_tools.py b/tools/lg_tools.py index fd2af1a..161ddcb 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -281,7 +281,7 @@ def _extract_text_from_html(content: bytes) -> str: _MEMORY_LIST_MAX_LIMIT = 200 -def _build_memory_tools(knowledge_store): +def _build_memory_tools(knowledge_store) -> list: """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" from datetime import datetime, timezone @@ -328,9 +328,7 @@ async def memory_recall(query: str, k: int = 5) -> str: results = knowledge_store.search(query, k=clamped_k) if not results: return "No matches." - lines = [] - for r in results: - lines.append(f"[{r.get('domain', '?')}] {r['preview']}") + lines = [f"[{r.get('domain', '?')}] {r['preview']}" for r in results] return "\n".join(lines) @tool From b3a9f1dae02730a13734614dfe8481717cc1c6c1 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 15:25:15 -0700 Subject: [PATCH 13/24] fix(review-3): address round-3 PR #155 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - evals/runner.py: setup is now inside the try block so a partial setup failure (e.g. step 2 of 3 errors) still triggers the finally teardown — rows from steps that did succeed no longer leak into the next case. Was flagged as a duplicate from round 2. - knowledge/store.py: LIKE patterns now escape % and _ via ESCAPE '\' on every clause that takes user input (find_chunk_containing, delete_by_content, _search_like). A query for "100%" or "hello_world" no longer silently matches every row containing "100" or any single character between "hello" and "world". - knowledge/store.py: FTS5 MATCH tokens are now double-quoted via _fts_quote() so user-supplied query terms can't smuggle FTS5 operators (column filters, prefix wildcards, NEAR, AND/OR/NOT). Defence in depth — the [\w']+ tokenizer already filters most special chars. Hardening: - evals/runner.py: the fixed 0.3s asyncio.sleep waiting for the audit log to flush is gone. _await_audit_assertion now polls every 50ms up to a 2s deadline and returns as soon as the assertion passes — exits early on success, only burns the full deadline when the tool genuinely never fired. - evals/runner.py: _run_auth_check accepts case["headers"] so cases can override the default bearer-only header set and exercise X-API-Key auth scenarios (or both auths together). - knowledge/store.py: per-method exception handlers broadened from sqlite3.OperationalError to sqlite3.DatabaseError. Catches IntegrityError, ProgrammingError, and corruption variants too without crashing the agent loop. _has_fts5 (probe) and _connect (connection-time errors only) keep the narrower OperationalError. Co-Authored-By: Claude Opus 4.7 (1M context) --- evals/runner.py | 71 +++++++++++++++++++++++++++++++++--------- knowledge/store.py | 77 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 116 insertions(+), 32 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index d767f90..2fa148f 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -91,7 +91,14 @@ async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult: async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: - """Verify the A2A endpoint rejects a bad bearer with the expected status.""" + """Verify the A2A endpoint rejects a request with the expected status. + + Default behaviour exercises bearer auth alone using ``case["bad_token"]``. + Cases can override headers via ``case["headers"]`` to test other + auth surfaces — e.g. ``{"X-API-Key": "wrong"}`` for the legacy + X-API-Key path. ``Content-Type: application/json`` is always set + for the eval client; case headers override anything else. + """ import httpx expected_status = case.get("expect", {}).get("status", 401) @@ -99,8 +106,8 @@ async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {bad}", - # No X-API-Key — testing bearer alone. } + headers.update(case.get("headers") or {}) payload = { "jsonrpc": "2.0", "id": "auth-check", @@ -141,25 +148,61 @@ async def _run_stream(client: AgentClient, case: dict) -> CaseResult: return await _run_prompt_case(client, case, streaming=True) +_AUDIT_POLL_DEADLINE_S = 2.0 +_AUDIT_POLL_INTERVAL_S = 0.05 + + +async def _await_audit_assertion( + since: str, + expected_tools: list[str], + *, + require_success: bool, +) -> tuple[list[dict], bool, str]: + """Poll the audit log until ``expected_tools`` have all fired (or the + deadline is hit). Returns ``(entries, passed, detail)``. + + Replaces a fixed ``asyncio.sleep`` — under audit-log contention the + fixed wait was sometimes shorter than the flush, causing flaky + tool-firing assertions. Polling exits as soon as the assertion + passes; the deadline only kicks in when the tool genuinely never + fired. + """ + deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S + entries: list[dict] = [] + passed = False + detail = "" + while True: + entries = verify.audit_entries_since(since) + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if passed or asyncio.get_event_loop().time() >= deadline: + return entries, passed, detail + await asyncio.sleep(_AUDIT_POLL_INTERVAL_S) + + async def _run_prompt_case( client: AgentClient, case: dict, *, streaming: bool, ) -> CaseResult: - # Pre-seed state via direct DB writes (model never sees this). - if "setup" in case: - err = verify.apply_setup(case["setup"]) - if err: - return CaseResult( - case["id"], case["category"], case["name"], False, - f"setup failed: {err}", - ) - events: list[dict] = [] result: TaskResult | None = None try: + # Pre-seed state via direct DB writes (model never sees this). + # Inside the ``try`` so a partial setup failure still triggers + # the ``finally`` teardown — otherwise rows from the steps that + # *did* succeed would leak into the next case. + if "setup" in case: + err = verify.apply_setup(case["setup"]) + if err: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"setup failed: {err}", + ) + since = verify.audit_now() if streaming: @@ -190,11 +233,9 @@ async def _run_prompt_case( # cases). Missing key skips the audit check entirely. expected_tools = case.get("expected_tools") if expected_tools is not None: - await asyncio.sleep(0.3) # let the audit log catch up - entries = verify.audit_entries_since(since) require_success = case.get("tool_outcome", "success") == "success" - passed, detail = verify.assert_tools_fired( - entries, expected_tools, require_success=require_success, + entries, passed, detail = await _await_audit_assertion( + since, expected_tools, require_success=require_success, ) if not passed: problems.append(detail) diff --git a/knowledge/store.py b/knowledge/store.py index 62c5f54..473bedd 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -17,7 +17,8 @@ - If the configured path is unwritable (running locally outside the container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db`` so a fresh ``python server.py`` works without sudo. -- All write operations swallow ``sqlite3.OperationalError`` and log; +- All write operations swallow ``sqlite3.DatabaseError`` (covers + OperationalError, IntegrityError, and corruption variants) and log; the store never crashes the agent loop on a corrupt or read-only DB. Forks that want embeddings on top of FTS5 can subclass and override @@ -92,6 +93,36 @@ def _now_iso() -> str: return datetime.now(UTC).isoformat() +# LIKE escaping — sqlite treats ``%`` and ``_`` as wildcards in LIKE +# patterns. Without escaping, a search for ``"100%"`` matches every row +# starting with ``"100"`` instead of literal "100%". We escape them +# alongside the escape char itself, then bind ``ESCAPE '\'`` on every +# LIKE clause that takes user input. +_LIKE_ESCAPE = "\\" + + +def _escape_like(text: str) -> str: + """Escape ``%``, ``_``, and the escape char for safe LIKE matching.""" + return ( + text + .replace(_LIKE_ESCAPE, _LIKE_ESCAPE + _LIKE_ESCAPE) + .replace("%", _LIKE_ESCAPE + "%") + .replace("_", _LIKE_ESCAPE + "_") + ) + + +def _fts_quote(token: str) -> str: + """Quote a token for FTS5 MATCH so it's treated as a literal phrase. + + FTS5 has its own query syntax (column filters, prefix wildcards, + NEAR, AND/OR/NOT operators). Wrapping each token in double quotes + forces FTS5 to interpret it as a phrase token, neutralising any + operator characters the user happened to type. Internal double + quotes are doubled per FTS5 phrase rules. + """ + return '"' + token.replace('"', '""') + '"' + + def _has_fts5(db: sqlite3.Connection) -> bool: try: db.execute( @@ -187,7 +218,7 @@ def _init_db(self) -> None: db.execute( "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')" ) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.debug("[knowledge] FTS rebuild skipped: %s", exc) else: log.info( @@ -195,7 +226,7 @@ def _init_db(self) -> None: ) db.commit() db.close() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.error("[knowledge] schema init failed at %s: %s", self.path, exc) # Convenience for middleware that wants the raw connection. Kept @@ -235,7 +266,7 @@ def add_chunk( ) db.commit() return int(cur.lastrowid) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.error("[knowledge] add_chunk failed: %s", exc) return None finally: @@ -285,7 +316,7 @@ def search( try: rows = self._search_fts(db, query, k, domain) if self._fts_available \ else self._search_like(db, query, k, domain) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] search failed: %s", exc) rows = [] finally: @@ -310,10 +341,14 @@ def _search_fts( ) -> list[sqlite3.Row]: # Sanitize to FTS5-safe tokens; OR them so a multi-word query # matches any of the keywords (closer to LIKE behaviour). + # Each token is double-quoted so FTS5 treats it as a literal + # phrase rather than parsing operators (column filters, prefix + # wildcards, NEAR, etc.) — even though ``[\w']+`` already + # filters most special chars, defence in depth is cheap. tokens = [t for t in re.findall(r"[\w']+", query) if t] if not tokens: return [] - match = " OR ".join(tokens) + match = " OR ".join(_fts_quote(t) for t in tokens) if domain: return db.execute( "SELECT c.* FROM chunks_fts f " @@ -341,14 +376,18 @@ def _search_like( if not tokens: return [] # Score = number of tokens matched (rough recall-style ranking). + # User-supplied tokens are LIKE-escaped so a query containing + # ``%`` or ``_`` doesn't silently match every row; ESCAPE is + # bound on each clause. like_clauses = " + ".join( - "CASE WHEN content LIKE ? OR heading LIKE ? THEN 1 ELSE 0 END" + "CASE WHEN content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ? " + "THEN 1 ELSE 0 END" for _ in tokens ) params: list[Any] = [] for t in tokens: - needle = f"%{t}%" - params.extend([needle, needle]) + needle = f"%{_escape_like(t)}%" + params.extend([needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE]) sql = ( f"SELECT *, ({like_clauses}) AS score FROM chunks " "WHERE score > 0" @@ -380,7 +419,7 @@ def list_chunks( "SELECT * FROM chunks ORDER BY id DESC LIMIT ?", (limit,), ).fetchall() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] list_chunks failed: %s", exc) rows = [] finally: @@ -397,7 +436,7 @@ def stats(self) -> dict[str, int]: "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC" ).fetchall() total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] stats failed: %s", exc) return {"total": 0} finally: @@ -426,17 +465,18 @@ def find_chunk_containing( if db is None: return None try: + needle = f"%{_escape_like(text)}%" sql = ( "SELECT * FROM chunks " - "WHERE (content LIKE ? OR heading LIKE ?)" + "WHERE (content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ?)" ) - params: list[Any] = [f"%{text}%", f"%{text}%"] + params: list[Any] = [needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE] if domain: sql += " AND domain = ?" params.append(domain) sql += " ORDER BY id DESC LIMIT 1" row = db.execute(sql, params).fetchone() - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] find_chunk_containing failed: %s", exc) row = None finally: @@ -455,10 +495,13 @@ def delete_by_content(self, contains: str) -> int: if db is None: return 0 try: - cur = db.execute("DELETE FROM chunks WHERE content LIKE ?", (f"%{contains}%",)) + cur = db.execute( + "DELETE FROM chunks WHERE content LIKE ? ESCAPE ?", + (f"%{_escape_like(contains)}%", _LIKE_ESCAPE), + ) db.commit() return int(cur.rowcount) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] delete_by_content failed: %s", exc) return 0 finally: @@ -476,7 +519,7 @@ def delete_by_heading(self, domain: str, heading: str) -> int: ) db.commit() return int(cur.rowcount) - except sqlite3.OperationalError as exc: + except sqlite3.DatabaseError as exc: log.warning("[knowledge] delete_by_heading failed: %s", exc) return 0 finally: From b713d8d0997320520d7a14e503fdb12d26fd282e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:47:04 +0000 Subject: [PATCH 14/24] fix(review-4): address round-4 PR #155 CodeRabbit feedback - evals/runner.py: use asyncio.get_running_loop() instead of the deprecated get_event_loop() inside the _await_audit_assertion coroutine - evals/runner.py: prefix unused _entries return value with underscore - evals/runner.py: use datetime.UTC alias (consistent with store.py), drop now-unused timezone import - knowledge/store.py: broaden _get_db exception catch from OperationalError to DatabaseError so corrupt-DB errors are swallowed per the module's no-crash contract - knowledge/store.py: replace log.error with log.exception in all three DatabaseError handlers (schema init, _get_db, add_chunk) so tracebacks appear in error logs Co-Authored-By: Claude https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH --- evals/runner.py | 10 +++++----- knowledge/store.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/evals/runner.py b/evals/runner.py index 2fa148f..7b66d4d 100644 --- a/evals/runner.py +++ b/evals/runner.py @@ -36,7 +36,7 @@ import sys import time from dataclasses import asdict, dataclass, field -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path # Allow ``python -m evals.runner`` and ``python evals/runner.py``. @@ -167,7 +167,7 @@ async def _await_audit_assertion( passes; the deadline only kicks in when the tool genuinely never fired. """ - deadline = asyncio.get_event_loop().time() + _AUDIT_POLL_DEADLINE_S + deadline = asyncio.get_running_loop().time() + _AUDIT_POLL_DEADLINE_S entries: list[dict] = [] passed = False detail = "" @@ -176,7 +176,7 @@ async def _await_audit_assertion( passed, detail = verify.assert_tools_fired( entries, expected_tools, require_success=require_success, ) - if passed or asyncio.get_event_loop().time() >= deadline: + if passed or asyncio.get_running_loop().time() >= deadline: return entries, passed, detail await asyncio.sleep(_AUDIT_POLL_INTERVAL_S) @@ -234,7 +234,7 @@ async def _run_prompt_case( expected_tools = case.get("expected_tools") if expected_tools is not None: require_success = case.get("tool_outcome", "success") == "success" - entries, passed, detail = await _await_audit_assertion( + _entries, passed, detail = await _await_audit_assertion( since, expected_tools, require_success=require_success, ) if not passed: @@ -337,7 +337,7 @@ def _print_board(results: list[CaseResult]) -> None: def _save_report(results: list[CaseResult], path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) payload = { - "ts": datetime.now(timezone.utc).isoformat(), + "ts": datetime.now(UTC).isoformat(), "total": len(results), "passed": sum(1 for r in results if r.passed), "results": [asdict(r) for r in results], diff --git a/knowledge/store.py b/knowledge/store.py index 473bedd..d26d8a7 100644 --- a/knowledge/store.py +++ b/knowledge/store.py @@ -226,16 +226,16 @@ def _init_db(self) -> None: ) db.commit() db.close() - except sqlite3.DatabaseError as exc: - log.error("[knowledge] schema init failed at %s: %s", self.path, exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] schema init failed at %s", self.path) # Convenience for middleware that wants the raw connection. Kept # private so the public API stays small. def _get_db(self) -> sqlite3.Connection | None: try: return self._connect() - except sqlite3.OperationalError as exc: - log.error("[knowledge] connect failed: %s", exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] connect failed") return None # ── writes ────────────────────────────────────────────────────────────── @@ -266,8 +266,8 @@ def add_chunk( ) db.commit() return int(cur.lastrowid) - except sqlite3.DatabaseError as exc: - log.error("[knowledge] add_chunk failed: %s", exc) + except sqlite3.DatabaseError: + log.exception("[knowledge] add_chunk failed") return None finally: db.close() From ae752a5e523c85714a48846eb9d53ee7e6b77b1c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 22:48:12 +0000 Subject: [PATCH 15/24] chore: add uv.lock generated during round-4 review session https://claude.ai/code/session_01YW5U6mtpLy4rzKmqd4trkH --- uv.lock | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 uv.lock diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..7eae9e0 --- /dev/null +++ b/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.11" + +[[package]] +name = "protoagent" +version = "0.2.1" +source = { virtual = "." } From 0609c67b4fd8b15707d161ffba67f759c3507ca5 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 18:30:56 -0700 Subject: [PATCH 16/24] feat: ship pluggable scheduler (local sqlite + Workstacean adapter) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a default scheduler so agents can defer work to themselves — "remind me tomorrow", recurring sweeps, deadline check-ins. Three new tools land in get_all_tools() when a backend is wired up: schedule_task, list_schedules, cancel_schedule. Two backends ship behind a single SchedulerBackend protocol: - LocalScheduler (default): sqlite + asyncio polling. Per-agent jobs.db at /sandbox/scheduler// with a ~/.protoagent/scheduler// fallback. Fires by POSTing message/send to the running agent's own /a2a endpoint, going through bearer + X-API-Key auth like a real caller (audit log + cost-v1 capture work the same). Cron expressions reschedule via croniter; ISO datetimes are one-shot. Missed-fire recovery: within 24h fires immediately, older fires roll forward without firing. - WorkstaceanScheduler: HTTP adapter to a Workstacean install's POST /publish. Activated automatically when WORKSTACEAN_API_BASE and WORKSTACEAN_API_KEY env vars are set. Topic and job IDs are namespaced cron.. so a single Workstacean can serve N protoAgent forks safely. Multi-agent isolation is the headline architectural property — spinning up gina-personal alongside gina-work on the same box (or sharing one Workstacean) won't cross-fire scheduled prompts. Verified with explicit tests in test_scheduler_local.py. Wiring: - scheduler/{__init__,interface,local,workstacean}.py — module - tools/lg_tools.py — _build_scheduler_tools factory; get_all_tools takes a new optional scheduler= kwarg - graph/agent.py — create_agent_graph and create_simple_agent accept scheduler= - server.py — _build_scheduler() picks backend at boot, on_event("startup"/"shutdown") drives the polling task lifecycle, reload path reuses the running scheduler instance - config/langgraph-config.yaml + graph/{config,subagents/config}.py — worker subagent gets the three new tools in its allowlist - requirements.txt — croniter>=2.0 Tests: 48 new (test_scheduler_local.py covers add/list/cancel, multi-agent isolation, reschedule-vs-delete, missed-fire recovery, and an end-to-end fire path with httpx mocked; test_scheduler_workstacean.py covers all the publish payload assertions, namespacing, custom topic prefix, and HTTP error handling). Docs: docs/guides/scheduler.md (Diataxis how-to with the firing model, multi-agent story, env reference, and notes on the Workstacean A2A-bridge gap), plus index/configuration/README/ TEMPLATE updates. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 1 + TEMPLATE.md | 22 ++ config/langgraph-config.yaml | 3 + docs/guides/index.md | 1 + docs/guides/scheduler.md | 171 +++++++++++++ docs/reference/configuration.md | 13 + graph/agent.py | 7 +- graph/config.py | 1 + graph/subagents/config.py | 1 + requirements.txt | 4 + scheduler/__init__.py | 27 ++ scheduler/interface.py | 114 +++++++++ scheduler/local.py | 381 ++++++++++++++++++++++++++++ scheduler/workstacean.py | 183 +++++++++++++ server.py | 115 ++++++++- tests/test_scheduler_local.py | 290 +++++++++++++++++++++ tests/test_scheduler_workstacean.py | 168 ++++++++++++ tools/lg_tools.py | 115 ++++++++- 18 files changed, 1606 insertions(+), 11 deletions(-) create mode 100644 docs/guides/scheduler.md create mode 100644 scheduler/__init__.py create mode 100644 scheduler/interface.py create mode 100644 scheduler/local.py create mode 100644 scheduler/workstacean.py create mode 100644 tests/test_scheduler_local.py create mode 100644 tests/test_scheduler_workstacean.py diff --git a/README.md b/README.md index ef54b84..4a7036f 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ rename / release-pipeline wiring. | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships | | Starter tools | `tools/lg_tools.py` | Keyless general tools (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) plus memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled store | | Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` | +| Scheduler | `scheduler/` | `schedule_task` / `list_schedules` / `cancel_schedule` tools backed by either a bundled sqlite scheduler or a Workstacean adapter (env-selected). Multi-agent-safe — every job is namespaced by `AGENT_NAME`. See [Schedule future work](./docs/guides/scheduler.md) | | Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) | | Tracing | `tracing.py` | Langfuse trace_session with distributed `a2a.trace` propagation and the OTel cross-context-detach filter | | Observability | `metrics.py`, `audit.py` | Prometheus metrics with per-agent prefix, JSONL audit log with trace IDs | diff --git a/TEMPLATE.md b/TEMPLATE.md index 08ff5b3..4408c90 100644 --- a/TEMPLATE.md +++ b/TEMPLATE.md @@ -180,6 +180,28 @@ See [Eval your fork](./docs/guides/evals.md) for what each case asserts, how the three assertion channels work, and how to add cases for your fork's new tools. +## 9b. Scheduler — local sqlite or Workstacean + +The bundled scheduler ships three agent tools — `schedule_task`, +`list_schedules`, `cancel_schedule` — backed by either a local +sqlite poller or a Workstacean adapter, selected at startup via env: + +```bash +# Default: local sqlite, persists at /sandbox/scheduler//jobs.db +python server.py + +# Workstacean: set both and restart +export WORKSTACEAN_API_BASE=http://your-workstacean:3000 +export WORKSTACEAN_API_KEY=... +python server.py +``` + +Multi-fork safety: every job is namespaced by `AGENT_NAME`, so +spinning up `gina-personal` next to `gina-work` (or any number of +ginas under one Workstacean) doesn't cross-fire prompts. See +[Schedule future work](./docs/guides/scheduler.md) for the full +firing model and integration notes. + ## 9a. Understand the skill loop protoAgent's skill loop lets your agent learn from experience automatically. diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml index 05bada2..c75ff71 100644 --- a/config/langgraph-config.yaml +++ b/config/langgraph-config.yaml @@ -32,6 +32,9 @@ subagents: - memory_list - memory_stats - daily_log + - schedule_task + - list_schedules + - cancel_schedule max_turns: 20 middleware: diff --git a/docs/guides/index.md b/docs/guides/index.md index 65dc41e..ce26b48 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -10,4 +10,5 @@ Task-oriented procedures. Assumes you already have a running agent (see [Tutoria | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` | | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production | | [Eval your fork](/guides/evals) | You want a baseline pass-rate for the tools / memory / A2A surface in your fork | +| [Schedule future work](/guides/scheduler) | You want the agent to defer tasks to itself ("remind me tomorrow", recurring sweeps) — local sqlite or Workstacean-backed | | [Deploy via GHCR](/guides/deploy) | You're ready to ship and want auto-deploy wired up | diff --git a/docs/guides/scheduler.md b/docs/guides/scheduler.md new file mode 100644 index 0000000..dceb994 --- /dev/null +++ b/docs/guides/scheduler.md @@ -0,0 +1,171 @@ +# Schedule future work + +protoAgent ships a scheduler so the agent can defer tasks to itself — +"remind me about X tomorrow", "every Monday morning summarize last +week's logs", "at 3pm check the deploy". Two backends ship by default; +the agent-facing tool surface is identical regardless of which one is +active. + +## When to read this + +- You want forks (or your own multiple ginas) to support reminders, + recurring sweeps, or any "do this later" intent. +- You're running protoWorkstacean and want scheduled fires to flow + through the existing bus. +- You're spinning up multiple protoAgent instances on one box and + need scheduling state to stay isolated per agent. + +## The three tools + +When the scheduler is active, three tools land in `get_all_tools()`: + +| Tool | What it does | +|---|---| +| `schedule_task(prompt, when, job_id?)` | Persist a future invocation. `when` is cron (`"0 9 * * *"`) or ISO-8601 (`"2026-05-01T15:00:00"`). | +| `list_schedules()` | Show all jobs visible to *this* agent. | +| `cancel_schedule(job_id)` | Remove a job by id. | + +Prompts are self-contained — the agent has no memory of the +scheduling moment when the task fires, so write the prompt as a fresh +turn ("review last week's pipeline incidents and post a summary", +not "do that thing we discussed"). + +## Backend selection + +`server.py::_build_scheduler` picks at startup: + +1. `WORKSTACEAN_API_BASE` + `WORKSTACEAN_API_KEY` set → **`WorkstaceanScheduler`**. +2. Otherwise → **`LocalScheduler`** (sqlite, asyncio polling). +3. `SCHEDULER_DISABLED=1` → no scheduler. The three tools don't ship. + +Both backends honor the same `SchedulerBackend` protocol; the agent +loop never knows which one is wired up. + +```bash +# Solo / local dev — falls through to LocalScheduler automatically. +python server.py + +# Workstacean install — set both env vars and restart. +export WORKSTACEAN_API_BASE=http://your-workstacean-host:3000 +export WORKSTACEAN_API_KEY= +python server.py +``` + +> **protoLabs operators**: the fleet's Workstacean lives on the +> `ava` node; `WORKSTACEAN_API_KEY` is in the org's secrets manager +> under `secret-management → workstacean`. Coordinate with the team +> for the exact URL. + +## Multi-agent isolation + +Every job is namespaced by `AGENT_NAME` so spinning up +`gina-personal` alongside `gina-work` on the same box doesn't +cross-fire prompts. + +| Backend | How it isolates | +|---|---| +| Local | DB path per agent: `/sandbox/scheduler//jobs.db` (falls back to `~/.protoagent/scheduler//jobs.db`). Every row also carries `agent_name`; reads filter on it. | +| Workstacean | Job IDs are prefixed `-...`; topics are namespaced `cron..`. One Workstacean install can serve N forks safely. | + +If you supply your own `job_id` in `schedule_task`: + +- Local: the id is stored as-is. Two agents sharing one DB path with + the same user-supplied id will trip a primary-key collision (the + second add raises a clear error). To avoid it, let the scheduler + auto-generate (the auto-id is `-`). +- Workstacean: the adapter prepends `-` if your id doesn't + already start with it, so cross-agent collisions are impossible. + +## Local backend — how firing works + +The local scheduler runs an asyncio polling task on FastAPI's +`startup` event. Once a second: + +1. Read jobs where `next_fire <= now()` and `enabled = 1`. +2. For each due job: POST to `http://127.0.0.1:/a2a` as + a `message/send` with the job's prompt as the message text. Bearer + + X-API-Key are forwarded automatically. +3. One-shot ISO jobs are deleted after firing. Cron jobs reschedule + forward via `croniter`. + +Going through HTTP rather than calling into the graph directly buys +parity with real callers — the audit log, cost-v1 capture, and +push-notification path all behave identically. + +### Missed-fire recovery + +On startup, jobs whose `next_fire` is in the past are inspected: + +- **Within the last 24h** — fire on the next tick (so a 5-minute + outage doesn't lose an upcoming reminder). +- **Older than 24h** — cron jobs roll forward to the next slot + without firing; one-shot jobs are dropped. This matches + Workstacean's recovery behaviour and avoids flooding the agent + with stale prompts after a long downtime. + +### Persistence path + +```bash +# Default (Docker) +/sandbox/scheduler//jobs.db + +# Local fallback (when /sandbox isn't writable) +~/.protoagent/scheduler//jobs.db + +# Override +export SCHEDULER_DB_DIR=/var/data/agents +# → /var/data/agents//jobs.db +``` + +Mount a volume at the configured path to survive container +restarts (analogous to `audit/` and `knowledge/`). + +## Workstacean backend — how firing works + +When `WORKSTACEAN_API_BASE` and `WORKSTACEAN_API_KEY` are set, the +adapter publishes to `POST {base}/publish` with topic +`command.schedule` and the action wrapper Workstacean expects. See +the [Workstacean scheduler reference](https://protolabsai.github.io/protoWorkstacean/reference/scheduler/) +for the payload shape. + +When the schedule fires, Workstacean publishes the inner payload to +`cron..`. **Workstacean does not natively dispatch +to A2A endpoints today** — your fork needs to wire a bridge that +subscribes to `cron..*` and POSTs to the protoAgent's +`/a2a` endpoint. + +### Topic prefix override + +If your existing Workstacean bus uses a different convention: + +```bash +export WORKSTACEAN_TOPIC_PREFIX="myorg.cron.gina" +# → topics fire on myorg.cron.gina. +``` + +### `list_schedules()` returns empty under Workstacean + +Workstacean's `list` action publishes its response on the +`schedule.list` topic — there's no synchronous reply on `/publish`. +The adapter intentionally doesn't subscribe. If you need live +introspection, query Workstacean directly or run the local backend. + +## Adding a case to your eval suite + +The default `evals/tasks.json` doesn't include scheduler cases (the +fire path is async — a single eval run can't easily test that the +scheduled prompt arrives). For forks that want it, the pattern is: + +1. `schedule_task(prompt, "")` in setup. +2. Wait > 1 second. +3. Assert on the audit log and/or KB state for the *fired* prompt's + side effects. + +Document the case as `category: "scheduler"` and gate at >= 2/3 +attempts to absorb timing jitter. + +## References + +- [Workstacean scheduler reference](https://protolabsai.github.io/protoWorkstacean/reference/scheduler/) +- [Configuration](/reference/configuration#scheduler) — env vars +- [Eval your fork](/guides/evals) — for the testing pattern above diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index bd3f5be..3ca9a9c 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -83,3 +83,16 @@ Only read when `middleware.knowledge` is `true`. | `top_k` | `5` | Results per query fed into state. | The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 isn't available). One `chunks` table; the `domain` column distinguishes operator-set notes (`memory_ingest`), daily-log entries (`daily_log`), and conversation findings extracted by `MemoryMiddleware` (`domain='finding'`). + +## Scheduler + +The bundled scheduler is configured entirely via environment, not YAML, so the same image can be deployed under either backend without rebuilding. See [Schedule future work](/guides/scheduler) for the full guide. + +| Env var | Default | What | +|---|---|---| +| `WORKSTACEAN_API_BASE` | unset | When set together with `WORKSTACEAN_API_KEY`, swaps the bundled local scheduler for the `WorkstaceanScheduler` HTTP adapter. | +| `WORKSTACEAN_API_KEY` | unset | Auth token sent as `X-API-Key` to Workstacean's `/publish`. | +| `WORKSTACEAN_TOPIC_PREFIX` | `cron.` | Override the bus topic the adapter fires on, when your Workstacean install uses a different convention. | +| `SCHEDULER_DB_DIR` | `/sandbox/scheduler` | Local backend: parent directory for `/jobs.db`. Falls back to `~/.protoagent/scheduler//jobs.db` when unwritable. | +| `SCHEDULER_INVOKE_URL` | `http://127.0.0.1:` | Local backend: where to POST `message/send` when a job fires. Override only if the agent's A2A endpoint isn't on localhost. | +| `SCHEDULER_DISABLED` | unset | Set to `1` / `true` to drop the scheduler tools entirely. | diff --git a/graph/agent.py b/graph/agent.py index 355c3fc..08ad32a 100644 --- a/graph/agent.py +++ b/graph/agent.py @@ -158,6 +158,7 @@ async def task( def create_agent_graph( config: LangGraphConfig, knowledge_store=None, + scheduler=None, include_subagents: bool = True, ): """Create the protoAgent LangGraph agent. @@ -167,7 +168,7 @@ def create_agent_graph( """ llm = create_llm(config) - all_tools = get_all_tools(knowledge_store) + all_tools = get_all_tools(knowledge_store, scheduler=scheduler) if include_subagents: task_tool = _build_task_tool(config, all_tools) @@ -189,12 +190,12 @@ def create_agent_graph( return agent -def create_simple_agent(config: LangGraphConfig, knowledge_store=None): +def create_simple_agent(config: LangGraphConfig, knowledge_store=None, scheduler=None): """Create a simple agent without subagents (for debugging/testing).""" from langgraph.prebuilt import create_react_agent llm = create_llm(config) - all_tools = get_all_tools(knowledge_store) + all_tools = get_all_tools(knowledge_store, scheduler=scheduler) system_prompt = build_system_prompt(include_subagents=False) diff --git a/graph/config.py b/graph/config.py index a3df02b..c2cf995 100644 --- a/graph/config.py +++ b/graph/config.py @@ -41,6 +41,7 @@ class LangGraphConfig: "current_time", "calculator", "web_search", "fetch_url", "memory_ingest", "memory_recall", "memory_list", "memory_stats", "daily_log", + "schedule_task", "list_schedules", "cancel_schedule", ], max_turns=20, )) diff --git a/graph/subagents/config.py b/graph/subagents/config.py index 560edc7..a488703 100644 --- a/graph/subagents/config.py +++ b/graph/subagents/config.py @@ -67,6 +67,7 @@ class SubagentConfig: "current_time", "calculator", "web_search", "fetch_url", "memory_ingest", "memory_recall", "memory_list", "memory_stats", "daily_log", + "schedule_task", "list_schedules", "cancel_schedule", ], max_turns=20, ) diff --git a/requirements.txt b/requirements.txt index 30ef46d..aa05284 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,7 @@ langchain-openai>=0.3.0 # Starter tools (tools/lg_tools.py) ddgs>=9.0 beautifulsoup4>=4.12 + +# Scheduler (scheduler/local.py — cron expression parsing for the +# bundled local backend; the Workstacean adapter doesn't need this) +croniter>=2.0 diff --git a/scheduler/__init__.py b/scheduler/__init__.py new file mode 100644 index 0000000..960a226 --- /dev/null +++ b/scheduler/__init__.py @@ -0,0 +1,27 @@ +"""Pluggable scheduler for future-task delivery. + +Two backends ship by default: + +- ``LocalScheduler`` — sqlite + asyncio. Bundled, zero external + dependencies, per-agent persistence path. Use this for solo forks + or any deployment that doesn't already run protoWorkstacean. +- ``WorkstaceanScheduler`` — HTTP adapter to a protoWorkstacean + install. Topic-namespaced per agent so multiple ginas can share one + Workstacean and not collide. + +``server.py`` selects the backend at startup based on env vars; the +agent loop sees the same three tools (``schedule_task``, +``list_schedules``, ``cancel_schedule``) regardless of which backend +is wired up. + +Multi-agent safety: every job carries an ``agent_name`` (defaulted +from ``AGENT_NAME`` env / config) so that two protoAgent instances +sharing one storage path or one Workstacean install can't accidentally +fire each other's scheduled prompts. +""" + +from scheduler.interface import Job, SchedulerBackend +from scheduler.local import LocalScheduler +from scheduler.workstacean import WorkstaceanScheduler + +__all__ = ["Job", "SchedulerBackend", "LocalScheduler", "WorkstaceanScheduler"] diff --git a/scheduler/interface.py b/scheduler/interface.py new file mode 100644 index 0000000..6de9b3a --- /dev/null +++ b/scheduler/interface.py @@ -0,0 +1,114 @@ +"""Scheduler protocol — the contract every backend honors. + +Both ``LocalScheduler`` and ``WorkstaceanScheduler`` implement this +shape. The agent-facing tools in ``tools/lg_tools.py`` only see the +protocol; swapping backends is a server.py-level decision. +""" + +from __future__ import annotations + +import re +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from typing import Any, Protocol + + +@dataclass +class Job: + """A scheduled future invocation. + + ``schedule`` is either a 5-field cron expression (e.g. + ``"0 9 * * 1-5"``) or an ISO-8601 datetime for one-shot fires + (e.g. ``"2026-05-01T15:00:00+00:00"``). Backends auto-detect. + + ``agent_name`` namespaces the job — one Workstacean install or + shared sqlite path can serve N protoAgent instances without + cross-firing. + """ + + id: str + prompt: str + schedule: str + agent_name: str + created_at: str = field(default_factory=lambda: datetime.now(UTC).isoformat()) + next_fire: str | None = None # ISO; None means "compute on save" + last_fire: str | None = None + enabled: bool = True + + def as_dict(self) -> dict[str, Any]: + return asdict(self) + + +class SchedulerBackend(Protocol): + """The minimum surface every backend implements. + + Methods are sync because the agent tools wrap them in their own + async functions; backends that need to do async I/O (httpx in + Workstacean's case) handle it internally. + """ + + name: str # short label for logs / agent-facing strings: "local", "workstacean" + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + """Persist a new job. Returns the stored ``Job`` (with + backend-assigned id and next_fire if the caller didn't set them). + + Raises ``ValueError`` for malformed schedule strings.""" + ... + + def cancel_job(self, job_id: str) -> bool: + """Remove a job. Returns ``True`` if a row was deleted.""" + ... + + def list_jobs(self) -> list[Job]: + """All jobs visible to the calling agent. Implementations are + responsible for filtering by ``agent_name`` so multi-agent + deployments stay isolated.""" + ... + + async def start(self) -> None: + """Start any background polling. No-op for backends that don't + need it (Workstacean dispatches and forgets).""" + ... + + async def stop(self) -> None: + """Cleanly shut down background work.""" + ... + + +# ── shared helpers ────────────────────────────────────────────────────────── + + +_CRON_PATTERN = re.compile(r"^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s*$") + + +def is_cron(schedule: str) -> bool: + """Heuristic: does ``schedule`` look like a 5-field cron expression? + + Used by both backends to decide between cron-iter and + ``datetime.fromisoformat``. Doesn't validate semantics — that + happens when the schedule is parsed. + """ + return bool(_CRON_PATTERN.match(schedule)) and not _looks_like_iso(schedule) + + +def _looks_like_iso(schedule: str) -> bool: + # ISO datetimes contain ``-`` and either ``T`` or a space between + # date and time. Cron has neither in the first field. + return "T" in schedule or _has_iso_date_prefix(schedule) + + +def _has_iso_date_prefix(schedule: str) -> bool: + head = schedule.strip().split(" ", 1)[0] + return bool(re.match(r"^\d{4}-\d{2}-\d{2}", head)) + + +def parse_iso_to_utc(schedule: str) -> datetime: + """Parse an ISO-8601 datetime, treating naive inputs as UTC. + + Raises ``ValueError`` for malformed strings. + """ + dt = datetime.fromisoformat(schedule) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=UTC) + return dt.astimezone(UTC) diff --git a/scheduler/local.py b/scheduler/local.py new file mode 100644 index 0000000..17fe9b0 --- /dev/null +++ b/scheduler/local.py @@ -0,0 +1,381 @@ +"""LocalScheduler — bundled sqlite + asyncio backend. + +The default scheduler when no protoWorkstacean install is configured. +Every protoAgent instance gets a private ``jobs.db`` namespaced by +``AGENT_NAME`` so spinning up gina-personal alongside gina-work +doesn't cross-fire prompts. + +Architecture: + +- One ``jobs`` table — ``id``, ``prompt``, ``schedule``, ``next_fire``, + ``agent_name``, ``last_fire``, ``enabled``, ``created_at``. +- Polling coroutine runs on FastAPI's startup hook (``server.py``) + and ticks once per ``_POLL_INTERVAL_S`` (1s default). Cheap because + sqlite reads with an indexed ``next_fire`` filter cost microseconds. +- Firing = HTTP POST to the running agent's own ``/a2a`` endpoint as + a ``message/send``. Going through HTTP rather than calling into the + graph directly gets us free parity with real callers — same audit + log, same cost-v1 capture, same auth path. +- One-shot ISO schedules are deleted after firing. Cron schedules + reschedule via croniter. +- On startup: any job whose ``next_fire`` is in the past but within a + 24h window fires immediately (BFCL-style "missed fires" recovery, + matching Workstacean's behaviour). Older missed fires are + rescheduled forward without firing — better than waking the agent + to a flood of stale prompts after a long downtime. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import sqlite3 +import uuid +from datetime import UTC, datetime, timedelta +from pathlib import Path +from typing import Any + +from croniter import croniter + +from scheduler.interface import Job, is_cron, parse_iso_to_utc + +log = logging.getLogger(__name__) + +DEFAULT_DB_DIR = "/sandbox/scheduler" +_POLL_INTERVAL_S = 1.0 +_MISSED_FIRE_WINDOW_S = 24 * 60 * 60 # 24h — matches Workstacean + + +def _resolve_db_path(db_dir: str | Path | None, agent_name: str) -> Path: + """Pick a writable jobs.db path namespaced by agent name.""" + raw = os.environ.get("SCHEDULER_DB_DIR") or db_dir or DEFAULT_DB_DIR + base = Path(str(raw)).expanduser() / agent_name + try: + base.mkdir(parents=True, exist_ok=True) + probe = base / ".write-probe" + probe.touch() + probe.unlink() + return base / "jobs.db" + except OSError: + fallback = Path.home() / ".protoagent" / "scheduler" / agent_name + fallback.mkdir(parents=True, exist_ok=True) + log.info("[scheduler] %s not writable; using %s instead", base, fallback) + return fallback / "jobs.db" + + +def _now_iso() -> str: + return datetime.now(UTC).isoformat() + + +def _compute_next_fire(schedule: str, *, after: datetime | None = None) -> str: + """Resolve a schedule string to the next ISO timestamp it fires. + + ``after`` controls when "next" starts — current time by default; + pass an explicit reference when rescheduling a cron job after a + fire so successive fires don't drift. + """ + after = after or datetime.now(UTC) + if is_cron(schedule): + return croniter(schedule, after).get_next(datetime).astimezone(UTC).isoformat() + return parse_iso_to_utc(schedule).isoformat() + + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS jobs ( + id TEXT PRIMARY KEY, + prompt TEXT NOT NULL, + schedule TEXT NOT NULL, + agent_name TEXT NOT NULL, + next_fire TEXT NOT NULL, + last_fire TEXT, + enabled INTEGER NOT NULL DEFAULT 1, + created_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_jobs_next_fire ON jobs(next_fire); +CREATE INDEX IF NOT EXISTS idx_jobs_agent_name ON jobs(agent_name); +""" + + +class LocalScheduler: + """Sqlite-backed scheduler with an asyncio polling loop. + + Construct once at server startup, ``await scheduler.start()`` to + spawn the polling task, ``await scheduler.stop()`` on shutdown. + The agent-facing tools call ``add_job`` / ``cancel_job`` / + ``list_jobs`` synchronously. + """ + + name = "local" + + def __init__( + self, + agent_name: str, + *, + invoke_url: str, + api_key: str | None = None, + bearer_token: str | None = None, + db_dir: str | Path | None = None, + ): + self.agent_name = agent_name + self._invoke_url = invoke_url.rstrip("/") + self._api_key = api_key or "" + self._bearer = bearer_token or "" + self.path = _resolve_db_path(db_dir, agent_name) + self._task: asyncio.Task | None = None + self._stopping = False + self._init_db() + + # ── DB plumbing ───────────────────────────────────────────────────────── + + def _connect(self) -> sqlite3.Connection: + db = sqlite3.connect(str(self.path)) + db.row_factory = sqlite3.Row + try: + db.execute("PRAGMA journal_mode=WAL") + except sqlite3.OperationalError as exc: + log.debug("[scheduler] WAL skipped: %s", exc) + return db + + def _init_db(self) -> None: + try: + db = self._connect() + db.executescript(_SCHEMA) + db.commit() + db.close() + except sqlite3.DatabaseError as exc: + log.error("[scheduler] schema init failed at %s: %s", self.path, exc) + + # ── public API (matches SchedulerBackend) ─────────────────────────────── + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + if not prompt or not prompt.strip(): + raise ValueError("scheduler: prompt is required") + next_fire = _compute_next_fire(schedule) # raises ValueError for malformed input + + job = Job( + id=job_id or self._generate_id(), + prompt=prompt, + schedule=schedule, + agent_name=self.agent_name, + next_fire=next_fire, + ) + db = self._connect() + try: + db.execute( + "INSERT INTO jobs (id, prompt, schedule, agent_name, next_fire, " + "last_fire, enabled, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + (job.id, job.prompt, job.schedule, job.agent_name, + job.next_fire, job.last_fire, int(job.enabled), job.created_at), + ) + db.commit() + except sqlite3.IntegrityError as exc: + raise ValueError(f"job id {job.id!r} already exists") from exc + finally: + db.close() + return job + + def cancel_job(self, job_id: str) -> bool: + db = self._connect() + try: + cur = db.execute( + "DELETE FROM jobs WHERE id = ? AND agent_name = ?", + (job_id, self.agent_name), + ) + db.commit() + return cur.rowcount > 0 + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] cancel_job failed: %s", exc) + return False + finally: + db.close() + + def list_jobs(self) -> list[Job]: + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? ORDER BY next_fire ASC", + (self.agent_name,), + ).fetchall() + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] list_jobs failed: %s", exc) + return [] + finally: + db.close() + return [_row_to_job(r) for r in rows] + + async def start(self) -> None: + if self._task is not None: + return + self._stopping = False + self._recover_missed_fires() + self._task = asyncio.create_task(self._poll_loop(), name="scheduler.local.poll") + log.info( + "[scheduler] local backend started: agent=%s db=%s", + self.agent_name, self.path, + ) + + async def stop(self) -> None: + self._stopping = True + if self._task is None: + return + self._task.cancel() + try: + await self._task + except (asyncio.CancelledError, Exception): # noqa: BLE001 + pass + self._task = None + log.info("[scheduler] local backend stopped") + + # ── polling + firing ──────────────────────────────────────────────────── + + async def _poll_loop(self) -> None: + while not self._stopping: + try: + await self._tick() + except Exception: # noqa: BLE001 + log.exception("[scheduler] poll tick failed") + try: + await asyncio.sleep(_POLL_INTERVAL_S) + except asyncio.CancelledError: + return + + async def _tick(self) -> None: + now = datetime.now(UTC) + due = self._claim_due_jobs(now) + for job in due: + try: + await self._fire(job) + finally: + self._reschedule_or_delete(job, fired_at=now) + + def _claim_due_jobs(self, now: datetime) -> list[Job]: + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? AND enabled = 1 " + "AND next_fire <= ? ORDER BY next_fire ASC", + (self.agent_name, now.isoformat()), + ).fetchall() + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] _claim_due_jobs failed: %s", exc) + return [] + finally: + db.close() + return [_row_to_job(r) for r in rows] + + def _reschedule_or_delete(self, job: Job, *, fired_at: datetime) -> None: + """Cron jobs roll forward; one-shot jobs are deleted.""" + db = self._connect() + try: + if is_cron(job.schedule): + next_iso = _compute_next_fire(job.schedule, after=fired_at) + db.execute( + "UPDATE jobs SET next_fire = ?, last_fire = ? WHERE id = ?", + (next_iso, fired_at.isoformat(), job.id), + ) + else: + db.execute("DELETE FROM jobs WHERE id = ?", (job.id,)) + db.commit() + except sqlite3.DatabaseError: + log.exception("[scheduler] reschedule failed for job %s", job.id) + finally: + db.close() + + def _recover_missed_fires(self) -> None: + """Roll past-due jobs forward on startup. + + - Missed fires within the last 24h fire immediately on the next + tick (we leave their ``next_fire`` in the past so the polling + loop picks them up naturally). + - Older missed fires are rescheduled forward without firing — + firing a flood of stale prompts after a long downtime is worse + than dropping them. + """ + cutoff_recent = datetime.now(UTC) - timedelta(seconds=_MISSED_FIRE_WINDOW_S) + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? AND enabled = 1 " + "AND next_fire <= ?", + (self.agent_name, cutoff_recent.isoformat()), + ).fetchall() + for row in rows: + job = _row_to_job(row) + if is_cron(job.schedule): + next_iso = _compute_next_fire(job.schedule) + db.execute( + "UPDATE jobs SET next_fire = ? WHERE id = ?", + (next_iso, job.id), + ) + log.info( + "[scheduler] dropped stale fire for job %s; next at %s", + job.id, next_iso, + ) + else: + db.execute("DELETE FROM jobs WHERE id = ?", (job.id,)) + log.info("[scheduler] dropped stale one-shot job %s", job.id) + db.commit() + except sqlite3.DatabaseError: + log.exception("[scheduler] missed-fire recovery failed") + finally: + db.close() + + async def _fire(self, job: Job) -> None: + """Deliver a job by POSTing to the agent's own A2A endpoint.""" + import httpx + + headers = {"Content-Type": "application/json"} + if self._bearer: + headers["Authorization"] = f"Bearer {self._bearer}" + if self._api_key: + headers["X-API-Key"] = self._api_key + + message_id = str(uuid.uuid4()) + body = { + "jsonrpc": "2.0", + "id": message_id, + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": job.prompt}], + "messageId": message_id, + # Carry the originating job id so observers can tell + # this turn was scheduler-driven, not user-driven. + "metadata": {"scheduler_job_id": job.id, "scheduler_kind": "local"}, + } + }, + } + try: + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(f"{self._invoke_url}/a2a", headers=headers, json=body) + if r.status_code >= 400: + log.error( + "[scheduler] fire failed for job %s: HTTP %d %s", + job.id, r.status_code, r.text[:200], + ) + else: + log.info("[scheduler] fired job %s", job.id) + except Exception: # noqa: BLE001 + log.exception("[scheduler] fire exception for job %s", job.id) + + def _generate_id(self) -> str: + # Agent-name prefix keeps cross-agent IDs distinct in shared + # observability surfaces (audit log, dashboards) even though + # the DB row is already namespaced by agent_name. + return f"{self.agent_name}-{uuid.uuid4().hex[:12]}" + + +def _row_to_job(row: Any) -> Job: + return Job( + id=row["id"], + prompt=row["prompt"], + schedule=row["schedule"], + agent_name=row["agent_name"], + next_fire=row["next_fire"], + last_fire=row["last_fire"], + enabled=bool(row["enabled"]), + created_at=row["created_at"], + ) diff --git a/scheduler/workstacean.py b/scheduler/workstacean.py new file mode 100644 index 0000000..97f690b --- /dev/null +++ b/scheduler/workstacean.py @@ -0,0 +1,183 @@ +"""WorkstaceanScheduler — HTTP adapter to a protoWorkstacean install. + +Activated automatically when ``WORKSTACEAN_API_BASE`` and +``WORKSTACEAN_API_KEY`` are set (see ``server.py``). + +Speaks Workstacean's ``POST /publish`` API as documented at +https://protolabsai.github.io/protoWorkstacean/reference/scheduler/. +Every job is namespaced with the agent's name so multiple protoAgent +forks (e.g. ``gina-personal`` + ``gina-work``) can share one +Workstacean install without cross-firing: + +- Job IDs are prefixed: ``{agent_name}-{user_id_or_uuid}`` +- Topics are namespaced: ``cron.{agent_name}`` + +The adapter is fire-and-forget — Workstacean owns scheduling state. +``list_jobs()`` issues a ``list`` command and waits for the response +on the ``schedule.list`` topic. If the user wants strict local +introspection, they should run the local backend. + +Note: Workstacean today does not natively dispatch to A2A endpoints; +forks need to wire their Workstacean install to route ``cron.*`` +topics to the agent's A2A endpoint. See the linked guide for the +recommended bridge config. +""" + +from __future__ import annotations + +import logging +import os +import uuid +from typing import Any + +import httpx + +from scheduler.interface import Job, parse_iso_to_utc, is_cron + +log = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_S = 10 + + +class WorkstaceanScheduler: + """HTTP adapter to a Workstacean ``/publish`` endpoint.""" + + name = "workstacean" + + def __init__( + self, + agent_name: str, + *, + base_url: str, + api_key: str, + topic_prefix: str | None = None, + timeout_s: float = DEFAULT_TIMEOUT_S, + ): + if not base_url: + raise ValueError("WorkstaceanScheduler: base_url is required") + if not api_key: + raise ValueError("WorkstaceanScheduler: api_key is required") + self.agent_name = agent_name + self._base_url = base_url.rstrip("/") + self._api_key = api_key + # Namespacing: topic_prefix governs which Workstacean topic the + # job fires on. Default = ``cron.``. Forks can override + # via ``WORKSTACEAN_TOPIC_PREFIX`` to integrate with existing + # bus conventions. + self._topic_prefix = topic_prefix or f"cron.{agent_name}" + self._timeout_s = timeout_s + + # ── public API ────────────────────────────────────────────────────────── + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + if not prompt or not prompt.strip(): + raise ValueError("scheduler: prompt is required") + # Validate the schedule eagerly so a malformed expr fails at + # tool-call time, not silently inside Workstacean. + _validate_schedule(schedule) + + normalized_id = self._namespaced_id(job_id) + topic = f"{self._topic_prefix}.{normalized_id}" + # Workstacean expects an outer ``command.schedule`` topic and + # the inner ``payload`` carries both the trigger schedule and + # the actual message that will be fired. The inner ``topic`` + # is what Workstacean publishes to when the schedule fires — + # so it has to be something a downstream A2A bridge subscribes + # to. Default convention: ``cron..``. + body = { + "topic": "command.schedule", + "payload": { + "action": "add", + "id": normalized_id, + "schedule": schedule, + "topic": topic, + "payload": { + "content": prompt, + "sender": "scheduler", + "channel": "a2a", + # Cross-system breadcrumb so the bridge knows which + # protoAgent fork the message belongs to. + "agent_name": self.agent_name, + "scheduler_job_id": normalized_id, + }, + }, + } + self._publish(body) + + return Job( + id=normalized_id, + prompt=prompt, + schedule=schedule, + agent_name=self.agent_name, + next_fire=None, # Workstacean owns the schedule state + ) + + def cancel_job(self, job_id: str) -> bool: + body = { + "topic": "command.schedule", + "payload": {"action": "remove", "id": self._namespaced_id(job_id)}, + } + try: + self._publish(body) + return True + except RuntimeError as exc: + log.warning("[scheduler] workstacean cancel failed: %s", exc) + return False + + def list_jobs(self) -> list[Job]: + """Returns ``[]`` from the adapter. + + Workstacean's ``list`` action publishes its response on the + ``schedule.list`` topic — there is no synchronous reply on + ``/publish``. Subscribing to that topic from inside a + protoAgent process (without a full bus client) is more + machinery than this adapter is the right layer for. Forks + that need live introspection should run the local backend or + query Workstacean directly. + """ + return [] + + async def start(self) -> None: + # Workstacean owns scheduling state — nothing to start here. + log.info( + "[scheduler] workstacean backend ready: agent=%s base=%s topic=%s.*", + self.agent_name, self._base_url, self._topic_prefix, + ) + + async def stop(self) -> None: + return None + + # ── helpers ───────────────────────────────────────────────────────────── + + def _publish(self, body: dict[str, Any]) -> None: + headers = {"Content-Type": "application/json", "X-API-Key": self._api_key} + try: + r = httpx.post( + f"{self._base_url}/publish", + headers=headers, + json=body, + timeout=self._timeout_s, + ) + except httpx.HTTPError as exc: + raise RuntimeError(f"workstacean publish failed: {exc}") from exc + if r.status_code >= 400: + raise RuntimeError( + f"workstacean publish HTTP {r.status_code}: {r.text[:200]}" + ) + + def _namespaced_id(self, job_id: str | None) -> str: + suffix = job_id or uuid.uuid4().hex[:12] + prefix = f"{self.agent_name}-" + return suffix if suffix.startswith(prefix) else prefix + suffix + + +def _validate_schedule(schedule: str) -> None: + """Validate cron expression OR ISO datetime. Raises ValueError.""" + if is_cron(schedule): + from croniter import croniter + try: + croniter(schedule) + except (TypeError, ValueError) as exc: + raise ValueError(f"invalid cron expression {schedule!r}: {exc}") from exc + return + parse_iso_to_utc(schedule) # raises ValueError on malformed ISO diff --git a/server.py b/server.py index f1ecd46..bf05ae0 100644 --- a/server.py +++ b/server.py @@ -58,6 +58,10 @@ _active_port = 7870 # populated by _main() — the port this process is actually bound to. # Read by the autostart installer so the LaunchAgent reboots # on the same port the operator launched with, not the default. +_scheduler = None # SchedulerBackend (LocalScheduler or WorkstaceanScheduler). + # Constructed at init, started on FastAPI startup, stopped + # on shutdown. Lifecycle is hooked in _main() so the + # polling coroutine doesn't leak on server reload. def _init_langgraph_agent(): @@ -97,11 +101,21 @@ def _init_langgraph_agent(): # the worker subagent — the store is still cheap to construct. knowledge_store = _build_knowledge_store(_graph_config) - _graph = create_agent_graph(_graph_config, knowledge_store=knowledge_store) + # Scheduler — local sqlite by default, swaps to a WorkstaceanScheduler + # automatically when WORKSTACEAN_API_BASE + WORKSTACEAN_API_KEY env + # vars are set. Both backends share the same agent-tool surface + # (schedule_task / list_schedules / cancel_schedule). + global _scheduler + _scheduler = _build_scheduler(_graph_config) + + _graph = create_agent_graph( + _graph_config, knowledge_store=knowledge_store, scheduler=_scheduler, + ) log.info( - "LangGraph agent initialized (model: %s, knowledge_db: %s)", + "LangGraph agent initialized (model: %s, knowledge_db: %s, scheduler: %s)", _graph_config.model_name, getattr(knowledge_store, "path", "(disabled)"), + getattr(_scheduler, "name", "disabled"), ) @@ -124,6 +138,69 @@ def _build_knowledge_store(config): return None +def _build_scheduler(config): + """Return the active scheduler backend, or ``None`` when disabled. + + Selection order: + + 1. ``WORKSTACEAN_API_BASE`` + ``WORKSTACEAN_API_KEY`` set → + ``WorkstaceanScheduler``. Forks running on the protoLabs fleet + infrastructure get this for free. + 2. Otherwise → ``LocalScheduler`` with sqlite at + ``/sandbox/scheduler//jobs.db``. + + Returns ``None`` when explicitly disabled via ``SCHEDULER_DISABLED=1`` + so a fork can ship without a scheduler at all. + + The agent's auth token + api-key are passed into the local backend + so its self-invocation HTTP call can pass through bearer / X-API-Key + auth — the scheduler hits the same A2A endpoint as a real caller. + """ + if os.environ.get("SCHEDULER_DISABLED", "").lower() in ("1", "true", "yes"): + log.info("[server] scheduler disabled via SCHEDULER_DISABLED env") + return None + + name = agent_name() + workstacean_base = os.environ.get("WORKSTACEAN_API_BASE", "").strip() + workstacean_key = os.environ.get("WORKSTACEAN_API_KEY", "").strip() + if workstacean_base and workstacean_key: + try: + from scheduler import WorkstaceanScheduler + return WorkstaceanScheduler( + agent_name=name, + base_url=workstacean_base, + api_key=workstacean_key, + topic_prefix=os.environ.get("WORKSTACEAN_TOPIC_PREFIX") or None, + ) + except Exception as exc: + log.warning( + "[server] WorkstaceanScheduler init failed: %s; falling back to local", + exc, + ) + + try: + from scheduler import LocalScheduler + invoke_url = os.environ.get( + "SCHEDULER_INVOKE_URL", + f"http://127.0.0.1:{_active_port}", + ) + bearer = (config.auth_token or os.environ.get("A2A_AUTH_TOKEN", "")).strip() + api_key_env = f"{name.upper()}_API_KEY" + api_key = os.environ.get(api_key_env, "").strip() + return LocalScheduler( + agent_name=name, + invoke_url=invoke_url, + api_key=api_key, + bearer_token=bearer, + ) + except Exception as exc: + log.warning( + "[server] LocalScheduler init failed: %s; running scheduler-less", + exc, + ) + return None + + def _reload_langgraph_agent() -> tuple[bool, str]: """Rebuild the compiled graph from the latest config YAML. @@ -161,7 +238,14 @@ def _reload_langgraph_agent() -> tuple[bool, str]: if is_setup_complete(): try: new_store = _build_knowledge_store(new_config) - new_graph = create_agent_graph(new_config, knowledge_store=new_store) + # Re-use the running scheduler instance — tearing down the + # polling loop on every drawer save would orphan in-flight + # fires. Env-driven scheduler config (WORKSTACEAN_API_BASE, + # SCHEDULER_DISABLED) only takes effect on full restart; + # the YAML doesn't carry scheduler settings yet. + new_graph = create_agent_graph( + new_config, knowledge_store=new_store, scheduler=_scheduler, + ) except Exception as e: log.exception("[reload] graph rebuild failed") return False, f"graph rebuild failed: {e}" @@ -757,6 +841,31 @@ def _main(): fastapi_app = FastAPI(title=f"{agent_name()} — protoAgent") + # --- Scheduler lifecycle ------------------------------------------------ + # The local scheduler needs an asyncio polling task; the Workstacean + # adapter is a no-op start/stop. Both implement the same contract so + # we just call through. on_event is preferred over a lifespan + # context manager here — the rest of the boot is sync (uvicorn.run + # is the only blocking call) and FastAPI fires startup/shutdown + # around it. + @fastapi_app.on_event("startup") + async def _scheduler_startup(): + if _scheduler is None: + return + try: + await _scheduler.start() + except Exception: + log.exception("[scheduler] startup failed") + + @fastapi_app.on_event("shutdown") + async def _scheduler_shutdown(): + if _scheduler is None: + return + try: + await _scheduler.stop() + except Exception: + log.exception("[scheduler] shutdown failed") + # --- Chat API ----------------------------------------------------------- class ChatRequest(PydanticBaseModel): message: str diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py new file mode 100644 index 0000000..524116e --- /dev/null +++ b/tests/test_scheduler_local.py @@ -0,0 +1,290 @@ +"""Tests for ``scheduler.local.LocalScheduler``. + +The polling-loop firing path is covered by stubbing ``httpx.AsyncClient`` +so a unit test doesn't need a running A2A endpoint. Multi-agent +isolation, missed-fire recovery, and reschedule-vs-delete behaviour +all get explicit cases — they're the parts most likely to regress. +""" + +from __future__ import annotations + +import asyncio +import sqlite3 +from datetime import UTC, datetime, timedelta +from pathlib import Path + +import pytest + +from scheduler.interface import is_cron, parse_iso_to_utc +from scheduler.local import LocalScheduler, _compute_next_fire + + +# ── helpers ───────────────────────────────────────────────────────────────── + + +def _make_scheduler(tmp_path: Path, agent: str = "gina-test") -> LocalScheduler: + return LocalScheduler( + agent_name=agent, + invoke_url="http://127.0.0.1:7870", + api_key="k", + bearer_token="b", + db_dir=tmp_path, + ) + + +# ── interface helpers ────────────────────────────────────────────────────── + + +class TestIsCron: + def test_cron_5_field(self): + assert is_cron("0 9 * * *") is True + + def test_cron_with_ranges(self): + assert is_cron("0 9 * * 1-5") is True + + def test_iso_with_t(self): + assert is_cron("2026-04-28T15:00:00") is False + + def test_iso_with_space(self): + assert is_cron("2026-04-28 15:00:00") is False + + def test_iso_with_offset(self): + assert is_cron("2026-04-28T15:00:00+00:00") is False + + def test_garbage(self): + assert is_cron("not a schedule") is False + assert is_cron("0 9 *") is False # 3 fields, not 5 + + def test_seven_fields_rejected(self): + # 7-field cron (with seconds + year) is not standard 5-field; + # the current detector accepts only exactly 5. + assert is_cron("0 0 12 * * MON 2026") is False + + +class TestParseIso: + def test_naive_treated_as_utc(self): + dt = parse_iso_to_utc("2026-04-28T15:00:00") + assert dt.tzinfo == UTC + assert dt.hour == 15 + + def test_offset_normalized(self): + dt = parse_iso_to_utc("2026-04-28T15:00:00-05:00") + assert dt.tzinfo == UTC + assert dt.hour == 20 # 15 EST → 20 UTC + + def test_malformed_raises(self): + with pytest.raises(ValueError): + parse_iso_to_utc("not an iso string") + + +# ── add / list / cancel ───────────────────────────────────────────────────── + + +class TestAddJob: + def test_cron_job(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *") + assert job.agent_name == "gina-test" + assert job.prompt == "hi" + assert job.next_fire is not None + assert "T" in job.next_fire # ISO + + def test_iso_one_shot(self, tmp_path): + s = _make_scheduler(tmp_path) + future = "2099-01-01T00:00:00" + job = s.add_job("hi", future) + # Naive ISO should be normalized to UTC + assert job.next_fire.startswith("2099-01-01T00:00:00") + + def test_empty_prompt_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + with pytest.raises(ValueError): + s.add_job(" ", "0 9 * * *") + + def test_malformed_schedule_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + with pytest.raises(ValueError): + s.add_job("hi", "not-a-real-schedule") + + def test_user_id_preserved(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *", job_id="my-custom-id") + assert job.id == "my-custom-id" + + def test_duplicate_id_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="dup") + with pytest.raises(ValueError, match="already exists"): + s.add_job("again", "0 9 * * *", job_id="dup") + + def test_auto_id_has_agent_prefix(self, tmp_path): + s = _make_scheduler(tmp_path, agent="ginavision") + job = s.add_job("hi", "0 9 * * *") + assert job.id.startswith("ginavision-") + + +class TestListAndCancel: + def test_list_filters_by_agent(self, tmp_path): + gp = _make_scheduler(tmp_path, agent="gina-personal") + gw = _make_scheduler(tmp_path, agent="gina-work") + gp.add_job("p1", "0 9 * * *") + gp.add_job("p2", "0 10 * * *") + gw.add_job("w1", "0 9 * * *") + assert len(gp.list_jobs()) == 2 + assert len(gw.list_jobs()) == 1 + assert gp.list_jobs()[0].agent_name == "gina-personal" + + def test_cancel_returns_true_on_hit(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *") + assert s.cancel_job(job.id) is True + assert s.list_jobs() == [] + + def test_cancel_returns_false_on_miss(self, tmp_path): + s = _make_scheduler(tmp_path) + assert s.cancel_job("does-not-exist") is False + + def test_cross_agent_cancel_blocked(self, tmp_path): + gp = _make_scheduler(tmp_path, agent="gina-personal") + gw = _make_scheduler(tmp_path, agent="gina-work") + gw_job = gw.add_job("w1", "0 9 * * *") + # gp tries to cancel gw's job — must fail silently (no row deleted) + assert gp.cancel_job(gw_job.id) is False + assert len(gw.list_jobs()) == 1 + + +# ── reschedule / delete behaviour ─────────────────────────────────────────── + + +class TestRescheduleOrDelete: + def test_one_shot_deleted_after_fire(self, tmp_path): + s = _make_scheduler(tmp_path) + # ISO in the past so _claim_due_jobs picks it up + past = (datetime.now(UTC) - timedelta(seconds=5)).isoformat() + s.add_job("hi", past, job_id="oneshot") + job = s.list_jobs()[0] + s._reschedule_or_delete(job, fired_at=datetime.now(UTC)) + assert s.list_jobs() == [] + + def test_cron_rescheduled_after_fire(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="cron") + job = s.list_jobs()[0] + original_next = job.next_fire + # Fire at "now" — next_fire should advance to the next 09:00 UTC + s._reschedule_or_delete(job, fired_at=datetime.now(UTC)) + new_next = s.list_jobs()[0].next_fire + assert new_next != original_next or original_next > datetime.now(UTC).isoformat() + # last_fire should be populated + assert s.list_jobs()[0].last_fire is not None + + +class TestMissedFireRecovery: + def test_stale_oneshot_dropped(self, tmp_path): + s = _make_scheduler(tmp_path) + # ISO from 2 days ago — outside the 24h window + stale = (datetime.now(UTC) - timedelta(days=2)).isoformat() + s.add_job("hi", stale, job_id="stale") + s._recover_missed_fires() + assert s.list_jobs() == [] + + def test_stale_cron_rolled_forward(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="cron-stale") + # Manually rewrite next_fire to 2 days ago (outside window) + db = sqlite3.connect(str(s.path)) + old = (datetime.now(UTC) - timedelta(days=2)).isoformat() + db.execute("UPDATE jobs SET next_fire = ? WHERE id = ?", (old, "cron-stale")) + db.commit() + db.close() + s._recover_missed_fires() + rolled = s.list_jobs()[0] + assert rolled.next_fire > datetime.now(UTC).isoformat() + + def test_recent_missed_fire_kept(self, tmp_path): + s = _make_scheduler(tmp_path) + # 5 minutes ago — inside the 24h window, should still fire + recent = (datetime.now(UTC) - timedelta(minutes=5)).isoformat() + s.add_job("hi", recent, job_id="recent") + s._recover_missed_fires() + # Job still exists with next_fire in the past — polling will fire it + jobs = s.list_jobs() + assert len(jobs) == 1 + assert jobs[0].next_fire < datetime.now(UTC).isoformat() + + +# ── compute_next_fire ─────────────────────────────────────────────────────── + + +class TestComputeNextFire: + def test_cron_returns_iso_utc(self): + result = _compute_next_fire("0 9 * * *") + # Parses cleanly as ISO + dt = datetime.fromisoformat(result) + assert dt.tzinfo is not None + + def test_cron_after_anchor(self): + anchor = datetime(2026, 4, 27, 8, 0, 0, tzinfo=UTC) + result = _compute_next_fire("0 9 * * *", after=anchor) + # 9am UTC on 2026-04-27 + dt = datetime.fromisoformat(result) + assert dt.year == 2026 and dt.month == 4 and dt.day == 27 and dt.hour == 9 + + def test_iso_passthrough(self): + result = _compute_next_fire("2026-12-25T00:00:00") + assert result.startswith("2026-12-25T00:00:00") + + +# ── start / stop loop ─────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_start_stop_idempotent(tmp_path): + s = _make_scheduler(tmp_path) + await s.start() + await s.start() # second call is a no-op, not an error + assert s._task is not None + await s.stop() + await s.stop() # second call is a no-op, not an error + assert s._task is None + + +@pytest.mark.asyncio +async def test_due_job_fires(tmp_path, monkeypatch): + """End-to-end: an ISO job in the past gets picked up and POSTs to /a2a.""" + s = _make_scheduler(tmp_path) + # Schedule for 1 second ago so the first tick claims it + past = (datetime.now(UTC) - timedelta(seconds=1)).isoformat() + s.add_job("FIRED-ME", past, job_id="firetest") + + fired: list[dict] = [] + + class _FakeResponse: + status_code = 200 + text = "ok" + + class _FakeClient: + def __init__(self, *_a, **_kw): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, url, headers=None, json=None): + fired.append({"url": url, "json": json}) + return _FakeResponse() + + import httpx + monkeypatch.setattr(httpx, "AsyncClient", _FakeClient) + + await s.start() + # Give the polling loop one tick (poll interval is 1s) + await asyncio.sleep(1.5) + await s.stop() + + assert any("FIRED-ME" in str(c["json"]) for c in fired) + # One-shot was deleted after firing + assert s.list_jobs() == [] diff --git a/tests/test_scheduler_workstacean.py b/tests/test_scheduler_workstacean.py new file mode 100644 index 0000000..ddd6da4 --- /dev/null +++ b/tests/test_scheduler_workstacean.py @@ -0,0 +1,168 @@ +"""Tests for ``scheduler.workstacean.WorkstaceanScheduler``. + +We don't run a Workstacean instance — instead we monkeypatch +``httpx.post`` and assert that the adapter sends the right +``POST /publish`` body shape (action, namespaced id, namespaced topic, +auth header). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from scheduler.workstacean import WorkstaceanScheduler + + +class _FakeResponse: + def __init__(self, status: int = 200, body: str = "ok"): + self.status_code = status + self.text = body + + +class _Recorder: + def __init__(self): + self.calls: list[dict[str, Any]] = [] + self.response = _FakeResponse() + + def __call__(self, url, headers=None, json=None, timeout=None): + self.calls.append({"url": url, "headers": headers, "json": json}) + return self.response + + +@pytest.fixture +def recorder(monkeypatch): + rec = _Recorder() + import httpx + monkeypatch.setattr(httpx, "post", rec) + return rec + + +@pytest.fixture +def adapter(): + return WorkstaceanScheduler( + agent_name="gina-personal", + base_url="http://workstacean:3000", + api_key="test-key", + ) + + +# ── construction guards ──────────────────────────────────────────────────── + + +def test_missing_base_url_rejected(): + with pytest.raises(ValueError, match="base_url"): + WorkstaceanScheduler(agent_name="x", base_url="", api_key="k") + + +def test_missing_api_key_rejected(): + with pytest.raises(ValueError, match="api_key"): + WorkstaceanScheduler(agent_name="x", base_url="http://w:3000", api_key="") + + +# ── add_job ──────────────────────────────────────────────────────────────── + + +class TestAddJob: + def test_publishes_command_schedule(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + assert len(recorder.calls) == 1 + body = recorder.calls[0]["json"] + assert body["topic"] == "command.schedule" + assert body["payload"]["action"] == "add" + + def test_id_namespaced_with_agent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + body = recorder.calls[0]["json"] + assert body["payload"]["id"] == "gina-personal-daily" + + def test_id_idempotent_when_already_prefixed(self, adapter, recorder): + # If the caller passes an already-prefixed id, the adapter + # shouldn't double-prefix it. + adapter.add_job("hi", "0 9 * * *", job_id="gina-personal-already-set") + body = recorder.calls[0]["json"] + assert body["payload"]["id"] == "gina-personal-already-set" + + def test_topic_namespaced_with_agent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + body = recorder.calls[0]["json"] + assert body["payload"]["topic"].startswith("cron.gina-personal.") + + def test_inner_payload_carries_prompt(self, adapter, recorder): + adapter.add_job("the actual prompt", "0 9 * * *", job_id="x") + inner = recorder.calls[0]["json"]["payload"]["payload"] + assert inner["content"] == "the actual prompt" + assert inner["channel"] == "a2a" + assert inner["agent_name"] == "gina-personal" + + def test_iso_oneshot_accepted(self, adapter, recorder): + adapter.add_job("hi", "2099-01-01T00:00:00", job_id="x") + assert len(recorder.calls) == 1 + + def test_malformed_schedule_rejected(self, adapter): + with pytest.raises(ValueError): + adapter.add_job("hi", "not-a-schedule", job_id="x") + + def test_empty_prompt_rejected(self, adapter): + with pytest.raises(ValueError, match="prompt"): + adapter.add_job(" ", "0 9 * * *", job_id="x") + + def test_auth_header_sent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="x") + assert recorder.calls[0]["headers"]["X-API-Key"] == "test-key" + + +# ── cancel_job ───────────────────────────────────────────────────────────── + + +class TestCancelJob: + def test_publishes_remove(self, adapter, recorder): + adapter.cancel_job("daily") + body = recorder.calls[0]["json"] + assert body["payload"]["action"] == "remove" + assert body["payload"]["id"] == "gina-personal-daily" + + def test_returns_true_on_success(self, adapter, recorder): + assert adapter.cancel_job("daily") is True + + def test_returns_false_on_http_error(self, adapter, recorder): + recorder.response = _FakeResponse(status=500, body="boom") + assert adapter.cancel_job("daily") is False + + +# ── topic prefix override ────────────────────────────────────────────────── + + +def test_custom_topic_prefix(monkeypatch): + rec = _Recorder() + import httpx + monkeypatch.setattr(httpx, "post", rec) + adapter = WorkstaceanScheduler( + agent_name="gina-personal", + base_url="http://w:3000", + api_key="k", + topic_prefix="myorg.bus.gina", + ) + adapter.add_job("hi", "0 9 * * *", job_id="x") + body = rec.calls[0]["json"] + assert body["payload"]["topic"].startswith("myorg.bus.gina.") + + +# ── list_jobs is intentionally empty ─────────────────────────────────────── + + +def test_list_jobs_returns_empty(adapter): + """Workstacean's ``list`` action publishes async to a topic; + the adapter doesn't subscribe, so list_jobs returns [].""" + assert adapter.list_jobs() == [] + + +# ── start/stop are no-ops ────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_start_stop_no_op(adapter): + # Should not raise + await adapter.start() + await adapter.stop() diff --git a/tools/lg_tools.py b/tools/lg_tools.py index 161ddcb..dfa6727 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -383,18 +383,123 @@ async def daily_log(content: str) -> str: return [memory_ingest, memory_recall, memory_list, memory_stats, daily_log] +# ── scheduler tools ────────────────────────────────────────────────────────── +# +# Three tools that bind to either the local sqlite-backed scheduler or +# the Workstacean adapter — the agent loop sees one stable surface and +# never has to know which backend is wired up. +# +# Multi-agent safety: the underlying backend is constructed in +# ``server.py`` with the active ``AGENT_NAME`` baked in. add_job / +# list_jobs / cancel_job all filter by that name so two protoAgent +# instances on the same machine (or sharing one Workstacean install) +# never see each other's jobs. + + +def _build_scheduler_tools(scheduler) -> list: + """Bind scheduler tools to a ``SchedulerBackend``. Returns a list.""" + + @tool + async def schedule_task( + prompt: str, + when: str, + job_id: str | None = None, + ) -> str: + """Schedule a future task. The agent receives ``prompt`` as a + new turn when the schedule fires. + + Use this for anything the operator wants done later: reminders + ("remind me to follow up on the auth migration tomorrow at + 9am"), recurring sweeps ("every Monday morning, summarize last + week's logs"), one-off check-ins ("at 3pm today, ask whether + the deploy is healthy"). + + Args: + prompt: The text the agent should receive when the schedule + fires. Be self-contained — the agent has no memory of + this scheduling moment when the task fires. + when: Either a 5-field cron expression (``"0 9 * * 1-5"`` + = every weekday at 9am) or an ISO-8601 datetime + (``"2026-05-01T15:00:00"`` = once at 3pm UTC on May 1). + Compute exact times using ``current_time`` — the agent + cannot infer "now" from training data. + job_id: Optional human-readable id for the job. Auto- + generated if omitted; you'll need it later to cancel. + + Returns ``"Scheduled job next at ."`` on success, + an error string on malformed ``when`` or backend failure. + """ + try: + job = scheduler.add_job(prompt, when, job_id=job_id) + except ValueError as exc: + return f"Error: {exc}" + except Exception as exc: # noqa: BLE001 + return f"Error: scheduler add_job failed: {exc}" + next_fire = job.next_fire or "(managed by remote scheduler)" + return f"Scheduled job {job.id} next at {next_fire}." + + @tool + async def list_schedules() -> str: + """List the current scheduled jobs for this agent. + + Returns one job per line with id, next-fire timestamp, and a + prompt preview. Returns ``"No scheduled jobs."`` when empty. + + Backends that delegate state to a remote scheduler (e.g. the + Workstacean adapter) may return an empty list even when jobs + exist — query the remote scheduler directly to see those. + """ + jobs = scheduler.list_jobs() + if not jobs: + return "No scheduled jobs." + lines = [] + for j in jobs: + preview = (j.prompt or "")[:80] + next_fire = j.next_fire or "(managed remotely)" + lines.append(f"{j.id} next={next_fire} schedule={j.schedule!r} {preview}") + return "\n".join(lines) + + @tool + async def cancel_schedule(job_id: str) -> str: + """Cancel a scheduled job by id. + + Args: + job_id: The id returned by ``schedule_task`` (or shown by + ``list_schedules``). + + Returns ``"Canceled ."`` or ``"Error: no such job ."``. + """ + if not job_id or not job_id.strip(): + return "Error: job_id is required." + try: + ok = scheduler.cancel_job(job_id) + except Exception as exc: # noqa: BLE001 + return f"Error: scheduler cancel_job failed: {exc}" + return f"Canceled {job_id}." if ok else f"Error: no such job {job_id}." + + return [schedule_task, list_schedules, cancel_schedule] + + # ── registry ───────────────────────────────────────────────────────────────── -def get_all_tools(knowledge_store=None): +def get_all_tools(knowledge_store=None, scheduler=None): """Return every LangChain tool the lead agent + subagents can use. - When ``knowledge_store`` is provided, the memory tools are bound - to it and included. Forks that disable the store can pass - ``knowledge_store=None`` and the lead agent runs with the four - keyless tools only. + Optional dependencies: + + - ``knowledge_store`` enables the memory tools (memory_ingest, + memory_recall, memory_list, memory_stats, daily_log). + - ``scheduler`` enables the scheduler tools (schedule_task, + list_schedules, cancel_schedule). Accepts any backend that + implements ``scheduler.interface.SchedulerBackend``. + + Pass ``None`` to disable either subsystem — the lead agent runs + fine with just the four keyless general tools. """ tools = [current_time, calculator, web_search, fetch_url] if knowledge_store is not None: tools.extend(_build_memory_tools(knowledge_store)) + if scheduler is not None: + tools.extend(_build_scheduler_tools(scheduler)) return tools From 7d1da5e9aa521283f926d9e60fc786ec5f2a3c36 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 28 Apr 2026 01:48:13 +0000 Subject: [PATCH 17/24] fix(review): address round-1 CodeRabbit findings on scheduler PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docs/guides/scheduler.md: replace jargon "multiple ginas" with "multiple agents" - scheduler/__init__.py: sort __all__ lexicographically (RUF022) - scheduler/local.py: log.error → log.exception in _init_db to preserve traceback (TRY400) - scheduler/workstacean.py: correct stale module docstring that claimed list_jobs() issues a list command — it returns [] unconditionally - server.py: add -> None return annotations to _scheduler_startup/_scheduler_shutdown (ANN202) - tests: add match= to two bare pytest.raises(ValueError) calls (PT011) - tools/lg_tools.py: wrap blocking scheduler calls in asyncio.to_thread() to avoid blocking the event loop under concurrent load; fix cancel_schedule error message to not conflate transport/DB failures with "no such job" Co-Authored-By: claude-code https://claude.ai/code/session_01JmFYJSYRMRndZ43g3AYW2q --- docs/guides/scheduler.md | 2 +- scheduler/__init__.py | 2 +- scheduler/local.py | 4 ++-- scheduler/workstacean.py | 6 +++--- server.py | 4 ++-- tests/test_scheduler_local.py | 2 +- tests/test_scheduler_workstacean.py | 2 +- tools/lg_tools.py | 9 +++++---- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/guides/scheduler.md b/docs/guides/scheduler.md index dceb994..d98a793 100644 --- a/docs/guides/scheduler.md +++ b/docs/guides/scheduler.md @@ -8,7 +8,7 @@ active. ## When to read this -- You want forks (or your own multiple ginas) to support reminders, +- You want forks (or your own multiple agents) to support reminders, recurring sweeps, or any "do this later" intent. - You're running protoWorkstacean and want scheduled fires to flow through the existing bus. diff --git a/scheduler/__init__.py b/scheduler/__init__.py index 960a226..6828056 100644 --- a/scheduler/__init__.py +++ b/scheduler/__init__.py @@ -24,4 +24,4 @@ from scheduler.local import LocalScheduler from scheduler.workstacean import WorkstaceanScheduler -__all__ = ["Job", "SchedulerBackend", "LocalScheduler", "WorkstaceanScheduler"] +__all__ = ["Job", "LocalScheduler", "SchedulerBackend", "WorkstaceanScheduler"] diff --git a/scheduler/local.py b/scheduler/local.py index 17fe9b0..f853084 100644 --- a/scheduler/local.py +++ b/scheduler/local.py @@ -144,8 +144,8 @@ def _init_db(self) -> None: db.executescript(_SCHEMA) db.commit() db.close() - except sqlite3.DatabaseError as exc: - log.error("[scheduler] schema init failed at %s: %s", self.path, exc) + except sqlite3.DatabaseError: + log.exception("[scheduler] schema init failed at %s", self.path) # ── public API (matches SchedulerBackend) ─────────────────────────────── diff --git a/scheduler/workstacean.py b/scheduler/workstacean.py index 97f690b..56df684 100644 --- a/scheduler/workstacean.py +++ b/scheduler/workstacean.py @@ -13,9 +13,9 @@ - Topics are namespaced: ``cron.{agent_name}`` The adapter is fire-and-forget — Workstacean owns scheduling state. -``list_jobs()`` issues a ``list`` command and waits for the response -on the ``schedule.list`` topic. If the user wants strict local -introspection, they should run the local backend. +``list_jobs()`` returns an empty list because Workstacean's list +action publishes asynchronously — strict local introspection requires +the local backend. Note: Workstacean today does not natively dispatch to A2A endpoints; forks need to wire their Workstacean install to route ``cron.*`` diff --git a/server.py b/server.py index bf05ae0..d394744 100644 --- a/server.py +++ b/server.py @@ -849,7 +849,7 @@ def _main(): # is the only blocking call) and FastAPI fires startup/shutdown # around it. @fastapi_app.on_event("startup") - async def _scheduler_startup(): + async def _scheduler_startup() -> None: if _scheduler is None: return try: @@ -858,7 +858,7 @@ async def _scheduler_startup(): log.exception("[scheduler] startup failed") @fastapi_app.on_event("shutdown") - async def _scheduler_shutdown(): + async def _scheduler_shutdown() -> None: if _scheduler is None: return try: diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py index 524116e..867bf6c 100644 --- a/tests/test_scheduler_local.py +++ b/tests/test_scheduler_local.py @@ -73,7 +73,7 @@ def test_offset_normalized(self): assert dt.hour == 20 # 15 EST → 20 UTC def test_malformed_raises(self): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid isoformat|could not convert"): parse_iso_to_utc("not an iso string") diff --git a/tests/test_scheduler_workstacean.py b/tests/test_scheduler_workstacean.py index ddd6da4..74fb485 100644 --- a/tests/test_scheduler_workstacean.py +++ b/tests/test_scheduler_workstacean.py @@ -101,7 +101,7 @@ def test_iso_oneshot_accepted(self, adapter, recorder): assert len(recorder.calls) == 1 def test_malformed_schedule_rejected(self, adapter): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid isoformat|could not convert"): adapter.add_job("hi", "not-a-schedule", job_id="x") def test_empty_prompt_rejected(self, adapter): diff --git a/tools/lg_tools.py b/tools/lg_tools.py index dfa6727..42c60ce 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -40,6 +40,7 @@ from __future__ import annotations import ast +import asyncio import operator as _op from datetime import datetime from zoneinfo import ZoneInfo, ZoneInfoNotFoundError @@ -430,7 +431,7 @@ async def schedule_task( an error string on malformed ``when`` or backend failure. """ try: - job = scheduler.add_job(prompt, when, job_id=job_id) + job = await asyncio.to_thread(scheduler.add_job, prompt, when, job_id=job_id) except ValueError as exc: return f"Error: {exc}" except Exception as exc: # noqa: BLE001 @@ -449,7 +450,7 @@ async def list_schedules() -> str: Workstacean adapter) may return an empty list even when jobs exist — query the remote scheduler directly to see those. """ - jobs = scheduler.list_jobs() + jobs = await asyncio.to_thread(scheduler.list_jobs) if not jobs: return "No scheduled jobs." lines = [] @@ -472,10 +473,10 @@ async def cancel_schedule(job_id: str) -> str: if not job_id or not job_id.strip(): return "Error: job_id is required." try: - ok = scheduler.cancel_job(job_id) + ok = await asyncio.to_thread(scheduler.cancel_job, job_id) except Exception as exc: # noqa: BLE001 return f"Error: scheduler cancel_job failed: {exc}" - return f"Canceled {job_id}." if ok else f"Error: no such job {job_id}." + return f"Canceled {job_id}." if ok else f"Error: cancel failed or no such job {job_id}." return [schedule_task, list_schedules, cancel_schedule] From 1a052785629ae7aca44475b7b8b268c3166c366d Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 19:06:38 -0700 Subject: [PATCH 18/24] fix(review-2): address round-2 PR #156 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - scheduler/local.py: _fire() now returns bool (True on 2xx, False on HTTP error or network exception). _tick() only reschedules / deletes when _fire() succeeds, so a transient failure leaves the job in place for the next tick to retry. Previously a one-shot job hit by a 5xx silently vanished. - server.py: the API key env var name now uses AGENT_NAME_ENV.upper() to match the auth handler at L893. The previous code read agent_name() (which returns the wizard-set identity.name when set), so a wizard rename pointed the scheduler at _API_KEY while the auth handler still expected _API_KEY → self-invocation 401'd silently after every wizard rename. - server.py: reload path now constructs a scheduler when _scheduler is None (first-run case: server boots pre-setup, wizard finishes, drawer triggers reload — this is when we *first* construct the scheduler). Existing instances are still reused — drawer saves don't tear down the polling loop. Surface: - tools/lg_tools.py: exported SCHEDULER_TOOL_NAMES and MEMORY_TOOL_NAMES as module constants. - graph/config_io.py::list_available_tools: now exposes scheduler + memory tool names to the wizard's checkbox group even when the runtime hasn't yet constructed the underlying backends. Otherwise the wizard would hide tools that the runtime registers as soon as the user finishes setup. Declined: - scheduler/local.py L141-149: CodeRabbit asked to re-raise sqlite3.DatabaseError from _init_db. The store is intentionally fail-soft (matches knowledge/store.py + audit.py): _resolve_db_path already falls back to ~/.protoagent/scheduler/ when /sandbox is unwritable, and re-raising would crash boot in exactly the scenario the fallback is designed to handle. The graceful degradation contract is "scheduler tools return errors when storage is broken, agent keeps serving everything else". Tests: - tests/test_scheduler_local.py: new test_fire_failure_leaves_job_in_place regression guard + test_fire_returns_bool contract test. - tests/test_config_io.py: list_available_tools assertions now check for memory + scheduler tools and no duplicates. 86 scheduler-scope tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- graph/config_io.py | 26 ++++++++++-- scheduler/local.py | 30 +++++++++---- server.py | 40 +++++++++++++++--- tests/test_config_io.py | 9 ++++ tests/test_scheduler_local.py | 79 +++++++++++++++++++++++++++++++++++ tools/lg_tools.py | 12 ++++++ 6 files changed, 180 insertions(+), 16 deletions(-) diff --git a/graph/config_io.py b/graph/config_io.py index 24a44ca..d2e5f94 100644 --- a/graph/config_io.py +++ b/graph/config_io.py @@ -319,10 +319,30 @@ def list_gateway_models( def list_available_tools(knowledge_store: Any = None) -> list[str]: - """Return every tool name the runtime would wire into the graph.""" - from tools.lg_tools import get_all_tools + """Return every tool name the runtime *could* wire into the graph. + + The wizard's tool checkbox group reads this. We deliberately + expose the scheduler tool names even when no scheduler has been + constructed yet (fresh boot, pre-setup) — otherwise the wizard + would hide tools that the runtime will register the moment the + user finishes setup. Same logic for memory tools when the + knowledge store is absent. + """ + from tools.lg_tools import ( + MEMORY_TOOL_NAMES, + SCHEDULER_TOOL_NAMES, + get_all_tools, + ) - return [t.name for t in get_all_tools(knowledge_store)] + names = [t.name for t in get_all_tools(knowledge_store)] + # Deduplicate while preserving order: tools already present + # (because their backend was passed in) shouldn't appear twice. + seen = set(names) + for extra in (*MEMORY_TOOL_NAMES, *SCHEDULER_TOOL_NAMES): + if extra not in seen: + names.append(extra) + seen.add(extra) + return names # --------------------------------------------------------------------------- diff --git a/scheduler/local.py b/scheduler/local.py index f853084..d495056 100644 --- a/scheduler/local.py +++ b/scheduler/local.py @@ -245,10 +245,18 @@ async def _tick(self) -> None: now = datetime.now(UTC) due = self._claim_due_jobs(now) for job in due: - try: - await self._fire(job) - finally: + # Reschedule (or delete) only when delivery actually + # succeeded. A transient HTTP failure leaves the row in + # place so the next tick retries; a one-shot stays alive + # until it lands rather than vanishing on the first + # network blip. + if await self._fire(job): self._reschedule_or_delete(job, fired_at=now) + else: + log.warning( + "[scheduler] fire failed for job %s; leaving in place for retry", + job.id, + ) def _claim_due_jobs(self, now: datetime) -> list[Job]: db = self._connect() @@ -322,8 +330,14 @@ def _recover_missed_fires(self) -> None: finally: db.close() - async def _fire(self, job: Job) -> None: - """Deliver a job by POSTing to the agent's own A2A endpoint.""" + async def _fire(self, job: Job) -> bool: + """Deliver a job by POSTing to the agent's own A2A endpoint. + + Returns ``True`` on a 2xx response, ``False`` on any HTTP + error or network exception. Callers use the return value to + decide whether to advance the schedule (success) or leave + the row in place for the next tick to retry (failure). + """ import httpx headers = {"Content-Type": "application/json"} @@ -356,10 +370,12 @@ async def _fire(self, job: Job) -> None: "[scheduler] fire failed for job %s: HTTP %d %s", job.id, r.status_code, r.text[:200], ) - else: - log.info("[scheduler] fired job %s", job.id) + return False + log.info("[scheduler] fired job %s", job.id) + return True except Exception: # noqa: BLE001 log.exception("[scheduler] fire exception for job %s", job.id) + return False def _generate_id(self) -> str: # Agent-name prefix keeps cross-agent IDs distinct in shared diff --git a/server.py b/server.py index d394744..3235120 100644 --- a/server.py +++ b/server.py @@ -185,7 +185,11 @@ def _build_scheduler(config): f"http://127.0.0.1:{_active_port}", ) bearer = (config.auth_token or os.environ.get("A2A_AUTH_TOKEN", "")).strip() - api_key_env = f"{name.upper()}_API_KEY" + # The A2A handler reads X-API-Key from ``_API_KEY`` + # (server.py L893 — note: the env-derived name, NOT the wizard-set + # ``identity.name``). Match that here so a wizard rename doesn't + # break self-invocation auth. + api_key_env = f"{AGENT_NAME_ENV.upper()}_API_KEY" api_key = os.environ.get(api_key_env, "").strip() return LocalScheduler( agent_name=name, @@ -238,11 +242,35 @@ def _reload_langgraph_agent() -> tuple[bool, str]: if is_setup_complete(): try: new_store = _build_knowledge_store(new_config) - # Re-use the running scheduler instance — tearing down the - # polling loop on every drawer save would orphan in-flight - # fires. Env-driven scheduler config (WORKSTACEAN_API_BASE, - # SCHEDULER_DISABLED) only takes effect on full restart; - # the YAML doesn't carry scheduler settings yet. + # Reuse the running scheduler so a drawer save doesn't tear + # down the polling loop and orphan in-flight fires. Build + # one only when none was constructed yet — the typical + # path is: server boots before setup is complete (no + # scheduler), wizard finishes, drawer triggers reload — + # this is when we *first* construct the scheduler. + # + # Note: the freshly-built scheduler isn't started here. + # FastAPI's startup hook fires once at process start; on + # post-setup reloads we kick the polling loop manually. + global _scheduler + if _scheduler is None: + _scheduler = _build_scheduler(new_config) + if _scheduler is not None: + # _reload_langgraph_agent is sync but called from + # inside the FastAPI event loop, so the running + # loop is available. Fire-and-forget the start — + # awaiting it would require making this whole + # function async (and every caller along with it). + try: + import asyncio + asyncio.get_running_loop().create_task(_scheduler.start()) + except RuntimeError: + log.warning( + "[reload] no running event loop; scheduler will " + "start on next process boot", + ) + except Exception: + log.exception("[reload] scheduler start failed") new_graph = create_agent_graph( new_config, knowledge_store=new_store, scheduler=_scheduler, ) diff --git a/tests/test_config_io.py b/tests/test_config_io.py index caf0bb2..7be5075 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -329,7 +329,16 @@ def test_list_available_tools_returns_starter_set(): assert "calculator" in names assert "web_search" in names assert "fetch_url" in names + # Memory + scheduler tools appear in the wizard checklist even + # when no store / scheduler has been constructed yet — otherwise + # the user couldn't enable them on a fresh boot. + assert "memory_ingest" in names + assert "schedule_task" in names + assert "cancel_schedule" in names assert all(isinstance(n, str) for n in names) + # No duplicates — list_available_tools dedupes between the + # backend-bound tools and the static name lists. + assert len(names) == len(set(names)) # ── Setup wizard marker ───────────────────────────────────────────────────── diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py index 867bf6c..a7fea63 100644 --- a/tests/test_scheduler_local.py +++ b/tests/test_scheduler_local.py @@ -288,3 +288,82 @@ async def post(self, url, headers=None, json=None): assert any("FIRED-ME" in str(c["json"]) for c in fired) # One-shot was deleted after firing assert s.list_jobs() == [] + + +@pytest.mark.asyncio +async def test_fire_failure_leaves_job_in_place(tmp_path, monkeypatch): + """A 5xx HTTP response from /a2a must NOT delete the job. + + Regression guard for the round-2 review finding: previously, + _tick() called _reschedule_or_delete in finally, which silently + consumed one-shot jobs on transient failures. Now the job stays + until delivery actually succeeds. + """ + s = _make_scheduler(tmp_path) + past = (datetime.now(UTC) - timedelta(seconds=1)).isoformat() + s.add_job("DURABLE", past, job_id="firetest") + + class _FakeResponse: + status_code = 503 + text = "service unavailable" + + class _FakeClient: + def __init__(self, *_a, **_kw): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, url, headers=None, json=None): + return _FakeResponse() + + import httpx + monkeypatch.setattr(httpx, "AsyncClient", _FakeClient) + + await s.start() + await asyncio.sleep(1.5) # one polling tick + await s.stop() + + # Job survives the failed fire, will be retried on the next tick. + assert len(s.list_jobs()) == 1 + assert s.list_jobs()[0].id == "firetest" + + +@pytest.mark.asyncio +async def test_fire_returns_bool(tmp_path, monkeypatch): + """``_fire`` is the success/failure signal feeding the + reschedule decision in ``_tick``. Lock the contract.""" + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *", job_id="x") + + class _OkResponse: + status_code = 200 + text = "ok" + + class _ErrResponse: + status_code = 500 + text = "boom" + + class _FakeClient: + def __init__(self, response): + self._response = response + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, *_a, **_kw): + return self._response + + import httpx + + monkeypatch.setattr(httpx, "AsyncClient", lambda **kw: _FakeClient(_OkResponse())) + assert await s._fire(job) is True + + monkeypatch.setattr(httpx, "AsyncClient", lambda **kw: _FakeClient(_ErrResponse())) + assert await s._fire(job) is False diff --git a/tools/lg_tools.py b/tools/lg_tools.py index 42c60ce..d42effb 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -281,6 +281,18 @@ def _extract_text_from_html(content: bytes) -> str: _MEMORY_RECALL_MAX_K = 20 _MEMORY_LIST_MAX_LIMIT = 200 +# Stable list of scheduler tool names. Exposed as a module-level +# constant so ``graph/config_io.py::list_available_tools`` can show +# the wizard the right surface even when the runtime hasn't yet +# constructed a scheduler instance (e.g. fresh boot before setup is +# complete). Keep in sync with ``_build_scheduler_tools``. +SCHEDULER_TOOL_NAMES: tuple[str, ...] = ( + "schedule_task", "list_schedules", "cancel_schedule", +) +MEMORY_TOOL_NAMES: tuple[str, ...] = ( + "memory_ingest", "memory_recall", "memory_list", "memory_stats", "daily_log", +) + def _build_memory_tools(knowledge_store) -> list: """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" From 8b79fa8d5a2264cae69b0521a425c32190758a5f Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 19:32:47 -0700 Subject: [PATCH 19/24] feat(scheduler): YAML opt-out via middleware.scheduler (symmetric with knowledge/memory) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scheduler had asymmetric opt-out — only env-based (SCHEDULER_DISABLED=1). The knowledge and memory subsystems already exposed YAML toggles (middleware.knowledge, middleware.memory) so forks could flip them through the drawer or wizard. Scheduler now matches: - LangGraphConfig.scheduler_enabled: bool = True (default-on) - from_yaml() reads middleware.scheduler - config_to_dict() emits it for the drawer round-trip - config/langgraph-config.yaml has middleware.scheduler: true - server.py::_build_scheduler honors the YAML toggle first, env second Both subsystems are now genuinely opt-out: middleware: knowledge: true # was already so memory: true # was already so scheduler: true # NEW — was env-only audit: true Drawer/wizard can flip any of them without restart (the existing reload path already rebuilds on config change). The env opt-out (SCHEDULER_DISABLED=1) stays as a runtime escape hatch for fleet operators who can't edit YAML in the moment. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/langgraph-config.yaml | 11 +++++++---- docs/guides/scheduler.md | 15 +++++++++++---- docs/reference/configuration.md | 2 ++ graph/config.py | 7 ++++++- graph/config_io.py | 1 + server.py | 10 ++++++++++ 6 files changed, 37 insertions(+), 9 deletions(-) diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml index c75ff71..c7df665 100644 --- a/config/langgraph-config.yaml +++ b/config/langgraph-config.yaml @@ -38,13 +38,16 @@ subagents: max_turns: 20 middleware: - # All three middlewares default ON. The knowledge middleware needs a - # store; the template constructs one automatically (see - # ``server.py::_build_knowledge_store``). Set ``knowledge: false`` if - # your fork is purely stateless. + # All four subsystems default ON. The template constructs the + # knowledge store + scheduler backends automatically (see + # ``server.py::_build_knowledge_store`` and ``_build_scheduler``). + # Flip any of these to ``false`` to opt out — the corresponding + # tools (memory_*, schedule_*) are dropped from the agent loop + # without touching the worker subagent's tool allowlist. knowledge: true audit: true memory: true + scheduler: true knowledge: db_path: /sandbox/knowledge/agent.db diff --git a/docs/guides/scheduler.md b/docs/guides/scheduler.md index d98a793..faaf45f 100644 --- a/docs/guides/scheduler.md +++ b/docs/guides/scheduler.md @@ -34,12 +34,19 @@ not "do that thing we discussed"). `server.py::_build_scheduler` picks at startup: -1. `WORKSTACEAN_API_BASE` + `WORKSTACEAN_API_KEY` set → **`WorkstaceanScheduler`**. -2. Otherwise → **`LocalScheduler`** (sqlite, asyncio polling). -3. `SCHEDULER_DISABLED=1` → no scheduler. The three tools don't ship. +1. `middleware.scheduler: false` in YAML → no scheduler. The three + tools don't ship. (Symmetric with `middleware.knowledge` / + `middleware.memory` — drawer/wizard editable.) +2. `SCHEDULER_DISABLED=1` env → no scheduler. Runtime escape hatch + for fleet operators who can't edit config. +3. `WORKSTACEAN_API_BASE` + `WORKSTACEAN_API_KEY` set → + **`WorkstaceanScheduler`**. +4. Otherwise → **`LocalScheduler`** (sqlite, asyncio polling). Both backends honor the same `SchedulerBackend` protocol; the agent -loop never knows which one is wired up. +loop never knows which one is wired up. The scheduler is **default +on** — explicitly opt out via either config path above when a fork +wants a stateless agent with no scheduling surface. ```bash # Solo / local dev — falls through to LocalScheduler automatically. diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 3ca9a9c..6f5e1c7 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -33,6 +33,7 @@ middleware: knowledge: true audit: true memory: true + scheduler: true knowledge: db_path: /sandbox/knowledge/agent.db @@ -71,6 +72,7 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag | `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. | | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. | | `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. | +| `scheduler` | `true` | Wire the bundled scheduler backend (local sqlite, or `WorkstaceanScheduler` when env vars are set). Drops the `schedule_task` / `list_schedules` / `cancel_schedule` tools from the agent loop when `false`. Equivalent to setting `SCHEDULER_DISABLED=1`; the YAML toggle is the canonical opt-out path. | ## `knowledge` diff --git a/graph/config.py b/graph/config.py index c2cf995..aff6707 100644 --- a/graph/config.py +++ b/graph/config.py @@ -46,10 +46,14 @@ class LangGraphConfig: max_turns=20, )) - # Middleware toggles + # Middleware / subsystem toggles. All default-on so a fresh fork has + # a working memory loop + scheduler on day one. Forks that want a + # purely stateless agent (no KB, no scheduled tasks) can flip these + # via the drawer or by editing the YAML directly. knowledge_middleware: bool = True audit_middleware: bool = True memory_middleware: bool = True + scheduler_enabled: bool = True # Knowledge store — sqlite + FTS5, see ``knowledge/store.py``. # The default path lives under ``/sandbox/`` to play well with the @@ -109,6 +113,7 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig": knowledge_middleware=middleware.get("knowledge", cls.knowledge_middleware), audit_middleware=middleware.get("audit", cls.audit_middleware), memory_middleware=middleware.get("memory", cls.memory_middleware), + scheduler_enabled=middleware.get("scheduler", cls.scheduler_enabled), knowledge_db_path=knowledge.get("db_path", cls.knowledge_db_path), embed_model=knowledge.get("embed_model", cls.embed_model), knowledge_top_k=knowledge.get("top_k", cls.knowledge_top_k), diff --git a/graph/config_io.py b/graph/config_io.py index d2e5f94..2bd4857 100644 --- a/graph/config_io.py +++ b/graph/config_io.py @@ -133,6 +133,7 @@ def config_to_dict(config: LangGraphConfig) -> dict[str, Any]: "knowledge": config.knowledge_middleware, "audit": config.audit_middleware, "memory": config.memory_middleware, + "scheduler": config.scheduler_enabled, }, "knowledge": { "db_path": config.knowledge_db_path, diff --git a/server.py b/server.py index 3235120..97804d5 100644 --- a/server.py +++ b/server.py @@ -156,6 +156,16 @@ def _build_scheduler(config): so its self-invocation HTTP call can pass through bearer / X-API-Key auth — the scheduler hits the same A2A endpoint as a real caller. """ + # Two opt-out paths, in priority order: + # 1. ``middleware.scheduler: false`` in YAML (drawer / wizard). + # This is the canonical opt-out — symmetric with + # ``middleware.knowledge`` / ``middleware.memory``. + # 2. ``SCHEDULER_DISABLED=1`` env var. Runtime escape hatch for + # fleet operators who need to kill the scheduler without + # editing config (e.g. emergency rollback). + if not getattr(config, "scheduler_enabled", True): + log.info("[server] scheduler disabled via middleware.scheduler config") + return None if os.environ.get("SCHEDULER_DISABLED", "").lower() in ("1", "true", "yes"): log.info("[server] scheduler disabled via SCHEDULER_DISABLED env") return None From 7ce3f1065c4d521bd602a539917df7e4510d673e Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 19:43:45 -0700 Subject: [PATCH 20/24] fix(review-3+4): address rounds 3 + 4 PR #156 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - scheduler/local.py: stop() now suppresses only asyncio.CancelledError (the expected outcome of cancelling our own task) and logs any other exception via log.exception. Previously every exception was silently swallowed, so a polling-loop crash during shutdown would vanish without a trace. - server.py: reload path now honors the new middleware.scheduler toggle. Three states: - flipped OFF (was on) → stop + drop the running scheduler; new graph builds with scheduler=None. - flipped ON (was off / first run) → construct + start. - unchanged → reuse the running instance. Helpers _start_scheduler_async / _stop_scheduler_async fire start()/stop() onto the active loop without forcing the entire reload chain to become async. Type / nits: - server.py: added `-> "SchedulerBackend | None"` return type to _build_scheduler, with a TYPE_CHECKING import to avoid runtime cycles. - tests/test_scheduler_local.py: raw-string regex for `|` alternation (test_malformed_raises); added match= to the two bare ValueError tests (test_empty_prompt_rejected, test_malformed_schedule_rejected) so they only pass for the intended error message. - tests/test_config_io.py: assert list_schedules in names alongside schedule_task / cancel_schedule. - docs/reference/configuration.md: clarified the scheduler opt-out description — middleware.scheduler is canonical, SCHEDULER_DISABLED is a runtime escape hatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/reference/configuration.md | 2 +- scheduler/local.py | 9 +++- server.py | 86 ++++++++++++++++++++++----------- tests/test_config_io.py | 1 + tests/test_scheduler_local.py | 6 +-- 5 files changed, 72 insertions(+), 32 deletions(-) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 6f5e1c7..872bb37 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -72,7 +72,7 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag | `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. | | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. | | `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. | -| `scheduler` | `true` | Wire the bundled scheduler backend (local sqlite, or `WorkstaceanScheduler` when env vars are set). Drops the `schedule_task` / `list_schedules` / `cancel_schedule` tools from the agent loop when `false`. Equivalent to setting `SCHEDULER_DISABLED=1`; the YAML toggle is the canonical opt-out path. | +| `scheduler` | `true` | Wire the bundled scheduler backend (local sqlite, or `WorkstaceanScheduler` when env vars are set). Drops the `schedule_task` / `list_schedules` / `cancel_schedule` tools from the agent loop when `false`. Has the same effect as `SCHEDULER_DISABLED=1` — but `middleware.scheduler: false` is the canonical opt-out (drawer/wizard editable, survives restarts), while the env var is a runtime escape hatch for fleet operators who can't edit YAML in the moment. | ## `knowledge` diff --git a/scheduler/local.py b/scheduler/local.py index d495056..635d2f4 100644 --- a/scheduler/local.py +++ b/scheduler/local.py @@ -223,8 +223,15 @@ async def stop(self) -> None: self._task.cancel() try: await self._task - except (asyncio.CancelledError, Exception): # noqa: BLE001 + except asyncio.CancelledError: + # Expected — we just cancelled it. pass + except Exception: # noqa: BLE001 + # Anything else means the polling loop crashed during + # shutdown. Log with traceback so we can debug; don't + # re-raise (caller is in shutdown path, raising would + # mask the original shutdown trigger). + log.exception("[scheduler] polling task raised during stop") self._task = None log.info("[scheduler] local backend stopped") diff --git a/server.py b/server.py index 97804d5..195f308 100644 --- a/server.py +++ b/server.py @@ -30,10 +30,13 @@ import os import time from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any from graph.output_format import extract_output +if TYPE_CHECKING: + from scheduler.interface import SchedulerBackend + # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- @@ -138,7 +141,43 @@ def _build_knowledge_store(config): return None -def _build_scheduler(config): +def _start_scheduler_async(backend: "SchedulerBackend") -> None: + """Fire-and-forget scheduler.start() onto the running loop. + + Reload paths are sync but invoked from FastAPI request handlers, + so the running loop is available. Awaiting would force the entire + reload chain to become async — not worth it for one no-await + coroutine. + """ + import asyncio + try: + asyncio.get_running_loop().create_task(backend.start()) + except RuntimeError: + log.warning( + "[reload] no running event loop; scheduler will start " + "on next process boot", + ) + except Exception: + log.exception("[reload] scheduler start failed") + + +def _stop_scheduler_async(backend: "SchedulerBackend") -> None: + """Fire-and-forget scheduler.stop() onto the running loop. + + Used when the YAML toggle flips off mid-reload. The polling task + cancels cleanly; the next graph rebuild registers no scheduler + tools. + """ + import asyncio + try: + asyncio.get_running_loop().create_task(backend.stop()) + except RuntimeError: + log.warning("[reload] no running event loop; scheduler not stopped") + except Exception: + log.exception("[reload] scheduler stop failed") + + +def _build_scheduler(config) -> "SchedulerBackend | None": """Return the active scheduler backend, or ``None`` when disabled. Selection order: @@ -252,35 +291,28 @@ def _reload_langgraph_agent() -> tuple[bool, str]: if is_setup_complete(): try: new_store = _build_knowledge_store(new_config) - # Reuse the running scheduler so a drawer save doesn't tear - # down the polling loop and orphan in-flight fires. Build - # one only when none was constructed yet — the typical - # path is: server boots before setup is complete (no - # scheduler), wizard finishes, drawer triggers reload — - # this is when we *first* construct the scheduler. + # Three states for the scheduler on reload: + # + # 1. Toggle flipped OFF (was on) → stop + drop the running + # scheduler so the agent stops registering scheduler + # tools. The new graph is built with scheduler=None. + # 2. Toggle is ON and we have a running scheduler → reuse + # it. Drawer saves don't tear down the polling loop. + # 3. Toggle is ON but _scheduler is None (first-run after + # setup completes) → construct + start. # - # Note: the freshly-built scheduler isn't started here. - # FastAPI's startup hook fires once at process start; on - # post-setup reloads we kick the polling loop manually. + # Env-driven config (WORKSTACEAN_API_BASE) only takes + # effect on full process restart; the YAML toggle is the + # canonical reload-time switch. global _scheduler - if _scheduler is None: + scheduler_wanted = getattr(new_config, "scheduler_enabled", True) + if not scheduler_wanted and _scheduler is not None: + _stop_scheduler_async(_scheduler) + _scheduler = None + elif scheduler_wanted and _scheduler is None: _scheduler = _build_scheduler(new_config) if _scheduler is not None: - # _reload_langgraph_agent is sync but called from - # inside the FastAPI event loop, so the running - # loop is available. Fire-and-forget the start — - # awaiting it would require making this whole - # function async (and every caller along with it). - try: - import asyncio - asyncio.get_running_loop().create_task(_scheduler.start()) - except RuntimeError: - log.warning( - "[reload] no running event loop; scheduler will " - "start on next process boot", - ) - except Exception: - log.exception("[reload] scheduler start failed") + _start_scheduler_async(_scheduler) new_graph = create_agent_graph( new_config, knowledge_store=new_store, scheduler=_scheduler, ) diff --git a/tests/test_config_io.py b/tests/test_config_io.py index 7be5075..946abfb 100644 --- a/tests/test_config_io.py +++ b/tests/test_config_io.py @@ -334,6 +334,7 @@ def test_list_available_tools_returns_starter_set(): # the user couldn't enable them on a fresh boot. assert "memory_ingest" in names assert "schedule_task" in names + assert "list_schedules" in names assert "cancel_schedule" in names assert all(isinstance(n, str) for n in names) # No duplicates — list_available_tools dedupes between the diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py index a7fea63..06f4ef9 100644 --- a/tests/test_scheduler_local.py +++ b/tests/test_scheduler_local.py @@ -73,7 +73,7 @@ def test_offset_normalized(self): assert dt.hour == 20 # 15 EST → 20 UTC def test_malformed_raises(self): - with pytest.raises(ValueError, match="Invalid isoformat|could not convert"): + with pytest.raises(ValueError, match=r"Invalid isoformat|could not convert"): parse_iso_to_utc("not an iso string") @@ -98,12 +98,12 @@ def test_iso_one_shot(self, tmp_path): def test_empty_prompt_rejected(self, tmp_path): s = _make_scheduler(tmp_path) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=r"prompt is required"): s.add_job(" ", "0 9 * * *") def test_malformed_schedule_rejected(self, tmp_path): s = _make_scheduler(tmp_path) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=r"Invalid isoformat|could not convert"): s.add_job("hi", "not-a-real-schedule") def test_user_id_preserved(self, tmp_path): From 309890864179494c2ad9e1d0ea6590c2b83374fd Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 20:27:29 -0700 Subject: [PATCH 21/24] fix(review-5): address round-5 PR #156 CodeRabbit feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real bugs: - scheduler/local.py::_fire(): metadata moved from params.message.metadata to params.metadata. The A2A handler reads custom metadata from params.metadata (a2a_handler.py L1244 — `msg_metadata = params.get("metadata")`), so the previous nesting silently dropped the scheduler_job_id / scheduler_kind breadcrumb. Observers now get it as intended. - server.py reload path: scheduler swap is now planned before the graph rebuild and only committed after rebuild succeeds. A failed graph rebuild used to leave the scheduler torn down or a fresh one already started, dis-aligning runtime state. The new ordering: build candidate, rebuild graph (rollback-safe on failure), commit graph + scheduler atomically. - scheduler/local.py: _resolve_db_path now sanitizes agent_name via a new _safe_segment() helper. Strips path separators, ``..``, and absolute-path prefixes; falls back to "default" when nothing usable remains. Defence in depth — the value comes from operator- controlled env / YAML, but a typo or copy-paste shouldn't be able to put a sqlite file outside the configured scheduler dir. Tests: - tests/test_scheduler_local.py::test_cron_rescheduled_after_fire: pinned to a fixed fired_at timestamp so the assertion is exact (next_fire == "2026-04-29T09:00:00+00:00") instead of a "different from original" near-tautology that depends on datetime.now(). Docs: - docs/reference/configuration.md: clarified that the scheduler's enable/disable lives in YAML (middleware.scheduler), while backend selection and runtime knobs are env-driven. Repositioned SCHEDULER_DISABLED as the runtime escape hatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/reference/configuration.md | 4 +-- scheduler/local.py | 42 ++++++++++++++++++---- server.py | 63 +++++++++++++++++++++------------ tests/test_scheduler_local.py | 15 ++++---- 4 files changed, 85 insertions(+), 39 deletions(-) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 872bb37..2913700 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -88,7 +88,7 @@ The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 is ## Scheduler -The bundled scheduler is configured entirely via environment, not YAML, so the same image can be deployed under either backend without rebuilding. See [Schedule future work](/guides/scheduler) for the full guide. +Scheduler **enable/disable** is YAML-controlled (`middleware.scheduler` above) so the drawer can flip it without a restart. Backend **selection and runtime knobs** (which backend, where to write the sqlite, where to publish, etc.) are env-driven so the same container image can run under either backend without a rebuild. See [Schedule future work](/guides/scheduler) for the full guide. | Env var | Default | What | |---|---|---| @@ -97,4 +97,4 @@ The bundled scheduler is configured entirely via environment, not YAML, so the s | `WORKSTACEAN_TOPIC_PREFIX` | `cron.` | Override the bus topic the adapter fires on, when your Workstacean install uses a different convention. | | `SCHEDULER_DB_DIR` | `/sandbox/scheduler` | Local backend: parent directory for `/jobs.db`. Falls back to `~/.protoagent/scheduler//jobs.db` when unwritable. | | `SCHEDULER_INVOKE_URL` | `http://127.0.0.1:` | Local backend: where to POST `message/send` when a job fires. Override only if the agent's A2A endpoint isn't on localhost. | -| `SCHEDULER_DISABLED` | unset | Set to `1` / `true` to drop the scheduler tools entirely. | +| `SCHEDULER_DISABLED` | unset | Runtime escape hatch — set to `1` / `true` to drop the scheduler tools entirely without editing YAML. `middleware.scheduler: false` is the canonical opt-out. | diff --git a/scheduler/local.py b/scheduler/local.py index 635d2f4..37187cc 100644 --- a/scheduler/local.py +++ b/scheduler/local.py @@ -48,9 +48,17 @@ def _resolve_db_path(db_dir: str | Path | None, agent_name: str) -> Path: - """Pick a writable jobs.db path namespaced by agent name.""" + """Pick a writable jobs.db path namespaced by agent name. + + ``agent_name`` is sanitized to a single path segment before being + appended — operators set it via env or YAML, but defence in depth + against a value like ``../etc/passwd`` or ``/tmp/elsewhere`` is + cheap and prevents an exotic typo from putting a sqlite file + outside the configured scheduler dir. + """ + safe_name = _safe_segment(agent_name) raw = os.environ.get("SCHEDULER_DB_DIR") or db_dir or DEFAULT_DB_DIR - base = Path(str(raw)).expanduser() / agent_name + base = Path(str(raw)).expanduser() / safe_name try: base.mkdir(parents=True, exist_ok=True) probe = base / ".write-probe" @@ -58,12 +66,27 @@ def _resolve_db_path(db_dir: str | Path | None, agent_name: str) -> Path: probe.unlink() return base / "jobs.db" except OSError: - fallback = Path.home() / ".protoagent" / "scheduler" / agent_name + fallback = Path.home() / ".protoagent" / "scheduler" / safe_name fallback.mkdir(parents=True, exist_ok=True) log.info("[scheduler] %s not writable; using %s instead", base, fallback) return fallback / "jobs.db" +def _safe_segment(name: str) -> str: + """Reduce ``name`` to a single safe path segment. + + Replaces path separators, ``..``, and absolute-path prefixes with + underscores; falls back to ``"default"`` when nothing usable + remains. Preserves the common slug shape (``gina-personal``, + ``ginavision``) without surprises. + """ + if not name: + return "default" + cleaned = name.replace("/", "_").replace("\\", "_").replace("..", "_") + cleaned = cleaned.lstrip(".").strip() + return cleaned or "default" + + def _now_iso() -> str: return datetime.now(UTC).isoformat() @@ -363,10 +386,15 @@ async def _fire(self, job: Job) -> bool: "role": "user", "parts": [{"kind": "text", "text": job.prompt}], "messageId": message_id, - # Carry the originating job id so observers can tell - # this turn was scheduler-driven, not user-driven. - "metadata": {"scheduler_job_id": job.id, "scheduler_kind": "local"}, - } + }, + # Custom metadata goes at params.metadata — that's + # where a2a_handler._a2a_rpc reads it (see + # ``msg_metadata = params.get("metadata")``). Putting + # it inside params.message.metadata silently drops it. + "metadata": { + "scheduler_job_id": job.id, + "scheduler_kind": "local", + }, }, } try: diff --git a/server.py b/server.py index 195f308..8b10e4f 100644 --- a/server.py +++ b/server.py @@ -288,36 +288,44 @@ def _reload_langgraph_agent() -> tuple[bool, str]: # would leave the process serving the prior compiled _graph under # fresh _graph_config + rotated bearer auth on failure — the # metrics / card / auth all de-sync from what's actually running. + # Plan the scheduler swap *before* attempting the graph rebuild so + # the polling loop isn't torn down (or a fresh one started) until + # we know the rebuild will succeed. Three states: + # + # 1. Toggle flipped OFF, scheduler currently running → next graph + # uses None; we stop the running scheduler only after commit. + # 2. Toggle ON, none running (first-run after setup completes) → + # construct now (cheap), start only after commit. + # 3. Toggle ON, already running → reuse. Drawer saves don't tear + # down the polling loop. + # + # Env-driven config (WORKSTACEAN_API_BASE) only takes effect on + # full process restart; the YAML toggle is the canonical + # reload-time switch. + global _scheduler + scheduler_wanted = getattr(new_config, "scheduler_enabled", True) + next_scheduler: "SchedulerBackend | None" + pending_start: "SchedulerBackend | None" = None + pending_stop: "SchedulerBackend | None" = None + if not scheduler_wanted: + next_scheduler = None + pending_stop = _scheduler # may be None — stopper is no-op then + elif _scheduler is None: + next_scheduler = _build_scheduler(new_config) + pending_start = next_scheduler + else: + next_scheduler = _scheduler + if is_setup_complete(): try: new_store = _build_knowledge_store(new_config) - # Three states for the scheduler on reload: - # - # 1. Toggle flipped OFF (was on) → stop + drop the running - # scheduler so the agent stops registering scheduler - # tools. The new graph is built with scheduler=None. - # 2. Toggle is ON and we have a running scheduler → reuse - # it. Drawer saves don't tear down the polling loop. - # 3. Toggle is ON but _scheduler is None (first-run after - # setup completes) → construct + start. - # - # Env-driven config (WORKSTACEAN_API_BASE) only takes - # effect on full process restart; the YAML toggle is the - # canonical reload-time switch. - global _scheduler - scheduler_wanted = getattr(new_config, "scheduler_enabled", True) - if not scheduler_wanted and _scheduler is not None: - _stop_scheduler_async(_scheduler) - _scheduler = None - elif scheduler_wanted and _scheduler is None: - _scheduler = _build_scheduler(new_config) - if _scheduler is not None: - _start_scheduler_async(_scheduler) new_graph = create_agent_graph( - new_config, knowledge_store=new_store, scheduler=_scheduler, + new_config, knowledge_store=new_store, scheduler=next_scheduler, ) except Exception as e: log.exception("[reload] graph rebuild failed") + # Scheduler state hasn't been committed yet — caller's + # running scheduler keeps polling, no orphaned tasks. return False, f"graph rebuild failed: {e}" else: new_graph = None @@ -334,6 +342,15 @@ def _reload_langgraph_agent() -> tuple[bool, str]: # before _main wires routes) — harmless. pass _graph = new_graph + # Commit the scheduler swap. start/stop are async — fire-and-forget + # onto the active loop so reload stays sync. We've already verified + # the graph rebuild succeeded; if start/stop fails we log but + # don't roll back (the agent is already serving the new graph). + _scheduler = next_scheduler + if pending_stop is not None: + _stop_scheduler_async(pending_stop) + if pending_start is not None: + _start_scheduler_async(pending_start) if new_graph is None: log.info("[reload] setup not complete — config reloaded, graph not compiled") diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py index 06f4ef9..0e65ca1 100644 --- a/tests/test_scheduler_local.py +++ b/tests/test_scheduler_local.py @@ -170,13 +170,14 @@ def test_cron_rescheduled_after_fire(self, tmp_path): s = _make_scheduler(tmp_path) s.add_job("hi", "0 9 * * *", job_id="cron") job = s.list_jobs()[0] - original_next = job.next_fire - # Fire at "now" — next_fire should advance to the next 09:00 UTC - s._reschedule_or_delete(job, fired_at=datetime.now(UTC)) - new_next = s.list_jobs()[0].next_fire - assert new_next != original_next or original_next > datetime.now(UTC).isoformat() - # last_fire should be populated - assert s.list_jobs()[0].last_fire is not None + # Fire at a fixed timestamp — 2026-04-28T10:00:00Z is one hour + # past the 09:00 cron slot, so the next fire must be exactly + # 2026-04-29T09:00:00Z. + fired_at = datetime(2026, 4, 28, 10, 0, 0, tzinfo=UTC) + s._reschedule_or_delete(job, fired_at=fired_at) + rescheduled = s.list_jobs()[0] + assert rescheduled.next_fire == "2026-04-29T09:00:00+00:00" + assert rescheduled.last_fire == fired_at.isoformat() class TestMissedFireRecovery: From 16337ca96337f457d778160adf77029b590f74fa Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 27 Apr 2026 20:58:30 -0700 Subject: [PATCH 22/24] docs: sync surface counts and add scheduler/memory tool coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After PRs #155 (default KB store + memory tools) and #156 (default scheduler), the docs claimed nine tools, missed scheduler tools entirely in the reference, and skipped scheduler env vars. This pass syncs every stale claim flagged by the audit. Updates: - docs/reference/starter-tools.md - Corrected count: nine → twelve - New tool sections: schedule_task, list_schedules, cancel_schedule (signatures, output formats, multi-agent isolation notes) - "adding your own" snippet now threads scheduler= through get_all_tools alongside knowledge_store= - Related links include the scheduler guide - docs/reference/environment-variables.md - New "Knowledge store" section: KNOWLEDGE_DB_PATH override + the ~/.protoagent fallback - New "Audit log" section: AUDIT_PATH (used by evals/verify.py) - New "Scheduler" section: WORKSTACEAN_API_BASE/KEY/TOPIC_PREFIX, SCHEDULER_DB_DIR/INVOKE_URL/DISABLED, plus the protoLabs operators callout pointing at the ava node + secrets manager for the actual key - docs/tutorials/first-agent.md - Wizard description now mentions all twelve tools and the four middleware toggles (added Scheduler alongside Audit/Memory/ Knowledge) - docs/tutorials/first-tool.md - "Where to go next" link copy: five → twelve - docs/guides/fork-the-template.md - Tool list paragraph corrected to all twelve, with the binding-by-backend split called out - docs/guides/customize-and-deploy.md - "Add domain tools" section now mentions memory + scheduler tool binding and the middleware.* toggles for opt-out - README.md - Starter tools row now lists all twelve, grouped 4+5+3 with backend bindings shown Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- docs/guides/customize-and-deploy.md | 4 +- docs/guides/fork-the-template.md | 2 +- docs/reference/environment-variables.md | 31 ++++++++++++++ docs/reference/starter-tools.md | 55 ++++++++++++++++++++++--- docs/tutorials/first-agent.md | 2 +- docs/tutorials/first-tool.md | 2 +- 7 files changed, 88 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4a7036f..65ac158 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ rename / release-pipeline wiring. | Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 | | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork | | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships | -| Starter tools | `tools/lg_tools.py` | Keyless general tools (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) plus memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled store | +| Starter tools | `tools/lg_tools.py` | Twelve tools default-on: 4 keyless general (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) + 5 memory (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the KB store + 3 scheduler (`schedule_task`, `list_schedules`, `cancel_schedule`) bound to the scheduler backend | | Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` | | Scheduler | `scheduler/` | `schedule_task` / `list_schedules` / `cancel_schedule` tools backed by either a bundled sqlite scheduler or a Workstacean adapter (env-selected). Multi-agent-safe — every job is namespaced by `AGENT_NAME`. See [Schedule future work](./docs/guides/scheduler.md) | | Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) | diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md index 81fdeec..7ba875a 100644 --- a/docs/guides/customize-and-deploy.md +++ b/docs/guides/customize-and-deploy.md @@ -66,7 +66,9 @@ Replace with the skills your agent actually advertises over A2A. The `name` and ## 5. (Optional) Add domain tools -`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom. Any tool returned from there becomes a checkbox in the wizard and drawer automatically. +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url` plus 5 memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled `KnowledgeStore`. The 3 scheduler tools (`schedule_task`, `list_schedules`, `cancel_schedule`) are wired in separately by `server.py::_build_scheduler` when the scheduler backend is enabled. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of `tools/lg_tools.py`. Any tool returned from there (or from `_build_scheduler_tools`) becomes a checkbox in the wizard and drawer automatically. + +The memory tools are dropped automatically when `middleware.knowledge: false`; the scheduler tools when `middleware.scheduler: false`. See [Schedule future work](/guides/scheduler) and [Configuration](/reference/configuration#middleware) for the toggles. ## 6. (Optional) Configure subagents diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md index d5472e4..3de87b0 100644 --- a/docs/guides/fork-the-template.md +++ b/docs/guides/fork-the-template.md @@ -43,7 +43,7 @@ Keep the `` / `` protocol block in `prompts.py` — the A2A ## 4. Replace the starter tools -`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. +Twelve tools ship by default: `current_time`, `calculator`, `web_search`, `fetch_url` (keyless general) plus `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` (bound to the bundled `KnowledgeStore`) plus `schedule_task`, `list_schedules`, `cancel_schedule` (bound to the scheduler backend). Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of `tools/lg_tools.py`. See the [starter tools reference](/reference/starter-tools) for the shapes of the shipped ones. diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md index e27d074..d74ea6b 100644 --- a/docs/reference/environment-variables.md +++ b/docs/reference/environment-variables.md @@ -36,6 +36,37 @@ Session memory is enabled by default. See [architecture § Session memory](/expl To persist memory across container restarts, mount a volume at whatever `MEMORY_PATH` resolves to. Without a volume the directory is ephemeral. +## Knowledge store + +The bundled `KnowledgeStore` (sqlite + FTS5) is enabled by default. See [Configuration § knowledge](/reference/configuration#knowledge) for the full guide. + +| Variable | Default | What | +|---|---|---| +| `KNOWLEDGE_DB_PATH` | (unset — uses YAML `knowledge.db_path`) | Runtime override for the sqlite path. Falls back to `~/.protoagent/knowledge/agent.db` when the resolved path is unwritable (e.g. running locally without `/sandbox`). | + +To opt out entirely, set `middleware.knowledge: false` in YAML. The memory tools (`memory_ingest`, `memory_recall`, etc.) are dropped from the agent loop when the store is disabled. + +## Audit log + +| Variable | Default | What | +|---|---|---| +| `AUDIT_PATH` | `/sandbox/audit/audit.jsonl` | Directory + filename of the JSONL audit log written by `AuditMiddleware`. Read by `evals/verify.py` for side-effect assertions. | + +## Scheduler + +The bundled scheduler is enabled by default. See [Schedule future work](/guides/scheduler) and [Configuration § scheduler](/reference/configuration#scheduler) for the full guide. **Backend selection** is env-driven; **enable/disable** lives in YAML (`middleware.scheduler`) so the drawer can toggle without a restart. + +| Variable | Default | What | +|---|---|---| +| `WORKSTACEAN_API_BASE` | (unset) | When set together with `WORKSTACEAN_API_KEY`, swaps the bundled `LocalScheduler` for the `WorkstaceanScheduler` HTTP adapter. | +| `WORKSTACEAN_API_KEY` | (unset) | Auth token sent as `X-API-Key` to Workstacean's `/publish`. | +| `WORKSTACEAN_TOPIC_PREFIX` | `cron.` | Override the bus topic the adapter fires on, when your Workstacean install uses a different convention. | +| `SCHEDULER_DB_DIR` | `/sandbox/scheduler` | Local backend: parent directory for `/jobs.db`. Falls back to `~/.protoagent/scheduler//jobs.db` when unwritable. | +| `SCHEDULER_INVOKE_URL` | `http://127.0.0.1:` | Local backend: where to POST `message/send` when a job fires. Override only if the agent's A2A endpoint isn't on localhost. | +| `SCHEDULER_DISABLED` | (unset) | Runtime escape hatch — set to `1` / `true` to drop the scheduler tools entirely without editing YAML. `middleware.scheduler: false` is the canonical opt-out. | + +> **protoLabs operators**: the fleet's Workstacean lives on the `ava` node. `WORKSTACEAN_API_KEY` is in the org's secrets manager under `secret-management → workstacean`. + ## Tracing (optional) | Variable | What | diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index 60d74d4..9ef37aa 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -1,11 +1,12 @@ # Starter tools -Nine tools ship in `tools/lg_tools.py`: +Twelve tools ship by default: - Four keyless general-purpose tools — `current_time`, `calculator`, `web_search`, `fetch_url` — that work without any state. - Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)). +- Three **scheduler tools** — `schedule_task`, `list_schedules`, `cancel_schedule` — bound to the bundled scheduler backend (local sqlite or the Workstacean adapter, see [Schedule future work](/guides/scheduler)). -`get_all_tools(knowledge_store)` is the registry. When `knowledge_store` is `None` (the store is disabled in config) the memory tools are omitted automatically. +`get_all_tools(knowledge_store, scheduler)` is the registry. When `knowledge_store` is `None` the memory tools are omitted; when `scheduler` is `None` the scheduler tools are omitted. Both backends are constructed by default in `server.py`; opt out via `middleware.knowledge: false` / `middleware.scheduler: false` in `config/langgraph-config.yaml`. ## `current_time` @@ -158,6 +159,47 @@ async def daily_log(content: str) -> str Convenience wrapper around `memory_ingest` that writes to `domain='daily-log'` with today's UTC date as the heading. Same-day entries cluster under the same heading for `memory_list(domain='daily-log')`. +## `schedule_task` + +```python +@tool +async def schedule_task(prompt: str, when: str, job_id: str | None = None) -> str +``` + +Persist a future invocation. The agent receives `prompt` as a fresh turn when the schedule fires. + +`when` is either a 5-field cron expression (`"0 9 * * 1-5"` = every weekday at 9am) or an ISO-8601 datetime (`"2026-05-01T15:00:00"` = once at 3pm UTC on May 1). Backends auto-detect. + +`job_id` is optional — auto-generated as `-` when omitted. You'll need it later for `cancel_schedule`. + +Output: `"Scheduled job next at ."` on success. Returns `"Error: ..."` on malformed `when` or backend failure. + +Prompts are self-contained — the agent has no memory of the scheduling moment when the task fires, so write the prompt as a fresh turn ("review last week's pipeline incidents and post a summary"), not a reference ("do that thing we discussed"). + +## `list_schedules` + +```python +@tool +async def list_schedules() -> str +``` + +List the current scheduled jobs for *this* agent. Multi-agent isolation: each agent only sees jobs it created. + +Output: one job per line with id, next-fire timestamp, schedule, and prompt preview. Returns `"No scheduled jobs."` when empty. + +The Workstacean adapter intentionally returns `[]` (Workstacean owns scheduling state and its `list` action publishes asynchronously to a topic). Run the local backend or query Workstacean directly for live introspection there. + +## `cancel_schedule` + +```python +@tool +async def cancel_schedule(job_id: str) -> str +``` + +Cancel a scheduled job by id. Returns `"Canceled ."` or `"Error: no such job ."`. + +Cross-agent cancellation is blocked — `gina-personal` cannot cancel `gina-work`'s jobs even when sharing a sqlite path or a Workstacean install. + ## Adding your own Follow the same pattern: @@ -180,13 +222,15 @@ async def my_tool(required_arg: str, optional_arg: int = 5) -> str: return f"Success: {result}" ``` -Then append it to the keyless tool list in `get_all_tools()` — keep the conditional `_build_memory_tools(knowledge_store)` extension below it so the bundled memory tools still ship when a store is configured: +Then append it to the keyless tool list in `get_all_tools()` — keep the two conditional extensions below it so the bundled memory + scheduler tools still ship when their backends are configured: ```python -def get_all_tools(knowledge_store=None): +def get_all_tools(knowledge_store=None, scheduler=None): tools = [current_time, calculator, web_search, fetch_url, my_tool] if knowledge_store is not None: tools.extend(_build_memory_tools(knowledge_store)) + if scheduler is not None: + tools.extend(_build_scheduler_tools(scheduler)) return tools ``` @@ -195,5 +239,6 @@ See [Write your first tool](/tutorials/first-tool) for the full walkthrough. ## Related - [Configure subagents](/guides/subagents) — tools are allowlisted per subagent -- [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url` +- [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url`; scheduler backend selection lives there too - [Eval your fork](/guides/evals) — the eval harness exercises every tool listed here end-to-end +- [Schedule future work](/guides/scheduler) — the firing model + multi-agent isolation story behind the scheduler tools diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md index ce12744..e58ad76 100644 --- a/docs/tutorials/first-agent.md +++ b/docs/tutorials/first-agent.md @@ -40,7 +40,7 @@ Walk through the four steps: 1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one. 2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent. -3. **Tools & middleware.** All nine starter tools (`current_time`, `calculator`, `web_search`, `fetch_url`, plus the memory tools `memory_ingest` / `memory_recall` / `memory_list` / `memory_stats` / `daily_log`) are enabled by default. Leave **Audit**, **Memory**, and **Knowledge** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` (falls back to `~/.protoagent/knowledge/agent.db` outside Docker). +3. **Tools & middleware.** All twelve starter tools are enabled by default — four keyless general (`current_time`, `calculator`, `web_search`, `fetch_url`), five memory (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`), and three scheduler (`schedule_task`, `list_schedules`, `cancel_schedule`). Leave **Audit**, **Memory**, **Knowledge**, and **Scheduler** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` and a sqlite-backed scheduler under `/sandbox/scheduler//jobs.db`, both with `~/.protoagent/...` fallbacks outside Docker. 4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`. Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices. diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 056a8e4..af3b767 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -108,5 +108,5 @@ The template runs tests via `pytest` with `pytest-asyncio` in auto mode — no e ## Where to go next - [Add a custom skill](/guides/add-a-skill) — advertise new capabilities on the agent card so A2A callers can find them -- [Starter tools reference](/reference/starter-tools) — the shapes of the five tools that ship +- [Starter tools reference](/reference/starter-tools) — the shapes of all twelve tools that ship by default - [Configure subagents](/guides/subagents) — add specialized delegates beyond the placeholder `worker` From a4b7c65f481affee9a0ed9840cefb18609a7d814 Mon Sep 17 00:00:00 2001 From: Josh Mabry Date: Mon, 25 May 2026 00:08:51 -0700 Subject: [PATCH 23/24] chore: conform to workspace-config standard (.beads + .automaker baseline) Scaffolds .beads/issues.jsonl + .automaker/settings.json + standard .gitignore (narrowing any blanket ignore) via release-tools init-workspace-config. Any remaining runner-rule errors need per-job migration to namespace-profile-protolabs-linux. --- .automaker/settings.json | 3 +++ .gitignore | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 .automaker/settings.json diff --git a/.automaker/settings.json b/.automaker/settings.json new file mode 100644 index 0000000..61a2092 --- /dev/null +++ b/.automaker/settings.json @@ -0,0 +1,3 @@ +{ + "version": 1 +} diff --git a/.gitignore b/.gitignore index d28c925..fe99094 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,6 @@ build/ # protoLabs Studio agent workspace (never commit) .automaker-lock -.automaker/ .claude/ .worktrees/ worktrees/ @@ -28,3 +27,9 @@ worktrees/ node_modules/ docs/.vitepress/dist/ docs/.vitepress/cache/ + +# protoLabs workspace-config standard +.beads/beads.db +.automaker/features/ +.automaker/checkpoints/ +.automaker/trajectory/ From ccd665626730f3951d0e1950a1da62ca4c41e688 Mon Sep 17 00:00:00 2001 From: Automaker Date: Mon, 25 May 2026 00:19:53 -0700 Subject: [PATCH 24/24] chore(ci): migrate node/git workflows to org-owned runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate plain-Linux jobs to namespace-profile-protolabs-linux; annotate jobs that genuinely require GitHub-hosted infra. MIGRATED: - docs.yml build (vitepress docs build + upload-pages-artifact) - prepare-release.yml prepare (python version bump + gh PR ops) ANNOTATED (allow-hosted-runner): - docker-publish.yml build-and-push — docker buildx + registry push - docs.yml deploy — GitHub Pages deploy needs hosted Pages environment - release.yml release — docker buildx + registry push (+ OIDC attestation) Also finishes the .beads scaffold that the conform commit referenced but never landed: narrow the blanket *.jsonl ignore so .beads/issues.jsonl is committed (workspace-config requires the git-friendly export tracked). Co-Authored-By: Claude Opus 4.7 --- .beads/issues.jsonl | 0 .github/workflows/docker-publish.yml | 1 + .github/workflows/docs.yml | 5 +++-- .github/workflows/prepare-release.yml | 10 +++++----- .github/workflows/release.yml | 9 +++++---- .gitignore | 2 ++ 6 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 .beads/issues.jsonl diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index a931d29..0218b0a 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -29,6 +29,7 @@ env: jobs: build-and-push: + # workspace-config: allow-hosted-runner docker buildx build + registry push runs-on: ubuntu-latest timeout-minutes: 30 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c619d94..085688e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -3,7 +3,7 @@ name: Deploy docs to GitHub Pages on: push: branches: [main] - paths: ['docs/**', 'package.json', 'package-lock.json'] + paths: ["docs/**", "package.json", "package-lock.json"] workflow_dispatch: permissions: @@ -18,7 +18,7 @@ concurrency: jobs: build: - runs-on: ubuntu-latest + runs-on: namespace-profile-protolabs-linux steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 @@ -33,6 +33,7 @@ jobs: deploy: needs: build + # workspace-config: allow-hosted-runner GitHub Pages deploy requires the hosted Pages environment runs-on: ubuntu-latest environment: name: github-pages diff --git a/.github/workflows/prepare-release.yml b/.github/workflows/prepare-release.yml index f2f444a..b9ab6cf 100644 --- a/.github/workflows/prepare-release.yml +++ b/.github/workflows/prepare-release.yml @@ -1,4 +1,4 @@ -name: 'Prepare Release' +name: "Prepare Release" # Runs after any non-release PR merges to main, OR manually. # Bumps the version in pyproject.toml, opens a prepare-release/vX.Y.Z @@ -14,13 +14,13 @@ on: workflow_dispatch: inputs: bump: - description: 'Version bump type' + description: "Version bump type" required: true type: choice options: [patch, minor, major] default: patch dry_run: - description: 'Preview only — no branch or PR created' + description: "Preview only — no branch or PR created" type: boolean default: false @@ -31,7 +31,7 @@ concurrency: jobs: prepare: name: Prepare Release - runs-on: ubuntu-latest + runs-on: namespace-profile-protolabs-linux # Guards: # - repo-scope: don't run in forks # - workflow_dispatch always runs @@ -63,7 +63,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: "3.12" - name: Bump version run: python scripts/version.py ${{ inputs.bump || 'patch' }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 61a4c65..b405965 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: 'Release' +name: "Release" # Triggered by a push of a vX.Y.Z tag (cut by prepare-release.yml). # Builds and pushes the stable semver Docker tags, creates a GitHub @@ -9,11 +9,11 @@ name: 'Release' on: push: tags: - - 'v*.*.*' + - "v*.*.*" workflow_dispatch: inputs: tag: - description: 'Tag to release (e.g. v0.1.1)' + description: "Tag to release (e.g. v0.1.1)" required: true env: @@ -23,6 +23,7 @@ env: jobs: release: name: Release + # workspace-config: allow-hosted-runner docker buildx build + registry push runs-on: ubuntu-latest if: github.repository == 'protoLabsAI/protoAgent' permissions: @@ -130,7 +131,7 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Post release notes to Discord continue-on-error: true diff --git a/.gitignore b/.gitignore index fe99094..b159843 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,8 @@ docs/.vitepress/cache/ # protoLabs workspace-config standard .beads/beads.db +# Git-friendly issue export must be committed despite the blanket *.jsonl ignore. +!.beads/issues.jsonl .automaker/features/ .automaker/checkpoints/ .automaker/trajectory/