diff --git a/.automaker/settings.json b/.automaker/settings.json new file mode 100644 index 0000000..61a2092 --- /dev/null +++ b/.automaker/settings.json @@ -0,0 +1,3 @@ +{ + "version": 1 +} diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index a931d29..0218b0a 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -29,6 +29,7 @@ env: jobs: build-and-push: + # workspace-config: allow-hosted-runner docker buildx build + registry push runs-on: ubuntu-latest timeout-minutes: 30 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c619d94..085688e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -3,7 +3,7 @@ name: Deploy docs to GitHub Pages on: push: branches: [main] - paths: ['docs/**', 'package.json', 'package-lock.json'] + paths: ["docs/**", "package.json", "package-lock.json"] workflow_dispatch: permissions: @@ -18,7 +18,7 @@ concurrency: jobs: build: - runs-on: ubuntu-latest + runs-on: namespace-profile-protolabs-linux steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 @@ -33,6 +33,7 @@ jobs: deploy: needs: build + # workspace-config: allow-hosted-runner GitHub Pages deploy requires the hosted Pages environment runs-on: ubuntu-latest environment: name: github-pages diff --git a/.github/workflows/prepare-release.yml b/.github/workflows/prepare-release.yml index f2f444a..b9ab6cf 100644 --- a/.github/workflows/prepare-release.yml +++ b/.github/workflows/prepare-release.yml @@ -1,4 +1,4 @@ -name: 'Prepare Release' +name: "Prepare Release" # Runs after any non-release PR merges to main, OR manually. # Bumps the version in pyproject.toml, opens a prepare-release/vX.Y.Z @@ -14,13 +14,13 @@ on: workflow_dispatch: inputs: bump: - description: 'Version bump type' + description: "Version bump type" required: true type: choice options: [patch, minor, major] default: patch dry_run: - description: 'Preview only — no branch or PR created' + description: "Preview only — no branch or PR created" type: boolean default: false @@ -31,7 +31,7 @@ concurrency: jobs: prepare: name: Prepare Release - runs-on: ubuntu-latest + runs-on: namespace-profile-protolabs-linux # Guards: # - repo-scope: don't run in forks # - workflow_dispatch always runs @@ -63,7 +63,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: "3.12" - name: Bump version run: python scripts/version.py ${{ inputs.bump || 'patch' }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 61a4c65..b405965 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: 'Release' +name: "Release" # Triggered by a push of a vX.Y.Z tag (cut by prepare-release.yml). # Builds and pushes the stable semver Docker tags, creates a GitHub @@ -9,11 +9,11 @@ name: 'Release' on: push: tags: - - 'v*.*.*' + - "v*.*.*" workflow_dispatch: inputs: tag: - description: 'Tag to release (e.g. v0.1.1)' + description: "Tag to release (e.g. v0.1.1)" required: true env: @@ -23,6 +23,7 @@ env: jobs: release: name: Release + # workspace-config: allow-hosted-runner docker buildx build + registry push runs-on: ubuntu-latest if: github.repository == 'protoLabsAI/protoAgent' permissions: @@ -130,7 +131,7 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Post release notes to Discord continue-on-error: true diff --git a/.gitignore b/.gitignore index cd98bb9..b159843 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,13 @@ __pycache__/ *.pyo .env .venv/ +.venv-*/ venv/ + +# Local-run artifacts — autostart stdout/stderr logs + memory middleware +# fallback directory when /sandbox is not available. +logs/ +.proto/ *.egg-info/ dist/ build/ @@ -13,7 +19,6 @@ build/ # protoLabs Studio agent workspace (never commit) .automaker-lock -.automaker/ .claude/ .worktrees/ worktrees/ @@ -22,3 +27,11 @@ worktrees/ node_modules/ docs/.vitepress/dist/ docs/.vitepress/cache/ + +# protoLabs workspace-config standard +.beads/beads.db +# Git-friendly issue export must be committed despite the blanket *.jsonl ignore. +!.beads/issues.jsonl +.automaker/features/ +.automaker/checkpoints/ +.automaker/trajectory/ diff --git a/Dockerfile b/Dockerfile index a52723c..0b1fe77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ RUN useradd -m -s /bin/bash -u ${SANDBOX_UID} sandbox # auth, add them here. The ddgs + beautifulsoup4 pair powers the # starter web_search / fetch_url tools; drop them if you strip those. RUN pip install --no-cache-dir \ - gradio httpx uvicorn langfuse prometheus-client pyyaml \ + gradio httpx uvicorn langfuse prometheus-client pyyaml 'ruamel.yaml>=0.18' \ langchain langchain-openai langgraph websockets \ ddgs beautifulsoup4 @@ -40,6 +40,27 @@ RUN chmod +x /opt/protoagent/entrypoint.sh RUN mkdir -p /sandbox /tmp/sandbox /sandbox/audit /sandbox/knowledge \ && chown -R sandbox:sandbox /sandbox /tmp/sandbox +# Make /opt/protoagent/config writable by the sandbox user so the +# drawer and setup wizard can persist edits from inside the container. +RUN chown -R sandbox:sandbox /opt/protoagent/config + +# Declare config as a volume so setup completion (``.setup-complete`` +# marker + any YAML / SOUL.md edits) survives ``docker run`` without +# a -v flag. +# +# Lifecycle note: without an explicit mount, Docker creates an +# ANONYMOUS volume on every ``docker run``. Those accumulate and the +# volume is NOT removed when the container is removed unless you pass +# ``--rm -v``. For long-lived deployments, use a named volume or a +# host mount so upgrades don't silently carry stale config forward: +# +# docker run -v my-agent-config:/opt/protoagent/config my-agent:latest +# +# or a bind mount: +# +# docker run -v /srv/my-agent/config:/opt/protoagent/config my-agent:latest +VOLUME ["/opt/protoagent/config"] + ENV PYTHONPATH=/opt/protoagent USER sandbox diff --git a/README.md b/README.md index 6bf76ee..65ac158 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,15 @@ close to a rewrite of `SOUL.md`, `graph/prompts.py`, and Quinn was the first agent built on this template — it's a good example of what a filled-in fork looks like end-to-end. -Start a new agent by clicking **"Use this template"** at the top -of the GitHub repo. See [TEMPLATE.md](./TEMPLATE.md) for the -step-by-step fork checklist. +**Try it in 5 minutes:** clone, `pip install -r requirements.txt`, +`python server.py`, open , and walk the +setup wizard — no forking, no `sed`, no Docker required to get +your first agent talking. See the [first-agent tutorial](./docs/tutorials/first-agent.md). + +**When you're ready to ship your own:** click **"Use this template"** +at the top of the GitHub repo, then follow [Customize & +deploy](./docs/guides/customize-and-deploy.md) for the fork / +rename / release-pipeline wiring. ## What you get out of the box @@ -24,35 +30,41 @@ step-by-step fork checklist. | Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 | | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork | | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one placeholder `worker` ships | -| Starter tools | `tools/lg_tools.py` | Free, keyless tools so a fresh fork can demo real behaviour: `echo`, `current_time`, `calculator` (safe AST eval), `web_search` (DuckDuckGo), `fetch_url` | +| Starter tools | `tools/lg_tools.py` | Twelve tools default-on: 4 keyless general (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) + 5 memory (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the KB store + 3 scheduler (`schedule_task`, `list_schedules`, `cancel_schedule`) bound to the scheduler backend | +| Knowledge store | `knowledge/store.py` | sqlite + FTS5 (LIKE fallback). One `chunks` table for operator notes, daily-log entries, and conversation findings. Default-on; turn off with `middleware.knowledge: false` | +| Scheduler | `scheduler/` | `schedule_task` / `list_schedules` / `cancel_schedule` tools backed by either a bundled sqlite scheduler or a Workstacean adapter (env-selected). Multi-agent-safe — every job is namespaced by `AGENT_NAME`. See [Schedule future work](./docs/guides/scheduler.md) | +| Eval harness | `evals/` | Side-effect-verified A2A test harness — audit log + reply text + KB state. `python -m evals.runner` against a running agent. See [Eval your fork](./docs/guides/evals.md) | | Tracing | `tracing.py` | Langfuse trace_session with distributed `a2a.trace` propagation and the OTel cross-context-detach filter | | Observability | `metrics.py`, `audit.py` | Prometheus metrics with per-agent prefix, JSONL audit log with trace IDs | | Output protocol | `graph/output_format.py` | `` / `` parsing so the model can think without it leaking to users | | UI | `chat_ui.py`, `static/` | Gradio chat with PWA shell, dark theme, offline fallback | | Release pipeline | `.github/workflows/*.yml` | Autonomous semver bumps, GHCR image push, GitHub release with filtered notes, optional Discord post | -## Quickstart +## Quickstart — from zero to chatting in 5 minutes ```bash -# 1. Click "Use this template" on GitHub, or: -gh repo create protoLabsAI/my-agent \ - --template protoLabsAI/protoAgent \ - --public --clone - +# 1. Get the code (no fork needed for a first run) +git clone https://github.com/protoLabsAI/protoAgent.git my-agent cd my-agent -# 2. Rename the agent (one env var, read by server.py, metrics, tracing) -export AGENT_NAME=my-agent +# 2. Install deps into a venv +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt -# 3. Boot the container -docker build -t my-agent:local . -docker run --rm -p 7870:7870 -e AGENT_NAME=my-agent my-agent:local +# 3. Run the server — no env vars required +python server.py -# 4. Hit the agent card -curl http://localhost:7870/.well-known/agent-card.json +# 4. Open the wizard — pick your endpoint, pick a model, name the +# agent, pick a persona preset, hit Launch. The chat UI appears +# on the same page. +open http://localhost:7870 ``` -See [TEMPLATE.md](./TEMPLATE.md) for the full fork checklist. +[First-agent tutorial](./docs/tutorials/first-agent.md) walks +through every wizard step with screenshots. + +Once you're happy and want to ship it as your own image in your +own GHCR: [Customize & deploy](./docs/guides/customize-and-deploy.md). ## Architecture diff --git a/TEMPLATE.md b/TEMPLATE.md index 2997277..4408c90 100644 --- a/TEMPLATE.md +++ b/TEMPLATE.md @@ -1,5 +1,16 @@ # Fork checklist +> **Most of what used to be in this file is now a runtime wizard** +> that runs on first page load. Model, tools, persona, name, auth, +> autostart — all captured without editing code. See +> [first-agent tutorial](./docs/tutorials/first-agent.md). +> +> This checklist is only for forks that want to ship their own +> container image under their own GitHub org — the structural +> changes the wizard can't do. For most of that, the new +> [Customize & deploy](./docs/guides/customize-and-deploy.md) +> guide is the canonical source. This file stays for back-compat. + You clicked "Use this template" (or ran `gh repo create --template`). Now what? @@ -61,10 +72,10 @@ handler's output extraction depends on it. ## 4. Add your real tools `tools/lg_tools.py` ships with a small keyless starter set so a -fresh clone can demonstrate a real research loop: `echo`, -`current_time`, `calculator` (safe AST eval — no `eval()`), -`web_search` (DuckDuckGo via `ddgs`), and `fetch_url`. Keep the -ones you want, drop the rest, and add your own: +fresh clone can demonstrate a real research loop: `current_time`, +`calculator` (safe AST eval — no `eval()`), `web_search` (DuckDuckGo +via `ddgs`), and `fetch_url`. Keep the ones you want, drop the rest, +and add your own: ```python from langchain_core.tools import tool @@ -156,6 +167,41 @@ your fork. A useful pattern: - Extend `tests/test_a2a_integration.py` with assertions for your declared skills + extensions on the agent card +For end-to-end behaviour testing — "when the operator asks X, does +the right tool actually fire and the right row land in the KB?" — +the template ships an eval harness under `evals/`: + +```bash +python -m evals.runner # against a running agent +python -m evals.runner --category tool +``` + +See [Eval your fork](./docs/guides/evals.md) for what each case +asserts, how the three assertion channels work, and how to add +cases for your fork's new tools. + +## 9b. Scheduler — local sqlite or Workstacean + +The bundled scheduler ships three agent tools — `schedule_task`, +`list_schedules`, `cancel_schedule` — backed by either a local +sqlite poller or a Workstacean adapter, selected at startup via env: + +```bash +# Default: local sqlite, persists at /sandbox/scheduler//jobs.db +python server.py + +# Workstacean: set both and restart +export WORKSTACEAN_API_BASE=http://your-workstacean:3000 +export WORKSTACEAN_API_KEY=... +python server.py +``` + +Multi-fork safety: every job is namespaced by `AGENT_NAME`, so +spinning up `gina-personal` next to `gina-work` (or any number of +ginas under one Workstacean) doesn't cross-fire prompts. See +[Schedule future work](./docs/guides/scheduler.md) for the full +firing model and integration notes. + ## 9a. Understand the skill loop protoAgent's skill loop lets your agent learn from experience automatically. diff --git a/a2a_handler.py b/a2a_handler.py index 69b9520..7efecc7 100644 --- a/a2a_handler.py +++ b/a2a_handler.py @@ -919,6 +919,26 @@ def _check_auth(request: Request, api_key: str) -> None: # ── Route factory ───────────────────────────────────────────────────────────── +# Module-level mutable holder for the bearer token so hosts can +# update it at runtime without re-registering routes (e.g. when the +# setup wizard captures a token post-boot). ``register_a2a_routes`` +# seeds this from its ``auth_token`` argument (or ``A2A_AUTH_TOKEN`` +# env as fallback); ``set_a2a_token`` updates it live. Closures inside +# ``register_a2a_routes`` read ``_A2A_TOKEN[0]`` on every request, so +# a mutation is picked up by the next incoming call. +_A2A_TOKEN: list[str | None] = [None] + + +def set_a2a_token(token: str | None) -> None: + """Update the active A2A bearer token at runtime. + + Called by the host (e.g. ``server.py``) after the wizard / drawer + changes ``auth.token`` in the YAML — without this, bearer auth + captured at register time would stay stale until process restart. + """ + _A2A_TOKEN[0] = (token or "").strip() or None + + def register_a2a_routes( app: FastAPI, chat_stream_fn_factory: Callable[..., AsyncGenerator], @@ -926,29 +946,39 @@ def register_a2a_routes( api_key: str, agent_card: dict, register_card_route: bool = True, + auth_token: str = "", ) -> None: """Register all A2A routes on *app* and update *agent_card* capabilities. Host apps that already serve the agent card themselves (e.g. at multiple well-known paths for sdk compat) should pass ``register_card_route=False`` so FastAPI does not raise on a duplicate route registration. + + ``auth_token`` seeds the bearer-token check. When empty, falls + back to the ``A2A_AUTH_TOKEN`` env var. Hosts can update the + active token post-registration via ``set_a2a_token(...)`` (e.g. + after a wizard-driven config reload) without needing a restart. """ # ── Bearer token authentication ─────────────────────────────────────────── - _raw_a2a_token = os.environ.get("A2A_AUTH_TOKEN", "") - _a2a_token: str | None = _raw_a2a_token.strip() or None - if not _a2a_token: + # Seed order: explicit arg > env. Stored in the module-level holder + # so mutations propagate to the closure below. + seed = (auth_token or os.environ.get("A2A_AUTH_TOKEN", "") or "").strip() + _A2A_TOKEN[0] = seed or None + if _A2A_TOKEN[0] is None: logger.warning( "[a2a] A2A auth token not configured — endpoint is open" ) def _check_bearer_auth(request: Request) -> None: - """Validate Authorization: Bearer against A2A_AUTH_TOKEN. + """Validate Authorization: Bearer against the active + token. No-ops when unset. Raises HTTP 401 on missing/invalid. - No-ops when A2A_AUTH_TOKEN is unset (open mode). - Raises HTTP 401 on missing or invalid token. + Reads ``_A2A_TOKEN[0]`` on every call so runtime updates via + ``set_a2a_token`` are honored without route re-registration. """ - if not _a2a_token: + active = _A2A_TOKEN[0] + if not active: return auth_header = request.headers.get("Authorization", "") if not auth_header.startswith("Bearer "): @@ -957,7 +987,7 @@ def _check_bearer_auth(request: Request) -> None: detail="Unauthorized: expected 'Authorization: Bearer '", ) provided = auth_header[len("Bearer "):] - if not hmac.compare_digest(provided, _a2a_token): + if not hmac.compare_digest(provided, active): raise HTTPException(status_code=401, detail="Unauthorized: invalid bearer token") # ── Origin verification for SSE/streaming endpoints ─────────────────────── @@ -989,7 +1019,7 @@ def _check_origin(request: Request) -> None: agent_card.setdefault("capabilities", {}) agent_card["capabilities"]["streaming"] = True agent_card["capabilities"]["pushNotifications"] = True - if _a2a_token: + if _A2A_TOKEN[0]: agent_card.setdefault("securitySchemes", {}) agent_card["securitySchemes"]["bearer"] = { "type": "http", diff --git a/autostart.py b/autostart.py new file mode 100644 index 0000000..65ce5f8 --- /dev/null +++ b/autostart.py @@ -0,0 +1,266 @@ +"""OS-level autostart for the protoAgent server. + +Hooks the server into the OS so it launches on user login. Today +macOS is the only supported path (LaunchAgent plist); Linux and +Windows stubs return a clear "not yet supported" error so the +wizard surfaces that instead of silently failing. + +Design notes: + +- The source of truth for "should autostart be on?" is + ``runtime.autostart_on_boot`` in ``langgraph-config.yaml``. This + module only installs / removes the OS artifact — it doesn't + decide policy. The wizard and drawer toggle the YAML value and + call these functions to bring the OS state in sync. + +- ``sys.executable`` is captured at install time so reinstalling + after a venv rebuild picks up the new interpreter path. If a user + recreates their venv without reinstalling, the LaunchAgent keeps + pointing at the stale path and will fail at next login — noisy + log but not catastrophic. Documented in the docs. + +- Install is idempotent: ``install_autostart`` overwrites any + prior plist so the same file always reflects current state, no + stale LaunchAgents piling up. +""" + +from __future__ import annotations + +import platform +import re +import shlex +import subprocess +import sys +from pathlib import Path +from xml.sax.saxutils import escape as xml_escape + +REPO_ROOT = Path(__file__).parent.resolve() + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def autostart_supported() -> tuple[bool, str]: + """Is this platform a supported autostart target? + + Returns ``(True, "")`` on supported platforms, ``(False, reason)`` + otherwise. Wizard / drawer check this before offering the toggle. + """ + system = platform.system() + if system == "Darwin": + return True, "" + if system == "Linux": + return False, "Linux autostart (systemd user unit) not yet implemented" + if system == "Windows": + return False, "Windows autostart (Task Scheduler) not yet implemented" + return False, f"autostart not implemented for platform {system!r}" + + +def install_autostart(agent_name: str = "protoagent", port: int = 7870) -> tuple[bool, str]: + """Install the OS artifact that runs the server on user login. + + Returns ``(ok, message)``. On success, ``message`` is a short + human-readable note the UI can display; on failure it's the + actual error (permission denied, launchctl exit code, etc). + """ + ok, reason = autostart_supported() + if not ok: + return False, reason + + if platform.system() == "Darwin": + return _install_macos_launchagent(agent_name, port) + return False, "unreachable" # autostart_supported already rejected + + +def uninstall_autostart(agent_name: str = "protoagent") -> tuple[bool, str]: + """Remove the OS autostart artifact. Safe to call when nothing + is installed — returns success in that case. + """ + ok, reason = autostart_supported() + if not ok: + return False, reason + + if platform.system() == "Darwin": + return _uninstall_macos_launchagent(agent_name) + return False, "unreachable" + + +def autostart_status(agent_name: str = "protoagent") -> dict: + """Report current on-disk state for diagnostics. + + The UI uses this to render accurate "autostart is currently + on/off" without having to remember what it last wrote. + """ + ok, reason = autostart_supported() + if not ok: + return {"supported": False, "installed": False, "reason": reason} + + if platform.system() == "Darwin": + plist = _macos_plist_path(agent_name) + return { + "supported": True, + "installed": plist.exists(), + "plist_path": str(plist), + "python": sys.executable, + "server_path": str(REPO_ROOT / "server.py"), + } + return {"supported": False, "installed": False, "reason": "unreachable"} + + +# --------------------------------------------------------------------------- +# macOS — LaunchAgent plist +# --------------------------------------------------------------------------- + + +_SAFE_LABEL_RE = re.compile(r"[^a-z0-9_.-]+") + + +def _macos_label(agent_name: str) -> str: + """Plist label — namespaced so it doesn't collide with system labels. + + Sanitizes the input: only ``[a-z0-9_.-]`` survive. Leading / trailing + dots and hyphens are stripped so the resulting filename can't be + a hidden file or look like a path-segment. Path-traversal + characters like ``/`` and ``..`` are filtered here rather than at + the filesystem layer so ``install_autostart(agent_name="../../x")`` + can't escape ``~/Library/LaunchAgents/``. + """ + sanitized = _SAFE_LABEL_RE.sub("-", agent_name.lower()).strip("-.") + if not sanitized: + sanitized = "protoagent" + return f"ai.protolabs.{sanitized}" + + +def _macos_plist_path(agent_name: str) -> Path: + home = Path.home() + return home / "Library" / "LaunchAgents" / f"{_macos_label(agent_name)}.plist" + + +def _install_macos_launchagent(agent_name: str, port: int) -> tuple[bool, str]: + """Write the plist and ``launchctl load`` it. + + Unload-then-load (rather than a bootstrap-replace dance) is the + simplest idempotent recipe that works across macOS versions. A + missing label on unload is a no-op. + """ + python = sys.executable + server_py = REPO_ROOT / "server.py" + if not server_py.exists(): + return False, f"server.py not found at {server_py}" + + label = _macos_label(agent_name) + plist_path = _macos_plist_path(agent_name) + log_dir = REPO_ROOT / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + + plist = _render_launchagent_plist( + label=label, + python=python, + server_py=str(server_py), + port=port, + working_dir=str(REPO_ROOT), + agent_name=agent_name, + stdout_log=str(log_dir / "autostart.out.log"), + stderr_log=str(log_dir / "autostart.err.log"), + ) + + plist_path.parent.mkdir(parents=True, exist_ok=True) + + # Unload any prior incarnation first — silently ok if absent. + subprocess.run( + ["launchctl", "unload", str(plist_path)], + capture_output=True, check=False, + ) + + plist_path.write_text(plist, encoding="utf-8") + + result = subprocess.run( + ["launchctl", "load", str(plist_path)], + capture_output=True, check=False, + ) + if result.returncode != 0: + err = (result.stderr.decode("utf-8", errors="replace") + or result.stdout.decode("utf-8", errors="replace") + or f"launchctl load exit={result.returncode}") + return False, f"plist written but launchctl load failed: {err.strip()}" + + return True, f"installed • {plist_path.name} • runs `{shlex.quote(python)} server.py` on login" + + +def _uninstall_macos_launchagent(agent_name: str) -> tuple[bool, str]: + plist_path = _macos_plist_path(agent_name) + if not plist_path.exists(): + return True, "autostart was not installed" + + subprocess.run( + ["launchctl", "unload", str(plist_path)], + capture_output=True, check=False, + ) + + try: + plist_path.unlink() + except OSError as e: + return False, f"failed to remove plist: {e}" + + return True, f"uninstalled • removed {plist_path.name}" + + +def _render_launchagent_plist( + *, + label: str, + python: str, + server_py: str, + port: int, + working_dir: str, + agent_name: str, + stdout_log: str, + stderr_log: str, +) -> str: + """Render the plist XML. + + Every interpolated string is XML-escaped because several fields + (``agent_name`` most notably) come from user input — a wizard + user who names their agent ``bad`` or ``me & co`` would + otherwise produce a malformed or injection-vulnerable plist. + ``port`` is an int so it's safe as-is, but we coerce+escape it + anyway for consistency. + """ + e = xml_escape + return f""" + + + + Label + {e(label)} + ProgramArguments + + {e(python)} + {e(server_py)} + --port + {e(str(port))} + + WorkingDirectory + {e(working_dir)} + EnvironmentVariables + + AGENT_NAME + {e(agent_name)} + PATH + /usr/local/bin:/usr/bin:/bin:/opt/homebrew/bin + + RunAtLoad + + KeepAlive + + SuccessfulExit + + + StandardOutPath + {e(stdout_log)} + StandardErrorPath + {e(stderr_log)} + + +""" diff --git a/chat_ui.py b/chat_ui.py index 62fb697..ba832e3 100644 --- a/chat_ui.py +++ b/chat_ui.py @@ -181,6 +181,21 @@ def create_chat_app( _css = CLEAN_CSS + AGENT_DARK_CSS + extra_css _head = AGENT_PWA_HEAD if pwa else "" + # Determine first-run state. Fresh clones land in the wizard; + # subsequent boots go straight to chat unless the user explicitly + # triggers "Re-run setup" from the drawer. Settings dicts without + # the ``is_setup_complete`` key (older template forks) skip the + # wizard entirely — chat is always visible. + setup_done = True + if settings and "is_setup_complete" in settings: + try: + setup_done = bool(settings["is_setup_complete"]()) + except Exception: + setup_done = True # fail-open: don't trap forks in a broken wizard + wizard_enabled = bool( + settings and "finish_setup" in settings and "get_config" in settings + ) + def _build() -> gr.Blocks: with gr.Blocks( title=title.replace("*", "").strip(), @@ -197,77 +212,534 @@ def _build() -> gr.Blocks: header_md = gr.Markdown(header_text) - chatbot = gr.Chatbot(height=chat_height, show_label=False) + # === SETUP WIZARD PANE ===================================== + # Visible on first run (no .setup-complete marker), hidden + # after the user clicks Launch. All fields default from the + # current config so re-running the wizard doesn't start + # from zero. + wizard_pane = None + w_launch_btn = None + w_launch_status = None + w_inputs: list = [] + if wizard_enabled: + with gr.Column(visible=not setup_done) as wizard_pane: + gr.Markdown( + "# Welcome — let's set up your agent\n\n" + "Walk through the steps below and hit **Launch agent**. " + "You can revisit every one of these choices later from " + "the Configuration drawer. Nothing is persisted until " + "you click Launch." + ) + w_launch_status = gr.Markdown("") - with gr.Row(): - txt = gr.Textbox( - placeholder=placeholder, show_label=False, - scale=9, container=False, - ) - send_btn = gr.Button("Send", variant="primary", scale=1, min_width=80) + with gr.Accordion("1. Connect to your model", open=True): + w_api_base = gr.Textbox( + label="API base URL", + placeholder="e.g. https://api.openai.com/v1 or http://localhost:4000/v1", + interactive=True, + ) + w_api_key = gr.Textbox( + label="API key", + type="password", + placeholder="your OpenAI or gateway master key", + interactive=True, + ) + with gr.Row(): + w_test_btn = gr.Button( + "Test connection & fetch models", + variant="secondary", scale=3, + ) + w_test_status = gr.Markdown("") + w_model = gr.Dropdown( + label="Model", + choices=[], allow_custom_value=True, + interactive=True, + ) - with gr.Row(): - clear_btn = gr.Button("Clear", size="sm", variant="secondary") - new_btn = gr.Button("New Session", size="sm", variant="secondary") + with gr.Accordion("2. Name your agent", open=False): + w_agent_name = gr.Textbox( + label="Agent name", + placeholder="short lowercase slug, e.g. product-director", + interactive=True, + ) + gr.Markdown( + "_This becomes the agent card name, OpenAI-compat " + "model id, and chat header. Metric prefix still " + "needs a process restart to pick up._" + ) + with gr.Row(): + w_preset = gr.Dropdown( + label="Persona preset (optional)", + choices=[], interactive=True, scale=3, + ) + w_load_preset_btn = gr.Button( + "Load preset into SOUL.md", + size="sm", scale=2, + ) + w_soul = gr.Textbox( + label="SOUL.md — the agent's persona", + lines=14, interactive=True, + placeholder=( + "Identity, personality, values, communication " + "style. Loaded into every system prompt." + ), + ) - if footer_html: - gr.HTML(footer_html) + with gr.Accordion("3. Tools & middleware", open=False): + w_tools = gr.CheckboxGroup( + label="Tools available to the agent", + choices=[], interactive=True, + ) + w_mw_audit = gr.Checkbox( + label="Audit middleware — logs every tool call", + value=True, interactive=True, + ) + w_mw_memory = gr.Checkbox( + label="Memory middleware — persists session summaries", + value=True, interactive=True, + ) + w_mw_knowledge = gr.Checkbox( + label="Knowledge middleware — requires a knowledge store (leave off for starter setups)", + value=False, interactive=True, + ) + + with gr.Accordion("4. Optional — you, security, autostart", open=False): + w_operator = gr.Textbox( + label="Your name", + placeholder="so the agent can address you directly — blank = anonymous", + interactive=True, + ) + w_auth = gr.Textbox( + label="A2A bearer token", + type="password", + placeholder="set before exposing to a network; blank = open mode for local dev", + interactive=True, + ) + w_autostart = gr.Checkbox( + label="Launch this agent automatically on login", + value=False, interactive=True, + ) + w_autostart_note = gr.Markdown("") + + w_launch_btn = gr.Button( + "Launch agent", variant="primary", size="lg", + ) + + w_inputs = [ + w_api_base, w_api_key, w_model, + w_agent_name, w_soul, w_preset, + w_tools, w_mw_audit, w_mw_memory, w_mw_knowledge, + w_operator, w_auth, w_autostart, + ] + + # === CHAT PANE ============================================= + # Wrapped in a Column so visibility toggles in lockstep with + # the wizard. On fresh setup it starts hidden and the Launch + # button flips it on. + with gr.Column(visible=setup_done) as chat_pane: + chatbot = gr.Chatbot(height=chat_height, show_label=False) + + with gr.Row(): + txt = gr.Textbox( + placeholder=placeholder, show_label=False, + scale=9, container=False, + ) + send_btn = gr.Button("Send", variant="primary", scale=1, min_width=80) + + with gr.Row(): + clear_btn = gr.Button("Clear", size="sm", variant="secondary") + new_btn = gr.Button("New Session", size="sm", variant="secondary") + + if footer_html: + gr.HTML(footer_html) # --- Settings sidebar --- + # Each section below is gated on the presence of its callback, + # so forks can opt in per panel. The Configuration panel (the + # live-editable drawer) renders when "get_config" + "save_all" + # are provided by the server. The drawer is hidden during the + # wizard so the user has one surface to look at at a time. + sidebar_block = None if settings: - with gr.Sidebar(label="Settings", open=False, position="right"): - with gr.Accordion("Tools", open=False): - tools_display = gr.Markdown("Loading...") - refresh_tools_btn = gr.Button("Refresh", size="sm") - - with gr.Accordion("Model", open=False): - model_display = gr.Markdown("Loading...") - provider_dropdown = gr.Dropdown( - label="Provider", choices=[], interactive=True, + with gr.Sidebar(label="Settings", open=False, position="right", visible=setup_done) as sidebar_block: + + # === Live configuration drawer ============================ + if "get_config" in settings and "save_all" in settings: + gr.Markdown( + "### Configuration\n" + "Edits are written to `config/langgraph-config.yaml` " + "and applied with a live graph rebuild — in-flight " + "turns finish on the previous config.", ) - switch_status = gr.Markdown("") - refresh_model_btn = gr.Button("Refresh", size="sm") + config_status = gr.Markdown("") - if "get_knowledge_stats" in settings: - with gr.Accordion("Knowledge Base", open=False): - kb_display = gr.Markdown("Loading...") - refresh_kb_btn = gr.Button("Refresh", size="sm") + with gr.Accordion("Model", open=True): + api_base_in = gr.Textbox( + label="API Base URL", + placeholder="http://gateway:4000/v1", + interactive=True, + ) + api_key_in = gr.Textbox( + label="API Key", + type="password", + placeholder="blank → use $OPENAI_API_KEY env", + interactive=True, + ) + with gr.Row(): + model_in = gr.Dropdown( + label="Model", + choices=[], + interactive=True, + allow_custom_value=True, + scale=4, + ) + fetch_models_btn = gr.Button( + "Fetch", size="sm", scale=1, min_width=60, + ) + model_fetch_status = gr.Markdown("") + temperature_in = gr.Slider( + label="Temperature", + minimum=0.0, maximum=2.0, step=0.05, + interactive=True, + ) + max_tokens_in = gr.Number( + label="Max Tokens", precision=0, + minimum=1, interactive=True, + ) + max_iter_in = gr.Slider( + label="Max Iterations", + minimum=1, maximum=200, step=1, + interactive=True, + ) - # --- Callbacks --- + with gr.Accordion("Worker Subagent", open=False): + worker_enabled_in = gr.Checkbox( + label="Enabled", interactive=True, + ) + worker_tools_in = gr.CheckboxGroup( + label="Tools", choices=[], interactive=True, + ) + worker_max_turns_in = gr.Number( + label="Max Turns", precision=0, + minimum=1, interactive=True, + ) - def load_tools(): - return settings["get_tools_list"]() + with gr.Accordion("Middleware", open=False): + mw_knowledge_in = gr.Checkbox( + label="Knowledge", interactive=True, + ) + mw_audit_in = gr.Checkbox( + label="Audit", interactive=True, + ) + mw_memory_in = gr.Checkbox( + label="Memory", interactive=True, + ) - def load_model(): - return settings["get_model_info"]() + with gr.Accordion("Knowledge Store", open=False): + kb_db_in = gr.Textbox( + label="DB Path", interactive=True, + ) + kb_embed_in = gr.Textbox( + label="Embed Model", interactive=True, + ) + kb_top_k_in = gr.Number( + label="Top K", precision=0, + minimum=1, interactive=True, + ) - def load_provider_choices(): - choices = settings["get_provider_choices"]() - current = settings["get_current_provider"]() - return gr.update(choices=choices, value=current) + with gr.Accordion("Identity", open=False): + identity_name_in = gr.Textbox( + label="Agent name", + placeholder="short lowercase slug", + interactive=True, + ) + identity_operator_in = gr.Textbox( + label="Your name (operator)", + placeholder="injected into system prompt when set", + interactive=True, + ) - def switch_provider(choice): - return settings["switch_provider"](choice) + with gr.Accordion("Security — A2A bearer token", open=False): + auth_token_in = gr.Textbox( + label="Bearer token", + type="password", + placeholder="blank → open mode; set to require Authorization: Bearer ", + interactive=True, + ) + gr.Markdown( + "_Live-reloadable. Save & Reload flips A2A " + "enforcement on or off immediately; no restart._" + ) - def load_subtitle(): - return settings["get_subtitle"]() + with gr.Accordion("Autostart on login", open=False): + autostart_in = gr.Checkbox( + label="Launch this agent automatically on login", + interactive=True, + ) + autostart_drawer_status = gr.Markdown("") - app.load(fn=load_tools, outputs=[tools_display]) - app.load(fn=load_model, outputs=[model_display]) - app.load(fn=load_provider_choices, outputs=[provider_dropdown]) + with gr.Accordion("Persona (SOUL.md)", open=False): + soul_in = gr.Textbox( + label="SOUL.md", lines=16, show_label=False, + interactive=True, + placeholder="Agent persona — loaded into every system prompt.", + ) - refresh_tools_btn.click(fn=load_tools, outputs=[tools_display]) - refresh_model_btn.click( - fn=load_model, outputs=[model_display] - ).then(fn=load_provider_choices, outputs=[provider_dropdown]) + with gr.Row(): + save_btn = gr.Button( + "Save & Reload", variant="primary", scale=2, + ) + reload_btn = gr.Button( + "Reload from Disk", variant="secondary", scale=1, + ) - provider_dropdown.change( - fn=switch_provider, inputs=[provider_dropdown], outputs=[switch_status], - ).then(fn=load_model, outputs=[model_display]).then( - fn=load_subtitle, outputs=[header_md], - ) + # "Re-run setup" re-opens the wizard with current + # values pre-populated — for re-picking a preset, + # swapping models, or resetting the autostart plist. + if "restart_setup" in settings and wizard_enabled: + with gr.Accordion("Re-run setup wizard", open=False): + gr.Markdown( + "_Reopens the wizard with all current " + "values pre-filled. Your config isn't " + "wiped — you're just re-visiting the " + "choices._" + ) + reset_setup_btn = gr.Button( + "Run wizard now", variant="secondary", + ) + reset_setup_status = gr.Markdown("") + + # Ordered tuple used for both load_all outputs and + # save_all inputs — keeps the wiring obvious and the + # two lists from drifting out of sync. + _config_components = [ + api_base_in, api_key_in, model_in, + temperature_in, max_tokens_in, max_iter_in, + worker_enabled_in, worker_tools_in, worker_max_turns_in, + mw_knowledge_in, mw_audit_in, mw_memory_in, + kb_db_in, kb_embed_in, kb_top_k_in, + identity_name_in, identity_operator_in, + auth_token_in, autostart_in, + soul_in, + ] + + def _load_all(): + cfg = settings["get_config"]() + soul = settings["get_soul"]() if "get_soul" in settings else "" + tools = settings["list_tools"]() if "list_tools" in settings else [] + + # Best-effort gateway probe. If it fails (offline, + # wrong key) we surface the error but keep the form + # populated with the saved model name — the user + # can still edit everything else. + models, err = ([], "") + if "list_models" in settings: + try: + models, err = settings["list_models"]("", "") + except Exception as e: + err = str(e) + current_name = cfg["model"]["name"] + dropdown_choices = models if models else [current_name] + if current_name and current_name not in dropdown_choices: + dropdown_choices = [current_name, *dropdown_choices] + + fetch_msg = ( + f"✓ {len(models)} model(s) from gateway" + if models and not err + else f"⚠ {err}" if err else "" + ) + + worker = cfg["subagents"]["worker"] + identity = cfg.get("identity", {}) + auth = cfg.get("auth", {}) + runtime = cfg.get("runtime", {}) + return ( + cfg["model"]["api_base"], + cfg["model"]["api_key"], + gr.update(choices=dropdown_choices, value=current_name), + cfg["model"]["temperature"], + cfg["model"]["max_tokens"], + cfg["model"]["max_iterations"], + worker["enabled"], + gr.update(choices=tools, value=list(worker["tools"])), + worker["max_turns"], + cfg["middleware"]["knowledge"], + cfg["middleware"]["audit"], + cfg["middleware"]["memory"], + cfg["knowledge"]["db_path"], + cfg["knowledge"]["embed_model"], + cfg["knowledge"]["top_k"], + identity.get("name", ""), + identity.get("operator", ""), + auth.get("token", ""), + bool(runtime.get("autostart_on_boot", False)), + soul, + fetch_msg, + ) + + def _fetch_models(api_base, api_key): + if "list_models" not in settings: + return gr.update(), "⚠ list_models not wired" + try: + models, err = settings["list_models"](api_base, api_key) + except Exception as e: + return gr.update(), f"⚠ {e}" + if err: + return gr.update(), f"⚠ {err}" + return gr.update(choices=models), f"✓ {len(models)} model(s) from gateway" + + def _save( + api_base, api_key, model_name, + temperature, max_tokens, max_iter, + worker_enabled, worker_tools, worker_max_turns, + mw_knowledge, mw_audit, mw_memory, + kb_db, kb_embed, kb_top_k, + identity_name, identity_operator, + auth_token, autostart_on, + soul, + ): + # Numeric fields fall back to sensible minimums + # rather than 0 when the user clears them — + # ``validate_config_dict`` rejects zero values so + # a blank field would otherwise block the save + # with a confusing validation error. + new_config = { + "model": { + "api_base": api_base or "", + "api_key": api_key or "", + "name": model_name or "", + "temperature": float(temperature), + "max_tokens": int(max_tokens or 4096), + "max_iterations": int(max_iter or 50), + }, + "subagents": { + "worker": { + "enabled": bool(worker_enabled), + "tools": list(worker_tools or []), + "max_turns": int(worker_max_turns or 20), + }, + }, + "middleware": { + "knowledge": bool(mw_knowledge), + "audit": bool(mw_audit), + "memory": bool(mw_memory), + }, + "knowledge": { + "db_path": kb_db or "", + "embed_model": kb_embed or "", + "top_k": int(kb_top_k or 1), + }, + "identity": { + "name": (identity_name or "").strip() or "protoagent", + "operator": (identity_operator or "").strip(), + }, + "auth": { + "token": auth_token or "", + }, + "runtime": { + "autostart_on_boot": bool(autostart_on), + }, + } + try: + ok, msg = settings["save_all"](new_config, soul or "") + except Exception as e: + return f"⚠ save failed: {e}" + return f"{'✓' if ok else '⚠'} {msg}" + + def _reload_only(): + try: + ok, msg = settings["save_all"](None, None) + except Exception as e: + return f"⚠ reload failed: {e}" + return f"{'✓' if ok else '⚠'} {msg}" + + app.load( + fn=_load_all, + outputs=[*_config_components, model_fetch_status], + ) + fetch_models_btn.click( + fn=_fetch_models, + inputs=[api_base_in, api_key_in], + outputs=[model_in, model_fetch_status], + ) + save_btn.click( + fn=_save, + inputs=_config_components, + outputs=[config_status], + ).then( + fn=_fetch_models, + inputs=[api_base_in, api_key_in], + outputs=[model_in, model_fetch_status], + ) + reload_btn.click( + fn=_reload_only, outputs=[config_status], + ).then( + fn=_load_all, + outputs=[*_config_components, model_fetch_status], + ) + + # === Legacy read-only panels (opt-in via their own keys) == + if "get_tools_list" in settings: + with gr.Accordion("Tools", open=False): + tools_display = gr.Markdown("Loading...") + refresh_tools_btn = gr.Button("Refresh", size="sm") + + def load_tools(): + return settings["get_tools_list"]() + + app.load(fn=load_tools, outputs=[tools_display]) + refresh_tools_btn.click(fn=load_tools, outputs=[tools_display]) + + if "get_model_info" in settings: + with gr.Accordion("Model Status", open=False): + model_display = gr.Markdown("Loading...") + refresh_model_btn = gr.Button("Refresh", size="sm") + + provider_dropdown = None + switch_status = None + if "get_provider_choices" in settings: + provider_dropdown = gr.Dropdown( + label="Provider", choices=[], interactive=True, + ) + switch_status = gr.Markdown("") + + def load_model(): + return settings["get_model_info"]() + + app.load(fn=load_model, outputs=[model_display]) + refresh_model_btn.click(fn=load_model, outputs=[model_display]) + + if provider_dropdown is not None: + def load_provider_choices(): + choices = settings["get_provider_choices"]() + # get_current_provider is optional — older + # forks provided only the choices list. + # Missing key must not raise KeyError; the + # dropdown simply renders with no preselect. + current_fn = settings.get("get_current_provider") + current = current_fn() if current_fn else None + return gr.update(choices=choices, value=current) + + def switch_provider(choice): + return settings["switch_provider"](choice) + + def load_subtitle(): + return settings["get_subtitle"]() + + app.load(fn=load_provider_choices, outputs=[provider_dropdown]) + provider_dropdown.change( + fn=switch_provider, + inputs=[provider_dropdown], + outputs=[switch_status], + ).then(fn=load_model, outputs=[model_display]).then( + fn=load_subtitle, outputs=[header_md], + ) if "get_knowledge_stats" in settings: + with gr.Accordion("Knowledge Base", open=False): + kb_display = gr.Markdown("Loading...") + refresh_kb_btn = gr.Button("Refresh", size="sm") + def load_kb_stats(): return settings["get_knowledge_stats"]() @@ -311,6 +783,244 @@ def get_response(history: list[dict], original_msg: str, sid: str): clear_btn.click(fn=lambda: ([], "default"), outputs=[chatbot, session_id]) new_btn.click(fn=lambda: ([], secrets.token_hex(4)), outputs=[chatbot, session_id]) + # --- Wizard callbacks ----------------------------------------- + if wizard_enabled: + def _load_wizard_defaults(): + """Seed every wizard field from the current on-disk + config. Returns updates in the exact order of + ``w_inputs`` plus the connection-test status + the + autostart note.""" + cfg = settings["get_config"]() if "get_config" in settings else {} + soul = settings["get_soul"]() if "get_soul" in settings else "" + tools = settings["list_tools"]() if "list_tools" in settings else [] + presets = settings["list_soul_presets"]() if "list_soul_presets" in settings else [] + + model = cfg.get("model", {}) + identity = cfg.get("identity", {}) + worker = cfg.get("subagents", {}).get("worker", {}) + mw = cfg.get("middleware", {}) + runtime = cfg.get("runtime", {}) + auth = cfg.get("auth", {}) + + current_model = model.get("name", "") + model_choices = [current_model] if current_model else [] + + autostart_msg = "" + if "autostart_info" in settings: + try: + info = settings["autostart_info"]() + except Exception as e: + info = {"supported": False, "reason": str(e)} + if info.get("supported"): + state = "installed" if info.get("installed") else "not installed" + autostart_msg = f"_Platform supported. Current state: **{state}**._" + else: + autostart_msg = f"⚠ {info.get('reason', 'not supported on this platform')}" + + return ( + model.get("api_base", ""), + model.get("api_key", ""), + gr.update(choices=model_choices, value=current_model), + identity.get("name", ""), + soul, + gr.update(choices=presets, value=None), + gr.update(choices=tools, value=list(worker.get("tools", []))), + bool(mw.get("audit", True)), + bool(mw.get("memory", True)), + bool(mw.get("knowledge", False)), + identity.get("operator", ""), + auth.get("token", ""), + bool(runtime.get("autostart_on_boot", False)), + "", # w_test_status + autostart_msg, + ) + + app.load( + fn=_load_wizard_defaults, + outputs=[*w_inputs, w_test_status, w_autostart_note], + ) + + # Re-check setup state on every page load so external + # completions (POST /api/config/setup from curl, or a + # reset triggered in another tab) are reflected after + # a browser refresh. Without this, Gradio keeps serving + # the initial visibility state from when the Blocks + # were first rendered. + # Sync visibility on every page load. The output list is + # either two or three elements depending on whether the + # sidebar exists — we used to alias the sidebar slot to + # wizard_pane when missing, but that sent a duplicate + # gr.update to the same component, which Gradio treats + # as two competing writes to wizard_pane.visible. + if sidebar_block is not None: + def _sync_visibility_with_sidebar(): + if "is_setup_complete" not in settings: + return gr.update(), gr.update(), gr.update() + done = bool(settings["is_setup_complete"]()) + return ( + gr.update(visible=not done), # wizard_pane + gr.update(visible=done), # chat_pane + gr.update(visible=done), # sidebar_block + ) + + app.load( + fn=_sync_visibility_with_sidebar, + outputs=[wizard_pane, chat_pane, sidebar_block], + ) + else: + def _sync_visibility_no_sidebar(): + if "is_setup_complete" not in settings: + return gr.update(), gr.update() + done = bool(settings["is_setup_complete"]()) + return ( + gr.update(visible=not done), # wizard_pane + gr.update(visible=done), # chat_pane + ) + + app.load( + fn=_sync_visibility_no_sidebar, + outputs=[wizard_pane, chat_pane], + ) + + # Connection test — fills the model dropdown + def _test_connection(api_base, api_key): + if "list_models" not in settings: + return gr.update(), "⚠ list_models callback not wired" + if not api_base: + return gr.update(), "⚠ enter an API base URL first" + try: + models, err = settings["list_models"](api_base, api_key) + except Exception as e: + return gr.update(), f"⚠ {e}" + if err: + return gr.update(), f"⚠ {err}" + pick = models[0] if models else None + return ( + gr.update(choices=models, value=pick), + f"✓ {len(models)} model(s) — picked **{pick}**, change if needed", + ) + + w_test_btn.click( + fn=_test_connection, + inputs=[w_api_base, w_api_key], + outputs=[w_model, w_test_status], + ) + + # Preset loader — pastes template text into SOUL textarea + def _load_preset(name): + if not name or "read_soul_preset" not in settings: + return gr.update() + try: + return settings["read_soul_preset"](name) + except Exception: + return gr.update() + + w_load_preset_btn.click( + fn=_load_preset, inputs=[w_preset], outputs=[w_soul], + ) + + # Launch button — write everything, mark complete, then + # hard-reload the page. Toggling ``visible=`` on nested + # gr.Column + gr.Sidebar via gr.update is unreliable + # (children don't always re-mount); a full reload is the + # only bulletproof way to guarantee the chat pane appears. + # The reload re-enters _build() which reads + # is_setup_complete()==True and renders chat + drawer + # visible from scratch. + def _finish_wizard( + api_base, api_key, model_name, + agent_name_val, soul, _preset_unused, + tools, mw_audit, mw_memory, mw_knowledge, + operator, auth_token, autostart, + ): + if not (api_base or "").strip(): + return "⚠ API base URL is required — go back to step 1" + if not (model_name or "").strip(): + return "⚠ pick a model — use the Test connection button in step 1" + if not (agent_name_val or "").strip(): + return "⚠ agent name is required — step 2" + + new_config = { + "model": { + "api_base": api_base, + "api_key": api_key or "", + "name": model_name, + }, + "subagents": { + "worker": { + "enabled": True, + "tools": list(tools or []), + }, + }, + "middleware": { + "audit": bool(mw_audit), + "memory": bool(mw_memory), + "knowledge": bool(mw_knowledge), + }, + "identity": { + "name": agent_name_val.strip(), + "operator": (operator or "").strip(), + }, + "auth": {"token": auth_token or ""}, + "runtime": {"autostart_on_boot": bool(autostart)}, + } + try: + ok, msg = settings["finish_setup"](new_config, soul or "") + except Exception as e: + return f"⚠ setup failed: {e}" + if ok: + return f"✓ {msg} — reloading page…" + return f"⚠ {msg}" + + # 1. Run the save. 2. On the client, if the status message + # starts with "✓", reload after a short beat so the user + # sees the success line. Any warning (⚠) keeps the wizard + # visible so they can correct and retry. + w_launch_btn.click( + fn=_finish_wizard, + inputs=w_inputs, + outputs=[w_launch_status], + ).then( + fn=None, + inputs=[w_launch_status], + outputs=None, + js=( + "(status) => {" + " if (typeof status === 'string' && status.startsWith('✓')) {" + " setTimeout(() => window.location.reload(), 1000);" + " }" + " return [];" + "}" + ), + ) + + # "Re-run setup" in the drawer — same reload-after-flip + # pattern for the reverse direction. + if "restart_setup" in settings: + def _trigger_rerun(): + try: + msg = settings["restart_setup"]() + except Exception as e: + return f"⚠ {e}" + return f"✓ {msg} — reloading page…" + + reset_setup_btn.click( + fn=_trigger_rerun, + outputs=[reset_setup_status], + ).then( + fn=None, + inputs=[reset_setup_status], + outputs=None, + js=( + "(status) => {" + " if (typeof status === 'string' && status.startsWith('✓')) {" + " setTimeout(() => window.location.reload(), 800);" + " }" + " return [];" + "}" + ), + ) + return app app = _build() diff --git a/config/langgraph-config.yaml b/config/langgraph-config.yaml index c3f53e8..c7df665 100644 --- a/config/langgraph-config.yaml +++ b/config/langgraph-config.yaml @@ -22,16 +22,32 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log + - schedule_task + - list_schedules + - cancel_schedule max_turns: 20 middleware: - # The knowledge middleware requires a knowledge store. Leave false - # until you add one. Memory persistence is enabled by default and - # writes session summaries to /sandbox/memory/ without a store. - knowledge: false + # All four subsystems default ON. The template constructs the + # knowledge store + scheduler backends automatically (see + # ``server.py::_build_knowledge_store`` and ``_build_scheduler``). + # Flip any of these to ``false`` to opt out — the corresponding + # tools (memory_*, schedule_*) are dropped from the agent loop + # without touching the worker subagent's tool allowlist. + knowledge: true audit: true memory: true + scheduler: true knowledge: db_path: /sandbox/knowledge/agent.db diff --git a/config/soul-presets/blank.md b/config/soul-presets/blank.md new file mode 100644 index 0000000..e53908c --- /dev/null +++ b/config/soul-presets/blank.md @@ -0,0 +1,24 @@ +# Identity + +_Describe your agent in one paragraph — who it is, who it +reports to, what domain it owns._ + +# Personality + +_3–6 traits. Affects the tone of every response._ + +# Values + +_Rules that shape judgement calls. Example: "never modify +production data while investigating."_ + +# Communication style + +_How output is formatted — markdown, plain text, JSON, Discord +embeds. How long responses should be by default._ + +# Capabilities + +_What tools are available and when to reach for each. The tool +docstrings are already in context; this is where you explain +the higher-level decision procedure._ diff --git a/config/soul-presets/coding.md b/config/soul-presets/coding.md new file mode 100644 index 0000000..1ad23d1 --- /dev/null +++ b/config/soul-presets/coding.md @@ -0,0 +1,37 @@ +# Identity + +I am a coding agent. I read code, explain it, suggest changes, and +write code when asked — grounded in what the codebase actually +does, not in what a general-purpose model might guess. + +# Personality + +- Precise — file paths, line numbers, exact identifiers. Never + "somewhere in the auth module." +- Conservative on edits — the smallest change that solves the + problem. I don't refactor surrounding code as a bonus. +- Root-cause oriented — when something breaks, I find the cause + before patching the symptom. + +# Communication style + +- Short prose, code in code fences, one clear recommendation. +- For any file reference, include the path and the relevant + lines. The operator shouldn't have to hunt. +- When I suggest a change, explain the *why* in one sentence. + Reserve multi-paragraph explanations for genuinely subtle cases. + +# When to reach for tools + +- `fetch_url` for official docs when the question is + library-specific and the model's training data may be stale. +- `web_search` for error messages with distinctive strings to + find similar reports. +- `calculator` for bit math, offsets, sizing. + +# Values + +- No speculation. If I haven't read the file, I say so before + making claims about it. +- A clean diff beats a clever one. Readability is a feature. +- Tests are evidence. A bug without a failing test is unverified. diff --git a/config/soul-presets/generic-assistant.md b/config/soul-presets/generic-assistant.md new file mode 100644 index 0000000..58e6459 --- /dev/null +++ b/config/soul-presets/generic-assistant.md @@ -0,0 +1,33 @@ +# Identity + +I am an AI assistant. I help the operator think through problems, +answer questions, and take action via the tools available to me. + +# Personality + +- Direct — I answer the question asked, not a version of it I wish + had been asked. +- Grounded — when I use a tool, I surface what it returned rather + than paraphrasing away the evidence. +- Calibrated — I say "I don't know" when I don't, rather than + fabricating a confident answer. + +# Communication style + +- Short by default. Expand when the operator asks or when the + answer genuinely requires it. +- Markdown when the surface renders it; plain text otherwise. +- Reference concrete artifacts (URLs, file paths, tool outputs) + so the operator can verify. + +# When to reach for tools + +- `web_search` + `fetch_url` when the question depends on current + information that the model's training data wouldn't know. +- `current_time` any time "now" matters — never guess the time. +- `calculator` for any numeric work beyond trivial mental math. + +# Values + +- Verify before asserting. +- Surface failures plainly; the operator decides what to do next. diff --git a/config/soul-presets/research.md b/config/soul-presets/research.md new file mode 100644 index 0000000..6d51cac --- /dev/null +++ b/config/soul-presets/research.md @@ -0,0 +1,35 @@ +# Identity + +I am a research agent. My job is to find information, evaluate +source quality, and deliver a synthesis the operator can act on. + +# Personality + +- Curious — I follow threads until I've seen enough to answer, + not until I find the first plausible-looking result. +- Skeptical — I assume claims are wrong until the evidence holds + up. I note when sources disagree. +- Thorough — when the operator asks for "three sources" I return + three distinct sources, not three links to the same article. + +# Communication style + +- Lead with the answer, then the evidence. Never bury the + conclusion under a recap of my search process. +- Cite with URLs. Prefer primary sources (docs, filings, papers) + over summaries. +- Flag confidence explicitly — "confirmed by X and Y" vs "one + source, unverified" — so the operator can calibrate. + +# Search loop + +1. Search with `web_search`. Read the top N titles + snippets. +2. Pick the most credible-looking 2–5. `fetch_url` each. +3. Cross-check: do independent sources agree? Which disagree? +4. Synthesize. Return claim → evidence → confidence, not a + chronological log of what I read. + +# Values + +- A hole in the evidence is more useful than a confident guess. +- Never present a synthesis as settled when the sources are thin. diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index d705a63..2deb10c 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -35,7 +35,8 @@ export default defineConfig({ text: "How-To Guides", items: [ { text: "Overview", link: "/guides/" }, - { text: "Fork the template", link: "/guides/fork-the-template" }, + { text: "Customize & deploy", link: "/guides/customize-and-deploy" }, + { text: "Fork checklist (fast path)", link: "/guides/fork-the-template" }, { text: "Add a custom skill", link: "/guides/add-a-skill" }, { text: "Configure subagents", link: "/guides/subagents" }, { text: "Wire Langfuse + Prometheus", link: "/guides/observability" }, diff --git a/docs/guides/customize-and-deploy.md b/docs/guides/customize-and-deploy.md new file mode 100644 index 0000000..7ba875a --- /dev/null +++ b/docs/guides/customize-and-deploy.md @@ -0,0 +1,99 @@ +# Customize & deploy + +Use this guide when you've run through the wizard, decided the template fits your use case, and now want to fork it into your own GitHub repo + ship a deployable image. If you're still evaluating, stay on the [first-agent tutorial](/tutorials/first-agent) — you don't need any of this to run the agent locally. + +## Why this is a separate step + +The [setup wizard](/tutorials/first-agent) handles runtime customization — model, tools, persona, auth — without editing code. Everything below is structural: renaming the template throughout the codebase, bending the release pipeline to your repo, baking your fork's identity into the Docker image. Do it once per fork, not every time you tweak a setting. + +## 1. Fork the template on GitHub + +```bash +gh repo create protoLabsAI/my-agent \ + --template protoLabsAI/protoAgent \ + --public --clone + +cd my-agent +``` + +Or: `Use this template → Create a new repository` from the browser. Pick a short slug (`jon`, `echo-agent`, `product-director`) — it ends up as the image name, metric prefix, Langfuse tag, and release-workflow repo guard. + +## 2. Rename `protoagent` throughout + +The template uses `protoagent` as the placeholder everywhere. Do one pass: + +```bash +# macOS / BSD sed +git grep -li protoagent | xargs sed -i '' 's/protoagent/my-agent/g' +git grep -li protoAgent | xargs sed -i '' 's/protoAgent/MyAgent/g' + +# Linux / GNU sed — drop the empty-string backup suffix +git grep -li protoagent | xargs sed -i 's/protoagent/my-agent/g' +git grep -li protoAgent | xargs sed -i 's/protoAgent/MyAgent/g' +``` + +Review the diff. Key hits: + +- `Dockerfile` — the `/opt/protoagent/` paths become `/opt/my-agent/`. +- `entrypoint.sh` — same. +- `server.py` — `AGENT_NAME_ENV` fallback becomes `my-agent`. +- `chat_ui.py` — branding strings (service worker label, apple-mobile-web-app-title). +- Workflow files — the repo guards check `protoLabsAI/my-agent` instead. + +The runtime name (`identity.name` in `config/langgraph-config.yaml`, set by the wizard) is separate — keep both in sync unless you have a reason not to. + +## 3. Un-freeze the release pipeline + +The release workflows gate on the template's repo path so third-party clones don't accidentally cut releases: + +- `.github/workflows/prepare-release.yml` +- `.github/workflows/release.yml` +- `.github/workflows/docker-publish.yml` + +Each has a `if: github.repository == 'protoLabsAI/protoAgent'` (or similar) check. Swap `protoLabsAI/protoAgent` for `/` in all three, or the pipeline won't fire on merges. + +## 4. Rewrite the agent card + +`server.py::_build_agent_card` ships with placeholder skills: + +```python +"skills": [ + {"id": "chat", "name": "Chat", "description": "General-purpose...", ...}, +], +``` + +Replace with the skills your agent actually advertises over A2A. The `name` and `url` fields already pick up `identity.name` from YAML, so the wizard-set name lands on the card without code changes. + +## 5. (Optional) Add domain tools + +`tools/lg_tools.py` ships with `current_time`, `calculator`, `web_search`, `fetch_url` plus 5 memory tools (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`) bound to the bundled `KnowledgeStore`. The 3 scheduler tools (`schedule_task`, `list_schedules`, `cancel_schedule`) are wired in separately by `server.py::_build_scheduler` when the scheduler backend is enabled. Keep the ones you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of `tools/lg_tools.py`. Any tool returned from there (or from `_build_scheduler_tools`) becomes a checkbox in the wizard and drawer automatically. + +The memory tools are dropped automatically when `middleware.knowledge: false`; the scheduler tools when `middleware.scheduler: false`. See [Schedule future work](/guides/scheduler) and [Configuration](/reference/configuration#middleware) for the toggles. + +## 6. (Optional) Configure subagents + +`graph/subagents/config.py` ships with one `worker`. Register more `SubagentConfig` instances in `SUBAGENT_REGISTRY` and add matching fields in `graph/config.py::LangGraphConfig`. The lead agent delegates via the `task` tool; the subagent delegation rules are built from the registry. + +## 7. Build and ship the image + +```bash +docker build -t ghcr.io/my-org/my-agent:local . + +# local test — mount the config volume so wizard completions persist +docker run --rm -p 7870:7870 \ + -e OPENAI_API_KEY="$OPENAI_API_KEY" \ + -v my-agent-config:/opt/my-agent/config \ + ghcr.io/my-org/my-agent:local +``` + +The Dockerfile declares `VOLUME /opt//config` so even without `-v` the wizard writes persist across container runs on the same Docker host — they live in an anonymous volume. For production, use a named volume or host mount so you can back it up. + +Once the local build is happy, merge a PR to trigger the release pipeline ([Deploy via GHCR](/guides/deploy)). + +## 8. Delete `TEMPLATE.md` + +Once the checklist is done, `rm TEMPLATE.md` and rewrite `README.md` to describe your specific agent — its purpose, its skills, its operators. + +## Canonical reference implementation + +[protoLabsAI/quinn](https://github.com/protoLabsAI/quinn) is the first agent built on this template, now running in production. When this guide doesn't cover a specific decision, Quinn is the filled-in example — worth a skim before you invent something new. diff --git a/docs/guides/evals.md b/docs/guides/evals.md new file mode 100644 index 0000000..38e1e33 --- /dev/null +++ b/docs/guides/evals.md @@ -0,0 +1,151 @@ +# Eval your fork + +The template ships an eval harness under `evals/` so a fresh fork has +a working test suite for its tools, memory, and A2A protocol surface +on day one. Cases assert across three independent channels — audit +log, reply text, and knowledge-store side effects — so a model that +hallucinates a tool result still gets caught. + +## When to read this + +- You forked the template and want a baseline pass-rate before you + ship. +- You added a new tool and want to lock in its intent — "when the + operator says X, fire tool Y". +- You changed a prompt or model and want to measure regression. + +## Run the suite + +```bash +# Agent running at $EVAL_BASE_URL (default http://localhost:7870) +# with the relevant auth env (A2A_AUTH_TOKEN and/or _API_KEY). + +python -m evals.runner +python -m evals.runner --category tool +python -m evals.runner --tasks current_time_intent,daily_log_intent +``` + +Reports land in `evals/results/run-.json`. The CLI prints a +pass/fail board; the JSON report carries reply previews and timing +for post-hoc inspection. + +## The three assertion channels + +``` +prompt → A2A → audit log (1) tools fired with expected outcome + → reply text (2) substrings present in reply + → KB chunks table (3) side effects landed correctly +``` + +A case passes only when every configured assertion holds. Most cases +should opt in to channels 1 and 3 — text patterns alone are brittle +to model paraphrasing and miss hallucinated tool results entirely. + +### Why side-effect verification beats text-only + +A model can produce "Logged: ..." in its reply without actually +calling `daily_log`. Substring matching passes, the DB stays empty, +and the bug ships. Reading `audit.jsonl` and the `chunks` table +afterward catches it. + +## The shape of a case + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-XYZ", + "domain": "context" + }, + "setup": [{"kb_ingest": {"content": "...", "domain": "...", "heading": "..."}}], + "teardown": [{"kb_delete_by_content": {"contains": "..."}}] +} +``` + +Three case `kind`s ship: + +- `agent_card` — fetch `/.well-known/agent-card.json` and assert on + the card's name, skill count, and declared extensions. +- `auth_check` — send a request with a deliberately bad bearer and + assert the server returns the expected status (401 by default). +- `ask` — the main shape. Sends `prompt`, then asserts on tool firing, + reply patterns, and KB state. + +## Prompt rule + +**The tool name never appears in the prompt.** Every prompt must be +plausibly typed by a real user. "Use `daily_log` to record..." tests +instruction-following, not tool selection. If the agent needs to +infer the tool from intent, that *is* the test. + +## Setup and teardown — start clean every time + +Each `ask` case can pre-seed state via `setup` blocks (BFCL's +`initial_config` pattern: direct DB writes the model never sees) and +clean up after itself with `teardown`. The fixture is invisible to +the agent — it discovers the seeded state via tools, exactly as a +real user would. + +`teardown` runs even when assertions fail, so case order doesn't +matter and a noisy failure doesn't poison the next run. + +Supported setup/teardown step kinds (extend `evals/verify.py` to add +more): + +| Step kind | Args | What it does | +|---|---|---| +| `kb_ingest` | `content`, `domain`, `heading?` | Insert a chunk | +| `kb_delete_by_content` | `contains` | Delete chunks where content LIKE `%contains%` | +| `kb_delete_by_heading` | `domain`, `heading` | Delete chunks matching (domain, heading) | + +## What forks should test by default + +The starter `tasks.json` covers: + +- Agent card discovery (name, skill count, `cost-v1` extension) +- Bearer auth gating +- Each shipped tool fires from a plausible operator prompt +- Memory ingest → recall round-trip +- KB-driven middleware injection (no tool call needed) +- A chained two-tool case (`daily_log` then `memory_recall`) + +When you add a tool, add at least one case for it. When you add a +skill to the agent card, extend the `card_discovery` case to assert +the new skill is advertised. + +## Running in CI + +The runner exits non-zero when any case fails, so it drops in cleanly: + +```yaml +- name: Boot agent + run: docker compose up -d agent + +- name: Wait for /health + run: ./scripts/wait-for-it.sh http://localhost:7870/.well-known/agent-card.json + +- name: Run evals + run: python -m evals.runner + env: + EVAL_BASE_URL: http://localhost:7870 + A2A_AUTH_TOKEN: ${{ secrets.AGENT_BEARER }} +``` + +For non-deterministic categories (any `tool` or `chained` case), aim +for an N-of-M majority threshold rather than 100% — the reference +implementation runs 3 attempts and gates at 2 passes for those +categories. Deterministic ones (`a2a-protocol`, `subsystem` with +seeded state) gate at 100%. + +## References + +- [`evals/README.md`](https://github.com/protoLabsAI/protoAgent/blob/main/evals/README.md) — quick reference for case authors +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) — user simulator + milestones / minefields diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md index eacbc92..3de87b0 100644 --- a/docs/guides/fork-the-template.md +++ b/docs/guides/fork-the-template.md @@ -43,7 +43,7 @@ Keep the `` / `` protocol block in `prompts.py` — the A2A ## 4. Replace the starter tools -`tools/lg_tools.py` ships with `echo`, `current_time`, `calculator`, `web_search`, `fetch_url`. Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of the file. +Twelve tools ship by default: `current_time`, `calculator`, `web_search`, `fetch_url` (keyless general) plus `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` (bound to the bundled `KnowledgeStore`) plus `schedule_task`, `list_schedules`, `cancel_schedule` (bound to the scheduler backend). Keep what you want, drop the rest, add your own. Update `get_all_tools()` at the bottom of `tools/lg_tools.py`. See the [starter tools reference](/reference/starter-tools) for the shapes of the shipped ones. diff --git a/docs/guides/index.md b/docs/guides/index.md index 843adee..ce26b48 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -1,11 +1,14 @@ # How-To Guides -Task-oriented procedures. Assumes you already have a forked, running agent (see [Tutorials](/tutorials/) if not). +Task-oriented procedures. Assumes you already have a running agent (see [Tutorials](/tutorials/) if not — the wizard runs with zero setup). | Guide | When to read | |---|---| -| [Fork the template](/guides/fork-the-template) | Fast-path checklist for experienced forkers | +| [Customize & deploy](/guides/customize-and-deploy) | You've evaluated via the wizard and now want to fork, rename, and ship your own image | +| [Fork checklist (fast path)](/guides/fork-the-template) | Terser version of the above for experienced forkers | | [Add a custom skill](/guides/add-a-skill) | Your agent does new things and callers need to dispatch to them | | [Configure subagents](/guides/subagents) | You want specialized delegates beyond the placeholder `worker` | | [Wire Langfuse + Prometheus](/guides/observability) | You need traces and metrics in production | +| [Eval your fork](/guides/evals) | You want a baseline pass-rate for the tools / memory / A2A surface in your fork | +| [Schedule future work](/guides/scheduler) | You want the agent to defer tasks to itself ("remind me tomorrow", recurring sweeps) — local sqlite or Workstacean-backed | | [Deploy via GHCR](/guides/deploy) | You're ready to ship and want auto-deploy wired up | diff --git a/docs/guides/scheduler.md b/docs/guides/scheduler.md new file mode 100644 index 0000000..faaf45f --- /dev/null +++ b/docs/guides/scheduler.md @@ -0,0 +1,178 @@ +# Schedule future work + +protoAgent ships a scheduler so the agent can defer tasks to itself — +"remind me about X tomorrow", "every Monday morning summarize last +week's logs", "at 3pm check the deploy". Two backends ship by default; +the agent-facing tool surface is identical regardless of which one is +active. + +## When to read this + +- You want forks (or your own multiple agents) to support reminders, + recurring sweeps, or any "do this later" intent. +- You're running protoWorkstacean and want scheduled fires to flow + through the existing bus. +- You're spinning up multiple protoAgent instances on one box and + need scheduling state to stay isolated per agent. + +## The three tools + +When the scheduler is active, three tools land in `get_all_tools()`: + +| Tool | What it does | +|---|---| +| `schedule_task(prompt, when, job_id?)` | Persist a future invocation. `when` is cron (`"0 9 * * *"`) or ISO-8601 (`"2026-05-01T15:00:00"`). | +| `list_schedules()` | Show all jobs visible to *this* agent. | +| `cancel_schedule(job_id)` | Remove a job by id. | + +Prompts are self-contained — the agent has no memory of the +scheduling moment when the task fires, so write the prompt as a fresh +turn ("review last week's pipeline incidents and post a summary", +not "do that thing we discussed"). + +## Backend selection + +`server.py::_build_scheduler` picks at startup: + +1. `middleware.scheduler: false` in YAML → no scheduler. The three + tools don't ship. (Symmetric with `middleware.knowledge` / + `middleware.memory` — drawer/wizard editable.) +2. `SCHEDULER_DISABLED=1` env → no scheduler. Runtime escape hatch + for fleet operators who can't edit config. +3. `WORKSTACEAN_API_BASE` + `WORKSTACEAN_API_KEY` set → + **`WorkstaceanScheduler`**. +4. Otherwise → **`LocalScheduler`** (sqlite, asyncio polling). + +Both backends honor the same `SchedulerBackend` protocol; the agent +loop never knows which one is wired up. The scheduler is **default +on** — explicitly opt out via either config path above when a fork +wants a stateless agent with no scheduling surface. + +```bash +# Solo / local dev — falls through to LocalScheduler automatically. +python server.py + +# Workstacean install — set both env vars and restart. +export WORKSTACEAN_API_BASE=http://your-workstacean-host:3000 +export WORKSTACEAN_API_KEY= +python server.py +``` + +> **protoLabs operators**: the fleet's Workstacean lives on the +> `ava` node; `WORKSTACEAN_API_KEY` is in the org's secrets manager +> under `secret-management → workstacean`. Coordinate with the team +> for the exact URL. + +## Multi-agent isolation + +Every job is namespaced by `AGENT_NAME` so spinning up +`gina-personal` alongside `gina-work` on the same box doesn't +cross-fire prompts. + +| Backend | How it isolates | +|---|---| +| Local | DB path per agent: `/sandbox/scheduler//jobs.db` (falls back to `~/.protoagent/scheduler//jobs.db`). Every row also carries `agent_name`; reads filter on it. | +| Workstacean | Job IDs are prefixed `-...`; topics are namespaced `cron..`. One Workstacean install can serve N forks safely. | + +If you supply your own `job_id` in `schedule_task`: + +- Local: the id is stored as-is. Two agents sharing one DB path with + the same user-supplied id will trip a primary-key collision (the + second add raises a clear error). To avoid it, let the scheduler + auto-generate (the auto-id is `-`). +- Workstacean: the adapter prepends `-` if your id doesn't + already start with it, so cross-agent collisions are impossible. + +## Local backend — how firing works + +The local scheduler runs an asyncio polling task on FastAPI's +`startup` event. Once a second: + +1. Read jobs where `next_fire <= now()` and `enabled = 1`. +2. For each due job: POST to `http://127.0.0.1:/a2a` as + a `message/send` with the job's prompt as the message text. Bearer + + X-API-Key are forwarded automatically. +3. One-shot ISO jobs are deleted after firing. Cron jobs reschedule + forward via `croniter`. + +Going through HTTP rather than calling into the graph directly buys +parity with real callers — the audit log, cost-v1 capture, and +push-notification path all behave identically. + +### Missed-fire recovery + +On startup, jobs whose `next_fire` is in the past are inspected: + +- **Within the last 24h** — fire on the next tick (so a 5-minute + outage doesn't lose an upcoming reminder). +- **Older than 24h** — cron jobs roll forward to the next slot + without firing; one-shot jobs are dropped. This matches + Workstacean's recovery behaviour and avoids flooding the agent + with stale prompts after a long downtime. + +### Persistence path + +```bash +# Default (Docker) +/sandbox/scheduler//jobs.db + +# Local fallback (when /sandbox isn't writable) +~/.protoagent/scheduler//jobs.db + +# Override +export SCHEDULER_DB_DIR=/var/data/agents +# → /var/data/agents//jobs.db +``` + +Mount a volume at the configured path to survive container +restarts (analogous to `audit/` and `knowledge/`). + +## Workstacean backend — how firing works + +When `WORKSTACEAN_API_BASE` and `WORKSTACEAN_API_KEY` are set, the +adapter publishes to `POST {base}/publish` with topic +`command.schedule` and the action wrapper Workstacean expects. See +the [Workstacean scheduler reference](https://protolabsai.github.io/protoWorkstacean/reference/scheduler/) +for the payload shape. + +When the schedule fires, Workstacean publishes the inner payload to +`cron..`. **Workstacean does not natively dispatch +to A2A endpoints today** — your fork needs to wire a bridge that +subscribes to `cron..*` and POSTs to the protoAgent's +`/a2a` endpoint. + +### Topic prefix override + +If your existing Workstacean bus uses a different convention: + +```bash +export WORKSTACEAN_TOPIC_PREFIX="myorg.cron.gina" +# → topics fire on myorg.cron.gina. +``` + +### `list_schedules()` returns empty under Workstacean + +Workstacean's `list` action publishes its response on the +`schedule.list` topic — there's no synchronous reply on `/publish`. +The adapter intentionally doesn't subscribe. If you need live +introspection, query Workstacean directly or run the local backend. + +## Adding a case to your eval suite + +The default `evals/tasks.json` doesn't include scheduler cases (the +fire path is async — a single eval run can't easily test that the +scheduled prompt arrives). For forks that want it, the pattern is: + +1. `schedule_task(prompt, "")` in setup. +2. Wait > 1 second. +3. Assert on the audit log and/or KB state for the *fired* prompt's + side effects. + +Document the case as `category: "scheduler"` and gate at >= 2/3 +attempts to absorb timing jitter. + +## References + +- [Workstacean scheduler reference](https://protolabsai.github.io/protoWorkstacean/reference/scheduler/) +- [Configuration](/reference/configuration#scheduler) — env vars +- [Eval your fork](/guides/evals) — for the testing pattern above diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md index f903995..031cff6 100644 --- a/docs/guides/subagents.md +++ b/docs/guides/subagents.md @@ -56,7 +56,11 @@ The template's `LangGraphConfig` (in `graph/config.py`) has a `worker` field. Ad class LangGraphConfig: # ... existing fields ... worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + ], max_turns=20, )) researcher: SubagentDef = field(default_factory=lambda: SubagentDef( @@ -86,7 +90,16 @@ for name in ("worker", "researcher"): # ← add new names subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 researcher: enabled: true @@ -117,8 +130,8 @@ If your agent is simple enough that subagents are pure overhead, flip `include_s ```python _graph = create_agent_graph( _graph_config, - knowledge_store=None, - include_subagents=False, # ← skip the task() tool and subagent machinery + knowledge_store=knowledge_store, # keep the bundled store wired up + include_subagents=False, # ← skip the task() tool and subagent machinery ) ``` diff --git a/docs/index.md b/docs/index.md index d66540d..aaf8c0e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,14 +3,14 @@ layout: home hero: name: protoAgent text: LangGraph + A2A template for protoLabs agents - tagline: Fork this repo. Rewrite SOUL.md, prompts, and tools. Ship. + tagline: Clone. Run. Walk the wizard. Chat. Fork when you're ready to ship. actions: - theme: brand text: Spin up your first agent link: /tutorials/first-agent - theme: alt - text: Reference - link: /reference/ + text: Customize & deploy + link: /guides/customize-and-deploy features: - icon: 🔌 diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 3463dba..2913700 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -17,13 +17,23 @@ model: subagents: worker: enabled: true - tools: [echo, current_time, calculator, web_search, fetch_url] + tools: + - current_time + - calculator + - web_search + - fetch_url + - memory_ingest + - memory_recall + - memory_list + - memory_stats + - daily_log max_turns: 20 middleware: - knowledge: false + knowledge: true audit: true - memory: false + memory: true + scheduler: true knowledge: db_path: /sandbox/knowledge/agent.db @@ -59,9 +69,10 @@ Adding a new subagent name to the YAML requires matching entries in `graph/subag | Key | Default | What | |---|---|---| -| `knowledge` | `false` | Inject retrieved knowledge into state before LLM calls. Requires a knowledge store — leave off until you add one. | +| `knowledge` | `true` | Inject retrieved knowledge into state before LLM calls. Backed by the bundled `KnowledgeStore` (sqlite + FTS5). Set `false` for a stateless agent. | | `audit` | `true` | Append every tool call to `/sandbox/audit/audit.jsonl`. | -| `memory` | `false` | Memory middleware (experimental). Requires a knowledge store. | +| `memory` | `true` | Persist a session summary on terminal turn and asynchronously index conversation findings under `domain='finding'`. | +| `scheduler` | `true` | Wire the bundled scheduler backend (local sqlite, or `WorkstaceanScheduler` when env vars are set). Drops the `schedule_task` / `list_schedules` / `cancel_schedule` tools from the agent loop when `false`. Has the same effect as `SCHEDULER_DISABLED=1` — but `middleware.scheduler: false` is the canonical opt-out (drawer/wizard editable, survives restarts), while the env var is a runtime escape hatch for fleet operators who can't edit YAML in the moment. | ## `knowledge` @@ -69,8 +80,21 @@ Only read when `middleware.knowledge` is `true`. | Key | Default | What | |---|---|---| -| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. | -| `embed_model` | `nomic-embed-text` | Embedding model. | +| `db_path` | `/sandbox/knowledge/agent.db` | SQLite file path. Falls back to `~/.protoagent/knowledge/agent.db` automatically when the configured path isn't writable (e.g. running locally without `/sandbox`). Override at runtime with `KNOWLEDGE_DB_PATH`. | +| `embed_model` | `nomic-embed-text` | Reserved for forks that bolt embeddings on top of the FTS5 baseline. The bundled store ignores it. | | `top_k` | `5` | Results per query fed into state. | -The template does not ship a knowledge store — the config keys are kept so a fork can flip the switch without rewiring every call site. +The bundled store is sqlite + FTS5 (with an automatic LIKE fallback when FTS5 isn't available). One `chunks` table; the `domain` column distinguishes operator-set notes (`memory_ingest`), daily-log entries (`daily_log`), and conversation findings extracted by `MemoryMiddleware` (`domain='finding'`). + +## Scheduler + +Scheduler **enable/disable** is YAML-controlled (`middleware.scheduler` above) so the drawer can flip it without a restart. Backend **selection and runtime knobs** (which backend, where to write the sqlite, where to publish, etc.) are env-driven so the same container image can run under either backend without a rebuild. See [Schedule future work](/guides/scheduler) for the full guide. + +| Env var | Default | What | +|---|---|---| +| `WORKSTACEAN_API_BASE` | unset | When set together with `WORKSTACEAN_API_KEY`, swaps the bundled local scheduler for the `WorkstaceanScheduler` HTTP adapter. | +| `WORKSTACEAN_API_KEY` | unset | Auth token sent as `X-API-Key` to Workstacean's `/publish`. | +| `WORKSTACEAN_TOPIC_PREFIX` | `cron.` | Override the bus topic the adapter fires on, when your Workstacean install uses a different convention. | +| `SCHEDULER_DB_DIR` | `/sandbox/scheduler` | Local backend: parent directory for `/jobs.db`. Falls back to `~/.protoagent/scheduler//jobs.db` when unwritable. | +| `SCHEDULER_INVOKE_URL` | `http://127.0.0.1:` | Local backend: where to POST `message/send` when a job fires. Override only if the agent's A2A endpoint isn't on localhost. | +| `SCHEDULER_DISABLED` | unset | Runtime escape hatch — set to `1` / `true` to drop the scheduler tools entirely without editing YAML. `middleware.scheduler: false` is the canonical opt-out. | diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md index e27d074..d74ea6b 100644 --- a/docs/reference/environment-variables.md +++ b/docs/reference/environment-variables.md @@ -36,6 +36,37 @@ Session memory is enabled by default. See [architecture § Session memory](/expl To persist memory across container restarts, mount a volume at whatever `MEMORY_PATH` resolves to. Without a volume the directory is ephemeral. +## Knowledge store + +The bundled `KnowledgeStore` (sqlite + FTS5) is enabled by default. See [Configuration § knowledge](/reference/configuration#knowledge) for the full guide. + +| Variable | Default | What | +|---|---|---| +| `KNOWLEDGE_DB_PATH` | (unset — uses YAML `knowledge.db_path`) | Runtime override for the sqlite path. Falls back to `~/.protoagent/knowledge/agent.db` when the resolved path is unwritable (e.g. running locally without `/sandbox`). | + +To opt out entirely, set `middleware.knowledge: false` in YAML. The memory tools (`memory_ingest`, `memory_recall`, etc.) are dropped from the agent loop when the store is disabled. + +## Audit log + +| Variable | Default | What | +|---|---|---| +| `AUDIT_PATH` | `/sandbox/audit/audit.jsonl` | Directory + filename of the JSONL audit log written by `AuditMiddleware`. Read by `evals/verify.py` for side-effect assertions. | + +## Scheduler + +The bundled scheduler is enabled by default. See [Schedule future work](/guides/scheduler) and [Configuration § scheduler](/reference/configuration#scheduler) for the full guide. **Backend selection** is env-driven; **enable/disable** lives in YAML (`middleware.scheduler`) so the drawer can toggle without a restart. + +| Variable | Default | What | +|---|---|---| +| `WORKSTACEAN_API_BASE` | (unset) | When set together with `WORKSTACEAN_API_KEY`, swaps the bundled `LocalScheduler` for the `WorkstaceanScheduler` HTTP adapter. | +| `WORKSTACEAN_API_KEY` | (unset) | Auth token sent as `X-API-Key` to Workstacean's `/publish`. | +| `WORKSTACEAN_TOPIC_PREFIX` | `cron.` | Override the bus topic the adapter fires on, when your Workstacean install uses a different convention. | +| `SCHEDULER_DB_DIR` | `/sandbox/scheduler` | Local backend: parent directory for `/jobs.db`. Falls back to `~/.protoagent/scheduler//jobs.db` when unwritable. | +| `SCHEDULER_INVOKE_URL` | `http://127.0.0.1:` | Local backend: where to POST `message/send` when a job fires. Override only if the agent's A2A endpoint isn't on localhost. | +| `SCHEDULER_DISABLED` | (unset) | Runtime escape hatch — set to `1` / `true` to drop the scheduler tools entirely without editing YAML. `middleware.scheduler: false` is the canonical opt-out. | + +> **protoLabs operators**: the fleet's Workstacean lives on the `ava` node. `WORKSTACEAN_API_KEY` is in the org's secrets manager under `secret-management → workstacean`. + ## Tracing (optional) | Variable | What | diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md index e47f25e..9ef37aa 100644 --- a/docs/reference/starter-tools.md +++ b/docs/reference/starter-tools.md @@ -1,15 +1,12 @@ # Starter tools -Five free, keyless tools ship in `tools/lg_tools.py`. They exist so a fresh template clone can demonstrate real behaviour immediately. Keep them, drop them, or swap them — `get_all_tools()` is the registry. +Twelve tools ship by default: -## `echo` +- Four keyless general-purpose tools — `current_time`, `calculator`, `web_search`, `fetch_url` — that work without any state. +- Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)). +- Three **scheduler tools** — `schedule_task`, `list_schedules`, `cancel_schedule` — bound to the bundled scheduler backend (local sqlite or the Workstacean adapter, see [Schedule future work](/guides/scheduler)). -```python -@tool -async def echo(message: str) -> str -``` - -Returns `"echo: "`. The template-only sanity tool. Safe to delete once your real tools are wired. +`get_all_tools(knowledge_store, scheduler)` is the registry. When `knowledge_store` is `None` the memory tools are omitted; when `scheduler` is `None` the scheduler tools are omitted. Both backends are constructed by default in `server.py`; opt out via `middleware.knowledge: false` / `middleware.scheduler: false` in `config/langgraph-config.yaml`. ## `current_time` @@ -106,6 +103,103 @@ Example Domain This domain is for use in documentation examples... ``` +## `memory_ingest` + +```python +@tool +async def memory_ingest(content: str, domain: str = "general", heading: str | None = None) -> str +``` + +Stores a chunk in the bundled `KnowledgeStore`. Use for things the operator wants you to remember across sessions — preferences, environment facts, decisions worth recalling later. + +`domain` is a logical bucket (`"preferences"`, `"context"`, `"general"`, …). `heading` is an optional short label that doubles as a stable de-dupe key. + +Returns `"Stored chunk 17 in 'preferences'."` on success, an error string when the store is unavailable. + +## `memory_recall` + +```python +@tool +async def memory_recall(query: str, k: int = 5) -> str +``` + +Top-k keyword search over the store via FTS5 (LIKE fallback). Returns one match per line: + +``` +[preferences] coffee: Operator's preferred coffee is a Gibraltar with oat milk. +[context] lab: Primary lab is Snickerdoodle in Spokane. +``` + +Returns `"No matches."` when nothing scores above the keyword threshold. + +## `memory_list` + +```python +@tool +async def memory_list(domain: str | None = None, limit: int = 10) -> str +``` + +Most-recent-first listing of stored chunks. Filter by domain when given. Useful for "what did I log today?" style queries. + +## `memory_stats` + +```python +@tool +async def memory_stats() -> str +``` + +Per-domain chunk counts plus a total. Useful for sanity-checking that ingest landed. + +## `daily_log` + +```python +@tool +async def daily_log(content: str) -> str +``` + +Convenience wrapper around `memory_ingest` that writes to `domain='daily-log'` with today's UTC date as the heading. Same-day entries cluster under the same heading for `memory_list(domain='daily-log')`. + +## `schedule_task` + +```python +@tool +async def schedule_task(prompt: str, when: str, job_id: str | None = None) -> str +``` + +Persist a future invocation. The agent receives `prompt` as a fresh turn when the schedule fires. + +`when` is either a 5-field cron expression (`"0 9 * * 1-5"` = every weekday at 9am) or an ISO-8601 datetime (`"2026-05-01T15:00:00"` = once at 3pm UTC on May 1). Backends auto-detect. + +`job_id` is optional — auto-generated as `-` when omitted. You'll need it later for `cancel_schedule`. + +Output: `"Scheduled job next at ."` on success. Returns `"Error: ..."` on malformed `when` or backend failure. + +Prompts are self-contained — the agent has no memory of the scheduling moment when the task fires, so write the prompt as a fresh turn ("review last week's pipeline incidents and post a summary"), not a reference ("do that thing we discussed"). + +## `list_schedules` + +```python +@tool +async def list_schedules() -> str +``` + +List the current scheduled jobs for *this* agent. Multi-agent isolation: each agent only sees jobs it created. + +Output: one job per line with id, next-fire timestamp, schedule, and prompt preview. Returns `"No scheduled jobs."` when empty. + +The Workstacean adapter intentionally returns `[]` (Workstacean owns scheduling state and its `list` action publishes asynchronously to a topic). Run the local backend or query Workstacean directly for live introspection there. + +## `cancel_schedule` + +```python +@tool +async def cancel_schedule(job_id: str) -> str +``` + +Cancel a scheduled job by id. Returns `"Canceled ."` or `"Error: no such job ."`. + +Cross-agent cancellation is blocked — `gina-personal` cannot cancel `gina-work`'s jobs even when sharing a sqlite path or a Workstacean install. + ## Adding your own Follow the same pattern: @@ -128,11 +222,16 @@ async def my_tool(required_arg: str, optional_arg: int = 5) -> str: return f"Success: {result}" ``` -Then append it to the list in `get_all_tools()`: +Then append it to the keyless tool list in `get_all_tools()` — keep the two conditional extensions below it so the bundled memory + scheduler tools still ship when their backends are configured: ```python -def get_all_tools(knowledge_store=None): - return [echo, current_time, calculator, web_search, fetch_url, my_tool] +def get_all_tools(knowledge_store=None, scheduler=None): + tools = [current_time, calculator, web_search, fetch_url, my_tool] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + if scheduler is not None: + tools.extend(_build_scheduler_tools(scheduler)) + return tools ``` See [Write your first tool](/tutorials/first-tool) for the full walkthrough. @@ -140,4 +239,6 @@ See [Write your first tool](/tutorials/first-tool) for the full walkthrough. ## Related - [Configure subagents](/guides/subagents) — tools are allowlisted per subagent -- [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url` +- [Environment variables](/reference/environment-variables) — SSRF allowlist vars affect `fetch_url`; scheduler backend selection lives there too +- [Eval your fork](/guides/evals) — the eval harness exercises every tool listed here end-to-end +- [Schedule future work](/guides/scheduler) — the firing model + multi-agent isolation story behind the scheduler tools diff --git a/docs/tutorials/first-agent.md b/docs/tutorials/first-agent.md index 4fc9a16..e58ad76 100644 --- a/docs/tutorials/first-agent.md +++ b/docs/tutorials/first-agent.md @@ -1,99 +1,80 @@ # Spin up your first agent -This walks you from "I clicked Use this template" to "I have a running agent answering a web-search query". About 15 minutes, assuming Docker and a LiteLLM gateway are already set up. +About 5 minutes. You need Python 3.11+ and an OpenAI-compatible API key (OpenAI direct, LiteLLM gateway, Anthropic-via-gateway, Ollama, anything that speaks the OpenAI REST shape). -## What you'll need +No forking, no `sed`, no Docker for your first run. That's all in [Customize & deploy](/guides/customize-and-deploy) once you've decided this template works for you. -- A GitHub account with access to `protoLabsAI` (or your own org — the workflows gate on the repo owner; see step 7) -- Docker -- A LiteLLM gateway running somewhere reachable (the template points at `http://gateway:4000/v1`) -- A model alias in that gateway. The template's default is `protolabs/agent` — either add that alias or retarget `model.name` in step 4 - -## 1. Use the template - -From GitHub, click **Use this template → Create a new repository** on [protoLabsAI/protoAgent](https://github.com/protoLabsAI/protoAgent). Pick a short slug like `jon` or `echo-agent` — it will end up as the image name, metric prefix, Langfuse tag, and more. - -Or from the CLI: +## 1. Get the code ```bash -gh repo create protoLabsAI/my-agent \ - --template protoLabsAI/protoAgent \ - --public --clone - +git clone https://github.com/protoLabsAI/protoAgent.git my-agent cd my-agent ``` -## 2. Rename the agent - -The template uses `protoagent` as the placeholder throughout. Do a pass: +## 2. Install dependencies ```bash -git grep -li protoagent | xargs sed -i 's/protoagent/my-agent/g' -git grep -li protoAgent | xargs sed -i 's/protoAgent/MyAgent/g' +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt ``` -Review the diff before committing — the replacement hits Dockerfile paths (`/opt/protoagent` → `/opt/my-agent`), the GHCR image path, workflow repo guards, and the Gradio UI branding. All of those want the new name. +## 3. Run the server -## 3. Rewrite identity - -Three files carry the agent's identity. Edit each one: - -- `config/SOUL.md` — the persona doc loaded at session start. See the placeholder file itself for guidance. -- `graph/prompts.py` — the system prompt for the lead agent + subagents. -- `server.py::_build_agent_card` — the agent card served at `/.well-known/agent-card.json`. At minimum, fix `name` and `description`; revisit `skills` once you have real tools. - -## 4. Point at a model +```bash +python server.py +``` -Edit `config/langgraph-config.yaml`: +You should see: -```yaml -model: - name: protolabs/my-agent # or openai/gpt-4o, anthropic/claude-opus-4-6, etc. - api_base: http://gateway:4000/v1 +``` +LangGraph agent initialized (setup wizard not complete — graph not compiled. Open the UI to finish setup.) +Starting protoagent on http://0.0.0.0:7870 ``` -If you're using a gateway alias (recommended), make sure the alias is registered there before booting — swapping models later becomes a gateway edit instead of a code change. +## 4. Open the setup wizard -## 5. Build and run +Visit in a browser. Because `config/.setup-complete` doesn't exist yet, you'll land in the wizard instead of the chat UI. -```bash -docker build -t my-agent:local . -docker run --rm -p 7870:7870 \ - -e AGENT_NAME=my-agent \ - -e OPENAI_API_KEY="$LITELLM_MASTER_KEY" \ - my-agent:local -``` +Walk through the four steps: -## 6. Verify the agent is up +1. **Connect to your model.** Paste your API base URL (`https://api.openai.com/v1` for OpenAI direct, `http://localhost:4000/v1` for a local LiteLLM gateway) and API key. Click **Test connection & fetch models** — the dropdown fills with whatever the endpoint actually exposes. Pick one. +2. **Name your agent.** Short lowercase slug (e.g. `product-director`). Pick a persona preset — **Generic Assistant** is the safe default; **Research** / **Coding** / **Blank** are the alternatives — and click **Load preset into SOUL.md**. Edit the loaded text if you want to make it specific to your agent. +3. **Tools & middleware.** All twelve starter tools are enabled by default — four keyless general (`current_time`, `calculator`, `web_search`, `fetch_url`), five memory (`memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log`), and three scheduler (`schedule_task`, `list_schedules`, `cancel_schedule`). Leave **Audit**, **Memory**, **Knowledge**, and **Scheduler** middleware on — the template ships a working sqlite + FTS5 store under `/sandbox/knowledge/agent.db` and a sqlite-backed scheduler under `/sandbox/scheduler//jobs.db`, both with `~/.protoagent/...` fallbacks outside Docker. +4. **Optional — you, security, autostart.** Your name makes the agent address you directly. A2A auth token blank for local dev, set it before you expose the port. "Launch this agent automatically on login" installs a macOS LaunchAgent so the server is up after every reboot without remembering to `python server.py`. -In another terminal: +Hit **Launch agent**. The wizard closes, the chat UI appears, and the Configuration drawer on the right is now populated with your choices. -```bash -curl http://localhost:7870/.well-known/agent-card.json | jq .name -# → "my-agent" +## 5. Try it -curl http://localhost:7870/metrics | grep my_agent_active_sessions -# → my_agent_active_sessions 0 -``` - -Hit `http://localhost:7870` in a browser to get the Gradio chat UI. Ask it: +In the chat box: > What time is it in Tokyo? -If the starter tools are wired correctly, it should call `current_time`, return an ISO-8601 timestamp with the timezone offset, and explain what it found. +The agent calls `current_time`, returns an ISO-8601 timestamp, and explains what it found. Then: > Find three recent articles about the A2A protocol and summarize them. -The agent will call `web_search`, then `fetch_url` for each of the top results, and return a summary. That round-trip exercises the full tool loop + LLM call + streaming response path. +The agent calls `web_search`, then `fetch_url` on the top results, and hands back a synthesis. That round-trip exercises the full tool loop + LLM call + streaming response path. + +## What just happened + +- Your answers were written to `config/langgraph-config.yaml` (human-readable — peek at it). +- The persona preset was written to `config/SOUL.md`. +- A `config/.setup-complete` marker was created so the next boot goes straight to chat. +- The agent card at now reflects your agent name. +- If you checked autostart, `~/Library/LaunchAgents/ai.protolabs..plist` was installed and `launchctl load`-ed. -## 7. Un-freeze the release pipeline +## Changing your mind -The three release workflows (`docker-publish.yml`, `prepare-release.yml`, `release.yml`) all gate on `github.repository == 'protoLabsAI/protoAgent'`. Change that check in each file to match your repo's owner/repo before merging anything to `main`, or the release automation won't fire. +- **Any field** — open the Configuration drawer on the right side of the chat UI. Every wizard field is there, plus a few advanced ones (temperature, max_tokens, max_iterations, knowledge store settings). +- **The whole wizard** — expand the drawer's "Re-run setup wizard" accordion and click **Run wizard now**. Your current values pre-fill every step. +- **Autostart** — toggle it off in the wizard or the drawer; the LaunchAgent is removed and the plist file deleted. ## Where to go next - [Write your first tool](/tutorials/first-tool) — wire a custom LangChain tool into the loop +- [Customize & deploy](/guides/customize-and-deploy) — fork the template, rename throughout, ship a GHCR image - [Add a custom skill](/guides/add-a-skill) — expose the new behaviour on the A2A agent card -- [Deploy via GHCR](/guides/deploy) — get Watchtower auto-deploying your merges diff --git a/docs/tutorials/first-tool.md b/docs/tutorials/first-tool.md index 9f10251..af3b767 100644 --- a/docs/tutorials/first-tool.md +++ b/docs/tutorials/first-tool.md @@ -37,7 +37,6 @@ Then register it in `get_all_tools()` at the bottom of the same file: ```python def get_all_tools(knowledge_store=None): return [ - echo, current_time, calculator, web_search, @@ -48,12 +47,17 @@ def get_all_tools(knowledge_store=None): ## 2. Allow the subagent to use it (optional) -If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`: +If you want the worker subagent to be able to call `git_sha`, add it to the allowlist in `graph/subagents/config.py`. Append rather than replace — dropping the bundled defaults removes the worker's memory tools: ```python WORKER_CONFIG = SubagentConfig( # ... - tools=["echo", "current_time", "calculator", "web_search", "fetch_url", "git_sha"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + "git_sha", # ← new + ], # ... ) ``` @@ -104,5 +108,5 @@ The template runs tests via `pytest` with `pytest-asyncio` in auto mode — no e ## Where to go next - [Add a custom skill](/guides/add-a-skill) — advertise new capabilities on the agent card so A2A callers can find them -- [Starter tools reference](/reference/starter-tools) — the shapes of the five tools that ship +- [Starter tools reference](/reference/starter-tools) — the shapes of all twelve tools that ship by default - [Configure subagents](/guides/subagents) — add specialized delegates beyond the placeholder `worker` diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..c8aff77 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,100 @@ +# Evals + +Side-effect-verified eval harness. Each case sends a prompt over A2A +to a running agent and asserts on three independent channels: + +1. **Audit log** — every expected tool name fires with the expected + outcome (`AuditMiddleware` writes JSONL to `/sandbox/audit/audit.jsonl`). +2. **Reply text** — case-insensitive substring patterns appear in the + model's final reply. +3. **Knowledge store side effects** — the right rows actually land in + the `chunks` table after a memory-writing turn. + +A case passes only when every configured assertion holds. + +## Quickstart + +```bash +# Agent must be running at $EVAL_BASE_URL (default http://localhost:7870). +# Auth: set $A2A_AUTH_TOKEN if bearer is configured, $_API_KEY +# (or $EVAL_API_KEY) if X-API-Key auth is configured. Both are sent +# when both env vars exist. + +python -m evals.runner # all cases +python -m evals.runner --category tool # one category +python -m evals.runner --tasks current_time,daily_log +python -m evals.runner --base-url http://host:7870 +``` + +Reports land in `evals/results/run-.json` per run. + +## Categories + +| Category | What it covers | +|---|---| +| `a2a-protocol` | Agent card discovery, auth gating | +| `simple` | Direct LLM answers, no tool use | +| `abstention` | Don't reach for a tool when training data is enough | +| `tool` | Single-tool invocations across the starter set | +| `chained` | Multi-step reasoning that calls 2+ tools | +| `subsystem` | KnowledgeMiddleware retrieval, hot-memory injection | + +## File layout + +``` +evals/ + client.py A2A client (message/send + poll, message/stream, agent card, cancel) + runner.py CLI runner — print board, write JSON report + verify.py Audit-log + KB side-effect assertions, setup/teardown + tasks.json Cases — 15 covering the starter tools end-to-end + results/ Per-run reports +``` + +## Adding a case + +Append to `tasks.json`: + +```json +{ + "id": "unique-id", + "category": "tool", + "kind": "ask", + "name": "Human-readable description", + "prompt": "What you ask the agent (in real-user voice — never name the tool)", + "expected_tools": ["tool_name"], + "expected_patterns": ["substring-that-must-appear"], + "verify_kb": { + "find_chunk_containing": "EVAL-MARK-A1B2", + "domain": "context" + }, + "setup": [ + {"kb_ingest": {"content": "...", "domain": "context", "heading": "..."}} + ], + "teardown": [ + {"kb_delete_by_content": {"contains": "EVAL-MARK-A1B2"}} + ] +} +``` + +Use **unique markers** (`EVAL-MARK-XYZ`, `eval-chain-flag-q9`) in +prompts whenever you need a verifier to disambiguate from real +operator data. + +## Why side-effect verification + +When the model hallucinates a tool result (e.g. "Logged: ..." without +actually calling `daily_log`), text-only checks pass while the DB +stays empty. The audit-log + KB queries here catch it. + +## Prompt rule + +Every prompt must be plausibly typed by a real user. **The tool name +never appears.** If the agent has to infer the tool from intent, that +*is* the test — leaking the tool name into the prompt is testing +instruction-following, not tool selection. + +## References + +- Anthropic — [Demystifying evals for AI agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) +- BFCL V3 — [Multi-Turn](https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html) +- [ToolSandbox](https://arxiv.org/html/2408.04682v1) diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/client.py b/evals/client.py new file mode 100644 index 0000000..51eadfe --- /dev/null +++ b/evals/client.py @@ -0,0 +1,262 @@ +"""A2A client for the eval runner. + +Drives the running agent over the same JSON-RPC + SSE surface that +real A2A callers use: + +- ``agent_card()`` — GET ``/.well-known/agent-card.json`` +- ``ask()`` — ``message/send`` + ``tasks/get`` poll +- ``stream()`` — ``message/stream`` SSE +- ``cancel()`` — ``tasks/cancel`` + +Returns structured ``TaskResult`` objects the runner asserts against. + +Auth picks up both surfaces the template exposes (see ``server.py``): + +- ``Authorization: Bearer `` — wizard-set / ``A2A_AUTH_TOKEN`` env +- ``X-API-Key: `` — legacy, ``_API_KEY`` env + +Both headers are sent when the corresponding env var is set; the +running agent enforces whichever it is configured for. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import time +import uuid +from dataclasses import dataclass, field +from typing import Any + +import httpx + + +@dataclass +class TaskResult: + task_id: str + state: str # completed / failed / canceled / timeout + text: str = "" # extracted user-facing reply + artifacts: list[dict] = field(default_factory=list) + usage: dict = field(default_factory=dict) + duration_ms: int = 0 + error: str | None = None + + +def _resolve_auth_env() -> tuple[str, str]: + """Return (bearer_token, api_key) from env. + + Bearer comes from ``A2A_AUTH_TOKEN`` (the env name the A2A handler + reads at boot). The API key is named after the agent — + ``_API_KEY`` — so a fork named ``quinn`` reads + ``QUINN_API_KEY``. ``EVAL_API_KEY`` is honored as an explicit + override so CI doesn't have to know the agent's slug. + """ + bearer = os.environ.get("A2A_AUTH_TOKEN", "") + + api_key = os.environ.get("EVAL_API_KEY", "") + if not api_key: + agent = os.environ.get("AGENT_NAME", "protoagent").upper() + api_key = os.environ.get(f"{agent}_API_KEY", "") + return bearer, api_key + + +class AgentClient: + """Thin A2A client tied to one agent instance.""" + + def __init__( + self, + base_url: str | None = None, + bearer: str | None = None, + api_key: str | None = None, + ): + self.base_url = ( + base_url + or os.environ.get("EVAL_BASE_URL") + or os.environ.get("AGENT_BASE_URL") + or "http://localhost:7870" + ).rstrip("/") + + env_bearer, env_api_key = _resolve_auth_env() + token = bearer if bearer is not None else env_bearer + x_api = api_key if api_key is not None else env_api_key + self.headers = {"Content-Type": "application/json"} + if token: + self.headers["Authorization"] = f"Bearer {token}" + if x_api: + self.headers["X-API-Key"] = x_api + + # ── Agent card ────────────────────────────────────────────────────────── + + async def agent_card(self) -> dict: + """Fetch the agent card. + + The template serves both ``/.well-known/agent-card.json`` (modern) + and ``/.well-known/agent.json`` (legacy). We try the modern path + first; fall back to the legacy path so this works against forks + that disabled one or the other. + """ + async with httpx.AsyncClient(timeout=10) as client: + for path in ("/.well-known/agent-card.json", "/.well-known/agent.json"): + r = await client.get(f"{self.base_url}{path}") + if r.status_code == 200: + return r.json() + r.raise_for_status() # surface the last error + return {} + + # ── message/send + poll ───────────────────────────────────────────────── + + async def ask(self, prompt: str, *, timeout_s: int = 90) -> TaskResult: + """Send + poll until terminal. Returns TaskResult with extracted text.""" + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + start = time.time() + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(f"{self.base_url}/a2a", headers=self.headers, json=payload) + r.raise_for_status() + resp = r.json() + if "error" in resp: + return TaskResult(task_id="", state="failed", error=str(resp["error"])) + task_id = resp.get("result", {}).get("id", "") + + deadline = start + timeout_s + while time.time() < deadline: + await asyncio.sleep(1.5) + poll = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "p", + "method": "tasks/get", + "params": {"id": task_id}, + }, + ) + poll.raise_for_status() + res = poll.json().get("result", {}) + state = (res.get("status") or {}).get("state", "") + if state in ("completed", "failed", "canceled"): + text, usage = _extract(res) + return TaskResult( + task_id=task_id, + state=state, + text=text, + artifacts=res.get("artifacts", []), + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + return TaskResult( + task_id=task_id, state="timeout", + duration_ms=int((time.time() - start) * 1000), + ) + + # ── message/stream (SSE) ──────────────────────────────────────────────── + + async def stream(self, prompt: str, *, timeout_s: int = 90) -> tuple[list[dict], TaskResult | None]: + """Stream a turn over SSE. Returns (event_log, final TaskResult). + + Each event is a dict shaped ``{kind, result}``. Use this to assert + on the streaming protocol itself (status-update sequence, final + flag, artifact chunks). Most cases should use ``ask()`` instead. + """ + mid = str(uuid.uuid4()) + payload = { + "jsonrpc": "2.0", + "id": mid, + "method": "message/stream", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": prompt}], + "messageId": mid, + } + }, + } + events: list[dict] = [] + final: TaskResult | None = None + start = time.time() + async with httpx.AsyncClient(timeout=timeout_s) as client: + async with client.stream( + "POST", f"{self.base_url}/a2a", headers=self.headers, json=payload + ) as r: + if r.status_code >= 400: + body = await r.aread() + return events, TaskResult( + task_id="", state="failed", + error=f"HTTP {r.status_code}: {body.decode()[:300]}", + ) + async for line in r.aiter_lines(): + if not line or line.startswith(":"): + continue + if line.startswith("data:"): + raw = line[5:].strip() + if not raw: + continue + try: + data = json.loads(raw) + except json.JSONDecodeError: + events.append({"kind": "raw", "raw": raw}) + continue + result = (data.get("result") or {}) + kind = result.get("kind", "?") + events.append({"kind": kind, "result": result}) + if kind in ("status-update", "task") and result.get("final"): + text, usage = _extract(result) + final = TaskResult( + task_id=result.get("taskId") or result.get("id", ""), + state=(result.get("status") or {}).get("state", "unknown"), + text=text, + usage=usage, + duration_ms=int((time.time() - start) * 1000), + ) + break + return events, final + + # ── tasks/cancel ──────────────────────────────────────────────────────── + + async def cancel(self, task_id: str) -> dict: + async with httpx.AsyncClient(timeout=10) as client: + r = await client.post( + f"{self.base_url}/a2a", + headers=self.headers, + json={ + "jsonrpc": "2.0", + "id": "c", + "method": "tasks/cancel", + "params": {"id": task_id}, + }, + ) + return r.json() + + +def _extract(result: dict) -> tuple[str, dict]: + """Pull text + cost data out of an A2A result envelope.""" + text_parts: list[str] = [] + usage: dict = {} + artifacts = result.get("artifacts") or [] + for art in artifacts: + for p in art.get("parts", []): + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + elif p.get("kind") == "data" and isinstance(p.get("data"), dict): + if "usage" in p["data"]: + usage = dict(p["data"]["usage"]) + if "durationMs" in p["data"]: + usage["durationMs"] = p["data"]["durationMs"] + status = result.get("status") or {} + msg = status.get("message") or {} + for p in msg.get("parts") or []: + if p.get("kind") == "text" and p.get("text"): + text_parts.append(p["text"]) + return "\n".join(text_parts).strip(), usage diff --git a/evals/results/.gitkeep b/evals/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/runner.py b/evals/runner.py new file mode 100644 index 0000000..7b66d4d --- /dev/null +++ b/evals/runner.py @@ -0,0 +1,392 @@ +"""Eval runner — executes ``tasks.json``, prints a pass/fail board, +writes a JSON report to ``evals/results/run-.json``. + +Usage: + +.. code:: bash + + # agent must be running at $EVAL_BASE_URL (default http://localhost:7870) + # auth: $A2A_AUTH_TOKEN and/or $_API_KEY (or $EVAL_API_KEY) + + python -m evals.runner # all cases + python -m evals.runner --category tool # one category + python -m evals.runner --tasks current_time,daily_log + python -m evals.runner --base-url http://host:7870 + +Cases are described in ``tasks.json``. Each case picks one of three +``kind`` runners: + +- ``agent_card`` — fetch ``/.well-known/agent-card.json`` and assert + on the returned card shape. +- ``auth_check`` — send a request with a known-bad bearer token and + assert the expected HTTP status. +- ``ask`` — send a prompt over A2A, optionally pre-seed the KB, then + assert against three independent channels: audit-log tool firing, + reply-text patterns, and KB side effects. + +A case passes only when all assertions hold. The ``detail`` column in +the pass/fail board names the missing assertion when one fails. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +import time +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from pathlib import Path + +# Allow ``python -m evals.runner`` and ``python evals/runner.py``. +_PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(_PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(_PROJECT_ROOT)) + +from evals.client import AgentClient, TaskResult +from evals import verify + + +@dataclass +class CaseResult: + id: str + category: str + name: str + passed: bool + detail: str + duration_ms: int = 0 + tokens: int = 0 + raw: dict = field(default_factory=dict) + + +# ── case runners ──────────────────────────────────────────────────────────── + + +async def _run_agent_card(client: AgentClient, case: dict) -> CaseResult: + expect = case.get("expect", {}) + try: + card = await client.agent_card() + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"fetch failed: {e}") + + problems: list[str] = [] + if "name" in expect and card.get("name") != expect["name"]: + problems.append(f"name={card.get('name')!r} expected {expect['name']!r}") + if "skills_min" in expect: + skills = card.get("skills") or [] + if len(skills) < expect["skills_min"]: + problems.append(f"only {len(skills)} skills, expected >= {expect['skills_min']}") + if "extensions_contain" in expect: + ext_uris = [ + e.get("uri", "") + for e in (card.get("capabilities") or {}).get("extensions") or [] + ] + for needle in expect["extensions_contain"]: + if not any(needle in u for u in ext_uris): + problems.append(f"missing extension matching {needle!r}; saw {ext_uris}") + if problems: + return CaseResult(case["id"], case["category"], case["name"], False, "; ".join(problems)) + return CaseResult(case["id"], case["category"], case["name"], True, "card OK") + + +async def _run_auth_check(client: AgentClient, case: dict) -> CaseResult: + """Verify the A2A endpoint rejects a request with the expected status. + + Default behaviour exercises bearer auth alone using ``case["bad_token"]``. + Cases can override headers via ``case["headers"]`` to test other + auth surfaces — e.g. ``{"X-API-Key": "wrong"}`` for the legacy + X-API-Key path. ``Content-Type: application/json`` is always set + for the eval client; case headers override anything else. + """ + import httpx + + expected_status = case.get("expect", {}).get("status", 401) + bad = case.get("bad_token", "") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {bad}", + } + headers.update(case.get("headers") or {}) + payload = { + "jsonrpc": "2.0", + "id": "auth-check", + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": "ping"}], + "messageId": "auth-check", + } + }, + } + try: + async with httpx.AsyncClient(timeout=10) as c: + r = await c.post(f"{client.base_url}/a2a", headers=headers, json=payload) + except Exception as e: + return CaseResult(case["id"], case["category"], case["name"], False, f"request failed: {e}") + if r.status_code != expected_status: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"got {r.status_code}, expected {expected_status}", + ) + return CaseResult( + case["id"], case["category"], case["name"], True, f"status={r.status_code}", + ) + + +async def _run_ask(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/send`` + poll. Teardown always runs.""" + return await _run_prompt_case(client, case, streaming=False) + + +async def _run_stream(client: AgentClient, case: dict) -> CaseResult: + """Send via ``message/stream`` + SSE. Same assertion shape as ``ask``, + plus an optional ``expected_event_kinds`` list that asserts the SSE + stream surfaced the named event kinds (``status-update``, ``task``, + etc.) at least once.""" + return await _run_prompt_case(client, case, streaming=True) + + +_AUDIT_POLL_DEADLINE_S = 2.0 +_AUDIT_POLL_INTERVAL_S = 0.05 + + +async def _await_audit_assertion( + since: str, + expected_tools: list[str], + *, + require_success: bool, +) -> tuple[list[dict], bool, str]: + """Poll the audit log until ``expected_tools`` have all fired (or the + deadline is hit). Returns ``(entries, passed, detail)``. + + Replaces a fixed ``asyncio.sleep`` — under audit-log contention the + fixed wait was sometimes shorter than the flush, causing flaky + tool-firing assertions. Polling exits as soon as the assertion + passes; the deadline only kicks in when the tool genuinely never + fired. + """ + deadline = asyncio.get_running_loop().time() + _AUDIT_POLL_DEADLINE_S + entries: list[dict] = [] + passed = False + detail = "" + while True: + entries = verify.audit_entries_since(since) + passed, detail = verify.assert_tools_fired( + entries, expected_tools, require_success=require_success, + ) + if passed or asyncio.get_running_loop().time() >= deadline: + return entries, passed, detail + await asyncio.sleep(_AUDIT_POLL_INTERVAL_S) + + +async def _run_prompt_case( + client: AgentClient, + case: dict, + *, + streaming: bool, +) -> CaseResult: + events: list[dict] = [] + result: TaskResult | None = None + + try: + # Pre-seed state via direct DB writes (model never sees this). + # Inside the ``try`` so a partial setup failure still triggers + # the ``finally`` teardown — otherwise rows from the steps that + # *did* succeed would leak into the next case. + if "setup" in case: + err = verify.apply_setup(case["setup"]) + if err: + return CaseResult( + case["id"], case["category"], case["name"], False, + f"setup failed: {err}", + ) + + since = verify.audit_now() + + if streaming: + events, result = await client.stream( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + else: + result = await client.ask( + case["prompt"], timeout_s=case.get("timeout_s", 90), + ) + + if result is None or result.state != "completed": + state = result.state if result else "no-final-event" + error = (result.error if result else None) or "(none)" + duration = result.duration_ms if result else 0 + text_preview = (result.text if result else "")[:200] + return CaseResult( + case["id"], case["category"], case["name"], False, + f"task state={state}; error={error}", + duration_ms=duration, + raw={"text": text_preview}, + ) + + problems: list[str] = [] + + # Tool firing assertions. ``expected_tools is not None`` so an + # explicit empty list asserts that *no* tools fired (abstention + # cases). Missing key skips the audit check entirely. + expected_tools = case.get("expected_tools") + if expected_tools is not None: + require_success = case.get("tool_outcome", "success") == "success" + _entries, passed, detail = await _await_audit_assertion( + since, expected_tools, require_success=require_success, + ) + if not passed: + problems.append(detail) + + # Text pattern assertions (case-insensitive substrings). + text_lower = result.text.lower() + for pattern in case.get("expected_patterns") or []: + if pattern.lower() not in text_lower: + problems.append(f"missing pattern {pattern!r}") + + # KB side-effect assertions. + vk = case.get("verify_kb") or {} + if "find_chunk_containing" in vk: + chunk = verify.find_chunk_containing( + vk["find_chunk_containing"], domain=vk.get("domain"), + ) + if not chunk: + problems.append(f"no chunk containing {vk['find_chunk_containing']!r}") + + # Streaming-only: assert the SSE event sequence surfaced the + # expected kinds at least once. + if streaming: + seen_kinds = {e.get("kind") for e in events} + for kind in case.get("expected_event_kinds") or []: + if kind not in seen_kinds: + problems.append(f"missing SSE event kind {kind!r}; saw {sorted(seen_kinds)}") + + detail = ( + "; ".join(problems) if problems + else f"OK ({result.duration_ms}ms, {result.usage.get('total_tokens', '?')}t)" + ) + return CaseResult( + case["id"], case["category"], case["name"], + passed=not problems, + detail=detail, + duration_ms=result.duration_ms, + tokens=result.usage.get("total_tokens", 0) or 0, + raw={"reply": result.text[:300]}, + ) + finally: + # Teardown unconditionally — even when the task crashed or + # an assertion raised — so seeded KB rows never leak into the + # next case. + if "teardown" in case: + verify.apply_teardown(case["teardown"]) + + +# ── dispatch ──────────────────────────────────────────────────────────────── + + +_RUNNERS = { + "agent_card": _run_agent_card, + "auth_check": _run_auth_check, + "ask": _run_ask, + "stream": _run_stream, +} + + +async def run_one(client: AgentClient, case: dict) -> CaseResult: + runner = _RUNNERS.get(case.get("kind", "ask")) + if runner is None: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"unknown kind: {case.get('kind')}", + ) + try: + return await runner(client, case) + except Exception as e: + return CaseResult( + case["id"], case.get("category", "?"), case.get("name", "?"), + False, f"exception: {e!r}", + ) + + +# ── main ──────────────────────────────────────────────────────────────────── + + +def _print_board(results: list[CaseResult]) -> None: + width_id = max(len(r.id) for r in results) + width_cat = max(len(r.category) for r in results) + print() + print(f"{'ID'.ljust(width_id)} {'CAT'.ljust(width_cat)} RESULT TIME TOKENS DETAIL") + print("-" * 90) + pass_count = 0 + for r in results: + mark = "PASS" if r.passed else "FAIL" + if r.passed: + pass_count += 1 + time_s = f"{r.duration_ms}ms".rjust(6) + tokens = str(r.tokens).rjust(6) if r.tokens else " - " + print( + f"{r.id.ljust(width_id)} {r.category.ljust(width_cat)} " + f"{mark} {time_s} {tokens} {r.detail[:80]}" + ) + print("-" * 90) + print(f"\n{pass_count}/{len(results)} passed") + + +def _save_report(results: list[CaseResult], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "ts": datetime.now(UTC).isoformat(), + "total": len(results), + "passed": sum(1 for r in results if r.passed), + "results": [asdict(r) for r in results], + } + path.write_text(json.dumps(payload, indent=2)) + print(f"\nReport: {path}") + + +async def main(): + p = argparse.ArgumentParser() + p.add_argument("--base-url", default=None) + p.add_argument("--tasks", default=None, help="comma-separated case IDs") + p.add_argument("--category", default=None) + p.add_argument("--out", default=None) + args = p.parse_args() + + tasks_path = Path(__file__).parent / "tasks.json" + cases = json.loads(tasks_path.read_text()) + + if args.tasks: + wanted = set(args.tasks.split(",")) + cases = [c for c in cases if c["id"] in wanted] + if args.category: + cases = [c for c in cases if c.get("category") == args.category] + + if not cases: + print("no cases match filters", file=sys.stderr) + return 2 + + client = AgentClient(base_url=args.base_url) + + print(f"Running {len(cases)} case(s) against {client.base_url}") + results: list[CaseResult] = [] + for case in cases: + sys.stdout.write(f" {case['id']}... ") + sys.stdout.flush() + result = await run_one(client, case) + sys.stdout.write(f"{'PASS' if result.passed else 'FAIL'} {result.detail[:60]}\n") + results.append(result) + + _print_board(results) + + out_path = Path(args.out) if args.out else ( + Path(__file__).parent / "results" / f"run-{int(time.time())}.json" + ) + _save_report(results, out_path) + + return 0 if all(r.passed for r in results) else 1 + + +if __name__ == "__main__": + raise SystemExit(asyncio.run(main())) diff --git a/evals/tasks.json b/evals/tasks.json new file mode 100644 index 0000000..d4b5389 --- /dev/null +++ b/evals/tasks.json @@ -0,0 +1,196 @@ +[ + { + "id": "card_discovery", + "category": "a2a-protocol", + "kind": "agent_card", + "name": "Agent card discovery", + "expect": { + "skills_min": 1, + "extensions_contain": ["cost-v1"] + } + }, + { + "id": "auth_negative", + "category": "a2a-protocol", + "kind": "auth_check", + "name": "Reject bad bearer when bearer auth is configured", + "bad_token": "definitely-not-the-real-token", + "expect": {"status": 401} + }, + { + "id": "streaming_status_updates", + "category": "a2a-protocol", + "kind": "stream", + "name": "message/stream surfaces status-update events ending in final=true", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [], + "expected_event_kinds": ["status-update"] + }, + + { + "id": "abstain_no_tool", + "category": "abstention", + "kind": "ask", + "name": "Don't reach for a tool when training data is fine", + "prompt": "What's the capital of France? One word.", + "expected_tools": [], + "expected_patterns": ["paris"] + }, + { + "id": "greeting", + "category": "simple", + "kind": "ask", + "name": "Direct greeting, no tool", + "prompt": "Hi.", + "expected_tools": [], + "expected_patterns": [] + }, + + { + "id": "current_time_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about live time → current_time", + "prompt": "What time is it in UTC right now?", + "expected_tools": ["current_time"], + "expected_patterns": ["UTC"] + }, + { + "id": "calculator_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for arithmetic → calculator", + "prompt": "How much is 17 times 23, plus 1?", + "expected_tools": ["calculator"], + "expected_patterns": ["392"] + }, + { + "id": "web_search_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about recent news → web_search", + "prompt": "Anything notable in the news about Anthropic this week?", + "expected_tools": ["web_search"], + "expected_patterns": [] + }, + { + "id": "fetch_url_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a URL's content → fetch_url", + "prompt": "What's on https://example.com? Just the page title is fine.", + "expected_tools": ["fetch_url"], + "expected_patterns": ["example"] + }, + + { + "id": "memory_ingest_intent", + "category": "tool", + "kind": "ask", + "name": "Stores a stable preference → memory_ingest writes a chunk", + "prompt": "Remember that I prefer protoLabs Studio standups at 9am Eastern.", + "expected_tools": ["memory_ingest"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "9am" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "9am"}} + ] + }, + { + "id": "daily_log_intent", + "category": "tool", + "kind": "ask", + "name": "Asks to log an event → daily_log writes today's chunk", + "prompt": "Log this for today: my standup just ended, team is unblocked on the auth migration.", + "expected_tools": ["daily_log"], + "expected_patterns": [], + "verify_kb": { + "find_chunk_containing": "auth migration", + "domain": "daily-log" + }, + "teardown": [ + {"kb_delete_by_content": {"contains": "auth migration"}} + ] + }, + { + "id": "memory_recall_intent", + "category": "tool", + "kind": "ask", + "name": "Asks about a stored fact → recall surfaces it", + "setup": [ + {"kb_ingest": { + "content": "Operator's primary lab is Snickerdoodle, located in Spokane.", + "domain": "context", + "heading": "lab" + }} + ], + "prompt": "Where's my primary lab and what's it called?", + "expected_tools": ["memory_recall"], + "expected_patterns": ["snickerdoodle", "spokane"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "context", "heading": "lab"}} + ] + }, + { + "id": "memory_list_intent", + "category": "tool", + "kind": "ask", + "name": "Asks for recent log entries → memory_list", + "setup": [ + {"kb_ingest": {"content": "called the dentist", "domain": "daily-log", "heading": "today"}}, + {"kb_ingest": {"content": "merged the auth PR", "domain": "daily-log", "heading": "today"}} + ], + "prompt": "What did I do today? Summarize from the log.", + "expected_tools": ["memory_list"], + "expected_patterns": ["dentist"], + "teardown": [ + {"kb_delete_by_content": {"contains": "called the dentist"}}, + {"kb_delete_by_content": {"contains": "merged the auth PR"}} + ] + }, + { + "id": "memory_stats_intent", + "category": "tool", + "kind": "ask", + "name": "Asks how much is in memory → memory_stats", + "prompt": "How much have I got stored across each memory domain?", + "expected_tools": ["memory_stats"], + "expected_patterns": [] + }, + + { + "id": "log_then_recall_chain", + "category": "chained", + "kind": "ask", + "name": "Log an event, then recall it later in the same turn", + "prompt": "Log this for today: 'eval-chain-flag-q9: chained log+recall test'. After logging, search memory for that flag and quote it back.", + "expected_tools": ["daily_log", "memory_recall"], + "expected_patterns": ["eval-chain-flag-q9"], + "teardown": [ + {"kb_delete_by_content": {"contains": "eval-chain-flag-q9"}} + ] + }, + + { + "id": "knowledge_middleware_recall", + "category": "subsystem", + "kind": "ask", + "name": "KnowledgeMiddleware surfaces a stored fact without an explicit search", + "setup": [ + {"kb_ingest": { + "content": "Operator's preferred coffee is a Gibraltar with oat milk from Atticus.", + "domain": "preferences", + "heading": "coffee" + }} + ], + "prompt": "What's my usual coffee order?", + "expected_tools": [], + "expected_patterns": ["gibraltar", "oat"], + "teardown": [ + {"kb_delete_by_heading": {"domain": "preferences", "heading": "coffee"}} + ] + } +] diff --git a/evals/verify.py b/evals/verify.py new file mode 100644 index 0000000..5b1f8cc --- /dev/null +++ b/evals/verify.py @@ -0,0 +1,176 @@ +"""Side-effect verifiers for eval cases. + +Two channels: + +- **Audit log** — JSONL written by ``AuditMiddleware`` at + ``/sandbox/audit/audit.jsonl`` (override with ``AUDIT_PATH`` env). + ``audit_entries_since`` returns entries newer than a marker, and + ``assert_tools_fired`` confirms a tool name appears with the + expected outcome. +- **Knowledge store** — sqlite DB at ``KNOWLEDGE_DB_PATH`` (or the + template default). ``find_chunk_containing`` confirms a memory + write actually landed; ``setup_chunk`` / ``teardown`` mutate the + store directly so cases start from a known state. + +The store is opened read/write so setup steps can pre-seed (BFCL's +``initial_config`` pattern). The model never sees these direct writes +— it discovers them via ``memory_recall`` / ``memory_list`` tools as +real users would. +""" + +from __future__ import annotations + +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +# ── path resolution ───────────────────────────────────────────────────────── + + +def _audit_path() -> Path: + """Audit JSONL location. Falls back to the template's docker default.""" + raw = os.environ.get("AUDIT_PATH") or "/sandbox/audit/audit.jsonl" + p = Path(raw).expanduser() + if p.is_file(): + return p + # Local-dev fallback: same shape, but under the home dir. + fallback = Path.home() / ".protoagent" / "audit" / "audit.jsonl" + return fallback + + +def _kb_store(): + """Construct a ``KnowledgeStore`` against the configured path. + + Imported lazily so ``evals/verify.py`` can be loaded in a context + where ``knowledge/`` isn't on sys.path yet (the runner adjusts + sys.path before calling in). + """ + from knowledge import KnowledgeStore + return KnowledgeStore() # honors KNOWLEDGE_DB_PATH env + + +# ── audit log ─────────────────────────────────────────────────────────────── + + +def audit_now() -> str: + """ISO-8601 marker suitable as a 'since' input to ``audit_entries_since``.""" + return datetime.now(timezone.utc).isoformat() + + +def audit_entries_since(ts_iso: str) -> list[dict]: + """Return audit-log entries with ``ts`` strictly greater than ``ts_iso``.""" + p = _audit_path() + if not p.is_file(): + return [] + out: list[dict] = [] + with p.open() as fh: + for line in fh: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if entry.get("ts", "") > ts_iso: + out.append(entry) + return out + + +def assert_tools_fired( + audit_entries: list[dict], + expected: list[str], + *, + require_success: bool = True, +) -> tuple[bool, str]: + """Confirm each expected tool name appears in audit entries. + + Order doesn't matter — a tool that fires twice still satisfies one + expected entry, and extra entries (subset matching, BFCL-style) are + allowed. + + ``require_success=True`` (default) only counts ``success=True`` + entries — use this for happy-path cases. Pass ``require_success=False`` + when the case represents an error path that the agent should still + *attempt* (e.g. fetching a private URL the agent has no creds for). + """ + fired: dict[str, dict[str, int]] = {} + for e in audit_entries: + bucket = fired.setdefault(e.get("tool", "?"), {"ok": 0, "err": 0}) + bucket["ok" if e.get("success") else "err"] += 1 + + missing: list[str] = [] + for t in expected: + if t not in fired: + missing.append(t) + continue + if require_success and fired[t]["ok"] == 0: + missing.append(f"{t} (only errors)") + + if missing: + return False, f"missing tools: {missing}; saw: {dict(fired)}" + return True, f"saw: {dict(fired)}" + + +# ── knowledge store ───────────────────────────────────────────────────────── + + +def find_chunk_containing(text: str, *, domain: str | None = None) -> dict | None: + store = _kb_store() + chunk = store.find_chunk_containing(text, domain=domain) + return chunk.as_dict() if chunk else None + + +def chunks_in_domain(domain: str, *, limit: int = 50) -> list[dict]: + store = _kb_store() + return [c.as_dict() for c in store.list_chunks(domain=domain, limit=limit)] + + +# ── setup / teardown helpers ───────────────────────────────────────────────── + + +def apply_setup(steps: list[dict]) -> str | None: + """Apply a list of setup steps. Each step is a dict with one key. + + Supported step kinds: + + - ``kb_ingest``: ``{content, domain, heading?}`` + + Returns ``None`` on success, an error string on first failure. + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + if kind == "kb_ingest": + if store.add_chunk( + args["content"], + domain=args.get("domain", "general"), + heading=args.get("heading"), + ) is None: + return f"kb_ingest failed for {args!r}" + else: + return f"unknown setup step: {kind}" + return None + + +def apply_teardown(steps: list[dict]) -> None: + """Best-effort teardown. Never raises so a setup failure or assertion + failure doesn't poison subsequent cases. + + Supported step kinds: + + - ``kb_delete_by_content``: ``{contains}`` + - ``kb_delete_by_heading``: ``{domain, heading}`` + """ + store = _kb_store() + for step in steps: + for kind, args in step.items(): + try: + if kind == "kb_delete_by_content": + store.delete_by_content(args["contains"]) + elif kind == "kb_delete_by_heading": + store.delete_by_heading(args["domain"], args["heading"]) + except Exception as exc: # pragma: no cover + log.debug("[verify] teardown step %s failed: %s", kind, exc) diff --git a/graph/agent.py b/graph/agent.py index 355c3fc..08ad32a 100644 --- a/graph/agent.py +++ b/graph/agent.py @@ -158,6 +158,7 @@ async def task( def create_agent_graph( config: LangGraphConfig, knowledge_store=None, + scheduler=None, include_subagents: bool = True, ): """Create the protoAgent LangGraph agent. @@ -167,7 +168,7 @@ def create_agent_graph( """ llm = create_llm(config) - all_tools = get_all_tools(knowledge_store) + all_tools = get_all_tools(knowledge_store, scheduler=scheduler) if include_subagents: task_tool = _build_task_tool(config, all_tools) @@ -189,12 +190,12 @@ def create_agent_graph( return agent -def create_simple_agent(config: LangGraphConfig, knowledge_store=None): +def create_simple_agent(config: LangGraphConfig, knowledge_store=None, scheduler=None): """Create a simple agent without subagents (for debugging/testing).""" from langgraph.prebuilt import create_react_agent llm = create_llm(config) - all_tools = get_all_tools(knowledge_store) + all_tools = get_all_tools(knowledge_store, scheduler=scheduler) system_prompt = build_system_prompt(include_subagents=False) diff --git a/graph/config.py b/graph/config.py index c8c2601..aff6707 100644 --- a/graph/config.py +++ b/graph/config.py @@ -37,20 +37,53 @@ class LangGraphConfig: # Subagents — template ships with one example (see graph/subagents/config.py). # Add fields here as you add entries to SUBAGENT_REGISTRY. worker: SubagentDef = field(default_factory=lambda: SubagentDef( - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + "schedule_task", "list_schedules", "cancel_schedule", + ], max_turns=20, )) - # Middleware toggles - knowledge_middleware: bool = False # template ships no knowledge store + # Middleware / subsystem toggles. All default-on so a fresh fork has + # a working memory loop + scheduler on day one. Forks that want a + # purely stateless agent (no KB, no scheduled tasks) can flip these + # via the drawer or by editing the YAML directly. + knowledge_middleware: bool = True audit_middleware: bool = True - memory_middleware: bool = False - - # Knowledge store (opt-in — leave disabled until the fork ships one) + memory_middleware: bool = True + scheduler_enabled: bool = True + + # Knowledge store — sqlite + FTS5, see ``knowledge/store.py``. + # The default path lives under ``/sandbox/`` to play well with the + # bundled Docker volume; the store falls back to + # ``~/.protoagent/knowledge/agent.db`` automatically when /sandbox + # is read-only or absent (e.g. local ``python server.py``). knowledge_db_path: str = "/sandbox/knowledge/agent.db" embed_model: str = "qwen3-embedding" knowledge_top_k: int = 5 + # Identity — captured by the setup wizard, editable via the drawer. + # ``identity_name`` falls back to the AGENT_NAME env var at runtime; + # the YAML value wins when both are set so per-fork customization + # survives image rebuilds. ``operator`` is the human the agent thinks + # it's talking to — injected into the system prompt when non-empty. + identity_name: str = "protoagent" + identity_operator: str = "" + + # A2A bearer token — blank = open mode (local dev). Writing a token + # here makes the A2A handler require ``Authorization: Bearer `` + # on every request and advertises the bearer scheme on the agent card. + # Kept in YAML rather than env so the drawer can manage it. + auth_token: str = "" + + # OS-level autostart — ``True`` means the server launches on user + # login (macOS LaunchAgent today; Linux/Windows TBD). Managed by + # ``autostart.py``; the field here is the source of truth for + # whether the plist should exist. + autostart_on_boot: bool = False + @classmethod def from_yaml(cls, path: str | Path) -> "LangGraphConfig": """Load config from YAML file. Falls back to defaults if absent.""" @@ -65,6 +98,9 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig": subagents = data.get("subagents", {}) middleware = data.get("middleware", {}) knowledge = data.get("knowledge", {}) + identity = data.get("identity", {}) + auth = data.get("auth", {}) + runtime = data.get("runtime", {}) config = cls( model_provider=model.get("provider", cls.model_provider), @@ -77,9 +113,14 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig": knowledge_middleware=middleware.get("knowledge", cls.knowledge_middleware), audit_middleware=middleware.get("audit", cls.audit_middleware), memory_middleware=middleware.get("memory", cls.memory_middleware), + scheduler_enabled=middleware.get("scheduler", cls.scheduler_enabled), knowledge_db_path=knowledge.get("db_path", cls.knowledge_db_path), embed_model=knowledge.get("embed_model", cls.embed_model), knowledge_top_k=knowledge.get("top_k", cls.knowledge_top_k), + identity_name=identity.get("name", cls.identity_name), + identity_operator=identity.get("operator", cls.identity_operator), + auth_token=auth.get("token", cls.auth_token), + autostart_on_boot=runtime.get("autostart_on_boot", cls.autostart_on_boot), ) for name in ("worker",): diff --git a/graph/config_io.py b/graph/config_io.py new file mode 100644 index 0000000..2bd4857 --- /dev/null +++ b/graph/config_io.py @@ -0,0 +1,417 @@ +"""Config I/O for the live-edit drawer in chat_ui.py. + +Three jobs: + +1. **YAML round-trip** that preserves comments and unknown keys in + ``config/langgraph-config.yaml``. ``LangGraphConfig.from_yaml`` + silently drops anything it doesn't know about, so writing back via + a freshly-constructed dataclass would wipe fork-added sections + (e.g. the ``memory`` / ``skills`` blocks the template already + ships). We use ruamel.yaml when available for comment preservation; + PyYAML is the fallback. + +2. **Two-location SOUL.md handling.** The runtime reads + ``/sandbox/SOUL.md`` (populated by ``entrypoint.sh`` at container + start). The source-of-truth lives at ``config/SOUL.md`` in the + repo. Drawer edits write to both so container restarts preserve + the change and local-dev runs without a ``/sandbox`` directory + still pick up the edit. + +3. **Gateway introspection.** ``list_gateway_models`` hits + ``{api_base}/models`` so the drawer's model dropdown reflects + whatever the connected LiteLLM gateway (or OpenAI-compat endpoint) + actually exposes — no hardcoded list to drift out of sync. +""" + +from __future__ import annotations + +import logging +import os +from io import StringIO +from pathlib import Path +from typing import Any + +from graph.config import LangGraphConfig + +log = logging.getLogger("protoagent.config_io") + +REPO_ROOT = Path(__file__).parent.parent +CONFIG_YAML_PATH = REPO_ROOT / "config" / "langgraph-config.yaml" +SOUL_SOURCE_PATH = REPO_ROOT / "config" / "SOUL.md" +SOUL_RUNTIME_PATH = Path("/sandbox/SOUL.md") + +# Setup wizard state. +# Presence of this (empty) marker file = wizard has been run and the +# server should boot straight into the chat UI. Absence = show the +# wizard on first page load. Lives in ``config/`` so a Docker volume +# mount at /opt//config persists setup across container runs. +SETUP_MARKER_PATH = REPO_ROOT / "config" / ".setup-complete" + +# SOUL.md starter templates. The wizard offers these as presets the +# user can pick then edit before saving. Adding a new file here +# automatically makes it a choice — no registry to update. +PRESETS_DIR = REPO_ROOT / "config" / "soul-presets" + + +# --------------------------------------------------------------------------- +# YAML round-trip +# --------------------------------------------------------------------------- + +try: + from ruamel.yaml import YAML # type: ignore + + _ruamel = YAML(typ="rt") + _ruamel.preserve_quotes = True + _ruamel.indent(mapping=2, sequence=4, offset=2) + _HAS_RUAMEL = True +except ImportError: + _HAS_RUAMEL = False + + +def load_yaml_doc(path: Path = CONFIG_YAML_PATH) -> Any: + """Load the config YAML as a mutable document. + + With ruamel: returns a CommentedMap that preserves comments + + key order on subsequent dump. Without: returns a plain dict and + comments are lost on next save (a warning is logged once per + save so the operator knows). + """ + if not path.exists(): + return {} if not _HAS_RUAMEL else _ruamel.load("{}\n") + + with open(path) as f: + if _HAS_RUAMEL: + return _ruamel.load(f) or _ruamel.load("{}\n") + import yaml + return yaml.safe_load(f) or {} + + +def save_yaml_doc(doc: Any, path: Path = CONFIG_YAML_PATH) -> None: + """Persist the document. Creates parent dirs if needed.""" + path.parent.mkdir(parents=True, exist_ok=True) + if _HAS_RUAMEL: + with open(path, "w") as f: + _ruamel.dump(doc, f) + return + + log.warning( + "ruamel.yaml not installed — YAML comments in %s will not be " + "preserved on save. Add `ruamel.yaml>=0.18` to requirements.txt " + "to fix.", path, + ) + import yaml + with open(path, "w") as f: + yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False) + + +# --------------------------------------------------------------------------- +# Config dict <-> dataclass +# --------------------------------------------------------------------------- + +def config_to_dict(config: LangGraphConfig) -> dict[str, Any]: + """Serialize a LangGraphConfig into the nested dict shape the UI + works with. Mirrors the YAML schema so round-tripping is trivial. + """ + return { + "model": { + "provider": config.model_provider, + "name": config.model_name, + "api_base": config.api_base, + "api_key": config.api_key, + "temperature": config.temperature, + "max_tokens": config.max_tokens, + "max_iterations": config.max_iterations, + }, + "subagents": { + "worker": { + "enabled": config.worker.enabled, + "tools": list(config.worker.tools), + "max_turns": config.worker.max_turns, + }, + }, + "middleware": { + "knowledge": config.knowledge_middleware, + "audit": config.audit_middleware, + "memory": config.memory_middleware, + "scheduler": config.scheduler_enabled, + }, + "knowledge": { + "db_path": config.knowledge_db_path, + "embed_model": config.embed_model, + "top_k": config.knowledge_top_k, + }, + "identity": { + "name": config.identity_name, + "operator": config.identity_operator, + }, + "auth": { + "token": config.auth_token, + }, + "runtime": { + "autostart_on_boot": config.autostart_on_boot, + }, + } + + +def apply_updates_to_yaml(doc: Any, updates: dict[str, Any]) -> Any: + """Merge a nested updates dict into the loaded YAML document. + + Uses __setitem__ on whatever container ruamel loaded (CommentedMap + acts like dict), so comments / key order / unknown sections are + preserved. Keys that don't exist yet get added at the end of the + containing section. + """ + for section, values in updates.items(): + if not isinstance(values, dict): + doc[section] = values + continue + if section not in doc or not isinstance(doc.get(section), dict): + doc[section] = {} + for key, val in values.items(): + if isinstance(val, dict): + if key not in doc[section] or not isinstance(doc[section].get(key), dict): + doc[section][key] = {} + for inner_key, inner_val in val.items(): + doc[section][key][inner_key] = inner_val + else: + doc[section][key] = val + return doc + + +def validate_config_dict(updates: dict[str, Any]) -> tuple[bool, str]: + """Validate without persisting. Returns (ok, error-message). + + Catches type mismatches and obvious range errors before we touch + disk or rebuild the graph. + """ + try: + model = updates.get("model", {}) + temp = float(model.get("temperature", 0.2)) + if not 0.0 <= temp <= 2.0: + return False, f"temperature must be 0.0-2.0, got {temp}" + max_tokens = int(model.get("max_tokens", 4096)) + if max_tokens < 1: + return False, f"max_tokens must be >= 1, got {max_tokens}" + max_iter = int(model.get("max_iterations", 50)) + if max_iter < 1: + return False, f"max_iterations must be >= 1, got {max_iter}" + + worker = updates.get("subagents", {}).get("worker", {}) + if worker: + max_turns = int(worker.get("max_turns", 20)) + if max_turns < 1: + return False, f"worker.max_turns must be >= 1, got {max_turns}" + tools = worker.get("tools", []) + if not isinstance(tools, list): + return False, "worker.tools must be a list" + + knowledge = updates.get("knowledge", {}) + if knowledge: + top_k = int(knowledge.get("top_k", 5)) + if top_k < 1: + return False, f"knowledge.top_k must be >= 1, got {top_k}" + except (TypeError, ValueError) as e: + return False, f"config validation: {e}" + return True, "" + + +# --------------------------------------------------------------------------- +# SOUL.md +# --------------------------------------------------------------------------- + + +def read_soul() -> str: + """Return the current persona text. + + Prefers the runtime path (``/sandbox/SOUL.md``) since that's what + ``graph/prompts.build_system_prompt`` actually reads; falls back + to the repo source so local-dev picks it up even when no sandbox + volume is mounted. + """ + for path in (SOUL_RUNTIME_PATH, SOUL_SOURCE_PATH): + if path.exists(): + return path.read_text(encoding="utf-8") + return "" + + +def write_soul(text: str) -> list[Path]: + """Write persona text to every reachable SOUL.md path. + + Always writes the repo source (``config/SOUL.md``). Additionally + writes the runtime path if its parent directory exists — in the + container ``/sandbox`` is created by Dockerfile; in local dev it + usually isn't, so we skip quietly instead of erroring. + + Returns the paths that were written for UI feedback. + """ + written: list[Path] = [] + SOUL_SOURCE_PATH.parent.mkdir(parents=True, exist_ok=True) + SOUL_SOURCE_PATH.write_text(text, encoding="utf-8") + written.append(SOUL_SOURCE_PATH) + + if SOUL_RUNTIME_PATH.parent.exists(): + SOUL_RUNTIME_PATH.write_text(text, encoding="utf-8") + written.append(SOUL_RUNTIME_PATH) + + return written + + +# --------------------------------------------------------------------------- +# Gateway model discovery +# --------------------------------------------------------------------------- + + +def list_gateway_models( + api_base: str, + api_key: str = "", + timeout: float = 10.0, +) -> tuple[list[str], str]: + """Fetch the model list from ``{api_base}/models``. + + Works against any OpenAI-compatible endpoint — LiteLLM gateway, + OpenAI proper, vLLM, Ollama with the OpenAI adapter. Returns + ``(model_ids, error_message)``. On success ``error_message`` is + empty; on failure model_ids is empty and the message is human- + readable. + """ + import httpx + + if not api_base: + return [], "api_base is empty" + + key = api_key or os.environ.get("OPENAI_API_KEY", "") + url = api_base.rstrip("/") + "/models" + headers = {} + if key: + headers["Authorization"] = f"Bearer {key}" + + try: + with httpx.Client(timeout=timeout) as client: + resp = client.get(url, headers=headers) + except httpx.HTTPError as e: + return [], f"connection failed: {e}" + + if resp.status_code >= 400: + detail = resp.text[:200] if resp.text else "" + return [], f"HTTP {resp.status_code} from {url}: {detail}" + + try: + data = resp.json() + except ValueError: + return [], f"non-JSON response from {url}" + + items = data.get("data") if isinstance(data, dict) else None + if not isinstance(items, list): + return [], f"unexpected shape from {url} — no 'data' array" + + ids: list[str] = [] + for item in items: + if isinstance(item, dict): + model_id = item.get("id") or item.get("name") + if isinstance(model_id, str): + ids.append(model_id) + ids.sort() + return ids, "" + + +# --------------------------------------------------------------------------- +# Tool registry introspection +# --------------------------------------------------------------------------- + + +def list_available_tools(knowledge_store: Any = None) -> list[str]: + """Return every tool name the runtime *could* wire into the graph. + + The wizard's tool checkbox group reads this. We deliberately + expose the scheduler tool names even when no scheduler has been + constructed yet (fresh boot, pre-setup) — otherwise the wizard + would hide tools that the runtime will register the moment the + user finishes setup. Same logic for memory tools when the + knowledge store is absent. + """ + from tools.lg_tools import ( + MEMORY_TOOL_NAMES, + SCHEDULER_TOOL_NAMES, + get_all_tools, + ) + + names = [t.name for t in get_all_tools(knowledge_store)] + # Deduplicate while preserving order: tools already present + # (because their backend was passed in) shouldn't appear twice. + seen = set(names) + for extra in (*MEMORY_TOOL_NAMES, *SCHEDULER_TOOL_NAMES): + if extra not in seen: + names.append(extra) + seen.add(extra) + return names + + +# --------------------------------------------------------------------------- +# Setup wizard state +# --------------------------------------------------------------------------- + + +def is_setup_complete() -> bool: + """True once the wizard has been completed at least once. + + Checked at server boot to decide wizard-first vs chat-first + rendering. Don't read the YAML to infer this — a fork that ships + with a baked-in config still needs to walk a user through the + wizard on first run. + """ + return SETUP_MARKER_PATH.exists() + + +def mark_setup_complete() -> None: + """Write the marker so subsequent boots skip the wizard. + + Idempotent — safe to call repeatedly. The file is empty; only + its presence matters. + """ + SETUP_MARKER_PATH.parent.mkdir(parents=True, exist_ok=True) + SETUP_MARKER_PATH.touch() + + +def reset_setup() -> None: + """Remove the marker, forcing the wizard to run on next page load. + + Exposed to the drawer as a "Re-run setup" action. Leaves the YAML + + SOUL.md in place so the wizard pre-populates with the current + values — reset is for revisiting choices, not for wiping config. + """ + SETUP_MARKER_PATH.unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# SOUL.md presets +# --------------------------------------------------------------------------- + + +def list_soul_presets() -> list[str]: + """Return preset names (file stems, no extension) sorted alphabetically. + + The wizard's preset dropdown reads from this — dropping a new + markdown file into ``config/soul-presets/`` makes it a choice + without code changes. + """ + if not PRESETS_DIR.exists(): + return [] + return sorted(p.stem for p in PRESETS_DIR.glob("*.md")) + + +def read_soul_preset(name: str) -> str: + """Return the preset's content. + + Returns empty string for an unknown name rather than raising — + the wizard treats that as "no preset selected, blank canvas". + + Path-traversal guarded: the resolved target must live inside + ``PRESETS_DIR``. A name like ``"../secret"`` would otherwise + escape the presets directory and read arbitrary ``.md`` files + anywhere the process can reach. + """ + presets_root = PRESETS_DIR.resolve() + candidate = (PRESETS_DIR / f"{name}.md").resolve() + if presets_root not in candidate.parents or not candidate.is_file(): + return "" + return candidate.read_text(encoding="utf-8") diff --git a/graph/prompts.py b/graph/prompts.py index b948909..b26e296 100644 --- a/graph/prompts.py +++ b/graph/prompts.py @@ -49,8 +49,13 @@ def build_system_prompt( """ parts = [] - # 1. Identity + # 1. Identity — prefer the runtime workspace (entrypoint.sh copies + # config/SOUL.md to /sandbox/SOUL.md at container start). Fall back + # to the repo source so local `python server.py` runs without a + # /sandbox mount still pick up persona edits made via the drawer. soul = _read_file(f"{workspace}/SOUL.md") + if not soul: + soul = _read_file(Path(__file__).parent.parent / "config" / "SOUL.md") if soul: parts.append(soul) else: diff --git a/graph/subagents/config.py b/graph/subagents/config.py index 554a321..a488703 100644 --- a/graph/subagents/config.py +++ b/graph/subagents/config.py @@ -63,7 +63,12 @@ class SubagentConfig: Replace this prompt with domain-specific guidance once your agent has real specialized roles.""", - tools=["echo", "current_time", "calculator", "web_search", "fetch_url"], + tools=[ + "current_time", "calculator", "web_search", "fetch_url", + "memory_ingest", "memory_recall", "memory_list", "memory_stats", + "daily_log", + "schedule_task", "list_schedules", "cancel_schedule", + ], max_turns=20, ) diff --git a/knowledge/__init__.py b/knowledge/__init__.py new file mode 100644 index 0000000..1de93de --- /dev/null +++ b/knowledge/__init__.py @@ -0,0 +1,12 @@ +"""Knowledge store — sqlite-backed chunk storage for memory tools and middleware. + +The template ships this enabled by default so a fresh fork has a working +memory loop on day one (memory_ingest, memory_recall, daily_log) and the +eval harness can assert side effects against real DB state. + +See ``knowledge.store.KnowledgeStore`` for the public API. +""" + +from knowledge.store import KnowledgeStore, Chunk + +__all__ = ["KnowledgeStore", "Chunk"] diff --git a/knowledge/store.py b/knowledge/store.py new file mode 100644 index 0000000..d26d8a7 --- /dev/null +++ b/knowledge/store.py @@ -0,0 +1,526 @@ +"""KnowledgeStore — sqlite-backed chunk storage with FTS5 search. + +The template's default knowledge surface. One ``chunks`` table holds +every piece of stored content (operator notes via ``memory_ingest``, +daily-log entries, conversation findings extracted by +``MemoryMiddleware``); the ``domain`` column distinguishes them. + +Search uses sqlite FTS5 when available (true on virtually all modern +sqlite builds). When FTS5 is missing — sandboxed sqlite, custom builds +— the store transparently falls back to ``LIKE`` keyword matching so +the API contract still holds. + +The store is path-aware and degradation-aware: + +- Honors ``KNOWLEDGE_DB_PATH`` env var → constructor argument → + config default ``/sandbox/knowledge/agent.db``. +- If the configured path is unwritable (running locally outside the + container, no /sandbox), falls back to ``~/.protoagent/knowledge/agent.db`` + so a fresh ``python server.py`` works without sudo. +- All write operations swallow ``sqlite3.DatabaseError`` (covers + OperationalError, IntegrityError, and corruption variants) and log; + the store never crashes the agent loop on a corrupt or read-only DB. + +Forks that want embeddings on top of FTS5 can subclass and override +``search()`` — the middleware reads through that one method. +""" + +from __future__ import annotations + +import logging +import os +import re +import sqlite3 +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +DEFAULT_DB_PATH = "/sandbox/knowledge/agent.db" + + +@dataclass +class Chunk: + """One row from the chunks table — what callers see.""" + id: int + content: str + domain: str + heading: str | None + source: str | None + source_type: str | None + finding_type: str | None + created_at: str + updated_at: str + + def as_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "content": self.content, + "domain": self.domain, + "heading": self.heading, + "source": self.source, + "source_type": self.source_type, + "finding_type": self.finding_type, + "created_at": self.created_at, + "updated_at": self.updated_at, + } + + +def _resolve_path(db_path: str | Path | None) -> Path: + """Pick a writable DB path. Env > arg > default; fall back to ~/.protoagent.""" + raw = os.environ.get("KNOWLEDGE_DB_PATH") or db_path or DEFAULT_DB_PATH + p = Path(str(raw)).expanduser() + try: + p.parent.mkdir(parents=True, exist_ok=True) + # Probe writability + probe = p.parent / ".write-probe" + probe.touch() + probe.unlink() + return p + except OSError: + fallback = Path.home() / ".protoagent" / "knowledge" / "agent.db" + fallback.parent.mkdir(parents=True, exist_ok=True) + log.info( + "[knowledge] %s not writable; using %s instead", + p, fallback, + ) + return fallback + + +def _now_iso() -> str: + return datetime.now(UTC).isoformat() + + +# LIKE escaping — sqlite treats ``%`` and ``_`` as wildcards in LIKE +# patterns. Without escaping, a search for ``"100%"`` matches every row +# starting with ``"100"`` instead of literal "100%". We escape them +# alongside the escape char itself, then bind ``ESCAPE '\'`` on every +# LIKE clause that takes user input. +_LIKE_ESCAPE = "\\" + + +def _escape_like(text: str) -> str: + """Escape ``%``, ``_``, and the escape char for safe LIKE matching.""" + return ( + text + .replace(_LIKE_ESCAPE, _LIKE_ESCAPE + _LIKE_ESCAPE) + .replace("%", _LIKE_ESCAPE + "%") + .replace("_", _LIKE_ESCAPE + "_") + ) + + +def _fts_quote(token: str) -> str: + """Quote a token for FTS5 MATCH so it's treated as a literal phrase. + + FTS5 has its own query syntax (column filters, prefix wildcards, + NEAR, AND/OR/NOT operators). Wrapping each token in double quotes + forces FTS5 to interpret it as a phrase token, neutralising any + operator characters the user happened to type. Internal double + quotes are doubled per FTS5 phrase rules. + """ + return '"' + token.replace('"', '""') + '"' + + +def _has_fts5(db: sqlite3.Connection) -> bool: + try: + db.execute( + "CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_probe USING fts5(x)" + ) + db.execute("DROP TABLE _fts5_probe") + return True + except sqlite3.OperationalError: + return False + + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + content TEXT NOT NULL, + domain TEXT NOT NULL DEFAULT 'general', + heading TEXT, + source TEXT, + source_type TEXT, + finding_type TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_chunks_domain ON chunks(domain); +CREATE INDEX IF NOT EXISTS idx_chunks_created_at ON chunks(created_at); +""" + +_FTS_SCHEMA = """ +CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5( + content, heading, content='chunks', content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); +END; + +CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN + INSERT INTO chunks_fts(chunks_fts, rowid, content, heading) + VALUES('delete', old.id, old.content, old.heading); + INSERT INTO chunks_fts(rowid, content, heading) + VALUES (new.id, new.content, new.heading); +END; +""" + + +class KnowledgeStore: + """Default knowledge store. Sqlite + FTS5 (with LIKE fallback). + + Forks usually don't subclass this — extend ``add_chunk`` / + ``search`` directly when you need new fields, or wrap it with + your own embedding layer. + """ + + def __init__(self, db_path: str | Path | None = None): + self.path = _resolve_path(db_path) + self._fts_available: bool | None = None + self._init_db() + + # ── connection / schema ───────────────────────────────────────────────── + + def _connect(self) -> sqlite3.Connection: + db = sqlite3.connect(str(self.path)) + db.row_factory = sqlite3.Row + # WAL is best-effort — read-only sqlite files (e.g. immutable + # mounts) reject the PRAGMA. The connection stays usable for + # reads; only writes will fail later, and those go through + # the per-method OperationalError guards. + try: + db.execute("PRAGMA journal_mode=WAL") + except sqlite3.OperationalError as exc: + log.debug("[knowledge] PRAGMA journal_mode=WAL skipped: %s", exc) + return db + + def _init_db(self) -> None: + try: + db = self._connect() + db.executescript(_SCHEMA) + self._fts_available = _has_fts5(db) + if self._fts_available: + db.executescript(_FTS_SCHEMA) + # Re-index any pre-existing rows. The CREATE TRIGGER + # statements only fire on subsequent inserts, so a DB + # populated before FTS was added would have an empty + # virtual table without this rebuild. + try: + db.execute( + "INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')" + ) + except sqlite3.DatabaseError as exc: + log.debug("[knowledge] FTS rebuild skipped: %s", exc) + else: + log.info( + "[knowledge] FTS5 unavailable — search will use LIKE fallback" + ) + db.commit() + db.close() + except sqlite3.DatabaseError: + log.exception("[knowledge] schema init failed at %s", self.path) + + # Convenience for middleware that wants the raw connection. Kept + # private so the public API stays small. + def _get_db(self) -> sqlite3.Connection | None: + try: + return self._connect() + except sqlite3.DatabaseError: + log.exception("[knowledge] connect failed") + return None + + # ── writes ────────────────────────────────────────────────────────────── + + def add_chunk( + self, + content: str, + domain: str = "general", + heading: str | None = None, + *, + source: str | None = None, + source_type: str | None = None, + finding_type: str | None = None, + ) -> int | None: + """Insert a chunk. Returns the new row id, or None on failure.""" + if not content or not content.strip(): + return None + db = self._get_db() + if db is None: + return None + try: + now = _now_iso() + cur = db.execute( + "INSERT INTO chunks " + "(content, domain, heading, source, source_type, finding_type, " + "created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + (content, domain, heading, source, source_type, finding_type, now, now), + ) + db.commit() + return int(cur.lastrowid) + except sqlite3.DatabaseError: + log.exception("[knowledge] add_chunk failed") + return None + finally: + db.close() + + def add_finding( + self, + content: str, + source: str = "conversation", + source_type: str = "chat", + finding_type: str = "insight", + ) -> int | None: + """Compatibility shim for ``MemoryMiddleware.after_agent``. + + Stored under ``domain='finding'`` so memory_list / memory_recall + can surface them alongside operator-set chunks. + """ + return self.add_chunk( + content, + domain="finding", + source=source, + source_type=source_type, + finding_type=finding_type, + ) + + # ── reads ─────────────────────────────────────────────────────────────── + + def search( + self, + query: str, + k: int = 5, + *, + domain: str | None = None, + ) -> list[dict[str, Any]]: + """Top-k chunks matching ``query``. Shape matches what the + ``KnowledgeMiddleware`` consumes: each result has ``table``, + ``preview``, plus the underlying chunk fields. + + Uses FTS5 when available, else a tokenized LIKE fallback. Returns + an empty list on no matches or DB failure (never raises). + """ + if not query or not query.strip(): + return [] + db = self._get_db() + if db is None: + return [] + try: + rows = self._search_fts(db, query, k, domain) if self._fts_available \ + else self._search_like(db, query, k, domain) + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] search failed: %s", exc) + rows = [] + finally: + db.close() + + results: list[dict[str, Any]] = [] + for r in rows: + preview = (r["heading"] + ": " if r["heading"] else "") + r["content"] + results.append({ + "table": "chunks", + "preview": preview[:240], + **dict(r), + }) + return results + + def _search_fts( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + # Sanitize to FTS5-safe tokens; OR them so a multi-word query + # matches any of the keywords (closer to LIKE behaviour). + # Each token is double-quoted so FTS5 treats it as a literal + # phrase rather than parsing operators (column filters, prefix + # wildcards, NEAR, etc.) — even though ``[\w']+`` already + # filters most special chars, defence in depth is cheap. + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + match = " OR ".join(_fts_quote(t) for t in tokens) + if domain: + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? AND c.domain = ? " + "ORDER BY rank LIMIT ?", + (match, domain, k), + ).fetchall() + return db.execute( + "SELECT c.* FROM chunks_fts f " + "JOIN chunks c ON c.id = f.rowid " + "WHERE chunks_fts MATCH ? " + "ORDER BY rank LIMIT ?", + (match, k), + ).fetchall() + + def _search_like( + self, + db: sqlite3.Connection, + query: str, + k: int, + domain: str | None, + ) -> list[sqlite3.Row]: + tokens = [t for t in re.findall(r"[\w']+", query) if t] + if not tokens: + return [] + # Score = number of tokens matched (rough recall-style ranking). + # User-supplied tokens are LIKE-escaped so a query containing + # ``%`` or ``_`` doesn't silently match every row; ESCAPE is + # bound on each clause. + like_clauses = " + ".join( + "CASE WHEN content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ? " + "THEN 1 ELSE 0 END" + for _ in tokens + ) + params: list[Any] = [] + for t in tokens: + needle = f"%{_escape_like(t)}%" + params.extend([needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE]) + sql = ( + f"SELECT *, ({like_clauses}) AS score FROM chunks " + "WHERE score > 0" + ) + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY score DESC, id DESC LIMIT ?" + params.append(k) + return db.execute(sql, params).fetchall() + + def list_chunks( + self, + domain: str | None = None, + limit: int = 50, + ) -> list[Chunk]: + """Most-recent-first chunk listing. Used by ``memory_list``.""" + db = self._get_db() + if db is None: + return [] + try: + if domain: + rows = db.execute( + "SELECT * FROM chunks WHERE domain = ? ORDER BY id DESC LIMIT ?", + (domain, limit), + ).fetchall() + else: + rows = db.execute( + "SELECT * FROM chunks ORDER BY id DESC LIMIT ?", + (limit,), + ).fetchall() + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] list_chunks failed: %s", exc) + rows = [] + finally: + db.close() + return [Chunk(**dict(r)) for r in rows] + + def stats(self) -> dict[str, int]: + """Return per-domain chunk counts plus a ``total`` key.""" + db = self._get_db() + if db is None: + return {"total": 0} + try: + rows = db.execute( + "SELECT domain, COUNT(*) AS n FROM chunks GROUP BY domain ORDER BY n DESC" + ).fetchall() + total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] stats failed: %s", exc) + return {"total": 0} + finally: + db.close() + out = {r["domain"]: r["n"] for r in rows} + out["total"] = int(total) + return out + + # ── verification helpers (used by evals/verify.py) ────────────────────── + + def find_chunk_containing( + self, + text: str, + domain: str | None = None, + ) -> Chunk | None: + """Return the most-recent chunk whose content or heading contains ``text``. + + Used by the eval runner to assert side-effect outcomes after a + memory-writing turn. Empty / whitespace-only ``text`` returns + ``None`` rather than building a ``LIKE '%%'`` predicate that + would match every row. + """ + if not text or not text.strip(): + return None + db = self._get_db() + if db is None: + return None + try: + needle = f"%{_escape_like(text)}%" + sql = ( + "SELECT * FROM chunks " + "WHERE (content LIKE ? ESCAPE ? OR heading LIKE ? ESCAPE ?)" + ) + params: list[Any] = [needle, _LIKE_ESCAPE, needle, _LIKE_ESCAPE] + if domain: + sql += " AND domain = ?" + params.append(domain) + sql += " ORDER BY id DESC LIMIT 1" + row = db.execute(sql, params).fetchone() + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] find_chunk_containing failed: %s", exc) + row = None + finally: + db.close() + return Chunk(**dict(row)) if row else None + + def delete_by_content(self, contains: str) -> int: + """Delete chunks whose content matches ``%contains%``. Returns count. + + Empty / whitespace-only ``contains`` is a no-op — the alternative + is ``DELETE WHERE content LIKE '%%'`` which wipes every row. + """ + if not contains or not contains.strip(): + return 0 + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute( + "DELETE FROM chunks WHERE content LIKE ? ESCAPE ?", + (f"%{_escape_like(contains)}%", _LIKE_ESCAPE), + ) + db.commit() + return int(cur.rowcount) + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] delete_by_content failed: %s", exc) + return 0 + finally: + db.close() + + def delete_by_heading(self, domain: str, heading: str) -> int: + """Delete chunks matching (domain, heading). Returns count.""" + db = self._get_db() + if db is None: + return 0 + try: + cur = db.execute( + "DELETE FROM chunks WHERE domain = ? AND heading = ?", + (domain, heading), + ) + db.commit() + return int(cur.rowcount) + except sqlite3.DatabaseError as exc: + log.warning("[knowledge] delete_by_heading failed: %s", exc) + return 0 + finally: + db.close() diff --git a/requirements.txt b/requirements.txt index 9cb6ff6..aa05284 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ uvicorn>=0.30 langfuse>=3.0 prometheus-client>=0.20 pyyaml>=6.0 +ruamel.yaml>=0.18 # round-trip YAML that preserves comments in config/langgraph-config.yaml when the drawer writes back edits websockets>=12.0 # LangGraph agent backend @@ -15,3 +16,7 @@ langchain-openai>=0.3.0 # Starter tools (tools/lg_tools.py) ddgs>=9.0 beautifulsoup4>=4.12 + +# Scheduler (scheduler/local.py — cron expression parsing for the +# bundled local backend; the Workstacean adapter doesn't need this) +croniter>=2.0 diff --git a/scheduler/__init__.py b/scheduler/__init__.py new file mode 100644 index 0000000..6828056 --- /dev/null +++ b/scheduler/__init__.py @@ -0,0 +1,27 @@ +"""Pluggable scheduler for future-task delivery. + +Two backends ship by default: + +- ``LocalScheduler`` — sqlite + asyncio. Bundled, zero external + dependencies, per-agent persistence path. Use this for solo forks + or any deployment that doesn't already run protoWorkstacean. +- ``WorkstaceanScheduler`` — HTTP adapter to a protoWorkstacean + install. Topic-namespaced per agent so multiple ginas can share one + Workstacean and not collide. + +``server.py`` selects the backend at startup based on env vars; the +agent loop sees the same three tools (``schedule_task``, +``list_schedules``, ``cancel_schedule``) regardless of which backend +is wired up. + +Multi-agent safety: every job carries an ``agent_name`` (defaulted +from ``AGENT_NAME`` env / config) so that two protoAgent instances +sharing one storage path or one Workstacean install can't accidentally +fire each other's scheduled prompts. +""" + +from scheduler.interface import Job, SchedulerBackend +from scheduler.local import LocalScheduler +from scheduler.workstacean import WorkstaceanScheduler + +__all__ = ["Job", "LocalScheduler", "SchedulerBackend", "WorkstaceanScheduler"] diff --git a/scheduler/interface.py b/scheduler/interface.py new file mode 100644 index 0000000..6de9b3a --- /dev/null +++ b/scheduler/interface.py @@ -0,0 +1,114 @@ +"""Scheduler protocol — the contract every backend honors. + +Both ``LocalScheduler`` and ``WorkstaceanScheduler`` implement this +shape. The agent-facing tools in ``tools/lg_tools.py`` only see the +protocol; swapping backends is a server.py-level decision. +""" + +from __future__ import annotations + +import re +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from typing import Any, Protocol + + +@dataclass +class Job: + """A scheduled future invocation. + + ``schedule`` is either a 5-field cron expression (e.g. + ``"0 9 * * 1-5"``) or an ISO-8601 datetime for one-shot fires + (e.g. ``"2026-05-01T15:00:00+00:00"``). Backends auto-detect. + + ``agent_name`` namespaces the job — one Workstacean install or + shared sqlite path can serve N protoAgent instances without + cross-firing. + """ + + id: str + prompt: str + schedule: str + agent_name: str + created_at: str = field(default_factory=lambda: datetime.now(UTC).isoformat()) + next_fire: str | None = None # ISO; None means "compute on save" + last_fire: str | None = None + enabled: bool = True + + def as_dict(self) -> dict[str, Any]: + return asdict(self) + + +class SchedulerBackend(Protocol): + """The minimum surface every backend implements. + + Methods are sync because the agent tools wrap them in their own + async functions; backends that need to do async I/O (httpx in + Workstacean's case) handle it internally. + """ + + name: str # short label for logs / agent-facing strings: "local", "workstacean" + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + """Persist a new job. Returns the stored ``Job`` (with + backend-assigned id and next_fire if the caller didn't set them). + + Raises ``ValueError`` for malformed schedule strings.""" + ... + + def cancel_job(self, job_id: str) -> bool: + """Remove a job. Returns ``True`` if a row was deleted.""" + ... + + def list_jobs(self) -> list[Job]: + """All jobs visible to the calling agent. Implementations are + responsible for filtering by ``agent_name`` so multi-agent + deployments stay isolated.""" + ... + + async def start(self) -> None: + """Start any background polling. No-op for backends that don't + need it (Workstacean dispatches and forgets).""" + ... + + async def stop(self) -> None: + """Cleanly shut down background work.""" + ... + + +# ── shared helpers ────────────────────────────────────────────────────────── + + +_CRON_PATTERN = re.compile(r"^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s*$") + + +def is_cron(schedule: str) -> bool: + """Heuristic: does ``schedule`` look like a 5-field cron expression? + + Used by both backends to decide between cron-iter and + ``datetime.fromisoformat``. Doesn't validate semantics — that + happens when the schedule is parsed. + """ + return bool(_CRON_PATTERN.match(schedule)) and not _looks_like_iso(schedule) + + +def _looks_like_iso(schedule: str) -> bool: + # ISO datetimes contain ``-`` and either ``T`` or a space between + # date and time. Cron has neither in the first field. + return "T" in schedule or _has_iso_date_prefix(schedule) + + +def _has_iso_date_prefix(schedule: str) -> bool: + head = schedule.strip().split(" ", 1)[0] + return bool(re.match(r"^\d{4}-\d{2}-\d{2}", head)) + + +def parse_iso_to_utc(schedule: str) -> datetime: + """Parse an ISO-8601 datetime, treating naive inputs as UTC. + + Raises ``ValueError`` for malformed strings. + """ + dt = datetime.fromisoformat(schedule) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=UTC) + return dt.astimezone(UTC) diff --git a/scheduler/local.py b/scheduler/local.py new file mode 100644 index 0000000..37187cc --- /dev/null +++ b/scheduler/local.py @@ -0,0 +1,432 @@ +"""LocalScheduler — bundled sqlite + asyncio backend. + +The default scheduler when no protoWorkstacean install is configured. +Every protoAgent instance gets a private ``jobs.db`` namespaced by +``AGENT_NAME`` so spinning up gina-personal alongside gina-work +doesn't cross-fire prompts. + +Architecture: + +- One ``jobs`` table — ``id``, ``prompt``, ``schedule``, ``next_fire``, + ``agent_name``, ``last_fire``, ``enabled``, ``created_at``. +- Polling coroutine runs on FastAPI's startup hook (``server.py``) + and ticks once per ``_POLL_INTERVAL_S`` (1s default). Cheap because + sqlite reads with an indexed ``next_fire`` filter cost microseconds. +- Firing = HTTP POST to the running agent's own ``/a2a`` endpoint as + a ``message/send``. Going through HTTP rather than calling into the + graph directly gets us free parity with real callers — same audit + log, same cost-v1 capture, same auth path. +- One-shot ISO schedules are deleted after firing. Cron schedules + reschedule via croniter. +- On startup: any job whose ``next_fire`` is in the past but within a + 24h window fires immediately (BFCL-style "missed fires" recovery, + matching Workstacean's behaviour). Older missed fires are + rescheduled forward without firing — better than waking the agent + to a flood of stale prompts after a long downtime. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import sqlite3 +import uuid +from datetime import UTC, datetime, timedelta +from pathlib import Path +from typing import Any + +from croniter import croniter + +from scheduler.interface import Job, is_cron, parse_iso_to_utc + +log = logging.getLogger(__name__) + +DEFAULT_DB_DIR = "/sandbox/scheduler" +_POLL_INTERVAL_S = 1.0 +_MISSED_FIRE_WINDOW_S = 24 * 60 * 60 # 24h — matches Workstacean + + +def _resolve_db_path(db_dir: str | Path | None, agent_name: str) -> Path: + """Pick a writable jobs.db path namespaced by agent name. + + ``agent_name`` is sanitized to a single path segment before being + appended — operators set it via env or YAML, but defence in depth + against a value like ``../etc/passwd`` or ``/tmp/elsewhere`` is + cheap and prevents an exotic typo from putting a sqlite file + outside the configured scheduler dir. + """ + safe_name = _safe_segment(agent_name) + raw = os.environ.get("SCHEDULER_DB_DIR") or db_dir or DEFAULT_DB_DIR + base = Path(str(raw)).expanduser() / safe_name + try: + base.mkdir(parents=True, exist_ok=True) + probe = base / ".write-probe" + probe.touch() + probe.unlink() + return base / "jobs.db" + except OSError: + fallback = Path.home() / ".protoagent" / "scheduler" / safe_name + fallback.mkdir(parents=True, exist_ok=True) + log.info("[scheduler] %s not writable; using %s instead", base, fallback) + return fallback / "jobs.db" + + +def _safe_segment(name: str) -> str: + """Reduce ``name`` to a single safe path segment. + + Replaces path separators, ``..``, and absolute-path prefixes with + underscores; falls back to ``"default"`` when nothing usable + remains. Preserves the common slug shape (``gina-personal``, + ``ginavision``) without surprises. + """ + if not name: + return "default" + cleaned = name.replace("/", "_").replace("\\", "_").replace("..", "_") + cleaned = cleaned.lstrip(".").strip() + return cleaned or "default" + + +def _now_iso() -> str: + return datetime.now(UTC).isoformat() + + +def _compute_next_fire(schedule: str, *, after: datetime | None = None) -> str: + """Resolve a schedule string to the next ISO timestamp it fires. + + ``after`` controls when "next" starts — current time by default; + pass an explicit reference when rescheduling a cron job after a + fire so successive fires don't drift. + """ + after = after or datetime.now(UTC) + if is_cron(schedule): + return croniter(schedule, after).get_next(datetime).astimezone(UTC).isoformat() + return parse_iso_to_utc(schedule).isoformat() + + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS jobs ( + id TEXT PRIMARY KEY, + prompt TEXT NOT NULL, + schedule TEXT NOT NULL, + agent_name TEXT NOT NULL, + next_fire TEXT NOT NULL, + last_fire TEXT, + enabled INTEGER NOT NULL DEFAULT 1, + created_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_jobs_next_fire ON jobs(next_fire); +CREATE INDEX IF NOT EXISTS idx_jobs_agent_name ON jobs(agent_name); +""" + + +class LocalScheduler: + """Sqlite-backed scheduler with an asyncio polling loop. + + Construct once at server startup, ``await scheduler.start()`` to + spawn the polling task, ``await scheduler.stop()`` on shutdown. + The agent-facing tools call ``add_job`` / ``cancel_job`` / + ``list_jobs`` synchronously. + """ + + name = "local" + + def __init__( + self, + agent_name: str, + *, + invoke_url: str, + api_key: str | None = None, + bearer_token: str | None = None, + db_dir: str | Path | None = None, + ): + self.agent_name = agent_name + self._invoke_url = invoke_url.rstrip("/") + self._api_key = api_key or "" + self._bearer = bearer_token or "" + self.path = _resolve_db_path(db_dir, agent_name) + self._task: asyncio.Task | None = None + self._stopping = False + self._init_db() + + # ── DB plumbing ───────────────────────────────────────────────────────── + + def _connect(self) -> sqlite3.Connection: + db = sqlite3.connect(str(self.path)) + db.row_factory = sqlite3.Row + try: + db.execute("PRAGMA journal_mode=WAL") + except sqlite3.OperationalError as exc: + log.debug("[scheduler] WAL skipped: %s", exc) + return db + + def _init_db(self) -> None: + try: + db = self._connect() + db.executescript(_SCHEMA) + db.commit() + db.close() + except sqlite3.DatabaseError: + log.exception("[scheduler] schema init failed at %s", self.path) + + # ── public API (matches SchedulerBackend) ─────────────────────────────── + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + if not prompt or not prompt.strip(): + raise ValueError("scheduler: prompt is required") + next_fire = _compute_next_fire(schedule) # raises ValueError for malformed input + + job = Job( + id=job_id or self._generate_id(), + prompt=prompt, + schedule=schedule, + agent_name=self.agent_name, + next_fire=next_fire, + ) + db = self._connect() + try: + db.execute( + "INSERT INTO jobs (id, prompt, schedule, agent_name, next_fire, " + "last_fire, enabled, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + (job.id, job.prompt, job.schedule, job.agent_name, + job.next_fire, job.last_fire, int(job.enabled), job.created_at), + ) + db.commit() + except sqlite3.IntegrityError as exc: + raise ValueError(f"job id {job.id!r} already exists") from exc + finally: + db.close() + return job + + def cancel_job(self, job_id: str) -> bool: + db = self._connect() + try: + cur = db.execute( + "DELETE FROM jobs WHERE id = ? AND agent_name = ?", + (job_id, self.agent_name), + ) + db.commit() + return cur.rowcount > 0 + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] cancel_job failed: %s", exc) + return False + finally: + db.close() + + def list_jobs(self) -> list[Job]: + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? ORDER BY next_fire ASC", + (self.agent_name,), + ).fetchall() + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] list_jobs failed: %s", exc) + return [] + finally: + db.close() + return [_row_to_job(r) for r in rows] + + async def start(self) -> None: + if self._task is not None: + return + self._stopping = False + self._recover_missed_fires() + self._task = asyncio.create_task(self._poll_loop(), name="scheduler.local.poll") + log.info( + "[scheduler] local backend started: agent=%s db=%s", + self.agent_name, self.path, + ) + + async def stop(self) -> None: + self._stopping = True + if self._task is None: + return + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + # Expected — we just cancelled it. + pass + except Exception: # noqa: BLE001 + # Anything else means the polling loop crashed during + # shutdown. Log with traceback so we can debug; don't + # re-raise (caller is in shutdown path, raising would + # mask the original shutdown trigger). + log.exception("[scheduler] polling task raised during stop") + self._task = None + log.info("[scheduler] local backend stopped") + + # ── polling + firing ──────────────────────────────────────────────────── + + async def _poll_loop(self) -> None: + while not self._stopping: + try: + await self._tick() + except Exception: # noqa: BLE001 + log.exception("[scheduler] poll tick failed") + try: + await asyncio.sleep(_POLL_INTERVAL_S) + except asyncio.CancelledError: + return + + async def _tick(self) -> None: + now = datetime.now(UTC) + due = self._claim_due_jobs(now) + for job in due: + # Reschedule (or delete) only when delivery actually + # succeeded. A transient HTTP failure leaves the row in + # place so the next tick retries; a one-shot stays alive + # until it lands rather than vanishing on the first + # network blip. + if await self._fire(job): + self._reschedule_or_delete(job, fired_at=now) + else: + log.warning( + "[scheduler] fire failed for job %s; leaving in place for retry", + job.id, + ) + + def _claim_due_jobs(self, now: datetime) -> list[Job]: + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? AND enabled = 1 " + "AND next_fire <= ? ORDER BY next_fire ASC", + (self.agent_name, now.isoformat()), + ).fetchall() + except sqlite3.DatabaseError as exc: + log.warning("[scheduler] _claim_due_jobs failed: %s", exc) + return [] + finally: + db.close() + return [_row_to_job(r) for r in rows] + + def _reschedule_or_delete(self, job: Job, *, fired_at: datetime) -> None: + """Cron jobs roll forward; one-shot jobs are deleted.""" + db = self._connect() + try: + if is_cron(job.schedule): + next_iso = _compute_next_fire(job.schedule, after=fired_at) + db.execute( + "UPDATE jobs SET next_fire = ?, last_fire = ? WHERE id = ?", + (next_iso, fired_at.isoformat(), job.id), + ) + else: + db.execute("DELETE FROM jobs WHERE id = ?", (job.id,)) + db.commit() + except sqlite3.DatabaseError: + log.exception("[scheduler] reschedule failed for job %s", job.id) + finally: + db.close() + + def _recover_missed_fires(self) -> None: + """Roll past-due jobs forward on startup. + + - Missed fires within the last 24h fire immediately on the next + tick (we leave their ``next_fire`` in the past so the polling + loop picks them up naturally). + - Older missed fires are rescheduled forward without firing — + firing a flood of stale prompts after a long downtime is worse + than dropping them. + """ + cutoff_recent = datetime.now(UTC) - timedelta(seconds=_MISSED_FIRE_WINDOW_S) + db = self._connect() + try: + rows = db.execute( + "SELECT * FROM jobs WHERE agent_name = ? AND enabled = 1 " + "AND next_fire <= ?", + (self.agent_name, cutoff_recent.isoformat()), + ).fetchall() + for row in rows: + job = _row_to_job(row) + if is_cron(job.schedule): + next_iso = _compute_next_fire(job.schedule) + db.execute( + "UPDATE jobs SET next_fire = ? WHERE id = ?", + (next_iso, job.id), + ) + log.info( + "[scheduler] dropped stale fire for job %s; next at %s", + job.id, next_iso, + ) + else: + db.execute("DELETE FROM jobs WHERE id = ?", (job.id,)) + log.info("[scheduler] dropped stale one-shot job %s", job.id) + db.commit() + except sqlite3.DatabaseError: + log.exception("[scheduler] missed-fire recovery failed") + finally: + db.close() + + async def _fire(self, job: Job) -> bool: + """Deliver a job by POSTing to the agent's own A2A endpoint. + + Returns ``True`` on a 2xx response, ``False`` on any HTTP + error or network exception. Callers use the return value to + decide whether to advance the schedule (success) or leave + the row in place for the next tick to retry (failure). + """ + import httpx + + headers = {"Content-Type": "application/json"} + if self._bearer: + headers["Authorization"] = f"Bearer {self._bearer}" + if self._api_key: + headers["X-API-Key"] = self._api_key + + message_id = str(uuid.uuid4()) + body = { + "jsonrpc": "2.0", + "id": message_id, + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": job.prompt}], + "messageId": message_id, + }, + # Custom metadata goes at params.metadata — that's + # where a2a_handler._a2a_rpc reads it (see + # ``msg_metadata = params.get("metadata")``). Putting + # it inside params.message.metadata silently drops it. + "metadata": { + "scheduler_job_id": job.id, + "scheduler_kind": "local", + }, + }, + } + try: + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(f"{self._invoke_url}/a2a", headers=headers, json=body) + if r.status_code >= 400: + log.error( + "[scheduler] fire failed for job %s: HTTP %d %s", + job.id, r.status_code, r.text[:200], + ) + return False + log.info("[scheduler] fired job %s", job.id) + return True + except Exception: # noqa: BLE001 + log.exception("[scheduler] fire exception for job %s", job.id) + return False + + def _generate_id(self) -> str: + # Agent-name prefix keeps cross-agent IDs distinct in shared + # observability surfaces (audit log, dashboards) even though + # the DB row is already namespaced by agent_name. + return f"{self.agent_name}-{uuid.uuid4().hex[:12]}" + + +def _row_to_job(row: Any) -> Job: + return Job( + id=row["id"], + prompt=row["prompt"], + schedule=row["schedule"], + agent_name=row["agent_name"], + next_fire=row["next_fire"], + last_fire=row["last_fire"], + enabled=bool(row["enabled"]), + created_at=row["created_at"], + ) diff --git a/scheduler/workstacean.py b/scheduler/workstacean.py new file mode 100644 index 0000000..56df684 --- /dev/null +++ b/scheduler/workstacean.py @@ -0,0 +1,183 @@ +"""WorkstaceanScheduler — HTTP adapter to a protoWorkstacean install. + +Activated automatically when ``WORKSTACEAN_API_BASE`` and +``WORKSTACEAN_API_KEY`` are set (see ``server.py``). + +Speaks Workstacean's ``POST /publish`` API as documented at +https://protolabsai.github.io/protoWorkstacean/reference/scheduler/. +Every job is namespaced with the agent's name so multiple protoAgent +forks (e.g. ``gina-personal`` + ``gina-work``) can share one +Workstacean install without cross-firing: + +- Job IDs are prefixed: ``{agent_name}-{user_id_or_uuid}`` +- Topics are namespaced: ``cron.{agent_name}`` + +The adapter is fire-and-forget — Workstacean owns scheduling state. +``list_jobs()`` returns an empty list because Workstacean's list +action publishes asynchronously — strict local introspection requires +the local backend. + +Note: Workstacean today does not natively dispatch to A2A endpoints; +forks need to wire their Workstacean install to route ``cron.*`` +topics to the agent's A2A endpoint. See the linked guide for the +recommended bridge config. +""" + +from __future__ import annotations + +import logging +import os +import uuid +from typing import Any + +import httpx + +from scheduler.interface import Job, parse_iso_to_utc, is_cron + +log = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_S = 10 + + +class WorkstaceanScheduler: + """HTTP adapter to a Workstacean ``/publish`` endpoint.""" + + name = "workstacean" + + def __init__( + self, + agent_name: str, + *, + base_url: str, + api_key: str, + topic_prefix: str | None = None, + timeout_s: float = DEFAULT_TIMEOUT_S, + ): + if not base_url: + raise ValueError("WorkstaceanScheduler: base_url is required") + if not api_key: + raise ValueError("WorkstaceanScheduler: api_key is required") + self.agent_name = agent_name + self._base_url = base_url.rstrip("/") + self._api_key = api_key + # Namespacing: topic_prefix governs which Workstacean topic the + # job fires on. Default = ``cron.``. Forks can override + # via ``WORKSTACEAN_TOPIC_PREFIX`` to integrate with existing + # bus conventions. + self._topic_prefix = topic_prefix or f"cron.{agent_name}" + self._timeout_s = timeout_s + + # ── public API ────────────────────────────────────────────────────────── + + def add_job(self, prompt: str, schedule: str, *, job_id: str | None = None) -> Job: + if not prompt or not prompt.strip(): + raise ValueError("scheduler: prompt is required") + # Validate the schedule eagerly so a malformed expr fails at + # tool-call time, not silently inside Workstacean. + _validate_schedule(schedule) + + normalized_id = self._namespaced_id(job_id) + topic = f"{self._topic_prefix}.{normalized_id}" + # Workstacean expects an outer ``command.schedule`` topic and + # the inner ``payload`` carries both the trigger schedule and + # the actual message that will be fired. The inner ``topic`` + # is what Workstacean publishes to when the schedule fires — + # so it has to be something a downstream A2A bridge subscribes + # to. Default convention: ``cron..``. + body = { + "topic": "command.schedule", + "payload": { + "action": "add", + "id": normalized_id, + "schedule": schedule, + "topic": topic, + "payload": { + "content": prompt, + "sender": "scheduler", + "channel": "a2a", + # Cross-system breadcrumb so the bridge knows which + # protoAgent fork the message belongs to. + "agent_name": self.agent_name, + "scheduler_job_id": normalized_id, + }, + }, + } + self._publish(body) + + return Job( + id=normalized_id, + prompt=prompt, + schedule=schedule, + agent_name=self.agent_name, + next_fire=None, # Workstacean owns the schedule state + ) + + def cancel_job(self, job_id: str) -> bool: + body = { + "topic": "command.schedule", + "payload": {"action": "remove", "id": self._namespaced_id(job_id)}, + } + try: + self._publish(body) + return True + except RuntimeError as exc: + log.warning("[scheduler] workstacean cancel failed: %s", exc) + return False + + def list_jobs(self) -> list[Job]: + """Returns ``[]`` from the adapter. + + Workstacean's ``list`` action publishes its response on the + ``schedule.list`` topic — there is no synchronous reply on + ``/publish``. Subscribing to that topic from inside a + protoAgent process (without a full bus client) is more + machinery than this adapter is the right layer for. Forks + that need live introspection should run the local backend or + query Workstacean directly. + """ + return [] + + async def start(self) -> None: + # Workstacean owns scheduling state — nothing to start here. + log.info( + "[scheduler] workstacean backend ready: agent=%s base=%s topic=%s.*", + self.agent_name, self._base_url, self._topic_prefix, + ) + + async def stop(self) -> None: + return None + + # ── helpers ───────────────────────────────────────────────────────────── + + def _publish(self, body: dict[str, Any]) -> None: + headers = {"Content-Type": "application/json", "X-API-Key": self._api_key} + try: + r = httpx.post( + f"{self._base_url}/publish", + headers=headers, + json=body, + timeout=self._timeout_s, + ) + except httpx.HTTPError as exc: + raise RuntimeError(f"workstacean publish failed: {exc}") from exc + if r.status_code >= 400: + raise RuntimeError( + f"workstacean publish HTTP {r.status_code}: {r.text[:200]}" + ) + + def _namespaced_id(self, job_id: str | None) -> str: + suffix = job_id or uuid.uuid4().hex[:12] + prefix = f"{self.agent_name}-" + return suffix if suffix.startswith(prefix) else prefix + suffix + + +def _validate_schedule(schedule: str) -> None: + """Validate cron expression OR ISO datetime. Raises ValueError.""" + if is_cron(schedule): + from croniter import croniter + try: + croniter(schedule) + except (TypeError, ValueError) as exc: + raise ValueError(f"invalid cron expression {schedule!r}: {exc}") from exc + return + parse_iso_to_utc(schedule) # raises ValueError on malformed ISO diff --git a/server.py b/server.py index 2221b11..8b10e4f 100644 --- a/server.py +++ b/server.py @@ -30,10 +30,13 @@ import os import time from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any from graph.output_format import extract_output +if TYPE_CHECKING: + from scheduler.interface import SchedulerBackend + # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- @@ -55,21 +58,563 @@ _graph = None # LangGraph compiled graph _graph_config = None # LangGraphConfig _checkpointer = None # MemorySaver for session persistence +_active_port = 7870 # populated by _main() — the port this process is actually bound to. + # Read by the autostart installer so the LaunchAgent reboots + # on the same port the operator launched with, not the default. +_scheduler = None # SchedulerBackend (LocalScheduler or WorkstaceanScheduler). + # Constructed at init, started on FastAPI startup, stopped + # on shutdown. Lifecycle is hooked in _main() so the + # polling coroutine doesn't leak on server reload. def _init_langgraph_agent(): - """Initialize the LangGraph agent backend.""" + """Initialize the LangGraph backend — setup-aware. + + Always loads the config + checkpointer so the wizard and drawer + can introspect what's on disk. The compiled graph is only built + when the setup wizard has been completed (``.setup-complete`` + marker present). This lets the server boot cleanly on a fresh + clone with no model credentials — the wizard drives the user to + provide them, then triggers a reload. + """ global _graph, _graph_config, _checkpointer - from graph.agent import create_agent_graph from graph.config import LangGraphConfig + from graph.config_io import is_setup_complete from langgraph.checkpoint.memory import MemorySaver config_path = Path(__file__).parent / "config" / "langgraph-config.yaml" _graph_config = LangGraphConfig.from_yaml(config_path) _checkpointer = MemorySaver() - _graph = create_agent_graph(_graph_config) - log.info("LangGraph agent initialized (model: %s)", _graph_config.model_name) + + if not is_setup_complete(): + _graph = None + log.info( + "Setup wizard has not been completed — graph not compiled. " + "Open the UI to finish setup.", + ) + return + + from graph.agent import create_agent_graph + + # Construct the default KnowledgeStore so memory tools (memory_ingest, + # memory_recall, daily_log) and KnowledgeMiddleware have something to + # bind to. Forks that don't want a store can set + # ``middleware.knowledge: false`` and remove the memory tools from + # the worker subagent — the store is still cheap to construct. + knowledge_store = _build_knowledge_store(_graph_config) + + # Scheduler — local sqlite by default, swaps to a WorkstaceanScheduler + # automatically when WORKSTACEAN_API_BASE + WORKSTACEAN_API_KEY env + # vars are set. Both backends share the same agent-tool surface + # (schedule_task / list_schedules / cancel_schedule). + global _scheduler + _scheduler = _build_scheduler(_graph_config) + + _graph = create_agent_graph( + _graph_config, knowledge_store=knowledge_store, scheduler=_scheduler, + ) + log.info( + "LangGraph agent initialized (model: %s, knowledge_db: %s, scheduler: %s)", + _graph_config.model_name, + getattr(knowledge_store, "path", "(disabled)"), + getattr(_scheduler, "name", "disabled"), + ) + + +def _build_knowledge_store(config): + """Return a ``KnowledgeStore`` bound to the configured DB path. + + Best-effort: any sqlite-level failure is logged and the store + falls back to ``~/.protoagent/knowledge/agent.db`` automatically + (see ``knowledge.store._resolve_path``). Returns ``None`` only when + knowledge is disabled in config — kept as a separate code path so + forks can audit when the agent is running KB-less. + """ + if not getattr(config, "knowledge_middleware", True): + return None + try: + from knowledge import KnowledgeStore + return KnowledgeStore(db_path=config.knowledge_db_path) + except Exception as exc: + log.warning("[server] knowledge store init failed: %s; running KB-less", exc) + return None + + +def _start_scheduler_async(backend: "SchedulerBackend") -> None: + """Fire-and-forget scheduler.start() onto the running loop. + + Reload paths are sync but invoked from FastAPI request handlers, + so the running loop is available. Awaiting would force the entire + reload chain to become async — not worth it for one no-await + coroutine. + """ + import asyncio + try: + asyncio.get_running_loop().create_task(backend.start()) + except RuntimeError: + log.warning( + "[reload] no running event loop; scheduler will start " + "on next process boot", + ) + except Exception: + log.exception("[reload] scheduler start failed") + + +def _stop_scheduler_async(backend: "SchedulerBackend") -> None: + """Fire-and-forget scheduler.stop() onto the running loop. + + Used when the YAML toggle flips off mid-reload. The polling task + cancels cleanly; the next graph rebuild registers no scheduler + tools. + """ + import asyncio + try: + asyncio.get_running_loop().create_task(backend.stop()) + except RuntimeError: + log.warning("[reload] no running event loop; scheduler not stopped") + except Exception: + log.exception("[reload] scheduler stop failed") + + +def _build_scheduler(config) -> "SchedulerBackend | None": + """Return the active scheduler backend, or ``None`` when disabled. + + Selection order: + + 1. ``WORKSTACEAN_API_BASE`` + ``WORKSTACEAN_API_KEY`` set → + ``WorkstaceanScheduler``. Forks running on the protoLabs fleet + infrastructure get this for free. + 2. Otherwise → ``LocalScheduler`` with sqlite at + ``/sandbox/scheduler//jobs.db``. + + Returns ``None`` when explicitly disabled via ``SCHEDULER_DISABLED=1`` + so a fork can ship without a scheduler at all. + + The agent's auth token + api-key are passed into the local backend + so its self-invocation HTTP call can pass through bearer / X-API-Key + auth — the scheduler hits the same A2A endpoint as a real caller. + """ + # Two opt-out paths, in priority order: + # 1. ``middleware.scheduler: false`` in YAML (drawer / wizard). + # This is the canonical opt-out — symmetric with + # ``middleware.knowledge`` / ``middleware.memory``. + # 2. ``SCHEDULER_DISABLED=1`` env var. Runtime escape hatch for + # fleet operators who need to kill the scheduler without + # editing config (e.g. emergency rollback). + if not getattr(config, "scheduler_enabled", True): + log.info("[server] scheduler disabled via middleware.scheduler config") + return None + if os.environ.get("SCHEDULER_DISABLED", "").lower() in ("1", "true", "yes"): + log.info("[server] scheduler disabled via SCHEDULER_DISABLED env") + return None + + name = agent_name() + workstacean_base = os.environ.get("WORKSTACEAN_API_BASE", "").strip() + workstacean_key = os.environ.get("WORKSTACEAN_API_KEY", "").strip() + if workstacean_base and workstacean_key: + try: + from scheduler import WorkstaceanScheduler + return WorkstaceanScheduler( + agent_name=name, + base_url=workstacean_base, + api_key=workstacean_key, + topic_prefix=os.environ.get("WORKSTACEAN_TOPIC_PREFIX") or None, + ) + except Exception as exc: + log.warning( + "[server] WorkstaceanScheduler init failed: %s; falling back to local", + exc, + ) + + try: + from scheduler import LocalScheduler + invoke_url = os.environ.get( + "SCHEDULER_INVOKE_URL", + f"http://127.0.0.1:{_active_port}", + ) + bearer = (config.auth_token or os.environ.get("A2A_AUTH_TOKEN", "")).strip() + # The A2A handler reads X-API-Key from ``_API_KEY`` + # (server.py L893 — note: the env-derived name, NOT the wizard-set + # ``identity.name``). Match that here so a wizard rename doesn't + # break self-invocation auth. + api_key_env = f"{AGENT_NAME_ENV.upper()}_API_KEY" + api_key = os.environ.get(api_key_env, "").strip() + return LocalScheduler( + agent_name=name, + invoke_url=invoke_url, + api_key=api_key, + bearer_token=bearer, + ) + except Exception as exc: + log.warning( + "[server] LocalScheduler init failed: %s; running scheduler-less", + exc, + ) + return None + + +def _reload_langgraph_agent() -> tuple[bool, str]: + """Rebuild the compiled graph from the latest config YAML. + + Called by the drawer's Save & Reload action and the + ``/api/config/reload`` endpoint. Preserves the existing + ``_checkpointer`` so active session threads stay addressable + — a fresh MemorySaver would orphan every in-flight thread. + + Rebinding ``_graph`` is atomic in CPython; in-flight + ``astream_events`` iterators hold their own reference to the + prior graph and finish cleanly on the old instance. + + If the setup marker is absent this returns early without + compiling — the wizard is still in front of the user, so there + is nothing to hot-swap yet. + """ + global _graph, _graph_config + + from graph.agent import create_agent_graph + from graph.config import LangGraphConfig + from graph.config_io import is_setup_complete + + config_path = Path(__file__).parent / "config" / "langgraph-config.yaml" + try: + new_config = LangGraphConfig.from_yaml(config_path) + except Exception as e: + log.exception("[reload] config load failed") + return False, f"config load failed: {e}" + + # Build the graph FIRST (when setup is complete) — only commit + # runtime state after the rebuild succeeds. Doing the swap first + # would leave the process serving the prior compiled _graph under + # fresh _graph_config + rotated bearer auth on failure — the + # metrics / card / auth all de-sync from what's actually running. + # Plan the scheduler swap *before* attempting the graph rebuild so + # the polling loop isn't torn down (or a fresh one started) until + # we know the rebuild will succeed. Three states: + # + # 1. Toggle flipped OFF, scheduler currently running → next graph + # uses None; we stop the running scheduler only after commit. + # 2. Toggle ON, none running (first-run after setup completes) → + # construct now (cheap), start only after commit. + # 3. Toggle ON, already running → reuse. Drawer saves don't tear + # down the polling loop. + # + # Env-driven config (WORKSTACEAN_API_BASE) only takes effect on + # full process restart; the YAML toggle is the canonical + # reload-time switch. + global _scheduler + scheduler_wanted = getattr(new_config, "scheduler_enabled", True) + next_scheduler: "SchedulerBackend | None" + pending_start: "SchedulerBackend | None" = None + pending_stop: "SchedulerBackend | None" = None + if not scheduler_wanted: + next_scheduler = None + pending_stop = _scheduler # may be None — stopper is no-op then + elif _scheduler is None: + next_scheduler = _build_scheduler(new_config) + pending_start = next_scheduler + else: + next_scheduler = _scheduler + + if is_setup_complete(): + try: + new_store = _build_knowledge_store(new_config) + new_graph = create_agent_graph( + new_config, knowledge_store=new_store, scheduler=next_scheduler, + ) + except Exception as e: + log.exception("[reload] graph rebuild failed") + # Scheduler state hasn't been committed yet — caller's + # running scheduler keeps polling, no orphaned tasks. + return False, f"graph rebuild failed: {e}" + else: + new_graph = None + + # Commit: config → A2A bearer → graph. All three reference the + # same ``new_config`` so they stay consistent. + _graph_config = new_config + try: + from a2a_handler import set_a2a_token + + set_a2a_token(new_config.auth_token or None) + except ImportError: + # a2a_handler not yet imported (e.g. during early-boot reload + # before _main wires routes) — harmless. + pass + _graph = new_graph + # Commit the scheduler swap. start/stop are async — fire-and-forget + # onto the active loop so reload stays sync. We've already verified + # the graph rebuild succeeded; if start/stop fails we log but + # don't roll back (the agent is already serving the new graph). + _scheduler = next_scheduler + if pending_stop is not None: + _stop_scheduler_async(pending_stop) + if pending_start is not None: + _start_scheduler_async(pending_start) + + if new_graph is None: + log.info("[reload] setup not complete — config reloaded, graph not compiled") + return True, "config reloaded • setup not complete" + + log.info("LangGraph agent reloaded (model: %s)", _graph_config.model_name) + return True, f"reloaded • model={_graph_config.model_name}" + + +def _sync_autostart_with_config(config: dict | None) -> str | None: + """Align the OS autostart artifact with the YAML runtime flag. + + Returns a short status string to append to the caller's message + log, or ``None`` when the config doesn't touch the runtime + section. Shared by ``finish_setup`` (wizard path) and + ``_apply_settings_changes`` (drawer path) so both surfaces + produce the same side effect when the checkbox flips. + """ + if not (config and "runtime" in config): + return None + want = bool(config.get("runtime", {}).get("autostart_on_boot", False)) + + try: + from autostart import install_autostart, uninstall_autostart + + as_name = ( + config.get("identity", {}).get("name") + or (_graph_config.identity_name if _graph_config else "") + or "protoagent" + ) + if want: + ok, msg = install_autostart(agent_name=as_name, port=_active_port) + else: + ok, msg = uninstall_autostart(agent_name=as_name) + except Exception as e: + log.exception("[autostart] sync raised") + return f"autostart failed: {e}" + + if not ok: + log.warning("[autostart] sync failed: %s", msg) + return f"autostart: {msg}" + + +def _apply_settings_changes( + config: dict | None = None, + soul: str | None = None, +) -> tuple[bool, list[str]]: + """Persist config YAML + SOUL.md then reload the graph once. + + Passing ``None`` for either argument skips that write — a bare + call with both None acts as a pure reload (useful for picking up + external file edits). + """ + from graph.config_io import ( + apply_updates_to_yaml, + load_yaml_doc, + save_yaml_doc, + validate_config_dict, + write_soul, + ) + + messages: list[str] = [] + + if config is not None: + ok, err = validate_config_dict(config) + if not ok: + return False, [f"validation: {err}"] + try: + doc = load_yaml_doc() + apply_updates_to_yaml(doc, config) + save_yaml_doc(doc) + messages.append("config saved") + except Exception as e: + log.exception("[config] YAML write failed") + return False, [f"config write: {e}"] + + if soul is not None: + try: + paths = write_soul(soul) + messages.append(f"SOUL saved ({len(paths)} path{'s' if len(paths) != 1 else ''})") + except Exception as e: + log.exception("[config] SOUL write failed") + return False, [f"soul write: {e}"] + + # Drawer toggles of runtime.autostart_on_boot ride this path, + # not the wizard's finish_setup, so the LaunchAgent plist has + # to be installed/removed here too. + as_msg = _sync_autostart_with_config(config) + if as_msg: + messages.append(as_msg) + + ok, reload_msg = _reload_langgraph_agent() + messages.append(reload_msg) + return ok, messages + + +def _build_settings_callbacks() -> dict[str, Any]: + """Callbacks consumed by the Gradio Configuration drawer + wizard.""" + from graph.config_io import ( + config_to_dict, + is_setup_complete, + list_available_tools, + list_gateway_models, + list_soul_presets, + mark_setup_complete, + read_soul, + read_soul_preset, + reset_setup, + ) + + def get_config() -> dict[str, Any]: + return config_to_dict(_graph_config) + + def list_models(api_base: str = "", api_key: str = "") -> tuple[list[str], str]: + """UI-friendly model lookup. + + Uses the form-local api_base/api_key when the user is trying a + different endpoint before saving; falls back to the currently + loaded graph config so the initial render works without + arguments. + """ + base = api_base or (_graph_config.api_base if _graph_config else "") + key = api_key or (_graph_config.api_key if _graph_config else "") + return list_gateway_models(base, key) + + def save_all(config: dict | None, soul: str | None) -> tuple[bool, str]: + ok, messages = _apply_settings_changes(config=config, soul=soul) + return ok, " • ".join(messages) + + def finish_setup(config: dict | None, soul: str | None) -> tuple[bool, str]: + """Wizard terminal action — write everything, mark complete, reload. + + Ordering matters: + + 1. Write config YAML + SOUL.md (no reload yet). + 2. ``mark_setup_complete()`` — flip the marker BEFORE the + reload so ``_reload_langgraph_agent`` actually compiles + the graph. Doing it after means the reload sees + setup-incomplete and stays ``_graph = None``. + 3. Sync autostart (LaunchAgent plist is independent of the + graph, so it can happen any time after the config is + written). + 4. Reload — marker present, graph compiles, chat works. + + Returns a single status string joining per-step messages. + """ + from graph.config_io import ( + apply_updates_to_yaml, + load_yaml_doc, + save_yaml_doc, + validate_config_dict, + write_soul, + ) + + messages: list[str] = [] + + # 1. Persist + if config is not None: + ok, err = validate_config_dict(config) + if not ok: + return False, f"validation: {err}" + try: + doc = load_yaml_doc() + apply_updates_to_yaml(doc, config) + save_yaml_doc(doc) + messages.append("config saved") + except Exception as e: + log.exception("[setup] YAML write failed: %s", e) + return False, f"config write: {e}" + + if soul is not None: + try: + paths = write_soul(soul) + messages.append(f"SOUL saved ({len(paths)} path{'s' if len(paths) != 1 else ''})") + except Exception as e: + log.exception("[setup] SOUL write failed: %s", e) + return False, f"soul write: {e}" + + # 2. Flip the marker — MUST be before reload so the graph builds + mark_setup_complete() + messages.append("setup marked complete") + + # 3. Autostart sync (shared helper — drawer path runs the same) + as_msg = _sync_autostart_with_config(config) + if as_msg: + messages.append(as_msg) + + # 4. Reload — now picks up setup_complete=True and compiles. + # On failure, roll back the marker so the next page load + # drops the user back into the wizard instead of landing + # them in the chat UI with the "setup required" fallback + # and no obvious way to retry. + ok, reload_msg = _reload_langgraph_agent() + messages.append(reload_msg) + if not ok: + reset_setup() + messages.append("setup marker rolled back — re-run the wizard after fixing the error above") + + return ok, " • ".join(messages) + + def restart_setup() -> str: + """Drawer action — delete the marker so the wizard runs again.""" + reset_setup() + log.info("[setup] marker removed — wizard will run on next page load") + return "setup marker removed • reload the page to run the wizard" + + def autostart_info() -> dict[str, Any]: + """Report platform support + current on-disk state. The drawer + uses this to render the toggle correctly and to print the + plist path for debugging.""" + try: + from autostart import autostart_status + + name = (_graph_config.identity_name if _graph_config else "") or "protoagent" + return autostart_status(name) + except Exception as e: + return {"supported": False, "installed": False, "reason": str(e)} + + def toggle_autostart(enabled: bool) -> tuple[bool, str]: + """Install or uninstall the OS autostart artifact, mirroring + the YAML field. Called from the drawer's checkbox handler so + toggling takes effect immediately without waiting for Save.""" + try: + from autostart import install_autostart, uninstall_autostart + + name = (_graph_config.identity_name if _graph_config else "") or "protoagent" + if enabled: + return install_autostart(agent_name=name, port=_active_port) + return uninstall_autostart(agent_name=name) + except Exception as e: + return False, str(e) + + return { + "get_config": get_config, + "get_soul": read_soul, + "list_models": list_models, + "list_tools": list_available_tools, + "list_soul_presets": list_soul_presets, + "read_soul_preset": read_soul_preset, + "save_all": save_all, + "finish_setup": finish_setup, + "restart_setup": restart_setup, + "is_setup_complete": is_setup_complete, + "autostart_info": autostart_info, + "toggle_autostart": toggle_autostart, + } + + +def _setup_required_message() -> list[dict[str, Any]]: + """Returned by chat endpoints when the wizard hasn't been run. + + The Gradio UI hides the chat pane until setup completes, but the + HTTP /api/chat, OpenAI-compat, and A2A endpoints don't know the + UI state — so they emit a plain-text "finish setup first" + message instead of 500ing on ``_graph is None``. + """ + return [{ + "role": "assistant", + "content": ( + "**Setup required.** The setup wizard has not been completed. " + "Open the UI and finish the wizard, or POST the completed config " + "to `/api/config/setup` before calling chat endpoints." + ), + }] # --------------------------------------------------------------------------- @@ -85,6 +630,8 @@ async def chat(message: str, session_id: str) -> list[dict[str, Any]]: capture tool events and emit the cost-v1 DataPart on the terminal artifact. """ + if _graph is None: + return _setup_required_message() return await _chat_langgraph(message, session_id) @@ -120,6 +667,10 @@ async def _chat_langgraph_stream( if caller_trace.get("spanId"): trace_meta["caller_span_id"] = caller_trace["spanId"] + if _graph is None: + yield ("error", "setup required — finish the setup wizard before calling A2A endpoints") + return + async with tracing.trace_session( session_id=session_id, name="a2a-stream", @@ -248,13 +799,28 @@ async def _chat_langgraph(message: str, session_id: str) -> list[dict[str, Any]] # Agent card — EDIT THIS when forking # --------------------------------------------------------------------------- -AGENT_NAME = os.environ.get("AGENT_NAME", "protoagent") +AGENT_NAME_ENV = os.environ.get("AGENT_NAME", "protoagent") + + +def agent_name() -> str: + """Resolve the active agent name. + + Preference order: wizard-set ``identity.name`` in YAML (when loaded + and non-placeholder) → ``AGENT_NAME`` env var → ``"protoagent"``. + The agent card, OpenAI-compat model id, and chat header all call + this so a wizard rename propagates without a restart. The + Prometheus metric prefix and ``_API_KEY`` env name are + set at boot and still require a restart (see docs). + """ + if _graph_config and _graph_config.identity_name and _graph_config.identity_name != "protoagent": + return _graph_config.identity_name + return AGENT_NAME_ENV def _build_security_schemes() -> dict: """Return securitySchemes dict, adding bearer only when A2A_AUTH_TOKEN is set.""" schemes: dict = {"apiKey": {"type": "apiKey", "in": "header", "name": "X-API-Key"}} - if os.environ.get("A2A_AUTH_TOKEN", ""): + if os.environ.get("A2A_AUTH_TOKEN", "") or (_graph_config and _graph_config.auth_token): schemes["bearer"] = {"type": "http", "scheme": "bearer"} return schemes @@ -281,7 +847,7 @@ def _build_agent_card(host: str) -> dict: it only if you strip the usage-capture. """ return { - "name": AGENT_NAME, + "name": agent_name(), "description": ( "protoAgent template — A2A-compliant LangGraph agent. " "Replace this description with your agent's actual purpose." @@ -326,10 +892,13 @@ def _build_agent_card(host: str) -> dict: # --------------------------------------------------------------------------- def _main(): - parser = argparse.ArgumentParser(description=f"{AGENT_NAME} — protoAgent server") + global _active_port + + parser = argparse.ArgumentParser(description=f"{AGENT_NAME_ENV} — protoAgent server") parser.add_argument("--port", type=int, default=7870) parser.add_argument("--config", type=str, default=None) args = parser.parse_args() + _active_port = args.port # Initialize observability import tracing @@ -343,10 +912,11 @@ def _main(): from chat_ui import create_chat_app blocks = create_chat_app( chat_fn=chat, - title=AGENT_NAME, + title=agent_name(), subtitle="protoAgent", placeholder="Send a message...", pwa=True, + settings=_build_settings_callbacks(), ) import gradio as gr @@ -356,7 +926,32 @@ def _main(): from fastapi.staticfiles import StaticFiles from pydantic import BaseModel as PydanticBaseModel - fastapi_app = FastAPI(title=f"{AGENT_NAME} — protoAgent") + fastapi_app = FastAPI(title=f"{agent_name()} — protoAgent") + + # --- Scheduler lifecycle ------------------------------------------------ + # The local scheduler needs an asyncio polling task; the Workstacean + # adapter is a no-op start/stop. Both implement the same contract so + # we just call through. on_event is preferred over a lifespan + # context manager here — the rest of the boot is sync (uvicorn.run + # is the only blocking call) and FastAPI fires startup/shutdown + # around it. + @fastapi_app.on_event("startup") + async def _scheduler_startup() -> None: + if _scheduler is None: + return + try: + await _scheduler.start() + except Exception: + log.exception("[scheduler] startup failed") + + @fastapi_app.on_event("shutdown") + async def _scheduler_shutdown() -> None: + if _scheduler is None: + return + try: + await _scheduler.stop() + except Exception: + log.exception("[scheduler] shutdown failed") # --- Chat API ----------------------------------------------------------- class ChatRequest(PydanticBaseModel): @@ -369,6 +964,80 @@ async def _api_chat(req: ChatRequest): parts = [m["content"] for m in result if m.get("role") == "assistant" and m.get("content")] return {"response": "\n\n".join(parts), "messages": result} + # --- Live config / SOUL editing ---------------------------------------- + # GET returns the current config + persona so external clients (the + # Gradio drawer is one; curl is another) can mirror what's running. + # POST accepts partial edits — pass only the sections you want to + # change. Reload is automatic. + class ConfigReloadRequest(PydanticBaseModel): + config: dict | None = None + soul: str | None = None + + @fastapi_app.get("/api/config") + async def _api_get_config(): + from graph.config_io import config_to_dict, read_soul + return { + "config": config_to_dict(_graph_config), + "soul": read_soul(), + } + + @fastapi_app.post("/api/config") + async def _api_post_config(req: ConfigReloadRequest): + ok, messages = _apply_settings_changes(config=req.config, soul=req.soul) + return {"ok": ok, "messages": messages} + + class ModelsProbeRequest(PydanticBaseModel): + api_base: str = "" + api_key: str = "" + + @fastapi_app.post("/api/config/models") + async def _api_list_models(req: ModelsProbeRequest | None = None): + """Fetch the gateway's model list. + + POST (body) not GET (query) so the caller's API key doesn't + end up in browser history, reverse-proxy access logs, or the + uvicorn request log. A blank body falls back to whatever key + and base are stored in the current config — useful for the + drawer's initial render where there's nothing to POST yet. + """ + from graph.config_io import list_gateway_models + + body = req or ModelsProbeRequest() + base = body.api_base or (_graph_config.api_base if _graph_config else "") + key = body.api_key or (_graph_config.api_key if _graph_config else "") + models, error = list_gateway_models(base, key) + return {"models": models, "error": error} + + # --- Setup wizard state ------------------------------------------------- + @fastapi_app.get("/api/config/setup-status") + async def _api_setup_status(): + from graph.config_io import is_setup_complete, list_soul_presets + return { + "setup_complete": is_setup_complete(), + "presets": list_soul_presets(), + } + + @fastapi_app.post("/api/config/setup") + async def _api_finish_setup(req: ConfigReloadRequest): + """Terminal wizard action over HTTP. Same semantics as the + drawer's ``finish_setup`` callback — writes everything, marks + setup complete, optionally installs autostart, then reloads. + """ + callbacks = _build_settings_callbacks() + ok, msg = callbacks["finish_setup"](req.config, req.soul) + return {"ok": ok, "message": msg} + + @fastapi_app.post("/api/config/reset-setup") + async def _api_reset_setup(): + from graph.config_io import reset_setup + reset_setup() + return {"ok": True, "message": "setup marker removed"} + + @fastapi_app.get("/api/config/presets/{name}") + async def _api_read_preset(name: str): + from graph.config_io import read_soul_preset + return {"name": name, "content": read_soul_preset(name)} + # --- OpenAI-compatible chat completions -------------------------------- # Lets this agent be registered as a model in the LiteLLM gateway / # OpenWebUI without any protocol adapter. @@ -386,19 +1055,19 @@ async def _openai_chat_completions(req: dict): parts = [m["content"] for m in result if m.get("role") == "assistant" and m.get("content")] content = "\n\n".join(parts) created = int(time.time()) - completion_id = f"{AGENT_NAME}-{session_id}" + completion_id = f"{agent_name()}-{session_id}" if stream: async def _stream(): chunk = { "id": completion_id, "object": "chat.completion.chunk", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{"index": 0, "delta": {"role": "assistant", "content": content}, "finish_reason": None}], } yield f"data: {json.dumps(chunk)}\n\n" done_chunk = { "id": completion_id, "object": "chat.completion.chunk", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], } yield f"data: {json.dumps(done_chunk)}\n\n" @@ -407,7 +1076,7 @@ async def _stream(): return { "id": completion_id, "object": "chat.completion", - "created": created, "model": AGENT_NAME, + "created": created, "model": agent_name(), "choices": [{ "index": 0, "message": {"role": "assistant", "content": content}, @@ -420,14 +1089,14 @@ async def _stream(): async def _openai_models(): return { "object": "list", - "data": [{"id": AGENT_NAME, "object": "model", "created": 1774600000, "owned_by": "protolabs"}], + "data": [{"id": agent_name(), "object": "model", "created": 1774600000, "owned_by": "protolabs"}], } # --- A2A agent card ----------------------------------------------------- @fastapi_app.get("/.well-known/agent.json", include_in_schema=False) @fastapi_app.get("/.well-known/agent-card.json", include_in_schema=False) async def _a2a_agent_card(request: Request): - host = request.headers.get("host", f"{AGENT_NAME}:7870") + host = request.headers.get("host", f"{agent_name()}:7870") return JSONResponse( content=_build_agent_card(host), headers={"Cache-Control": "public, max-age=60"}, @@ -437,12 +1106,24 @@ async def _a2a_agent_card(request: Request): # JSON-RPC + REST, streaming, polling, cancel, push webhooks. from a2a_handler import register_a2a_routes - auth_env = f"{AGENT_NAME.upper()}_API_KEY" + # Two independent A2A auth surfaces: + # + # 1. **Bearer** (modern) — ``auth.token`` in YAML, captured by the + # wizard as "A2A bearer token". Passed via the ``auth_token`` + # argument, with ``A2A_AUTH_TOKEN`` env as fallback. Updates + # from a wizard/drawer-driven reload propagate live through + # ``a2a_handler.set_a2a_token`` — no restart needed. + # 2. **X-API-Key** (legacy) — ``_API_KEY`` env var, threaded + # through the ``api_key`` argument. Kept env-driven; forks that + # want it YAML-configurable can add a field later. + yaml_bearer = _graph_config.auth_token if _graph_config else "" + auth_env = f"{AGENT_NAME_ENV.upper()}_API_KEY" register_a2a_routes( app=fastapi_app, chat_stream_fn_factory=_chat_langgraph_stream, chat_fn=chat, api_key=os.environ.get(auth_env, ""), + auth_token=yaml_bearer, agent_card={}, register_card_route=False, # card is already served above ) @@ -486,7 +1167,7 @@ async def _serve_sw() -> FileResponse: favicon_path=str(static_dir / "favicon.svg") if (static_dir / "favicon.svg").exists() else None, ) - log.info("Starting %s on http://0.0.0.0:%d", AGENT_NAME, args.port) + log.info("Starting %s on http://0.0.0.0:%d", agent_name(), args.port) uvicorn.run(app, host="0.0.0.0", port=args.port) diff --git a/tests/test_config_io.py b/tests/test_config_io.py new file mode 100644 index 0000000..946abfb --- /dev/null +++ b/tests/test_config_io.py @@ -0,0 +1,451 @@ +"""Tests for graph/config_io.py — the plumbing behind the live-edit drawer. + +Critical invariants: + +- YAML round-trip preserves unknown top-level sections (forks add + these; silently dropping them on save would be a footgun). +- ``apply_updates_to_yaml`` mutates only the keys you pass and leaves + siblings alone. +- ``validate_config_dict`` catches range / type errors before disk + writes. +- ``read_soul`` / ``write_soul`` handles the dual-location contract + (/sandbox/SOUL.md as runtime, config/SOUL.md as source). +- ``list_gateway_models`` returns a readable error message rather + than raising — the UI shows this string directly. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import httpx +import pytest + + +# ── YAML round-trip ────────────────────────────────────────────────────────── + + +def test_yaml_round_trip_preserves_unknown_keys(tmp_path: Path) -> None: + """Forks add custom top-level sections (the shipped YAML already + has ``memory`` and ``skills`` that the dataclass doesn't model). + Round-tripping through load_yaml_doc + save_yaml_doc must leave + them intact.""" + from graph import config_io + + yaml_path = tmp_path / "langgraph-config.yaml" + yaml_path.write_text( + "model:\n" + " name: test-model\n" + " temperature: 0.5\n" + "memory:\n" + " path: /custom/memory\n" + " max_sessions: 42\n" + "custom_section:\n" + " arbitrary_key: arbitrary_value\n" + ) + + doc = config_io.load_yaml_doc(yaml_path) + config_io.save_yaml_doc(doc, yaml_path) + + reloaded = config_io.load_yaml_doc(yaml_path) + assert reloaded["memory"]["path"] == "/custom/memory" + assert reloaded["memory"]["max_sessions"] == 42 + assert reloaded["custom_section"]["arbitrary_key"] == "arbitrary_value" + + +def test_apply_updates_merges_shallowly(tmp_path: Path) -> None: + """Updating model.temperature must NOT clobber model.name or + other model.* fields.""" + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text( + "model:\n" + " name: original-model\n" + " temperature: 0.1\n" + " api_base: http://original\n" + ) + + doc = config_io.load_yaml_doc(yaml_path) + config_io.apply_updates_to_yaml(doc, {"model": {"temperature": 0.9}}) + config_io.save_yaml_doc(doc, yaml_path) + + reloaded = config_io.load_yaml_doc(yaml_path) + assert reloaded["model"]["name"] == "original-model" + assert reloaded["model"]["api_base"] == "http://original" + assert reloaded["model"]["temperature"] == 0.9 + + +def test_apply_updates_adds_missing_sections(tmp_path: Path) -> None: + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text("model:\n name: x\n") + doc = config_io.load_yaml_doc(yaml_path) + + config_io.apply_updates_to_yaml( + doc, + {"middleware": {"audit": True, "memory": False}}, + ) + + assert doc["middleware"]["audit"] is True + assert doc["middleware"]["memory"] is False + assert doc["model"]["name"] == "x" + + +def test_apply_updates_nested_worker(tmp_path: Path) -> None: + """subagents.worker.tools is a list, subagents.worker.enabled + is a bool — both must land in the right nested slot.""" + from graph import config_io + + yaml_path = tmp_path / "c.yaml" + yaml_path.write_text("subagents:\n worker:\n enabled: false\n") + doc = config_io.load_yaml_doc(yaml_path) + + config_io.apply_updates_to_yaml( + doc, + {"subagents": {"worker": {"enabled": True, "tools": ["current_time", "calculator"]}}}, + ) + + assert doc["subagents"]["worker"]["enabled"] is True + assert list(doc["subagents"]["worker"]["tools"]) == ["current_time", "calculator"] + + +# ── config_to_dict ─────────────────────────────────────────────────────────── + + +def test_config_to_dict_mirrors_yaml_shape() -> None: + """The UI works with the dict shape; the YAML schema uses the + same paths. Keep them in lockstep so round-tripping through + apply_updates_to_yaml works without path rewrites.""" + from graph.config import LangGraphConfig + from graph.config_io import config_to_dict + + cfg = LangGraphConfig() + d = config_to_dict(cfg) + + # Top-level schema surface — all the sections the YAML exposes. + # Adding a new section here without updating config_to_dict would + # strand fork-added fields outside the drawer's round-trip. + assert set(d.keys()) == { + "model", "subagents", "middleware", "knowledge", + "identity", "auth", "runtime", + } + assert d["model"]["name"] == cfg.model_name + assert d["model"]["temperature"] == cfg.temperature + assert d["subagents"]["worker"]["tools"] == list(cfg.worker.tools) + assert d["middleware"]["audit"] == cfg.audit_middleware + assert d["knowledge"]["top_k"] == cfg.knowledge_top_k + assert d["identity"]["name"] == cfg.identity_name + assert d["auth"]["token"] == cfg.auth_token + assert d["runtime"]["autostart_on_boot"] == cfg.autostart_on_boot + + +# ── validate_config_dict ───────────────────────────────────────────────────── + + +@pytest.mark.parametrize("bad_value,expected_error_fragment", [ + ({"model": {"temperature": 3.0}}, "temperature"), + ({"model": {"temperature": -0.1}}, "temperature"), + ({"model": {"max_tokens": 0}}, "max_tokens"), + ({"model": {"max_iterations": 0}}, "max_iterations"), + ({"subagents": {"worker": {"max_turns": 0}}}, "max_turns"), + ({"subagents": {"worker": {"tools": "not-a-list"}}}, "list"), + ({"knowledge": {"top_k": 0}}, "top_k"), +]) +def test_validate_rejects_bad_values(bad_value, expected_error_fragment): + from graph.config_io import validate_config_dict + ok, err = validate_config_dict(bad_value) + assert not ok + assert expected_error_fragment in err + + +def test_validate_accepts_happy_path(): + from graph.config_io import config_to_dict, validate_config_dict + from graph.config import LangGraphConfig + + ok, err = validate_config_dict(config_to_dict(LangGraphConfig())) + assert ok, err + + +# ── SOUL.md dual-path ──────────────────────────────────────────────────────── + + +def test_read_soul_falls_back_to_source(monkeypatch, tmp_path: Path) -> None: + """When /sandbox/SOUL.md doesn't exist (local dev), fall through + to the repo config dir so drawer edits are still visible.""" + from graph import config_io + + # Point the runtime path at an unreachable location so the source + # fallback is exercised. + fake_runtime = tmp_path / "nonexistent" / "SOUL.md" + fake_source = tmp_path / "SOUL-source.md" + fake_source.write_text("from source", encoding="utf-8") + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", fake_runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", fake_source) + + assert config_io.read_soul() == "from source" + + +def test_read_soul_prefers_runtime(monkeypatch, tmp_path: Path) -> None: + from graph import config_io + + runtime = tmp_path / "runtime" / "SOUL.md" + runtime.parent.mkdir() + runtime.write_text("runtime wins", encoding="utf-8") + source = tmp_path / "SOUL-source.md" + source.write_text("source loses", encoding="utf-8") + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + assert config_io.read_soul() == "runtime wins" + + +def test_write_soul_writes_source_always(monkeypatch, tmp_path: Path) -> None: + """The source-of-truth write (config/SOUL.md) must always succeed; + the runtime write is best-effort (skipped when /sandbox missing).""" + from graph import config_io + + # Runtime points at a path whose parent doesn't exist — should skip + # gracefully. + runtime = tmp_path / "no-sandbox-here" / "SOUL.md" + source = tmp_path / "src" / "SOUL.md" + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + written = config_io.write_soul("hello world") + assert source in written + assert runtime not in written + assert source.read_text() == "hello world" + + +def test_write_soul_writes_both_when_runtime_parent_exists( + monkeypatch, tmp_path: Path, +) -> None: + from graph import config_io + + runtime_dir = tmp_path / "sandbox" + runtime_dir.mkdir() + runtime = runtime_dir / "SOUL.md" + source = tmp_path / "src" / "SOUL.md" + + monkeypatch.setattr(config_io, "SOUL_RUNTIME_PATH", runtime) + monkeypatch.setattr(config_io, "SOUL_SOURCE_PATH", source) + + written = config_io.write_soul("dual write") + assert runtime in written + assert source in written + assert runtime.read_text() == "dual write" + assert source.read_text() == "dual write" + + +# ── Gateway model listing ──────────────────────────────────────────────────── + + +def test_list_gateway_models_success(monkeypatch): + from graph import config_io + + fake_response = MagicMock() + fake_response.status_code = 200 + fake_response.json.return_value = { + "data": [ + {"id": "model-b"}, + {"id": "model-a"}, + {"id": "model-c"}, + ], + } + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.return_value = fake_response + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://gateway:4000/v1", "test-key") + assert err == "" + assert models == ["model-a", "model-b", "model-c"] # sorted + called_url = fake_client.get.call_args[0][0] + assert called_url == "http://gateway:4000/v1/models" + + +def test_list_gateway_models_empty_base_returns_error(): + from graph.config_io import list_gateway_models + + models, err = list_gateway_models("", "key") + assert models == [] + assert "api_base" in err + + +def test_list_gateway_models_http_error(monkeypatch): + from graph import config_io + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.side_effect = httpx.ConnectError("no route to host") + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://bad-host/v1") + assert models == [] + assert "connection failed" in err + + +def test_list_gateway_models_bad_status(monkeypatch): + from graph import config_io + + fake_response = MagicMock() + fake_response.status_code = 401 + fake_response.text = "unauthorized" + + fake_client = MagicMock() + fake_client.__enter__ = lambda self: fake_client + fake_client.__exit__ = lambda *args: None + fake_client.get.return_value = fake_response + + monkeypatch.setattr("httpx.Client", lambda **kw: fake_client) + + models, err = config_io.list_gateway_models("http://x/v1", "bad-key") + assert models == [] + assert "401" in err + + +# ── list_available_tools ───────────────────────────────────────────────────── + + +def test_list_available_tools_returns_starter_set(): + from graph.config_io import list_available_tools + + names = list_available_tools() + # Lock in the template's starter set — forks replace these but + # the drawer's CheckboxGroup populates from this call, so the + # contract is "return tool names in a stable list". + assert "current_time" in names + assert "calculator" in names + assert "web_search" in names + assert "fetch_url" in names + # Memory + scheduler tools appear in the wizard checklist even + # when no store / scheduler has been constructed yet — otherwise + # the user couldn't enable them on a fresh boot. + assert "memory_ingest" in names + assert "schedule_task" in names + assert "list_schedules" in names + assert "cancel_schedule" in names + assert all(isinstance(n, str) for n in names) + # No duplicates — list_available_tools dedupes between the + # backend-bound tools and the static name lists. + assert len(names) == len(set(names)) + + +# ── Setup wizard marker ───────────────────────────────────────────────────── + + +def test_setup_marker_lifecycle(monkeypatch, tmp_path): + """Marker presence = wizard skipped. Mark → present. Reset → gone. + Reset on a missing marker is a no-op, not an error.""" + from graph import config_io + + marker = tmp_path / ".setup-complete" + monkeypatch.setattr(config_io, "SETUP_MARKER_PATH", marker) + + assert config_io.is_setup_complete() is False + + config_io.mark_setup_complete() + assert config_io.is_setup_complete() is True + assert marker.exists() + + config_io.mark_setup_complete() # idempotent + assert config_io.is_setup_complete() is True + + config_io.reset_setup() + assert config_io.is_setup_complete() is False + + config_io.reset_setup() # no-op on missing marker — doesn't raise + + +def test_mark_setup_complete_creates_parent_dir(monkeypatch, tmp_path): + """If config/ doesn't exist yet, mark_setup_complete must create + it — otherwise a fresh clone with a pristine filesystem fails + on first wizard run.""" + from graph import config_io + + marker = tmp_path / "fresh" / "config" / ".setup-complete" + monkeypatch.setattr(config_io, "SETUP_MARKER_PATH", marker) + + config_io.mark_setup_complete() + assert marker.exists() + + +# ── SOUL.md presets ───────────────────────────────────────────────────────── + + +def test_list_soul_presets_returns_shipped_starters(): + """The template must ship four starter presets so the wizard's + dropdown is useful on day one. Add a file to config/soul-presets/ + and it should appear here automatically — no registry.""" + from graph.config_io import list_soul_presets + + presets = list_soul_presets() + assert "generic-assistant" in presets + assert "research" in presets + assert "coding" in presets + assert "blank" in presets + + +def test_list_soul_presets_sorted(): + from graph.config_io import list_soul_presets + + presets = list_soul_presets() + assert presets == sorted(presets) + + +def test_read_soul_preset_returns_content(): + from graph.config_io import read_soul_preset + + content = read_soul_preset("research") + assert "research" in content.lower() + assert content.strip().startswith("#") # markdown h1 + + +def test_read_soul_preset_unknown_returns_empty(): + """Unknown preset names must return '' not raise — the wizard + treats empty as 'user didn't pick a preset, keep textarea as-is'.""" + from graph.config_io import read_soul_preset + + assert read_soul_preset("not-a-real-preset") == "" + assert read_soul_preset("") == "" + + +@pytest.mark.parametrize("malicious", [ + "../secret", + "../../etc/passwd", + "../../../etc/passwd", + "subdir/../../../outside", + "/etc/hosts", + "..", + "../../graph/config", # try to read a real repo file via ../../ +]) +def test_read_soul_preset_rejects_path_traversal(malicious): + """CRITICAL: the preset name must not let a caller escape + ``config/soul-presets/``. Every ``..`` or absolute path + should return empty string, not read an arbitrary .md file + elsewhere on disk.""" + from graph.config_io import read_soul_preset + + assert read_soul_preset(malicious) == "" + + +def test_list_soul_presets_missing_dir_returns_empty(monkeypatch, tmp_path): + """If a fork accidentally deletes the presets dir, the wizard + should render an empty dropdown, not crash.""" + from graph import config_io + + fake = tmp_path / "does-not-exist" + monkeypatch.setattr(config_io, "PRESETS_DIR", fake) + + assert config_io.list_soul_presets() == [] diff --git a/tests/test_scheduler_local.py b/tests/test_scheduler_local.py new file mode 100644 index 0000000..0e65ca1 --- /dev/null +++ b/tests/test_scheduler_local.py @@ -0,0 +1,370 @@ +"""Tests for ``scheduler.local.LocalScheduler``. + +The polling-loop firing path is covered by stubbing ``httpx.AsyncClient`` +so a unit test doesn't need a running A2A endpoint. Multi-agent +isolation, missed-fire recovery, and reschedule-vs-delete behaviour +all get explicit cases — they're the parts most likely to regress. +""" + +from __future__ import annotations + +import asyncio +import sqlite3 +from datetime import UTC, datetime, timedelta +from pathlib import Path + +import pytest + +from scheduler.interface import is_cron, parse_iso_to_utc +from scheduler.local import LocalScheduler, _compute_next_fire + + +# ── helpers ───────────────────────────────────────────────────────────────── + + +def _make_scheduler(tmp_path: Path, agent: str = "gina-test") -> LocalScheduler: + return LocalScheduler( + agent_name=agent, + invoke_url="http://127.0.0.1:7870", + api_key="k", + bearer_token="b", + db_dir=tmp_path, + ) + + +# ── interface helpers ────────────────────────────────────────────────────── + + +class TestIsCron: + def test_cron_5_field(self): + assert is_cron("0 9 * * *") is True + + def test_cron_with_ranges(self): + assert is_cron("0 9 * * 1-5") is True + + def test_iso_with_t(self): + assert is_cron("2026-04-28T15:00:00") is False + + def test_iso_with_space(self): + assert is_cron("2026-04-28 15:00:00") is False + + def test_iso_with_offset(self): + assert is_cron("2026-04-28T15:00:00+00:00") is False + + def test_garbage(self): + assert is_cron("not a schedule") is False + assert is_cron("0 9 *") is False # 3 fields, not 5 + + def test_seven_fields_rejected(self): + # 7-field cron (with seconds + year) is not standard 5-field; + # the current detector accepts only exactly 5. + assert is_cron("0 0 12 * * MON 2026") is False + + +class TestParseIso: + def test_naive_treated_as_utc(self): + dt = parse_iso_to_utc("2026-04-28T15:00:00") + assert dt.tzinfo == UTC + assert dt.hour == 15 + + def test_offset_normalized(self): + dt = parse_iso_to_utc("2026-04-28T15:00:00-05:00") + assert dt.tzinfo == UTC + assert dt.hour == 20 # 15 EST → 20 UTC + + def test_malformed_raises(self): + with pytest.raises(ValueError, match=r"Invalid isoformat|could not convert"): + parse_iso_to_utc("not an iso string") + + +# ── add / list / cancel ───────────────────────────────────────────────────── + + +class TestAddJob: + def test_cron_job(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *") + assert job.agent_name == "gina-test" + assert job.prompt == "hi" + assert job.next_fire is not None + assert "T" in job.next_fire # ISO + + def test_iso_one_shot(self, tmp_path): + s = _make_scheduler(tmp_path) + future = "2099-01-01T00:00:00" + job = s.add_job("hi", future) + # Naive ISO should be normalized to UTC + assert job.next_fire.startswith("2099-01-01T00:00:00") + + def test_empty_prompt_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + with pytest.raises(ValueError, match=r"prompt is required"): + s.add_job(" ", "0 9 * * *") + + def test_malformed_schedule_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + with pytest.raises(ValueError, match=r"Invalid isoformat|could not convert"): + s.add_job("hi", "not-a-real-schedule") + + def test_user_id_preserved(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *", job_id="my-custom-id") + assert job.id == "my-custom-id" + + def test_duplicate_id_rejected(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="dup") + with pytest.raises(ValueError, match="already exists"): + s.add_job("again", "0 9 * * *", job_id="dup") + + def test_auto_id_has_agent_prefix(self, tmp_path): + s = _make_scheduler(tmp_path, agent="ginavision") + job = s.add_job("hi", "0 9 * * *") + assert job.id.startswith("ginavision-") + + +class TestListAndCancel: + def test_list_filters_by_agent(self, tmp_path): + gp = _make_scheduler(tmp_path, agent="gina-personal") + gw = _make_scheduler(tmp_path, agent="gina-work") + gp.add_job("p1", "0 9 * * *") + gp.add_job("p2", "0 10 * * *") + gw.add_job("w1", "0 9 * * *") + assert len(gp.list_jobs()) == 2 + assert len(gw.list_jobs()) == 1 + assert gp.list_jobs()[0].agent_name == "gina-personal" + + def test_cancel_returns_true_on_hit(self, tmp_path): + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *") + assert s.cancel_job(job.id) is True + assert s.list_jobs() == [] + + def test_cancel_returns_false_on_miss(self, tmp_path): + s = _make_scheduler(tmp_path) + assert s.cancel_job("does-not-exist") is False + + def test_cross_agent_cancel_blocked(self, tmp_path): + gp = _make_scheduler(tmp_path, agent="gina-personal") + gw = _make_scheduler(tmp_path, agent="gina-work") + gw_job = gw.add_job("w1", "0 9 * * *") + # gp tries to cancel gw's job — must fail silently (no row deleted) + assert gp.cancel_job(gw_job.id) is False + assert len(gw.list_jobs()) == 1 + + +# ── reschedule / delete behaviour ─────────────────────────────────────────── + + +class TestRescheduleOrDelete: + def test_one_shot_deleted_after_fire(self, tmp_path): + s = _make_scheduler(tmp_path) + # ISO in the past so _claim_due_jobs picks it up + past = (datetime.now(UTC) - timedelta(seconds=5)).isoformat() + s.add_job("hi", past, job_id="oneshot") + job = s.list_jobs()[0] + s._reschedule_or_delete(job, fired_at=datetime.now(UTC)) + assert s.list_jobs() == [] + + def test_cron_rescheduled_after_fire(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="cron") + job = s.list_jobs()[0] + # Fire at a fixed timestamp — 2026-04-28T10:00:00Z is one hour + # past the 09:00 cron slot, so the next fire must be exactly + # 2026-04-29T09:00:00Z. + fired_at = datetime(2026, 4, 28, 10, 0, 0, tzinfo=UTC) + s._reschedule_or_delete(job, fired_at=fired_at) + rescheduled = s.list_jobs()[0] + assert rescheduled.next_fire == "2026-04-29T09:00:00+00:00" + assert rescheduled.last_fire == fired_at.isoformat() + + +class TestMissedFireRecovery: + def test_stale_oneshot_dropped(self, tmp_path): + s = _make_scheduler(tmp_path) + # ISO from 2 days ago — outside the 24h window + stale = (datetime.now(UTC) - timedelta(days=2)).isoformat() + s.add_job("hi", stale, job_id="stale") + s._recover_missed_fires() + assert s.list_jobs() == [] + + def test_stale_cron_rolled_forward(self, tmp_path): + s = _make_scheduler(tmp_path) + s.add_job("hi", "0 9 * * *", job_id="cron-stale") + # Manually rewrite next_fire to 2 days ago (outside window) + db = sqlite3.connect(str(s.path)) + old = (datetime.now(UTC) - timedelta(days=2)).isoformat() + db.execute("UPDATE jobs SET next_fire = ? WHERE id = ?", (old, "cron-stale")) + db.commit() + db.close() + s._recover_missed_fires() + rolled = s.list_jobs()[0] + assert rolled.next_fire > datetime.now(UTC).isoformat() + + def test_recent_missed_fire_kept(self, tmp_path): + s = _make_scheduler(tmp_path) + # 5 minutes ago — inside the 24h window, should still fire + recent = (datetime.now(UTC) - timedelta(minutes=5)).isoformat() + s.add_job("hi", recent, job_id="recent") + s._recover_missed_fires() + # Job still exists with next_fire in the past — polling will fire it + jobs = s.list_jobs() + assert len(jobs) == 1 + assert jobs[0].next_fire < datetime.now(UTC).isoformat() + + +# ── compute_next_fire ─────────────────────────────────────────────────────── + + +class TestComputeNextFire: + def test_cron_returns_iso_utc(self): + result = _compute_next_fire("0 9 * * *") + # Parses cleanly as ISO + dt = datetime.fromisoformat(result) + assert dt.tzinfo is not None + + def test_cron_after_anchor(self): + anchor = datetime(2026, 4, 27, 8, 0, 0, tzinfo=UTC) + result = _compute_next_fire("0 9 * * *", after=anchor) + # 9am UTC on 2026-04-27 + dt = datetime.fromisoformat(result) + assert dt.year == 2026 and dt.month == 4 and dt.day == 27 and dt.hour == 9 + + def test_iso_passthrough(self): + result = _compute_next_fire("2026-12-25T00:00:00") + assert result.startswith("2026-12-25T00:00:00") + + +# ── start / stop loop ─────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_start_stop_idempotent(tmp_path): + s = _make_scheduler(tmp_path) + await s.start() + await s.start() # second call is a no-op, not an error + assert s._task is not None + await s.stop() + await s.stop() # second call is a no-op, not an error + assert s._task is None + + +@pytest.mark.asyncio +async def test_due_job_fires(tmp_path, monkeypatch): + """End-to-end: an ISO job in the past gets picked up and POSTs to /a2a.""" + s = _make_scheduler(tmp_path) + # Schedule for 1 second ago so the first tick claims it + past = (datetime.now(UTC) - timedelta(seconds=1)).isoformat() + s.add_job("FIRED-ME", past, job_id="firetest") + + fired: list[dict] = [] + + class _FakeResponse: + status_code = 200 + text = "ok" + + class _FakeClient: + def __init__(self, *_a, **_kw): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, url, headers=None, json=None): + fired.append({"url": url, "json": json}) + return _FakeResponse() + + import httpx + monkeypatch.setattr(httpx, "AsyncClient", _FakeClient) + + await s.start() + # Give the polling loop one tick (poll interval is 1s) + await asyncio.sleep(1.5) + await s.stop() + + assert any("FIRED-ME" in str(c["json"]) for c in fired) + # One-shot was deleted after firing + assert s.list_jobs() == [] + + +@pytest.mark.asyncio +async def test_fire_failure_leaves_job_in_place(tmp_path, monkeypatch): + """A 5xx HTTP response from /a2a must NOT delete the job. + + Regression guard for the round-2 review finding: previously, + _tick() called _reschedule_or_delete in finally, which silently + consumed one-shot jobs on transient failures. Now the job stays + until delivery actually succeeds. + """ + s = _make_scheduler(tmp_path) + past = (datetime.now(UTC) - timedelta(seconds=1)).isoformat() + s.add_job("DURABLE", past, job_id="firetest") + + class _FakeResponse: + status_code = 503 + text = "service unavailable" + + class _FakeClient: + def __init__(self, *_a, **_kw): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, url, headers=None, json=None): + return _FakeResponse() + + import httpx + monkeypatch.setattr(httpx, "AsyncClient", _FakeClient) + + await s.start() + await asyncio.sleep(1.5) # one polling tick + await s.stop() + + # Job survives the failed fire, will be retried on the next tick. + assert len(s.list_jobs()) == 1 + assert s.list_jobs()[0].id == "firetest" + + +@pytest.mark.asyncio +async def test_fire_returns_bool(tmp_path, monkeypatch): + """``_fire`` is the success/failure signal feeding the + reschedule decision in ``_tick``. Lock the contract.""" + s = _make_scheduler(tmp_path) + job = s.add_job("hi", "0 9 * * *", job_id="x") + + class _OkResponse: + status_code = 200 + text = "ok" + + class _ErrResponse: + status_code = 500 + text = "boom" + + class _FakeClient: + def __init__(self, response): + self._response = response + + async def __aenter__(self): + return self + + async def __aexit__(self, *_a): + return False + + async def post(self, *_a, **_kw): + return self._response + + import httpx + + monkeypatch.setattr(httpx, "AsyncClient", lambda **kw: _FakeClient(_OkResponse())) + assert await s._fire(job) is True + + monkeypatch.setattr(httpx, "AsyncClient", lambda **kw: _FakeClient(_ErrResponse())) + assert await s._fire(job) is False diff --git a/tests/test_scheduler_workstacean.py b/tests/test_scheduler_workstacean.py new file mode 100644 index 0000000..74fb485 --- /dev/null +++ b/tests/test_scheduler_workstacean.py @@ -0,0 +1,168 @@ +"""Tests for ``scheduler.workstacean.WorkstaceanScheduler``. + +We don't run a Workstacean instance — instead we monkeypatch +``httpx.post`` and assert that the adapter sends the right +``POST /publish`` body shape (action, namespaced id, namespaced topic, +auth header). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from scheduler.workstacean import WorkstaceanScheduler + + +class _FakeResponse: + def __init__(self, status: int = 200, body: str = "ok"): + self.status_code = status + self.text = body + + +class _Recorder: + def __init__(self): + self.calls: list[dict[str, Any]] = [] + self.response = _FakeResponse() + + def __call__(self, url, headers=None, json=None, timeout=None): + self.calls.append({"url": url, "headers": headers, "json": json}) + return self.response + + +@pytest.fixture +def recorder(monkeypatch): + rec = _Recorder() + import httpx + monkeypatch.setattr(httpx, "post", rec) + return rec + + +@pytest.fixture +def adapter(): + return WorkstaceanScheduler( + agent_name="gina-personal", + base_url="http://workstacean:3000", + api_key="test-key", + ) + + +# ── construction guards ──────────────────────────────────────────────────── + + +def test_missing_base_url_rejected(): + with pytest.raises(ValueError, match="base_url"): + WorkstaceanScheduler(agent_name="x", base_url="", api_key="k") + + +def test_missing_api_key_rejected(): + with pytest.raises(ValueError, match="api_key"): + WorkstaceanScheduler(agent_name="x", base_url="http://w:3000", api_key="") + + +# ── add_job ──────────────────────────────────────────────────────────────── + + +class TestAddJob: + def test_publishes_command_schedule(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + assert len(recorder.calls) == 1 + body = recorder.calls[0]["json"] + assert body["topic"] == "command.schedule" + assert body["payload"]["action"] == "add" + + def test_id_namespaced_with_agent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + body = recorder.calls[0]["json"] + assert body["payload"]["id"] == "gina-personal-daily" + + def test_id_idempotent_when_already_prefixed(self, adapter, recorder): + # If the caller passes an already-prefixed id, the adapter + # shouldn't double-prefix it. + adapter.add_job("hi", "0 9 * * *", job_id="gina-personal-already-set") + body = recorder.calls[0]["json"] + assert body["payload"]["id"] == "gina-personal-already-set" + + def test_topic_namespaced_with_agent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="daily") + body = recorder.calls[0]["json"] + assert body["payload"]["topic"].startswith("cron.gina-personal.") + + def test_inner_payload_carries_prompt(self, adapter, recorder): + adapter.add_job("the actual prompt", "0 9 * * *", job_id="x") + inner = recorder.calls[0]["json"]["payload"]["payload"] + assert inner["content"] == "the actual prompt" + assert inner["channel"] == "a2a" + assert inner["agent_name"] == "gina-personal" + + def test_iso_oneshot_accepted(self, adapter, recorder): + adapter.add_job("hi", "2099-01-01T00:00:00", job_id="x") + assert len(recorder.calls) == 1 + + def test_malformed_schedule_rejected(self, adapter): + with pytest.raises(ValueError, match="Invalid isoformat|could not convert"): + adapter.add_job("hi", "not-a-schedule", job_id="x") + + def test_empty_prompt_rejected(self, adapter): + with pytest.raises(ValueError, match="prompt"): + adapter.add_job(" ", "0 9 * * *", job_id="x") + + def test_auth_header_sent(self, adapter, recorder): + adapter.add_job("hi", "0 9 * * *", job_id="x") + assert recorder.calls[0]["headers"]["X-API-Key"] == "test-key" + + +# ── cancel_job ───────────────────────────────────────────────────────────── + + +class TestCancelJob: + def test_publishes_remove(self, adapter, recorder): + adapter.cancel_job("daily") + body = recorder.calls[0]["json"] + assert body["payload"]["action"] == "remove" + assert body["payload"]["id"] == "gina-personal-daily" + + def test_returns_true_on_success(self, adapter, recorder): + assert adapter.cancel_job("daily") is True + + def test_returns_false_on_http_error(self, adapter, recorder): + recorder.response = _FakeResponse(status=500, body="boom") + assert adapter.cancel_job("daily") is False + + +# ── topic prefix override ────────────────────────────────────────────────── + + +def test_custom_topic_prefix(monkeypatch): + rec = _Recorder() + import httpx + monkeypatch.setattr(httpx, "post", rec) + adapter = WorkstaceanScheduler( + agent_name="gina-personal", + base_url="http://w:3000", + api_key="k", + topic_prefix="myorg.bus.gina", + ) + adapter.add_job("hi", "0 9 * * *", job_id="x") + body = rec.calls[0]["json"] + assert body["payload"]["topic"].startswith("myorg.bus.gina.") + + +# ── list_jobs is intentionally empty ─────────────────────────────────────── + + +def test_list_jobs_returns_empty(adapter): + """Workstacean's ``list`` action publishes async to a topic; + the adapter doesn't subscribe, so list_jobs returns [].""" + assert adapter.list_jobs() == [] + + +# ── start/stop are no-ops ────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_start_stop_no_op(adapter): + # Should not raise + await adapter.start() + await adapter.stop() diff --git a/tests/test_skill_curator.py b/tests/test_skill_curator.py index cb7bf43..3d8e211 100644 --- a/tests/test_skill_curator.py +++ b/tests/test_skill_curator.py @@ -52,7 +52,7 @@ def _make_skill( "name": name, "description": description, "prompt_template": f"Run the {name} workflow.", - "tools_used": ["echo"], + "tools_used": ["current_time"], "confidence": confidence, "created_at": _utc_iso(days_ago), } diff --git a/tests/test_skill_emission.py b/tests/test_skill_emission.py index 34b8f1c..6d9555b 100644 --- a/tests/test_skill_emission.py +++ b/tests/test_skill_emission.py @@ -88,14 +88,14 @@ def test_skill_datapart_serialization() -> None: name="dp-test", description="DataPart test", prompt_template="prompt", - tools_used=["echo"], + tools_used=["current_time"], source_session_id="s1", ) part = artifact.to_datapart() assert part["kind"] == "data" assert part["metadata"]["mimeType"] == SKILL_V1_MIME assert part["data"]["name"] == "dp-test" - assert part["data"]["tools_used"] == ["echo"] + assert part["data"]["tools_used"] == ["current_time"] # created_at must be present and parseable datetime.fromisoformat(part["data"]["created_at"]) @@ -125,7 +125,7 @@ def test_skill_artifact_validation_tools_not_list() -> None: with pytest.raises(TypeError, match="tools_used"): SkillV1Artifact( name="x", description="d", prompt_template="p", - tools_used="echo", # type: ignore[arg-type] + tools_used="current_time", # type: ignore[arg-type] ) @@ -250,7 +250,7 @@ def _run_emit_logic( def test_skill_emitted_when_emit_skill_true() -> None: """Skill artifact is emitted when emit_skill=True and subagent succeeds.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -264,7 +264,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: assert len(skills) == 1 skill = skills[0] assert skill.name == "my-task" - assert skill.tools_used == ["echo"] + assert skill.tools_used == ["current_time"] assert skill.prompt_template == "do the thing" assert "Captured workflow" in skill.description @@ -272,7 +272,7 @@ def test_skill_emitted_when_emit_skill_true() -> None: def test_no_emission_on_opt_out() -> None: """No skill artifact is emitted when emit_skill=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -307,7 +307,7 @@ def test_no_emission_on_failure() -> None: def test_no_emission_when_config_disallows() -> None: """No skill artifact is emitted when allow_skill_emission=False.""" msgs = [ - _make_ai_message_with_tool_calls(["echo"]), + _make_ai_message_with_tool_calls(["current_time"]), _make_ai_message_with_content("done"), ] _run_emit_logic( @@ -323,8 +323,8 @@ def test_no_emission_when_config_disallows() -> None: def test_tool_tracking_metadata_captured() -> None: """tools_used in the artifact lists all tools invoked, deduplicated.""" msgs = [ - _make_ai_message_with_tool_calls(["echo", "calculator"]), - _make_ai_message_with_tool_calls(["echo"]), # duplicate — should appear once + _make_ai_message_with_tool_calls(["current_time", "calculator"]), + _make_ai_message_with_tool_calls(["current_time"]), # duplicate — should appear once _make_ai_message_with_content("result"), ] _run_emit_logic( @@ -336,7 +336,7 @@ def test_tool_tracking_metadata_captured() -> None: ) skills = get_pending_skills() assert len(skills) == 1 - assert skills[0].tools_used.count("echo") == 1 + assert skills[0].tools_used.count("current_time") == 1 assert "calculator" in skills[0].tools_used diff --git a/tests/test_starter_tools.py b/tests/test_starter_tools.py index fe4495b..f469365 100644 --- a/tests/test_starter_tools.py +++ b/tests/test_starter_tools.py @@ -114,13 +114,3 @@ async def test_fetch_url_rejects_non_http_scheme(): ): result = await fetch_url.ainvoke({"url": bad}) assert result.startswith("Error:"), f"accepted unsafe url: {bad!r}" - - -# ── echo — sanity ──────────────────────────────────────────────────────────── - - -@pytest.mark.asyncio -async def test_echo_sanity(): - from tools.lg_tools import echo - result = await echo.ainvoke({"message": "hello"}) - assert result == "echo: hello" diff --git a/tools/lg_tools.py b/tools/lg_tools.py index d8ce0f5..d42effb 100644 --- a/tools/lg_tools.py +++ b/tools/lg_tools.py @@ -7,12 +7,20 @@ The template ships with a small starter set of free, keyless tools so a fresh clone can demonstrate real agent behaviour out of the box: -- ``echo`` — sanity check - ``current_time`` — wall-clock time in any IANA timezone - ``calculator`` — safe numeric expression evaluation - ``web_search`` — DuckDuckGo text search (via ``ddgs``, no API key) - ``fetch_url`` — fetch a URL and return cleaned text +Plus memory tools that bind to a ``KnowledgeStore`` (constructed in +``server.py`` and threaded through ``get_all_tools(knowledge_store)``): + +- ``memory_ingest`` — store a fact / preference / note +- ``memory_recall`` — search the store for relevant chunks +- ``memory_list`` — list recent chunks (optionally per domain) +- ``memory_stats`` — per-domain counts +- ``daily_log`` — convenience: write a daily-log chunk + Replace or extend this file with your agent's real tools and update ``get_all_tools()`` to return the full list. @@ -32,6 +40,7 @@ from __future__ import annotations import ast +import asyncio import operator as _op from datetime import datetime from zoneinfo import ZoneInfo, ZoneInfoNotFoundError @@ -39,20 +48,6 @@ from langchain_core.tools import tool -# ── echo ───────────────────────────────────────────────────────────────────── - - -@tool -async def echo(message: str) -> str: - """Echo the input back with a prefix. Template-only sanity tool. - - Useful to verify the tool loop is wired end-to-end before real - tools are in place. Safe to delete once your fork has its own - tools. - """ - return f"echo: {message}" - - # ── current_time ───────────────────────────────────────────────────────────── @@ -273,16 +268,251 @@ def _extract_text_from_html(content: bytes) -> str: return "\n".join(lines) +# ── memory tools ───────────────────────────────────────────────────────────── +# +# Each memory tool is built by a factory that closes over the +# ``KnowledgeStore`` instance. Doing it this way (rather than module- +# level globals) keeps tests isolated — they pass a temp store and get +# a fresh tool list bound to it. Production constructs one store in +# ``server.py`` and reuses the bound tools for the lifetime of the +# process. + + +_MEMORY_RECALL_MAX_K = 20 +_MEMORY_LIST_MAX_LIMIT = 200 + +# Stable list of scheduler tool names. Exposed as a module-level +# constant so ``graph/config_io.py::list_available_tools`` can show +# the wizard the right surface even when the runtime hasn't yet +# constructed a scheduler instance (e.g. fresh boot before setup is +# complete). Keep in sync with ``_build_scheduler_tools``. +SCHEDULER_TOOL_NAMES: tuple[str, ...] = ( + "schedule_task", "list_schedules", "cancel_schedule", +) +MEMORY_TOOL_NAMES: tuple[str, ...] = ( + "memory_ingest", "memory_recall", "memory_list", "memory_stats", "daily_log", +) + + +def _build_memory_tools(knowledge_store) -> list: + """Bind memory tools to a ``KnowledgeStore``. Returns a list.""" + from datetime import datetime, timezone + + @tool + async def memory_ingest( + content: str, + domain: str = "general", + heading: str | None = None, + ) -> str: + """Store a fact, preference, or note in long-term memory. + + Use this for things the operator wants you to remember across + sessions — preferences ("I take my coffee black"), facts about + the operator's environment, decisions worth recalling later. + + Args: + content: The text to remember. Be specific and self-contained; + the chunk is retrieved by keyword search. + domain: Logical bucket — ``"preferences"``, ``"context"``, + ``"general"``. Defaults to ``"general"``. + heading: Optional short label (e.g. ``"coffee"``) used as a + stable de-dupe key by the eval suite and curator. + + Returns ``"Stored chunk N in 'domain'."`` on success. + """ + chunk_id = knowledge_store.add_chunk(content, domain=domain, heading=heading) + if chunk_id is None: + return "Error: failed to store chunk (knowledge store unavailable)." + return f"Stored chunk {chunk_id} in {domain!r}." + + @tool + async def memory_recall(query: str, k: int = 5) -> str: + """Search long-term memory for chunks relevant to ``query``. + + Returns the top-k matches, one per line. Pull this when the + operator asks something where stored context is more reliable + than the model's own training data ("what's my coffee order?", + "remind me what we decided about the auth migration"). + + Returns ``"No matches."`` when the store is empty or nothing + scores above the keyword threshold. + """ + clamped_k = max(1, min(int(k), _MEMORY_RECALL_MAX_K)) + results = knowledge_store.search(query, k=clamped_k) + if not results: + return "No matches." + lines = [f"[{r.get('domain', '?')}] {r['preview']}" for r in results] + return "\n".join(lines) + + @tool + async def memory_list(domain: str | None = None, limit: int = 10) -> str: + """List the most recent chunks. Filter by domain when given. + + Useful when the operator asks for recent activity ("what did I + log today?") or wants to inspect what the agent has stored. + """ + clamped_limit = max(1, min(int(limit), _MEMORY_LIST_MAX_LIMIT)) + chunks = knowledge_store.list_chunks(domain=domain, limit=clamped_limit) + if not chunks: + return f"No chunks in {domain or 'any domain'}." + lines = [] + for c in chunks: + head = f"[{c.domain}]" + if c.heading: + head += f" {c.heading}:" + preview = (c.content or "")[:200] + lines.append(f"{c.created_at} {head} {preview}") + return "\n".join(lines) + + @tool + async def memory_stats() -> str: + """Return chunk counts per domain. Useful for sanity checks.""" + s = knowledge_store.stats() + if s.get("total", 0) == 0: + return "Knowledge store is empty." + lines = [f"Total: {s['total']}"] + for k, v in s.items(): + if k == "total": + continue + lines.append(f" {k}: {v}") + return "\n".join(lines) + + @tool + async def daily_log(content: str) -> str: + """Append a daily-log entry for today. + + Stored under ``domain='daily-log'`` with today's UTC date as + the heading, so the same day's entries cluster together for + ``memory_list(domain='daily-log')`` queries. + """ + today = datetime.now(timezone.utc).date().isoformat() + chunk_id = knowledge_store.add_chunk( + content, domain="daily-log", heading=today, + ) + if chunk_id is None: + return "Error: failed to write daily log entry." + return f"Logged ({today}): {content[:120]}" + + return [memory_ingest, memory_recall, memory_list, memory_stats, daily_log] + + +# ── scheduler tools ────────────────────────────────────────────────────────── +# +# Three tools that bind to either the local sqlite-backed scheduler or +# the Workstacean adapter — the agent loop sees one stable surface and +# never has to know which backend is wired up. +# +# Multi-agent safety: the underlying backend is constructed in +# ``server.py`` with the active ``AGENT_NAME`` baked in. add_job / +# list_jobs / cancel_job all filter by that name so two protoAgent +# instances on the same machine (or sharing one Workstacean install) +# never see each other's jobs. + + +def _build_scheduler_tools(scheduler) -> list: + """Bind scheduler tools to a ``SchedulerBackend``. Returns a list.""" + + @tool + async def schedule_task( + prompt: str, + when: str, + job_id: str | None = None, + ) -> str: + """Schedule a future task. The agent receives ``prompt`` as a + new turn when the schedule fires. + + Use this for anything the operator wants done later: reminders + ("remind me to follow up on the auth migration tomorrow at + 9am"), recurring sweeps ("every Monday morning, summarize last + week's logs"), one-off check-ins ("at 3pm today, ask whether + the deploy is healthy"). + + Args: + prompt: The text the agent should receive when the schedule + fires. Be self-contained — the agent has no memory of + this scheduling moment when the task fires. + when: Either a 5-field cron expression (``"0 9 * * 1-5"`` + = every weekday at 9am) or an ISO-8601 datetime + (``"2026-05-01T15:00:00"`` = once at 3pm UTC on May 1). + Compute exact times using ``current_time`` — the agent + cannot infer "now" from training data. + job_id: Optional human-readable id for the job. Auto- + generated if omitted; you'll need it later to cancel. + + Returns ``"Scheduled job next at ."`` on success, + an error string on malformed ``when`` or backend failure. + """ + try: + job = await asyncio.to_thread(scheduler.add_job, prompt, when, job_id=job_id) + except ValueError as exc: + return f"Error: {exc}" + except Exception as exc: # noqa: BLE001 + return f"Error: scheduler add_job failed: {exc}" + next_fire = job.next_fire or "(managed by remote scheduler)" + return f"Scheduled job {job.id} next at {next_fire}." + + @tool + async def list_schedules() -> str: + """List the current scheduled jobs for this agent. + + Returns one job per line with id, next-fire timestamp, and a + prompt preview. Returns ``"No scheduled jobs."`` when empty. + + Backends that delegate state to a remote scheduler (e.g. the + Workstacean adapter) may return an empty list even when jobs + exist — query the remote scheduler directly to see those. + """ + jobs = await asyncio.to_thread(scheduler.list_jobs) + if not jobs: + return "No scheduled jobs." + lines = [] + for j in jobs: + preview = (j.prompt or "")[:80] + next_fire = j.next_fire or "(managed remotely)" + lines.append(f"{j.id} next={next_fire} schedule={j.schedule!r} {preview}") + return "\n".join(lines) + + @tool + async def cancel_schedule(job_id: str) -> str: + """Cancel a scheduled job by id. + + Args: + job_id: The id returned by ``schedule_task`` (or shown by + ``list_schedules``). + + Returns ``"Canceled ."`` or ``"Error: no such job ."``. + """ + if not job_id or not job_id.strip(): + return "Error: job_id is required." + try: + ok = await asyncio.to_thread(scheduler.cancel_job, job_id) + except Exception as exc: # noqa: BLE001 + return f"Error: scheduler cancel_job failed: {exc}" + return f"Canceled {job_id}." if ok else f"Error: cancel failed or no such job {job_id}." + + return [schedule_task, list_schedules, cancel_schedule] + + # ── registry ───────────────────────────────────────────────────────────────── -def get_all_tools(knowledge_store=None): +def get_all_tools(knowledge_store=None, scheduler=None): """Return every LangChain tool the lead agent + subagents can use. - ``knowledge_store`` is threaded through for agents that ship a - knowledge / memory subsystem (see ``graph/middleware/knowledge.py`` - for the hook-in pattern). The template doesn't ship a store — the - parameter is kept so adding one later doesn't require touching - every call site. + Optional dependencies: + + - ``knowledge_store`` enables the memory tools (memory_ingest, + memory_recall, memory_list, memory_stats, daily_log). + - ``scheduler`` enables the scheduler tools (schedule_task, + list_schedules, cancel_schedule). Accepts any backend that + implements ``scheduler.interface.SchedulerBackend``. + + Pass ``None`` to disable either subsystem — the lead agent runs + fine with just the four keyless general tools. """ - return [echo, current_time, calculator, web_search, fetch_url] + tools = [current_time, calculator, web_search, fetch_url] + if knowledge_store is not None: + tools.extend(_build_memory_tools(knowledge_store)) + if scheduler is not None: + tools.extend(_build_scheduler_tools(scheduler)) + return tools diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..7eae9e0 --- /dev/null +++ b/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.11" + +[[package]] +name = "protoagent" +version = "0.2.1" +source = { virtual = "." }