From c0cc61639bac63a69704d414b3e3395e88ac9180 Mon Sep 17 00:00:00 2001 From: rdwj Date: Mon, 4 May 2026 17:01:46 -0500 Subject: [PATCH] feat: Add `fips-agents add vision` to enable multimodal example client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #20. Drops `examples/vision_client.py` showing the three `image_url` URL forms the agent runtime accepts: inline `data:` URIs, remote `https://` URLs, and the internal `file_id:` scheme that the agent rewrites to a `data:` URI server-side. Image input flows through the agent's existing `model.endpoint` — no separate vision endpoint split. Set `MODEL_ENDPOINT` and `MODEL_NAME` to a vision-capable model (Granite Vision 3.2-2B and others) before running the agent. Precondition: `server.files.enabled` must be `true` in agent.yaml. The `file_id:` URL scheme resolves bytes via the BytesStore, which only exists when files is enabled. The command refuses to apply (with an actionable hint) until `fips-agents add files` has been run. Requires fipsagents>=0.20.0 in the project's dependencies. Notes on issue #20's wording: - No `vision:` section is added to agent.yaml. The agent-template audit for issue #101 explicitly chose a single multimodal endpoint via existing `model.endpoint` — adding a `vision:` block now would bake in a split that hasn't been needed. - Example code lives at `examples/vision_client.py` (client-side), not `src/agent.py`. Content blocks are constructed by callers and flow through the agent runtime automatically; the agent code itself doesn't need to change. Assisted-by: Claude Code (Opus 4.7) --- CLAUDE.md | 2 +- README.md | 31 ++++ src/fips_agents_cli/commands/add.py | 238 ++++++++++++++++++++++++++++ tests/test_modality.py | 97 +++++++++++- 4 files changed, 366 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index a4e916c..8194dc4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co **fips-agents-cli** is a Python-based CLI tool for scaffolding FIPS-compliant AI agent projects. It scaffolds MCP (Model Context Protocol) servers and AI agent projects from production-ready templates, customizes them for new projects, and prepares them for immediate development use. -**Current Status:** Scaffolding commands implemented: `create mcp-server`, `create agent`, `create gateway`, `create ui`, `create sandbox`, `create model-car`. Post-scaffolding commands: `generate` (tool, resource, prompt, middleware), `patch` (check, all + type-specific category subcommands — see below), `add` (code-executor, files), `vendor`. Note: `create workflow` exists in code but is not yet working. +**Current Status:** Scaffolding commands implemented: `create mcp-server`, `create agent`, `create gateway`, `create ui`, `create sandbox`, `create model-car`. Post-scaffolding commands: `generate` (tool, resource, prompt, middleware), `patch` (check, all + type-specific category subcommands — see below), `add` (code-executor, files, vision), `vendor`. Note: `create workflow` exists in code but is not yet working. The `patch` command is type-aware via `template.type` in `.template-info`: - **MCP server** projects expose `patch generators | core | docs | build`. diff --git a/README.md b/README.md index cf6a26e..f9409c7 100644 --- a/README.md +++ b/README.md @@ -625,6 +625,37 @@ cd my-research-agent fips-agents add code-executor ``` +#### `add vision` + +```bash +fips-agents add vision +``` + +Wires multimodal (image input) example client code into your project. Drops `examples/vision_client.py` showing the three `image_url` URL forms the agent runtime accepts: inline `data:` URIs, remote `https://` URLs, and the internal `file_id:` scheme that the agent rewrites to a `data:` URI server-side. + +Image input flows through the agent's existing `model.endpoint` — there is no separate vision endpoint split. Set `MODEL_ENDPOINT` and `MODEL_NAME` to a vision-capable model (e.g. Granite Vision 3.2-2B, LLaVA, Phi-4-Multimodal) before running the agent. + +**Prerequisite:** `fips-agents add files` must have been run first. The `file_id:` URL scheme resolves bytes via the agent's `BytesStore`, which only exists when files is enabled. The command refuses to apply if `server.files.enabled` is not `true` in `agent.yaml`. + +**Requires:** `fipsagents>=0.20.0` in the project's dependencies (content-block support landed in 0.20.0). + +**What it does:** + +1. Verifies `server.files.enabled: true` in `agent.yaml` (precondition) +2. Writes `examples/vision_client.py` with runnable snippets for all three URL forms + +**Example:** + +```bash +cd my-research-agent +fips-agents add files # prerequisite +fips-agents add vision +export MODEL_ENDPOINT=https://granite-vision-3-2-2b-...:443/v1 +export MODEL_NAME=ibm-granite/granite-vision-3.2-2b +make run-local +python examples/vision_client.py path/to/image.png +``` + --- ### Vendor Commands diff --git a/src/fips_agents_cli/commands/add.py b/src/fips_agents_cli/commands/add.py index 0d905fc..ecdf5a2 100644 --- a/src/fips_agents_cli/commands/add.py +++ b/src/fips_agents_cli/commands/add.py @@ -6,6 +6,7 @@ import click from rich.console import Console from rich.panel import Panel +from ruamel.yaml import YAML from fips_agents_cli.tools.modality import ( ModalityError, @@ -164,6 +165,216 @@ async def code_executor(code: str, timeout: float = 10.0) -> str: ) +# --------------------------------------------------------------------------- +# vision — multimodal (image input) example client +# --------------------------------------------------------------------------- + +# A self-contained client snippet showing the three image_url variants the +# server accepts. Lives at examples/vision_client.py — out of the agent +# import path on purpose, since content blocks are constructed by callers, +# not by the agent itself. +VISION_CLIENT_SOURCE = '''\ +"""Vision input examples — three ways to send an image to a multimodal agent. + +The agent runtime accepts any OpenAI-shaped ``image_url`` content block on +``POST /v1/chat/completions``. The block carries a URL in one of three +forms; the agent (via ``OpenAIChatServer._resolve_image_file_ids``) +rewrites ``file_id:`` references to inline ``data:`` URIs before +forwarding to the model. + +Prerequisites: +- Files capability is enabled (``fips-agents add files`` already run). +- The configured ``model.endpoint`` is a vision-capable model (e.g. + Granite Vision 3.2-2B, LLaVA, Phi-4-Multimodal). Set + ``MODEL_ENDPOINT`` and ``MODEL_NAME`` accordingly. + +Run against a locally-running agent: + + python examples/vision_client.py +""" + +from __future__ import annotations + +import base64 +import os +import sys +from pathlib import Path + +import httpx + +AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8080") + + +def variant_data_uri(image_path: Path) -> dict: + """Inline the image as a base64 ``data:`` URI. + + Suited for one-shot requests where the image lives on the client + and you do not need to reference it again. + """ + mime = "image/png" if image_path.suffix.lower() == ".png" else "image/jpeg" + encoded = base64.b64encode(image_path.read_bytes()).decode("ascii") + return { + "type": "image_url", + "image_url": {"url": f"data:{mime};base64,{encoded}"}, + } + + +def variant_remote_url(url: str) -> dict: + """Reference a publicly-fetchable HTTPS URL. + + The model serves the URL itself; no upload is required. Best when + the image is already on a CDN or public bucket. + """ + return {"type": "image_url", "image_url": {"url": url}} + + +def variant_file_id(image_path: Path) -> dict: + """Upload the image once via POST /v1/files, then reference by id. + + The agent fetches bytes from the configured BytesStore, sniffs the + MIME type, and rewrites the URL to a ``data:`` URI server-side + before forwarding to the model. Best when the same image is used + across multiple turns or sessions. + """ + with httpx.Client() as client: + upload = client.post( + f"{AGENT_URL}/v1/files", + files={"file": (image_path.name, image_path.read_bytes(), "image/png")}, + timeout=30.0, + ) + upload.raise_for_status() + file_id = upload.json()["file_id"] + return { + "type": "image_url", + "image_url": {"url": f"file_id:{file_id}"}, + } + + +def chat_with_image(prompt: str, image_block: dict) -> str: + with httpx.Client() as client: + resp = client.post( + f"{AGENT_URL}/v1/chat/completions", + json={ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + image_block, + ], + } + ], + "max_tokens": 128, + "temperature": 0, + }, + timeout=60.0, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + + +if __name__ == "__main__": + image = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./test.png") + if not image.exists(): + print(f"Image not found: {image}", file=sys.stderr) + print("Usage: python examples/vision_client.py [path/to/image.png]", file=sys.stderr) + sys.exit(1) + + print("=== Variant 1: inline data: URI ===") + print(chat_with_image("Describe this image briefly.", variant_data_uri(image))) + + print("\\n=== Variant 2: remote URL ===") + # Replace with any public image URL. + print( + chat_with_image( + "Describe this image briefly.", + variant_remote_url("https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"), + ) + ) + + print("\\n=== Variant 3: file_id (server-resolved) ===") + print(chat_with_image("Describe this image briefly.", variant_file_id(image))) +''' + + +VISION_NEXT_STEPS = ( + "1. Set the agent's model endpoint to a vision-capable model.", + " Granite Vision 3.2-2B is the canonical example:", + " [dim]export MODEL_ENDPOINT=https://granite-vision-3-2-2b-...:443/v1[/dim]", + " [dim]export MODEL_NAME=ibm-granite/granite-vision-3.2-2b[/dim]", + "", + "2. Run the agent locally:", + " [dim]make run-local[/dim]", + "", + "3. Send an image-bearing chat completion. Three URL forms work:", + "", + r" [bold]data:[/bold] inline base64 (\"data:image/png;base64,...\")", + " [bold]https://[/bold] remote URL the model fetches", + " [bold]file_id:[/bold] internal scheme — upload via POST /v1/files first,", + " then reference the returned id; the agent rewrites the URL to", + " a data: URI server-side before forwarding to the model.", + "", + "4. See examples/vision_client.py for runnable snippets of all three.", + "", + "5. Notes:", + " - Image input requires fipsagents>=0.20.0 (content-block support).", + " - Tool calling is not enabled on most vision endpoints — your", + " agent's step() may need to call_model(include_tools=False).", + " - Trace spans fingerprint image bytes (SHA-256 + size); raw", + " payloads are never logged.", +) + + +def _vision_precondition(project_root: Path) -> tuple[bool, str]: + """Vision input only makes sense when files is enabled. + + The ``file_id:`` URL scheme resolves bytes via the configured + ``BytesStore``, which is only wired up when ``server.files.enabled`` + is true. The other two variants (``data:`` URIs and remote + ``https://`` URLs) work without files, but the agent's value-add is + the file_id resolution path — fail fast and tell the user to run + ``fips-agents add files`` first. + """ + yaml_path = project_root / "agent.yaml" + if not yaml_path.exists(): + return False, "agent.yaml not found in this project" + + yaml = YAML() + try: + with open(yaml_path) as f: + data = yaml.load(f) + except Exception as e: + return False, f"Failed to parse agent.yaml: {e}" + + server = data.get("server") if hasattr(data, "get") else None + files = server.get("files") if server is not None and hasattr(server, "get") else None + enabled = files.get("enabled") if files is not None and hasattr(files, "get") else None + + if enabled is True: + return True, "" + + return False, ( + "Vision input requires file uploads to be enabled — " + "the file_id: URL scheme resolves bytes via the agent's " + "BytesStore. Run `fips-agents add files` first, then re-run " + "`fips-agents add vision`." + ) + + +VISION_SPEC = ModalitySpec( + name="vision", + description="Multimodal image input via OpenAI content blocks", + source_files=( + SourceFile( + relative_path="examples/vision_client.py", + content=VISION_CLIENT_SOURCE, + ), + ), + precondition=_vision_precondition, + next_steps=VISION_NEXT_STEPS, +) + + def _print_modality_result(spec: ModalitySpec, result: ModalityResult) -> None: """Render the per-action lines + the success panel for an applied spec.""" for action in result.actions: @@ -263,3 +474,30 @@ def files_cmd(): fips-agents add files """ _run_modality(FILES_SPEC) + + +@add.command("vision") +def vision_cmd(): + """Wire multimodal (image input) example client into the project. + + Drops examples/vision_client.py showing the three image_url URL + forms the agent runtime accepts (inline data:, remote https://, + and the internal file_id: scheme). Files capability must be + enabled first — run `fips-agents add files` and re-run. + + Image input runs through the agent's existing model.endpoint — + no separate vision endpoint split. Set MODEL_ENDPOINT and + MODEL_NAME to a vision-capable model (e.g. Granite Vision 3.2-2B, + LLaVA, Phi-4-Multimodal) before running the agent. + + Requires fipsagents>=0.20.0 in the project's dependencies. + + Example: + + cd my-research-agent + + fips-agents add files # prerequisite + + fips-agents add vision + """ + _run_modality(VISION_SPEC) diff --git a/tests/test_modality.py b/tests/test_modality.py index d55302f..c4265d2 100644 --- a/tests/test_modality.py +++ b/tests/test_modality.py @@ -10,7 +10,7 @@ from ruamel.yaml import YAML from fips_agents_cli.cli import cli -from fips_agents_cli.commands.add import CODE_EXECUTOR_SPEC, FILES_SPEC +from fips_agents_cli.commands.add import CODE_EXECUTOR_SPEC, FILES_SPEC, VISION_SPEC from fips_agents_cli.tools.modality import ( ModalityError, ModalitySpec, @@ -446,3 +446,98 @@ def test_spec_has_expected_shape(self) -> None: assert FILES_SPEC.source_files == () assert FILES_SPEC.pyproject_extra is None assert FILES_SPEC.precondition is None + + +class TestVisionSpec: + def test_spec_has_expected_shape(self) -> None: + # add vision drops a client example, gates on files being enabled, + # and otherwise touches no toggles — image input flows through the + # existing model.endpoint, no separate vision endpoint split. + assert VISION_SPEC.name == "vision" + assert VISION_SPEC.agent_yaml_enable is None + assert VISION_SPEC.chart_values_enable is None + assert VISION_SPEC.pyproject_extra is None + assert VISION_SPEC.precondition is not None + assert len(VISION_SPEC.source_files) == 1 + assert VISION_SPEC.source_files[0].relative_path == "examples/vision_client.py" + + +# --------------------------------------------------------------------------- +# Integration — `fips-agents add vision` end-to-end +# --------------------------------------------------------------------------- + + +def _enable_files(agent_project: Path) -> None: + """Flip server.files.enabled to a literal True in the fixture's agent.yaml. + + The fixture ships the field as ``${FILES_ENABLED:-false}`` (a string + with env-var substitution). The vision precondition checks for the + literal boolean True — same shape add files would leave behind. + """ + yaml = YAML() + yaml.preserve_quotes = True + path = agent_project / "agent.yaml" + with open(path) as f: + data = yaml.load(f) + data["server"]["files"]["enabled"] = True + with open(path, "w") as f: + yaml.dump(data, f) + + +class TestAddVisionE2E: + def test_first_run_drops_example_client( + self, agent_project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + _enable_files(agent_project) + monkeypatch.chdir(agent_project) + runner = CliRunner() + result = runner.invoke(cli, ["add", "vision"]) + assert result.exit_code == 0, result.output + + example = agent_project / "examples" / "vision_client.py" + assert example.exists() + text = example.read_text() + # All three URL forms must appear in the example so users can + # copy any of them as a starting point. The data: form is built + # from an f-string so we match the constituent pieces. + assert "image_url" in text + assert "data:" in text and "base64" in text + assert "file_id:" in text + # POST /v1/files upload + chat completions both demonstrated. + assert "/v1/files" in text + assert "/v1/chat/completions" in text + + assert "Multimodal image input" in result.output + + def test_second_run_is_idempotent( + self, agent_project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + _enable_files(agent_project) + monkeypatch.chdir(agent_project) + runner = CliRunner() + runner.invoke(cli, ["add", "vision"]) + result = runner.invoke(cli, ["add", "vision"]) + assert result.exit_code == 0, result.output + assert "already exists" in result.output + + def test_fails_when_files_not_enabled( + self, agent_project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Fixture ships server.files.enabled as a string, not True — the + # precondition must refuse to apply. + monkeypatch.chdir(agent_project) + runner = CliRunner() + result = runner.invoke(cli, ["add", "vision"]) + assert result.exit_code == 1 + assert "fips-agents add files" in result.output + # Nothing was dropped. + assert not (agent_project / "examples" / "vision_client.py").exists() + + def test_errors_when_not_in_an_agent_project( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.chdir(tmp_path) + runner = CliRunner() + result = runner.invoke(cli, ["add", "vision"]) + assert result.exit_code == 1 + assert "Not in an agent project" in result.output