From c0cc61639bac63a69704d414b3e3395e88ac9180 Mon Sep 17 00:00:00 2001
From: rdwj <wjackson@redhat.com>
Date: Mon, 4 May 2026 17:01:46 -0500
Subject: [PATCH] feat: Add `fips-agents add vision` to enable multimodal
 example client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #20.

Drops `examples/vision_client.py` showing the three `image_url` URL
forms the agent runtime accepts: inline `data:` URIs, remote
`https://` URLs, and the internal `file_id:<id>` scheme that the
agent rewrites to a `data:` URI server-side.

Image input flows through the agent's existing `model.endpoint` —
no separate vision endpoint split. Set `MODEL_ENDPOINT` and
`MODEL_NAME` to a vision-capable model (Granite Vision 3.2-2B and
others) before running the agent.

Precondition: `server.files.enabled` must be `true` in agent.yaml.
The `file_id:<id>` URL scheme resolves bytes via the BytesStore,
which only exists when files is enabled. The command refuses to
apply (with an actionable hint) until `fips-agents add files` has
been run.

Requires fipsagents>=0.20.0 in the project's dependencies.

Notes on issue #20's wording:
- No `vision:` section is added to agent.yaml. The agent-template
  audit for issue #101 explicitly chose a single multimodal endpoint
  via existing `model.endpoint` — adding a `vision:` block now would
  bake in a split that hasn't been needed.
- Example code lives at `examples/vision_client.py` (client-side), not
  `src/agent.py`. Content blocks are constructed by callers and
  flow through the agent runtime automatically; the agent code itself
  doesn't need to change.

Assisted-by: Claude Code (Opus 4.7)
---
 CLAUDE.md                           |   2 +-
 README.md                           |  31 ++++
 src/fips_agents_cli/commands/add.py | 238 ++++++++++++++++++++++++++++
 tests/test_modality.py              |  97 +++++++++++-
 4 files changed, 366 insertions(+), 2 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index a4e916c..8194dc4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 **fips-agents-cli** is a Python-based CLI tool for scaffolding FIPS-compliant AI agent projects. It scaffolds MCP (Model Context Protocol) servers and AI agent projects from production-ready templates, customizes them for new projects, and prepares them for immediate development use.
 
-**Current Status:** Scaffolding commands implemented: `create mcp-server`, `create agent`, `create gateway`, `create ui`, `create sandbox`, `create model-car`. Post-scaffolding commands: `generate` (tool, resource, prompt, middleware), `patch` (check, all + type-specific category subcommands — see below), `add` (code-executor, files), `vendor`. Note: `create workflow` exists in code but is not yet working.
+**Current Status:** Scaffolding commands implemented: `create mcp-server`, `create agent`, `create gateway`, `create ui`, `create sandbox`, `create model-car`. Post-scaffolding commands: `generate` (tool, resource, prompt, middleware), `patch` (check, all + type-specific category subcommands — see below), `add` (code-executor, files, vision), `vendor`. Note: `create workflow` exists in code but is not yet working.
 
 The `patch` command is type-aware via `template.type` in `.template-info`:
 - **MCP server** projects expose `patch generators | core | docs | build`.
diff --git a/README.md b/README.md
index cf6a26e..f9409c7 100644
--- a/README.md
+++ b/README.md
@@ -625,6 +625,37 @@ cd my-research-agent
 fips-agents add code-executor
 ```
 
+#### `add vision`
+
+```bash
+fips-agents add vision
+```
+
+Wires multimodal (image input) example client code into your project. Drops `examples/vision_client.py` showing the three `image_url` URL forms the agent runtime accepts: inline `data:` URIs, remote `https://` URLs, and the internal `file_id:<id>` scheme that the agent rewrites to a `data:` URI server-side.
+
+Image input flows through the agent's existing `model.endpoint` — there is no separate vision endpoint split. Set `MODEL_ENDPOINT` and `MODEL_NAME` to a vision-capable model (e.g. Granite Vision 3.2-2B, LLaVA, Phi-4-Multimodal) before running the agent.
+
+**Prerequisite:** `fips-agents add files` must have been run first. The `file_id:<id>` URL scheme resolves bytes via the agent's `BytesStore`, which only exists when files is enabled. The command refuses to apply if `server.files.enabled` is not `true` in `agent.yaml`.
+
+**Requires:** `fipsagents>=0.20.0` in the project's dependencies (content-block support landed in 0.20.0).
+
+**What it does:**
+
+1. Verifies `server.files.enabled: true` in `agent.yaml` (precondition)
+2. Writes `examples/vision_client.py` with runnable snippets for all three URL forms
+
+**Example:**
+
+```bash
+cd my-research-agent
+fips-agents add files     # prerequisite
+fips-agents add vision
+export MODEL_ENDPOINT=https://granite-vision-3-2-2b-...:443/v1
+export MODEL_NAME=ibm-granite/granite-vision-3.2-2b
+make run-local
+python examples/vision_client.py path/to/image.png
+```
+
 ---
 
 ### Vendor Commands
diff --git a/src/fips_agents_cli/commands/add.py b/src/fips_agents_cli/commands/add.py
index 0d905fc..ecdf5a2 100644
--- a/src/fips_agents_cli/commands/add.py
+++ b/src/fips_agents_cli/commands/add.py
@@ -6,6 +6,7 @@
 import click
 from rich.console import Console
 from rich.panel import Panel
+from ruamel.yaml import YAML
 
 from fips_agents_cli.tools.modality import (
     ModalityError,
@@ -164,6 +165,216 @@ async def code_executor(code: str, timeout: float = 10.0) -> str:
 )
 
 
+# ---------------------------------------------------------------------------
+# vision — multimodal (image input) example client
+# ---------------------------------------------------------------------------
+
+# A self-contained client snippet showing the three image_url variants the
+# server accepts. Lives at examples/vision_client.py — out of the agent
+# import path on purpose, since content blocks are constructed by callers,
+# not by the agent itself.
+VISION_CLIENT_SOURCE = '''\
+"""Vision input examples — three ways to send an image to a multimodal agent.
+
+The agent runtime accepts any OpenAI-shaped ``image_url`` content block on
+``POST /v1/chat/completions``. The block carries a URL in one of three
+forms; the agent (via ``OpenAIChatServer._resolve_image_file_ids``)
+rewrites ``file_id:<id>`` references to inline ``data:`` URIs before
+forwarding to the model.
+
+Prerequisites:
+- Files capability is enabled (``fips-agents add files`` already run).
+- The configured ``model.endpoint`` is a vision-capable model (e.g.
+  Granite Vision 3.2-2B, LLaVA, Phi-4-Multimodal). Set
+  ``MODEL_ENDPOINT`` and ``MODEL_NAME`` accordingly.
+
+Run against a locally-running agent:
+
+    python examples/vision_client.py
+"""
+
+from __future__ import annotations
+
+import base64
+import os
+import sys
+from pathlib import Path
+
+import httpx
+
+AGENT_URL = os.environ.get("AGENT_URL", "http://localhost:8080")
+
+
+def variant_data_uri(image_path: Path) -> dict:
+    """Inline the image as a base64 ``data:`` URI.
+
+    Suited for one-shot requests where the image lives on the client
+    and you do not need to reference it again.
+    """
+    mime = "image/png" if image_path.suffix.lower() == ".png" else "image/jpeg"
+    encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
+    return {
+        "type": "image_url",
+        "image_url": {"url": f"data:{mime};base64,{encoded}"},
+    }
+
+
+def variant_remote_url(url: str) -> dict:
+    """Reference a publicly-fetchable HTTPS URL.
+
+    The model serves the URL itself; no upload is required. Best when
+    the image is already on a CDN or public bucket.
+    """
+    return {"type": "image_url", "image_url": {"url": url}}
+
+
+def variant_file_id(image_path: Path) -> dict:
+    """Upload the image once via POST /v1/files, then reference by id.
+
+    The agent fetches bytes from the configured BytesStore, sniffs the
+    MIME type, and rewrites the URL to a ``data:`` URI server-side
+    before forwarding to the model. Best when the same image is used
+    across multiple turns or sessions.
+    """
+    with httpx.Client() as client:
+        upload = client.post(
+            f"{AGENT_URL}/v1/files",
+            files={"file": (image_path.name, image_path.read_bytes(), "image/png")},
+            timeout=30.0,
+        )
+        upload.raise_for_status()
+        file_id = upload.json()["file_id"]
+    return {
+        "type": "image_url",
+        "image_url": {"url": f"file_id:{file_id}"},
+    }
+
+
+def chat_with_image(prompt: str, image_block: dict) -> str:
+    with httpx.Client() as client:
+        resp = client.post(
+            f"{AGENT_URL}/v1/chat/completions",
+            json={
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            image_block,
+                        ],
+                    }
+                ],
+                "max_tokens": 128,
+                "temperature": 0,
+            },
+            timeout=60.0,
+        )
+        resp.raise_for_status()
+        return resp.json()["choices"][0]["message"]["content"]
+
+
+if __name__ == "__main__":
+    image = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./test.png")
+    if not image.exists():
+        print(f"Image not found: {image}", file=sys.stderr)
+        print("Usage: python examples/vision_client.py [path/to/image.png]", file=sys.stderr)
+        sys.exit(1)
+
+    print("=== Variant 1: inline data: URI ===")
+    print(chat_with_image("Describe this image briefly.", variant_data_uri(image)))
+
+    print("\\n=== Variant 2: remote URL ===")
+    # Replace with any public image URL.
+    print(
+        chat_with_image(
+            "Describe this image briefly.",
+            variant_remote_url("https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"),
+        )
+    )
+
+    print("\\n=== Variant 3: file_id (server-resolved) ===")
+    print(chat_with_image("Describe this image briefly.", variant_file_id(image)))
+'''
+
+
+VISION_NEXT_STEPS = (
+    "1. Set the agent's model endpoint to a vision-capable model.",
+    "   Granite Vision 3.2-2B is the canonical example:",
+    "     [dim]export MODEL_ENDPOINT=https://granite-vision-3-2-2b-...:443/v1[/dim]",
+    "     [dim]export MODEL_NAME=ibm-granite/granite-vision-3.2-2b[/dim]",
+    "",
+    "2. Run the agent locally:",
+    "   [dim]make run-local[/dim]",
+    "",
+    "3. Send an image-bearing chat completion. Three URL forms work:",
+    "",
+    r"   [bold]data:[/bold] inline base64 (\"data:image/png;base64,...\")",
+    "   [bold]https://[/bold] remote URL the model fetches",
+    "   [bold]file_id:[/bold] internal scheme — upload via POST /v1/files first,",
+    "     then reference the returned id; the agent rewrites the URL to",
+    "     a data: URI server-side before forwarding to the model.",
+    "",
+    "4. See examples/vision_client.py for runnable snippets of all three.",
+    "",
+    "5. Notes:",
+    "   - Image input requires fipsagents>=0.20.0 (content-block support).",
+    "   - Tool calling is not enabled on most vision endpoints — your",
+    "     agent's step() may need to call_model(include_tools=False).",
+    "   - Trace spans fingerprint image bytes (SHA-256 + size); raw",
+    "     payloads are never logged.",
+)
+
+
+def _vision_precondition(project_root: Path) -> tuple[bool, str]:
+    """Vision input only makes sense when files is enabled.
+
+    The ``file_id:<id>`` URL scheme resolves bytes via the configured
+    ``BytesStore``, which is only wired up when ``server.files.enabled``
+    is true. The other two variants (``data:`` URIs and remote
+    ``https://`` URLs) work without files, but the agent's value-add is
+    the file_id resolution path — fail fast and tell the user to run
+    ``fips-agents add files`` first.
+    """
+    yaml_path = project_root / "agent.yaml"
+    if not yaml_path.exists():
+        return False, "agent.yaml not found in this project"
+
+    yaml = YAML()
+    try:
+        with open(yaml_path) as f:
+            data = yaml.load(f)
+    except Exception as e:
+        return False, f"Failed to parse agent.yaml: {e}"
+
+    server = data.get("server") if hasattr(data, "get") else None
+    files = server.get("files") if server is not None and hasattr(server, "get") else None
+    enabled = files.get("enabled") if files is not None and hasattr(files, "get") else None
+
+    if enabled is True:
+        return True, ""
+
+    return False, (
+        "Vision input requires file uploads to be enabled — "
+        "the file_id:<id> URL scheme resolves bytes via the agent's "
+        "BytesStore. Run `fips-agents add files` first, then re-run "
+        "`fips-agents add vision`."
+    )
+
+
+VISION_SPEC = ModalitySpec(
+    name="vision",
+    description="Multimodal image input via OpenAI content blocks",
+    source_files=(
+        SourceFile(
+            relative_path="examples/vision_client.py",
+            content=VISION_CLIENT_SOURCE,
+        ),
+    ),
+    precondition=_vision_precondition,
+    next_steps=VISION_NEXT_STEPS,
+)
+
+
 def _print_modality_result(spec: ModalitySpec, result: ModalityResult) -> None:
     """Render the per-action lines + the success panel for an applied spec."""
     for action in result.actions:
@@ -263,3 +474,30 @@ def files_cmd():
         fips-agents add files
     """
     _run_modality(FILES_SPEC)
+
+
+@add.command("vision")
+def vision_cmd():
+    """Wire multimodal (image input) example client into the project.
+
+    Drops examples/vision_client.py showing the three image_url URL
+    forms the agent runtime accepts (inline data:, remote https://,
+    and the internal file_id:<id> scheme). Files capability must be
+    enabled first — run `fips-agents add files` and re-run.
+
+    Image input runs through the agent's existing model.endpoint —
+    no separate vision endpoint split. Set MODEL_ENDPOINT and
+    MODEL_NAME to a vision-capable model (e.g. Granite Vision 3.2-2B,
+    LLaVA, Phi-4-Multimodal) before running the agent.
+
+    Requires fipsagents>=0.20.0 in the project's dependencies.
+
+    Example:
+
+        cd my-research-agent
+
+        fips-agents add files     # prerequisite
+
+        fips-agents add vision
+    """
+    _run_modality(VISION_SPEC)
diff --git a/tests/test_modality.py b/tests/test_modality.py
index d55302f..c4265d2 100644
--- a/tests/test_modality.py
+++ b/tests/test_modality.py
@@ -10,7 +10,7 @@
 from ruamel.yaml import YAML
 
 from fips_agents_cli.cli import cli
-from fips_agents_cli.commands.add import CODE_EXECUTOR_SPEC, FILES_SPEC
+from fips_agents_cli.commands.add import CODE_EXECUTOR_SPEC, FILES_SPEC, VISION_SPEC
 from fips_agents_cli.tools.modality import (
     ModalityError,
     ModalitySpec,
@@ -446,3 +446,98 @@ def test_spec_has_expected_shape(self) -> None:
         assert FILES_SPEC.source_files == ()
         assert FILES_SPEC.pyproject_extra is None
         assert FILES_SPEC.precondition is None
+
+
+class TestVisionSpec:
+    def test_spec_has_expected_shape(self) -> None:
+        # add vision drops a client example, gates on files being enabled,
+        # and otherwise touches no toggles — image input flows through the
+        # existing model.endpoint, no separate vision endpoint split.
+        assert VISION_SPEC.name == "vision"
+        assert VISION_SPEC.agent_yaml_enable is None
+        assert VISION_SPEC.chart_values_enable is None
+        assert VISION_SPEC.pyproject_extra is None
+        assert VISION_SPEC.precondition is not None
+        assert len(VISION_SPEC.source_files) == 1
+        assert VISION_SPEC.source_files[0].relative_path == "examples/vision_client.py"
+
+
+# ---------------------------------------------------------------------------
+# Integration — `fips-agents add vision` end-to-end
+# ---------------------------------------------------------------------------
+
+
+def _enable_files(agent_project: Path) -> None:
+    """Flip server.files.enabled to a literal True in the fixture's agent.yaml.
+
+    The fixture ships the field as ``${FILES_ENABLED:-false}`` (a string
+    with env-var substitution). The vision precondition checks for the
+    literal boolean True — same shape add files would leave behind.
+    """
+    yaml = YAML()
+    yaml.preserve_quotes = True
+    path = agent_project / "agent.yaml"
+    with open(path) as f:
+        data = yaml.load(f)
+    data["server"]["files"]["enabled"] = True
+    with open(path, "w") as f:
+        yaml.dump(data, f)
+
+
+class TestAddVisionE2E:
+    def test_first_run_drops_example_client(
+        self, agent_project: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        _enable_files(agent_project)
+        monkeypatch.chdir(agent_project)
+        runner = CliRunner()
+        result = runner.invoke(cli, ["add", "vision"])
+        assert result.exit_code == 0, result.output
+
+        example = agent_project / "examples" / "vision_client.py"
+        assert example.exists()
+        text = example.read_text()
+        # All three URL forms must appear in the example so users can
+        # copy any of them as a starting point. The data: form is built
+        # from an f-string so we match the constituent pieces.
+        assert "image_url" in text
+        assert "data:" in text and "base64" in text
+        assert "file_id:" in text
+        # POST /v1/files upload + chat completions both demonstrated.
+        assert "/v1/files" in text
+        assert "/v1/chat/completions" in text
+
+        assert "Multimodal image input" in result.output
+
+    def test_second_run_is_idempotent(
+        self, agent_project: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        _enable_files(agent_project)
+        monkeypatch.chdir(agent_project)
+        runner = CliRunner()
+        runner.invoke(cli, ["add", "vision"])
+        result = runner.invoke(cli, ["add", "vision"])
+        assert result.exit_code == 0, result.output
+        assert "already exists" in result.output
+
+    def test_fails_when_files_not_enabled(
+        self, agent_project: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        # Fixture ships server.files.enabled as a string, not True — the
+        # precondition must refuse to apply.
+        monkeypatch.chdir(agent_project)
+        runner = CliRunner()
+        result = runner.invoke(cli, ["add", "vision"])
+        assert result.exit_code == 1
+        assert "fips-agents add files" in result.output
+        # Nothing was dropped.
+        assert not (agent_project / "examples" / "vision_client.py").exists()
+
+    def test_errors_when_not_in_an_agent_project(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.chdir(tmp_path)
+        runner = CliRunner()
+        result = runner.invoke(cli, ["add", "vision"])
+        assert result.exit_code == 1
+        assert "Not in an agent project" in result.output