diff --git a/astro.config.mjs b/astro.config.mjs index 3fffb88..7083338 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -78,6 +78,19 @@ export default defineConfig({ { label: 'OpenShift Setup', slug: 'deployment/openshift-setup' }, ], }, + { + label: 'MCP', + items: [ + { label: 'Overview', slug: 'mcp' }, + { label: 'Installation', slug: 'mcp/installation' }, + { label: 'Quick Start', slug: 'mcp/quickstart' }, + { label: 'Configuration', slug: 'mcp/configuration' }, + { label: 'Tool Reference', slug: 'mcp/tools' }, + { label: 'Resource Reference', slug: 'mcp/resources' }, + { label: 'Prompt Reference', slug: 'mcp/prompts' }, + { label: 'Troubleshooting', slug: 'mcp/troubleshooting' }, + ], + }, { label: 'Adapters', items: [ diff --git a/src/content/docs/mcp/configuration.md b/src/content/docs/mcp/configuration.md new file mode 100644 index 0000000..cb5d4ed --- /dev/null +++ b/src/content/docs/mcp/configuration.md @@ -0,0 +1,169 @@ +--- +title: "Configuration" +--- + +The EvalHub MCP server can be configured through CLI flags, a YAML configuration file, or environment variables. When multiple sources set the same value, **CLI flags take highest precedence**, followed by the config file, then environment variables. + +## CLI Flags + +``` +evalhub-mcp [flags] +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--transport` | `stdio` | Transport mode: `stdio`, `http`, or `http-sse` | +| `--host` | `localhost` | Bind address for HTTP transports | +| `--port` | `3001` | Port for HTTP transports | +| `--config` | — | Path to YAML configuration file | +| `--insecure` | `false` | Skip TLS certificate verification for the EvalHub backend | +| `--tls-cert` | — | Path to TLS certificate file (for HTTPS on the MCP server) | +| `--tls-key` | — | Path to TLS private key file (for HTTPS on the MCP server) | +| `--version` | — | Print version and exit | + +Both `--tls-cert` and `--tls-key` must be provided together. When set, the HTTP server listens over HTTPS. + +## Configuration File + +Pass `--config ` to load settings from a YAML file: + +```yaml +# evalhub-mcp.yaml +base_url: https://evalhub.apps.my-cluster.example.com +token: +tenant: my-team +transport: http +host: 0.0.0.0 +port: 3001 +insecure: false +``` + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `EVALHUB_BASE_URL` | EvalHub backend API URL | +| `EVALHUB_TOKEN` | Authentication token | +| `EVALHUB_TENANT` | Tenant identifier | +| `EVALHUB_TRANSPORT` | Transport mode (`stdio`, `http`, `http-sse`) | +| `EVALHUB_HOST` | HTTP bind address | +| `EVALHUB_PORT` | HTTP port | +| `EVALHUB_INSECURE` | Skip TLS verification for EvalHub backend (`true`/`false`) | +| `EVALHUB_TLS_CERT_FILE` | Path to TLS certificate | +| `EVALHUB_TLS_KEY_FILE` | Path to TLS private key | +| `EVALHUB_LIST_PAGE_LIMIT` | Default page size for list resources | + +## Precedence + +When the same setting is specified in multiple places: + +1. **CLI flags** (highest priority) +2. **YAML config file** (if `--config` is used) +3. **Environment variables** (lowest priority) + +For example, if `EVALHUB_TRANSPORT=http` is set as an environment variable but you run `evalhub-mcp --transport stdio`, the server uses stdio. + +## Kubernetes Operator + +When EvalHub is deployed via the TrustyAI operator, the MCP server is configured through the `spec.mcp` section of the EvalHub custom resource: + +```yaml +apiVersion: trustyai.opendatahub.io/v1alpha1 +kind: EvalHub +metadata: + name: evalhub + namespace: my-namespace +spec: + replicas: 1 + mcp: + enabled: true + replicas: 1 + transport: http + image: quay.io/evalhub/evalhub-mcp:latest + authSecret: mcp-auth-token + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + env: + - name: LOG_LEVEL + value: "debug" +``` + +### Operator MCP Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enabled` | bool | `false` | Enable MCP server deployment | +| `replicas` | int | `1` | Number of MCP server replicas | +| `transport` | string | `http` | Client-facing transport (`http` or `http-sse`) | +| `evalHubTransport` | string | `http` | Transport for internal EvalHub API calls | +| `image` | string | `quay.io/evalhub/evalhub-mcp:latest` | Container image override | +| `authSecret` | string | — | Kubernetes Secret containing a `token` key for EvalHub API auth | +| `resources` | ResourceRequirements | 100m/128Mi request, 500m/256Mi limit | Container resource requests and limits | +| `env` | []EnvVar | — | Additional environment variables | + +### What the Operator Creates + +When `spec.mcp.enabled` is `true`, the operator automatically creates: + +- **Deployment** (`-mcp`): Runs the MCP server container with health checks +- **Service** (`-mcp`): ClusterIP service on port 8443 +- **ConfigMap** (`-mcp-config`): Server configuration YAML +- **Route** (OpenShift only, `-mcp`): Edge-terminated TLS route for external access + +TLS certificates are automatically provisioned via OpenShift service signing. + +### Checking MCP Status + +```bash +kubectl get evalhub -o jsonpath='{.status.mcp}' +``` + +The status includes: +- `phase`: `Pending`, `Ready`, `Error`, or `Disabled` +- `ready`: Whether the MCP deployment is available +- `url`: Internal service URL + +## Example Configurations + +### Local Development + +```bash +export EVALHUB_BASE_URL="http://localhost:8080" +export EVALHUB_TOKEN="dev-token" +export EVALHUB_TENANT="default" + +evalhub-mcp --transport stdio +``` + +### Shared Team Server + +```yaml +# team-mcp.yaml +base_url: https://evalhub.apps.cluster.example.com +token: +tenant: team-a +transport: http +host: 0.0.0.0 +port: 3001 +``` + +```bash +evalhub-mcp --config team-mcp.yaml +``` + +### Secure Production Server + +```bash +evalhub-mcp \ + --transport http \ + --host 0.0.0.0 \ + --port 8443 \ + --tls-cert /etc/tls/server.crt \ + --tls-key /etc/tls/server.key \ + --config /etc/evalhub-mcp/config.yaml +``` diff --git a/src/content/docs/mcp/index.md b/src/content/docs/mcp/index.md new file mode 100644 index 0000000..dc94ff9 --- /dev/null +++ b/src/content/docs/mcp/index.md @@ -0,0 +1,62 @@ +--- +title: "MCP Overview" +--- + +The EvalHub MCP server implements the [Model Context Protocol](https://modelcontextprotocol.io/) (MCP), enabling AI coding assistants such as Claude Code, VS Code with GitHub Copilot, and other MCP-compatible clients to interact with EvalHub directly from a conversation. + +## What is MCP? + +MCP is an open standard that lets AI assistants connect to external tools and data sources through a unified protocol. Instead of manually copying commands or switching between terminal windows, your AI assistant can submit evaluations, check job status, browse benchmarks, and follow structured evaluation workflows — all through natural language. + +## What the EvalHub MCP Server Provides + +### Tools + +Actions the AI assistant can execute on your behalf: + +| Tool | Description | +|------|-------------| +| `submit_evaluation` | Submit a new model evaluation job with benchmarks or a collection | +| `get_job_status` | Check job progress, state, and per-benchmark status | +| `cancel_job` | Cancel a running or pending evaluation job | + +### Resources + +Read-only data the assistant can query using `evalhub://` URIs: + +| Resource | URI | Description | +|----------|-----|-------------| +| Providers | `evalhub://providers` | List evaluation providers and their benchmarks | +| Benchmarks | `evalhub://benchmarks` | Browse benchmarks, filter by label | +| Collections | `evalhub://collections` | List pre-defined benchmark collections | +| Jobs | `evalhub://jobs` | List evaluation jobs, filter by status | +| Server Version | `evalhub://server/version` | Server build and version metadata | + +All list resources support pagination (`?limit=N&offset=N`). Benchmarks support label filtering (`?label=rag&label=safety`). Jobs support status filtering (`?status=running`). + +### Prompts + +Structured conversation templates that guide the assistant through complex workflows: + +| Prompt | Description | +|--------|-------------| +| `edd_workflow` | Evaluation-Driven Development cycle: Define, Measure, Iterate | +| `evaluate_model` | Step-by-step model evaluation from discovery to results | +| `compare_runs` | Compare metrics across two or more evaluation jobs | + +## Transport Modes + +The MCP server supports multiple transport modes for different deployment scenarios: + +| Mode | Flag | Use Case | +|------|------|----------| +| **stdio** | `--transport stdio` | Local development. The AI client launches the server as a subprocess. | +| **Streamable HTTP** | `--transport http` | Remote or shared deployments. The server runs as a standalone HTTP service. | +| **Legacy HTTP+SSE** | `--transport http-sse` | Older MCP clients that don't support Streamable HTTP. | + +## Next Steps + +- [Install the MCP server](/mcp/installation/) on your platform +- Follow the [Quick Start](/mcp/quickstart/) to connect your AI assistant in under 5 steps +- Browse the [Tool](/mcp/tools/), [Resource](/mcp/resources/), and [Prompt](/mcp/prompts/) references +- See [Configuration](/mcp/configuration/) for all available options diff --git a/src/content/docs/mcp/installation.md b/src/content/docs/mcp/installation.md new file mode 100644 index 0000000..a6c7633 --- /dev/null +++ b/src/content/docs/mcp/installation.md @@ -0,0 +1,108 @@ +--- +title: "Installation" +--- + +import { Tabs, TabItem } from '@astrojs/starlight/components'; + +The `evalhub-mcp` binary is a standalone server that connects AI assistants to EvalHub. It is available for macOS (Intel and Apple Silicon), Linux (amd64 and arm64), and as a container image. + +## Prerequisites + +- An EvalHub instance (running locally or on a cluster) with a reachable API endpoint +- An authentication token for your EvalHub tenant +- An MCP-compatible AI client ([Claude Code](https://docs.anthropic.com/en/docs/claude-code), [VS Code with GitHub Copilot](https://code.visualstudio.com/), or another MCP client) + +## Install the Binary + + + + +```bash +brew install evalhub-mcp +``` + +Verify: + +```bash +evalhub-mcp --version +``` + + + + +Download the binary for your platform from [GitHub Releases](https://github.com/eval-hub/eval-hub/releases): + +```bash +# macOS (Apple Silicon) +curl -Lo evalhub-mcp https://github.com/eval-hub/eval-hub/releases/latest/download/evalhub-mcp-darwin-arm64 + +# macOS (Intel) +curl -Lo evalhub-mcp https://github.com/eval-hub/eval-hub/releases/latest/download/evalhub-mcp-darwin-amd64 + +# Linux (amd64) +curl -Lo evalhub-mcp https://github.com/eval-hub/eval-hub/releases/latest/download/evalhub-mcp-linux-amd64 + +# Linux (arm64) +curl -Lo evalhub-mcp https://github.com/eval-hub/eval-hub/releases/latest/download/evalhub-mcp-linux-arm64 +``` + +Make it executable and move it to your PATH: + +```bash +chmod +x evalhub-mcp +sudo mv evalhub-mcp /usr/local/bin/ +``` + +Verify: + +```bash +evalhub-mcp --version +``` + + + + +Requires Go 1.25 or later. + +```bash +git clone https://github.com/eval-hub/eval-hub.git +cd eval-hub +make build-mcp +``` + +The binary is placed in `./bin/evalhub-mcp`. Move it to your PATH: + +```bash +sudo mv ./bin/evalhub-mcp /usr/local/bin/ +``` + + + + +## Kubernetes / OpenShift Deployment + +If EvalHub is managed by the TrustyAI operator, the MCP server can be deployed as a sidecar by enabling it in the EvalHub custom resource: + +```yaml +apiVersion: trustyai.opendatahub.io/v1alpha1 +kind: EvalHub +metadata: + name: evalhub +spec: + replicas: 1 + mcp: + enabled: true + replicas: 1 +``` + +The operator creates a Deployment, Service, ConfigMap, and (on OpenShift) a Route for the MCP server automatically. See [Configuration](/mcp/configuration/#kubernetes-operator) for all available fields. + +## Using the EvalHub CLI as an MCP Server + +If you already have the [EvalHub CLI](/guides/cli/) installed and configured, you can use it as an MCP server directly without installing `evalhub-mcp` separately: + +```bash +claude mcp add evalhub -- evalhub --profile mcp +``` + +This uses the CLI's built-in `mcp` subcommand with an existing CLI profile for authentication. See the [Quick Start](/mcp/quickstart/) for the full setup flow using either approach. diff --git a/src/content/docs/mcp/prompts.md b/src/content/docs/mcp/prompts.md new file mode 100644 index 0000000..fa61854 --- /dev/null +++ b/src/content/docs/mcp/prompts.md @@ -0,0 +1,93 @@ +--- +title: "Prompt Reference" +--- + +MCP prompts are structured conversation templates that guide the AI assistant through complex evaluation workflows. Each prompt returns a sequence of messages that the assistant uses to drive the interaction. + +## edd_workflow + +Guides the AI assistant through the Evaluation-Driven Development (EDD) cycle — a structured methodology for building AI applications with continuous evaluation feedback. + +The workflow follows three phases: +1. **Define** — Establish evaluation criteria for the application type +2. **Measure** — Run benchmarks and collect metrics +3. **Iterate** — Analyze results and improve + +### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `application_type` | Yes | Type of application being evaluated. One of: `rag`, `agent`, `safety`, `classifier` | + +Each application type loads domain-specific guidance. For example, `rag` focuses on retrieval quality and context relevance, while `safety` emphasizes bias detection and harmful content evaluation. + +### Example + +**Prompt:** +``` +Use the edd_workflow prompt for a RAG application. +``` + +The assistant receives structured guidance for defining RAG-specific evaluation criteria, selecting appropriate benchmarks, running evaluations, and iterating on results. + +### Valid Application Types + +| Type | Focus | +|------|-------| +| `rag` | Retrieval quality, context relevance, answer accuracy | +| `agent` | Task completion, tool use, multi-step reasoning | +| `safety` | Bias detection, harmful content, fairness | +| `classifier` | Classification accuracy, precision, recall | + +## evaluate_model + +Step-by-step guidance for evaluating a model end-to-end: discover available benchmarks, select appropriate ones, submit an evaluation, and review results. + +### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `model_url` | No | URL of the model inference endpoint. If omitted, the assistant asks for it. | +| `benchmark_preferences` | No | Preferences for benchmark selection (e.g. "reasoning", "safety", "general"). If omitted, the assistant helps you choose. | + +### Example + +**With model URL:** +``` +Use the evaluate_model prompt with model URL https://llama3.example.com/v1. +``` + +The assistant guides you through benchmark selection and evaluation submission for the specified model. + +**Without model URL:** +``` +Use the evaluate_model prompt. +``` + +The assistant first helps you identify your model endpoint, then proceeds with benchmark discovery and evaluation. + +## compare_runs + +Guidance for comparing two or more evaluation runs side-by-side: select jobs, fetch results, compare metrics, and summarize findings. + +### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `job_ids` | No | Comma-separated list of evaluation job IDs to compare. If omitted, the assistant helps you select jobs. | + +### Example + +**With job IDs:** +``` +Use the compare_runs prompt to compare jobs a1b2c3d4 and e5f6g7h8. +``` + +**Without job IDs:** +``` +Use the compare_runs prompt. +``` + +The assistant fetches the list of completed jobs and helps you select which ones to compare. + +At least two job IDs are required for comparison. diff --git a/src/content/docs/mcp/quickstart.mdx b/src/content/docs/mcp/quickstart.mdx new file mode 100644 index 0000000..b6d52f9 --- /dev/null +++ b/src/content/docs/mcp/quickstart.mdx @@ -0,0 +1,259 @@ +--- +title: "Quick Start" +--- + +import { Steps, Tabs, TabItem } from '@astrojs/starlight/components'; + +Connect your AI assistant to EvalHub in under 5 steps. + +## Claude Code + + + + +The stdio transport is simplest for individual use — Claude Code manages the server process automatically. + + + +1. **Install the MCP server** + + ```bash + brew install evalhub-mcp + ``` + + Or [download the binary](/mcp/installation/) for your platform. + +2. **Set your EvalHub credentials** + + ```bash + export EVALHUB_BASE_URL="https://" + export EVALHUB_TOKEN="" + export EVALHUB_TENANT="" + ``` + +3. **Register the server with Claude Code** + + ```bash + claude mcp add evalhub --transport stdio -- evalhub-mcp + ``` + +4. **Verify the connection** + + ```bash + claude mcp list + ``` + + `evalhub` should appear with transport `stdio` and status available. + +5. **Start using it** + + Open a Claude Code conversation and try: + + ``` + List the available evaluation providers from EvalHub. + ``` + + + + + + +Use HTTP transport when the MCP server should run as a shared service (e.g. on a remote machine or for team use). + + + +1. **Install the MCP server** + + ```bash + brew install evalhub-mcp + ``` + +2. **Start the server** + + ```bash + export EVALHUB_BASE_URL="https://" + export EVALHUB_TOKEN="" + export EVALHUB_TENANT="" + + evalhub-mcp --transport http --host localhost --port 3001 + ``` + + For dev environments with self-signed certificates: + + ```bash + evalhub-mcp --transport http --host localhost --port 3001 --insecure + ``` + +3. **Register with Claude Code** (in a separate terminal) + + ```bash + claude mcp add evalhub --transport http http://localhost:3001 + ``` + +4. **Verify the connection** + + ```bash + claude mcp list + ``` + +5. **Start using it** + + Open a Claude Code conversation and try: + + ``` + List the available evaluation providers from EvalHub. + ``` + + + + + + +### Using the EvalHub CLI instead + +If you have the [EvalHub CLI](/guides/cli/) installed, you can use it as the MCP server directly: + +```bash +# Configure a CLI profile for the agent +evalhub --profile agent config set base_url https:// +evalhub --profile agent config set token +evalhub --profile agent config set tenant + +# Register with Claude Code +claude mcp add evalhub -- evalhub --profile agent mcp +``` + +## VS Code / GitHub Copilot + + + + + + +1. **Install the MCP server** + + ```bash + brew install evalhub-mcp + ``` + +2. **Set your EvalHub credentials** + + ```bash + export EVALHUB_BASE_URL="https://" + export EVALHUB_TOKEN="" + export EVALHUB_TENANT="" + ``` + +3. **Add to VS Code settings** + + Open your VS Code `settings.json` (Cmd/Ctrl+Shift+P → "Preferences: Open User Settings (JSON)") and add: + + ```json + { + "mcp": { + "servers": { + "evalhub": { + "command": "evalhub-mcp", + "args": [], + "env": { + "EVALHUB_BASE_URL": "https://", + "EVALHUB_TOKEN": "", + "EVALHUB_TENANT": "" + } + } + } + } + } + ``` + +4. **Reload VS Code** + + Restart the window or run "Developer: Reload Window" from the command palette. + +5. **Start using it** + + In GitHub Copilot Chat, ask: + + ``` + @evalhub List the available evaluation providers. + ``` + + + + + + + + +1. **Install and start the MCP server** + + ```bash + export EVALHUB_BASE_URL="https://" + export EVALHUB_TOKEN="" + export EVALHUB_TENANT="" + + evalhub-mcp --transport http --host localhost --port 3001 + ``` + +2. **Add to VS Code settings** + + ```json + { + "mcp": { + "servers": { + "evalhub": { + "url": "http://localhost:3001" + } + } + } + } + ``` + +3. **Reload VS Code** + + Restart the window or run "Developer: Reload Window". + +4. **Verify** + + Open GitHub Copilot Chat and ask about available tools. + +5. **Start using it** + + ``` + @evalhub List the available benchmarks. + ``` + + + + + + +## What to Try Next + +Once connected, try these example prompts with your AI assistant: + +**Browse what's available:** +``` +Show me all evaluation providers and their benchmarks. +``` + +**Submit an evaluation:** +``` +Submit an evaluation named "my-first-eval" using the leaderboard-v2 collection +against my model at http://localhost:11434/v1 named qwen2.5:1.5b. +``` + +**Check status:** +``` +What's the status of my evaluation job? +``` + +**Follow a structured workflow:** +``` +Use the edd_workflow prompt for a RAG application. +``` + +For the full list of available tools, resources, and prompts, see the reference pages: +- [Tools](/mcp/tools/) +- [Resources](/mcp/resources/) +- [Prompts](/mcp/prompts/) diff --git a/src/content/docs/mcp/resources.md b/src/content/docs/mcp/resources.md new file mode 100644 index 0000000..378ff97 --- /dev/null +++ b/src/content/docs/mcp/resources.md @@ -0,0 +1,204 @@ +--- +title: "Resource Reference" +--- + +MCP resources are read-only data endpoints that the AI assistant can query. All EvalHub resources use the `evalhub://` URI scheme. + +## Providers + +### List all providers + +| | | +|---|---| +| **URI** | `evalhub://providers` | +| **Description** | List all registered evaluation providers | +| **Pagination** | `?limit=N&offset=N` | + +**Example response:** +```json +[ + { + "resource": { "id": "lm_evaluation_harness" }, + "name": "LM Evaluation Harness", + "description": "EleutherAI language model evaluation framework" + }, + { + "resource": { "id": "guidellm" }, + "name": "GuideLLM", + "description": "Performance benchmarking" + } +] +``` + +### Get a provider by ID + +| | | +|---|---| +| **URI** | `evalhub://providers/{id}` | +| **Description** | Get a specific evaluation provider and its details | + +**Example:** `evalhub://providers/lm_evaluation_harness` + +## Benchmarks + +### List all benchmarks + +| | | +|---|---| +| **URI** | `evalhub://benchmarks` | +| **Description** | List all benchmarks across all providers | + +### Filter by label + +| | | +|---|---| +| **URI** | `evalhub://benchmarks?label=