Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
name: Integration Tests

on:
workflow_dispatch:
inputs:
run_e2e:
description: 'Also run Tier 3 E2E tests (requires OPENAI_API_KEY secret)'
type: boolean
default: false

env:
CARGO_TERM_COLOR: always

jobs:
integration:
name: Tier 1+2 Integration Tests
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable

- name: Cache cargo registry and build
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
integration_tests/target
key: ${{ runner.os }}-integration-${{ hashFiles('**/Cargo.lock') }}
restore-keys: ${{ runner.os }}-integration-

- name: Install protoc
run: |
sudo apt-get update
sudo apt-get install -y protobuf-compiler

- name: Build runtime binary
run: cargo build

- name: Run Tier 1+2 integration tests
working-directory: integration_tests
env:
MACP_TEST_BINARY: ../target/debug/macp-runtime
run: cargo test -- --test-threads=1
timeout-minutes: 10

e2e:
name: Tier 3 E2E Tests
runs-on: ubuntu-latest
if: github.event.inputs.run_e2e == 'true'

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable

- name: Cache cargo registry and build
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
integration_tests/target
key: ${{ runner.os }}-integration-${{ hashFiles('**/Cargo.lock') }}
restore-keys: ${{ runner.os }}-integration-

- name: Install protoc
run: |
sudo apt-get update
sudo apt-get install -y protobuf-compiler

- name: Build runtime binary
run: cargo build

- name: Run Tier 3 E2E tests
working-directory: integration_tests
env:
MACP_TEST_BINARY: ../target/debug/macp-runtime
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: cargo test -- --ignored --test-threads=1
timeout-minutes: 10
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Cargo.lock
/temp/
CLAUDE.md

# Integration tests build artifacts
integration_tests/target/

# OS
.DS_Store
Thumbs.db
Expand Down
15 changes: 14 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: setup build test test-integration test-conformance test-all fmt clippy check audit coverage sync-protos sync-protos-local check-protos
.PHONY: setup build test test-integration test-conformance test-all fmt clippy check audit coverage sync-protos sync-protos-local check-protos test-integration-grpc test-integration-agents test-integration-e2e test-integration-hosted

SPEC_PROTO_DIR := ../multiagentcoordinationprotocol/schemas/proto
PROTO_FILES := macp/v1/envelope.proto macp/v1/core.proto macp/modes/decision/v1/decision.proto macp/modes/proposal/v1/proposal.proto macp/modes/task/v1/task.proto macp/modes/handoff/v1/handoff.proto macp/modes/quorum/v1/quorum.proto
Expand Down Expand Up @@ -55,6 +55,19 @@ sync-protos-local:
done
@echo "Done. Run 'git diff proto/' to review changes."

## Integration tests (gRPC, Rig agents)
test-integration-grpc:
cd integration_tests && cargo test --test tier1 -- --test-threads=1

test-integration-agents:
cd integration_tests && cargo test --test tier2 -- --test-threads=1

test-integration-e2e:
cd integration_tests && cargo test -- --ignored --test-threads=1

test-integration-hosted:
cd integration_tests && cargo test -- --test-threads=1

## Check if local protos match BSR
check-protos:
@TMPDIR=$$(mktemp -d); \
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,36 @@ The runtime requires write access to `MACP_DATA_DIR`. Check directory permission
**Proto drift / `make check-protos` failure**
Run `make sync-protos` to update local proto files from BSR.

## Testing

```bash
cargo test --all-targets # Unit tests + Rust integration tests
make test-conformance # JSON fixture-driven conformance suite
```

A separate integration test crate (`integration_tests/`) tests the runtime through the real gRPC boundary:

```bash
cargo build
cd integration_tests
MACP_TEST_BINARY=../target/debug/macp-runtime cargo test -- --test-threads=1
```

The integration suite has three tiers:

- **Tier 1 (Protocol)** — 47 scripted gRPC tests covering all modes, error paths, signals, version binding, dedup, and RFC cross-cutting features
- **Tier 2 (Rig Tools)** — 5 tests using [Rig](https://rig.rs) agent framework `Tool` implementations for all MACP operations
- **Tier 3 (E2E)** — 3 tests with real OpenAI GPT-4o-mini agents coordinating through the runtime (requires `OPENAI_API_KEY`)

See `docs/testing.md` for full details on running locally, in CI, or against a hosted runtime.

## Development notes

- The RFC/spec repository remains the normative source for protocol semantics.
- Five standards-track modes use the canonical `macp.mode.*` identifiers.
- `multi_round` is a built-in extension (`ext.multi_round.v1`) — not standards-track, but ships with the runtime and enforces strict `SessionStart`.
- Extension modes can be dynamically registered, unregistered, and promoted via `RegisterExtMode`, `UnregisterExtMode`, and `PromoteMode` RPCs.
- `StreamSession` is enabled and binds one gRPC stream to one session, emitting accepted envelopes in order.
- `WatchSignals` broadcasts ambient Signal envelopes to all subscribers in real time.

See `docs/README.md` and `docs/examples.md` for the updated local development and usage guidance.
2 changes: 2 additions & 0 deletions docs/protocol.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Clients should call `Initialize` before using the runtime.
- `ListRoots`
- `WatchModeRegistry`
- `WatchRoots`
- `WatchSignals`
- `ListExtModes`
- `RegisterExtMode`
- `UnregisterExtMode`
Expand All @@ -31,6 +32,7 @@ Clients should call `Initialize` before using the runtime.

- `WatchModeRegistry` — sends the current registry state, then fires `RegistryChanged` on register/unregister/promote
- `WatchRoots` — sends the current roots state, then holds the stream open
- `WatchSignals` — broadcasts ambient Signal envelopes to all subscribers in real time; Signals correlate with sessions via `SignalPayload.correlation_session_id` but do not enter session history

## Extension mode lifecycle RPCs

Expand Down
131 changes: 131 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Testing

The runtime has three levels of tests, plus a separate integration test crate that exercises the gRPC boundary with real agents.

## Unit tests and conformance

```bash
cargo test --all-targets # unit tests + Rust integration tests
make test-conformance # JSON fixture-driven conformance suite
make test-all # fmt → clippy → test → integration → conformance
```

Unit tests live inside `src/` modules (`#[cfg(test)]`). Conformance fixtures are in `tests/conformance/` and exercise each mode's happy path and reject paths from JSON definitions.

## Integration test suite

A separate Rust crate at `integration_tests/` tests the runtime through the real gRPC transport boundary. It is **not** part of the main Cargo build — `cargo build --release` ignores it entirely.

### Architecture

```
integration_tests/
Cargo.toml # Depends on macp-runtime (lib) + rig-core + tonic
src/
config.rs # Test target configuration (local / CI / hosted)
server_manager.rs # Start/stop runtime as a subprocess on a free port
helpers.rs # Envelope builders, payload helpers, gRPC wrappers
macp_tools/ # Rig Tool implementations for all MACP operations
tests/
tier1.rs → tier1_protocol/ # Scripted gRPC protocol tests
tier2.rs → tier2_agents/ # Rig agent tool tests (no LLM)
tier3.rs → tier3_e2e/ # Real OpenAI LLM agent tests
```

### Three tiers

| Tier | What | LLM | Tests | Speed |
|------|------|-----|-------|-------|
| **Tier 1: Protocol** | Scripted gRPC calls testing all modes, error paths, RFC cross-cutting features (signals, dedup, version binding, cancel auth) | None | 47 | <1s |
| **Tier 2: Rig Tools** | MACP operations as Rig `Tool` trait implementations, invoked via `ToolSet::call()` | None | 5 | <1s |
| **Tier 3: E2E** | Real GPT-4o-mini agents coordinating through the runtime. Orchestrator as plain code, specialists as LLM. Parallel execution. Signals on ambient plane. | OpenAI | 3 | ~15s |

### Running integration tests

```bash
# Build the runtime first
cargo build

# Run Tier 1 + 2 (no API keys needed)
cd integration_tests
MACP_TEST_BINARY=../target/debug/macp-runtime cargo test -- --test-threads=1

# Run individual tiers
MACP_TEST_BINARY=../target/debug/macp-runtime cargo test --test tier1 -- --test-threads=1
MACP_TEST_BINARY=../target/debug/macp-runtime cargo test --test tier2 -- --test-threads=1

# Run Tier 3 E2E (requires OPENAI_API_KEY)
OPENAI_API_KEY=sk-... MACP_TEST_BINARY=../target/debug/macp-runtime cargo test --test tier3 -- --ignored --test-threads=1

# Run against a hosted runtime (no local server started)
MACP_TEST_ENDPOINT=host:50051 cargo test -- --test-threads=1
```

Or use Makefile targets from the project root:

```bash
make test-integration-grpc # Tier 1
make test-integration-agents # Tier 2
make test-integration-e2e # Tier 3 (needs OPENAI_API_KEY)
make test-integration-hosted # All tiers against MACP_TEST_ENDPOINT
```

### Configuration

| Variable | Purpose | Default |
|----------|---------|---------|
| `MACP_TEST_BINARY` | Path to runtime binary (skip cargo build) | Builds from parent crate |
| `MACP_TEST_ENDPOINT` | Connect to hosted runtime (skip server start) | Start local server |
| `MACP_TEST_TLS` | Use TLS for hosted connection | `0` |
| `MACP_TEST_AUTH_TOKEN` | Bearer token for hosted runtime | Dev headers |
| `OPENAI_API_KEY` | Required for Tier 3 E2E tests | Tier 3 tests skip if unset |

### Tier 1 coverage

Protocol tests exercise every mode through gRPC:

- **Initialize**: protocol negotiation, version rejection, runtime info
- **Decision mode**: happy path, duplicate dedup, non-initiator commit rejection
- **Proposal mode**: happy path, premature commitment rejection
- **Task mode**: happy path, non-initiator request rejection, duplicate task rejection
- **Handoff mode**: happy path, accept-without-offer rejection
- **Quorum mode**: happy path, approve-before-request, premature commitment
- **Multi-round mode**: happy path, pre-convergence commit rejection
- **Signals**: valid signal accepted, session_id/mode violations rejected, WatchSignals broadcast
- **Version binding**: commitment with wrong mode_version/config_version rejected
- **Deduplication**: rejected messages don't consume dedup slots, duplicate SessionStart rejected
- **CancelSession**: non-initiator rejection
- **Session lifecycle**: TTL expiry, concurrent sessions, parallel session independence
- **Mode registry**: list/register/unregister extension modes
- **Discovery**: GetManifest returns all modes, Initialize rejects unsupported version

### Tier 2: Rig agent tools

Each MACP operation (start session, propose, vote, commit, etc.) is implemented as a Rig `Tool` trait. Tier 2 tests validate these tools work correctly by calling them through `ToolSet::call()` — the same interface an LLM agent would use. Tests cover all 5 standard modes.

### Tier 3: E2E with real LLM

Three tests use real OpenAI GPT-4o-mini agents:

1. **Decision with signals**: Orchestrator (code) proposes → 3 specialist LLMs evaluate in parallel → each sends progress/completed Signals on the ambient plane → orchestrator commits. Demonstrates both coordination plane and ambient plane simultaneously.

2. **Decision**: Same as above without signals — simpler version.

3. **Task delegation**: Planner (code) creates task → Worker (LLM) accepts and completes → planner commits.

Architecture follows the RFC:
- Orchestrator/planner operations are **plain code** (deterministic, no LLM needed)
- Specialist/worker reasoning uses **real LLM** (where domain expertise matters)
- Agents run **in parallel** (runtime serializes by acceptance order)
- LLM reasoning happens **outside the session** (ambient plane)
- Only the resulting Envelope enters the session

### CI/CD

Integration tests run via manual GitHub Actions dispatch (not on every PR):

```
Actions → "Integration Tests" → Run workflow → optionally check "Run Tier 3 E2E"
```

Tier 3 E2E requires the `OPENAI_API_KEY` repository secret.
25 changes: 25 additions & 0 deletions integration_tests/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[package]
name = "macp-integration-tests"
version = "0.0.0"
edition = "2021"
publish = false

[dependencies]
macp-runtime = { path = ".." }

tonic = { version = "0.14", features = ["transport"] }
prost = "0.14"

tokio = { version = "1", features = ["full", "process"] }

rig-core = "0.34"

uuid = { version = "1", features = ["v4"] }
chrono = "0.4"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
anyhow = "1"
thiserror = "1"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
async-trait = "0.1"
39 changes: 39 additions & 0 deletions integration_tests/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use std::env;

/// Configuration for integration test target.
///
/// Supports three modes:
/// - **Local dev**: no env vars — builds parent crate, starts server on free port
/// - **CI**: `MACP_TEST_BINARY` set — uses pre-built binary, starts server
/// - **Hosted**: `MACP_TEST_ENDPOINT` set — connects directly, no server management
pub struct TestConfig {
/// gRPC endpoint to connect to (e.g. "http://127.0.0.1:50051")
pub endpoint: Option<String>,
/// Use TLS for the connection
pub use_tls: bool,
/// Bearer token for hosted runtime authentication
pub auth_token: Option<String>,
/// Path to a pre-built runtime binary
pub binary_path: Option<String>,
}

impl TestConfig {
pub fn from_env() -> Self {
Self {
endpoint: env::var("MACP_TEST_ENDPOINT").ok(),
use_tls: env::var("MACP_TEST_TLS").ok().as_deref() == Some("1"),
auth_token: env::var("MACP_TEST_AUTH_TOKEN").ok(),
binary_path: env::var("MACP_TEST_BINARY").ok(),
}
}

/// Whether we need to start a local server (no external endpoint provided).
pub fn needs_local_server(&self) -> bool {
self.endpoint.is_none()
}

/// Whether to use dev-mode headers (x-macp-agent-id) instead of bearer tokens.
pub fn use_dev_headers(&self) -> bool {
self.auth_token.is_none()
}
}
Loading
Loading