Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ kubeflow-mcp serve \
--instruction-tier full \ # full | compact | minimal
--transport stdio \ # stdio | http | sse
--auth-token SECRET \ # bearer token for HTTP auth (dev/staging)
--otel-endpoint URL \ # OTLP HTTP endpoint (optional tracing)
--log-level INFO \ # DEBUG | INFO | WARNING | ERROR
--log-format console \ # console | json (auto-detected if omitted)
--no-banner # suppress startup banner
Expand Down Expand Up @@ -165,6 +166,26 @@ kubeflow-mcp agent \

</details>

## Observability

OpenTelemetry tracing is optional and can be enabled without changing tool code.

- Install optional dependencies: `pip install ".[otel]"`
- Enable tracing with CLI flag or env var:

```bash
kubeflow-mcp serve --otel-endpoint http://localhost:4318
# or
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
kubeflow-mcp serve
Comment on lines +173 to +180
```

Each tool invocation emits a span with attributes:
`tool.name`, `tool.args_preview`, `tool.success`, `tool.duration_ms`, `kubeflow.persona`, and `correlation_id`.

> **Note:** `kubeflow-mcp agent --otel-endpoint ...` emits spans under a separate
> `kubeflow-mcp-agent` service in Jaeger, distinct from the `kubeflow-mcp` server spans.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth a one-liner noting that kubeflow-mcp agent --otel-endpoint ... emits a separate kubeflow-mcp-agent service in Jaeger. Without this, users running the agent will wonder why they only see server-side spans and think tracing is broken.

## Development

```bash
Expand Down
12 changes: 12 additions & 0 deletions kubeflow_mcp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ def cli() -> None:
"Falls back to KUBEFLOW_MCP_AUTH_TOKEN env var, config file. "
"Ignored for stdio transport.",
)
@click.option(
"--otel-endpoint",
default=None,
help="OpenTelemetry OTLP HTTP endpoint for tracing. "
"Falls back to OTEL_EXPORTER_OTLP_ENDPOINT env var, config file.",
)
def serve(
clients: str | None,
persona: str | None,
Expand All @@ -104,6 +110,7 @@ def serve(
instruction_tier: str | None,
no_banner: bool,
auth_token: str | None,
otel_endpoint: str | None,
) -> None:
"""Start the MCP server.

Expand All @@ -116,6 +123,7 @@ def serve(
from kubeflow_mcp.core.logging import setup_logging
from kubeflow_mcp.core.resilience import configure_circuit_breaker
from kubeflow_mcp.core.server import configure_resilience, create_server
from kubeflow_mcp.core.telemetry import setup_tracing

cfg = load_config()

Expand All @@ -128,8 +136,11 @@ def serve(

if auth_token:
cfg.auth.auth_token = auth_token
if otel_endpoint:
cfg.observability.otel_endpoint = otel_endpoint

logger = setup_logging(level=log_level, format=log_format)
tracing_enabled = setup_tracing(endpoint=cfg.observability.otel_endpoint)
logger.info(
"Starting kubeflow-mcp",
extra={
Expand All @@ -138,6 +149,7 @@ def serve(
"transport": transport,
"mode": mode,
"instruction_tier": instruction_tier,
"tracing_enabled": tracing_enabled,
},
)

Expand Down
41 changes: 41 additions & 0 deletions kubeflow_mcp/cli_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def _make_serve_mocks(config=None):
mock_load_config = MagicMock(return_value=config)
mock_build_auth_provider = MagicMock(return_value=None)
mock_configure_circuit_breaker = MagicMock()
mock_setup_tracing = MagicMock(return_value=False)

fake_server_mod = MagicMock()
fake_server_mod.create_server = mock_create_server
Expand All @@ -109,13 +110,16 @@ def _make_serve_mocks(config=None):
fake_auth_mod.build_auth_provider = mock_build_auth_provider
fake_resilience_mod = MagicMock()
fake_resilience_mod.configure_circuit_breaker = mock_configure_circuit_breaker
fake_telemetry_mod = MagicMock()
fake_telemetry_mod.setup_tracing = mock_setup_tracing

modules_patch = {
"kubeflow_mcp.core.server": fake_server_mod,
"kubeflow_mcp.core.logging": fake_logging_mod,
"kubeflow_mcp.core.config": fake_config_mod,
"kubeflow_mcp.core.auth": fake_auth_mod,
"kubeflow_mcp.core.resilience": fake_resilience_mod,
"kubeflow_mcp.core.telemetry": fake_telemetry_mod,
}
return mock_server, mock_create_server, modules_patch

Expand Down Expand Up @@ -401,3 +405,40 @@ def test_serve_default_shows_banner():

_, kwargs = mock_server.run.call_args
assert kwargs.get("show_banner") is True


def test_serve_calls_setup_tracing_with_config_endpoint():
from kubeflow_mcp.core.config import ObservabilityConfig

config = _make_default_config()
config.observability = ObservabilityConfig(otel_endpoint="http://otel-collector:4318/v1/traces")
mock_server, _, modules_patch = _make_serve_mocks(config=config)
fake_telemetry_mod = modules_patch["kubeflow_mcp.core.telemetry"]

with patch.dict(sys.modules, modules_patch):
runner = CliRunner()
runner.invoke(cli, ["serve"])

fake_telemetry_mod.setup_tracing.assert_called_once_with(
endpoint="http://otel-collector:4318/v1/traces"
)


def test_serve_otel_endpoint_cli_overrides_config():
from kubeflow_mcp.core.config import ObservabilityConfig

config = _make_default_config()
config.observability = ObservabilityConfig(otel_endpoint="http://old-endpoint:4318/v1/traces")
mock_server, _, modules_patch = _make_serve_mocks(config=config)
fake_telemetry_mod = modules_patch["kubeflow_mcp.core.telemetry"]

with patch.dict(sys.modules, modules_patch):
runner = CliRunner()
runner.invoke(
cli,
["serve", "--otel-endpoint", "http://new-endpoint:4318/v1/traces"],
)

fake_telemetry_mod.setup_tracing.assert_called_once_with(
endpoint="http://new-endpoint:4318/v1/traces"
)
19 changes: 19 additions & 0 deletions kubeflow_mcp/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
level: INFO
format: json

observability:
otel_endpoint: http://localhost:4318

Namespace restrictions are enforced via ``~/.kf-mcp-policy.yaml``
(``policy.namespaces``), not through server config.
"""
Expand Down Expand Up @@ -135,6 +138,12 @@ class LoggingConfig(BaseModel):
format: str | None = Field(default=None)


class ObservabilityConfig(BaseModel):
"""Observability configuration."""

otel_endpoint: str | None = Field(default=None)


class Config(BaseModel):
"""Root configuration."""

Expand All @@ -144,6 +153,7 @@ class Config(BaseModel):
trainer: TrainerConfig = Field(default_factory=TrainerConfig)
optimizer: OptimizerConfig = Field(default_factory=OptimizerConfig)
logging: LoggingConfig = Field(default_factory=LoggingConfig)
observability: ObservabilityConfig = Field(default_factory=ObservabilityConfig)


def _find_config_file() -> Path | None:
Expand Down Expand Up @@ -229,6 +239,14 @@ def load_config(config_path: Path | None = None) -> Config:
format=os.getenv("LOG_FORMAT", logging_file.get("format")),
)

observability_file = file_config.get("observability", {})
observability = ObservabilityConfig(
otel_endpoint=os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
observability_file.get("otel_endpoint"),
)
)
Comment on lines +242 to +248

# Build client-specific configs
trainer_file = file_config.get("trainer", {})
trainer = TrainerConfig(
Expand Down Expand Up @@ -283,6 +301,7 @@ def load_config(config_path: Path | None = None) -> Config:
trainer=trainer,
optimizer=optimizer,
logging=logging_config,
observability=observability,
)


Expand Down
2 changes: 1 addition & 1 deletion kubeflow_mcp/core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def format(self, record: logging.LogRecord) -> str:
if ctx is not None:
log_dict["context"] = ctx

extra_keys = {"audit", "tool", "parameters", "success", "duration_ms"}
extra_keys = {"audit", "tool", "parameters", "success", "duration_ms", "tracing_enabled"}
for key in extra_keys:
if hasattr(record, key):
log_dict[key] = getattr(record, key)
Expand Down
Loading