From 1f6014b385eaca6586e867f49f7092a503ea85ad Mon Sep 17 00:00:00 2001 From: JuliaEdom Date: Mon, 29 Jun 2026 22:59:56 +0300 Subject: [PATCH] fix(egress): allowlist the e2e/staging webhook callback host (A1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SSRF egress guard (audit_28_06_26.md #2) resolves every webhook/alert target and rejects any host resolving to a private/loopback/link-local address. The E2E and Staging deployments deliver the test webhook to the host gateway (host.docker.internal) / the in-pod loopback relay (127.0.0.1), both of which the guard correctly rejects — so POST /v1/webhooks returned 400 and tests/e2e/test_smoke.py::test_webhook_test_endpoint_delivers_callback failed in *both* the E2E Tests and Staging Deploy workflows (not required checks, so main went red unnoticed). Add an opt-in allowlist, AGENTFLOW_EGRESS_ALLOWED_HOSTS (comma-separated, exact hostnames, case-insensitive, default empty). When the URL host is on the list the public-address check is waived; the http(s) scheme check still applies and production keeps the full guard because nothing sets the env. Wire the controlled callback host into each deployment that needs it: - docker-compose.e2e.yml -> the configured callback host (host.docker.internal) - scripts/k8s_staging_up.sh -> 127.0.0.1, alongside the loopback relay setup - tests/e2e/conftest.py -> local uvicorn (127.0.0.1) and compose-override paths Also add workflow_dispatch to staging-deploy.yml so the fix can be verified on a branch before merging (mirrors e2e.yml). Regression tests in tests/unit/test_egress_guard.py prove the allowlist permits the listed host, stays case-insensitive, still rejects bad schemes and non-listed private hosts, and that an empty allowlist preserves the loopback rejection. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/staging-deploy.yml | 1 + docker-compose.e2e.yml | 5 +++ scripts/k8s_staging_up.sh | 8 +++- src/serving/api/egress_guard.py | 23 +++++++++++ tests/e2e/conftest.py | 8 ++++ tests/e2e/test_ci_compose_config.py | 14 +++++++ tests/unit/test_egress_guard.py | 58 ++++++++++++++++++++++++++++ 7 files changed, 116 insertions(+), 1 deletion(-) diff --git a/.github/workflows/staging-deploy.yml b/.github/workflows/staging-deploy.yml index 7d6cdd2..17dd771 100644 --- a/.github/workflows/staging-deploy.yml +++ b/.github/workflows/staging-deploy.yml @@ -3,6 +3,7 @@ name: Staging Deploy on: push: branches: [main] + workflow_dispatch: permissions: contents: read diff --git a/docker-compose.e2e.yml b/docker-compose.e2e.yml index 2691718..421d4eb 100644 --- a/docker-compose.e2e.yml +++ b/docker-compose.e2e.yml @@ -86,6 +86,11 @@ services: AGENTFLOW_RATE_LIMIT_RPM: ${AGENTFLOW_RATE_LIMIT_RPM:-120} AGENTFLOW_USAGE_DB_PATH: /app/data/agentflow_api_usage.duckdb AGENTFLOW_WEBHOOKS_FILE: /app/data/e2e-webhooks.yaml + # The e2e webhook callback is delivered to the host gateway, which the + # SSRF egress guard rejects as a private address. Trust exactly that + # configured callback host (default host.docker.internal) — the guard + # stays fully active for every other target. + AGENTFLOW_EGRESS_ALLOWED_HOSTS: ${AGENTFLOW_E2E_CALLBACK_HOST:-host.docker.internal} DUCKDB_PATH: /app/data/agentflow.duckdb REDIS_URL: redis://redis:6379/0 KAFKA_BOOTSTRAP_SERVERS: kafka:9092 diff --git a/scripts/k8s_staging_up.sh b/scripts/k8s_staging_up.sh index 9f12019..bd3b620 100644 --- a/scripts/k8s_staging_up.sh +++ b/scripts/k8s_staging_up.sh @@ -208,11 +208,17 @@ helm upgrade --install "$RELEASE_NAME" "$ROOT_DIR/helm/agentflow" \ --debug echo "==> Enabling host loopback relay for webhook callbacks..." +# The relay listens on 127.0.0.1 inside the pod and forwards to the host +# gateway, so the E2E webhook callback URL is http://127.0.0.1:/callback. +# The SSRF egress guard rejects 127.0.0.1 as loopback by default; allowlist +# exactly the relay loopback here (ephemeral staging only) so the webhook +# delivery test passes without weakening the guard for real targets. kubectl set env "deployment/$RELEASE_NAME" \ --namespace "$NAMESPACE" \ HOST_LOOPBACK_PROXY_TARGET="$HOST_LOOPBACK_PROXY_TARGET" \ HOST_LOOPBACK_PROXY_RANGE_START="$HOST_LOOPBACK_PROXY_RANGE_START" \ - HOST_LOOPBACK_PROXY_RANGE_END="$HOST_LOOPBACK_PROXY_RANGE_END" + HOST_LOOPBACK_PROXY_RANGE_END="$HOST_LOOPBACK_PROXY_RANGE_END" \ + AGENTFLOW_EGRESS_ALLOWED_HOSTS="127.0.0.1" kubectl patch deployment "$RELEASE_NAME" \ --namespace "$NAMESPACE" \ diff --git a/src/serving/api/egress_guard.py b/src/serving/api/egress_guard.py index adc4964..680c635 100644 --- a/src/serving/api/egress_guard.py +++ b/src/serving/api/egress_guard.py @@ -10,15 +10,33 @@ registration time (reject early, 4xx) and immediately before each delivery (narrowing the DNS-rebinding window — a name that resolved public at creation could later point at an internal IP). + +``AGENTFLOW_EGRESS_ALLOWED_HOSTS`` is an opt-in escape hatch (default empty, so +production keeps the full guard): a comma-separated allowlist of exact +hostnames a controlled deployment trusts even though they resolve to a +private/loopback address — e.g. the ``host.docker.internal`` gateway the e2e +compose stack delivers to, or the ``127.0.0.1`` relay the staging kind cluster +stands up. It never relaxes the guard for tenant traffic, only for hosts the +operator explicitly listed. """ from __future__ import annotations import ipaddress +import os import socket from urllib.parse import urlsplit _ALLOWED_SCHEMES = {"http", "https"} +_ALLOWLIST_ENV = "AGENTFLOW_EGRESS_ALLOWED_HOSTS" + + +def _allowed_hosts() -> frozenset[str]: + """Return the operator-configured opt-in allowlist (exact hostnames, + lower-cased). Read per call so deployments can set it via the environment + without an import-time freeze; empty by default.""" + raw = os.getenv(_ALLOWLIST_ENV, "") + return frozenset(host.strip().lower() for host in raw.split(",") if host.strip()) class UnsafeEgressURLError(ValueError): @@ -53,6 +71,11 @@ def validate_public_url(url: str) -> None: host = parts.hostname if not host: raise UnsafeEgressURLError("missing host") + if host.lower() in _allowed_hosts(): + # Operator explicitly trusts this host (controlled test/relay target). + # The scheme check above still applies; only the public-address check + # is waived. + return port = parts.port or (443 if scheme == "https" else 80) try: infos = socket.getaddrinfo(host, port, proto=socket.IPPROTO_TCP) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 5326b3b..240b97b 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -115,6 +115,11 @@ def _write_compose_override(path: Path, host_port: int) -> None: ' AGENTFLOW_RATE_LIMIT_RPM: "120"', " AGENTFLOW_USAGE_DB_PATH: /app/data/agentflow_api_usage.duckdb", " AGENTFLOW_WEBHOOKS_FILE: /app/data/e2e-webhooks.yaml", + # Trust the configured callback host so the SSRF egress guard permits + # the webhook delivery to the host gateway (it resolves to a private + # address). Tracks _compose_callback_host(): host.docker.internal on + # Linux, host.lima.internal on macOS, or an explicit override. + f' AGENTFLOW_EGRESS_ALLOWED_HOSTS: "{_compose_callback_host()}"', ' OTEL_SDK_DISABLED: "true"', " ports:", f' - "127.0.0.1:{host_port}:8000"', @@ -220,6 +225,9 @@ def _start_local_api(tmp_path: Path) -> dict[str, object]: "AGENTFLOW_USAGE_DB_PATH": str(tmp_path / "agentflow_usage.duckdb"), "AGENTFLOW_WEBHOOKS_FILE": str(tmp_path / "webhooks.yaml"), "DUCKDB_PATH": str(tmp_path / "agentflow.duckdb"), + # Local mode delivers the webhook callback to 127.0.0.1 (receiver shares + # this host); allowlist it so the SSRF egress guard permits the loopback. + "AGENTFLOW_EGRESS_ALLOWED_HOSTS": "127.0.0.1", "PYTHONUNBUFFERED": "1", } diff --git a/tests/e2e/test_ci_compose_config.py b/tests/e2e/test_ci_compose_config.py index e82c886..e96077c 100644 --- a/tests/e2e/test_ci_compose_config.py +++ b/tests/e2e/test_ci_compose_config.py @@ -101,6 +101,20 @@ def test_compose_override_uses_native_host_dns_on_darwin(tmp_path, monkeypatch): assert "extra_hosts" not in override["services"]["agentflow-api"] +def test_compose_override_allowlists_callback_host_for_egress_guard(tmp_path, monkeypatch): + conftest_module = _load_module(PROJECT_ROOT / "tests" / "e2e" / "conftest.py", "e2e_conftest") + monkeypatch.delenv("AGENTFLOW_E2E_CALLBACK_HOST", raising=False) + monkeypatch.setattr(conftest_module.platform, "system", lambda: "Linux") + + override_path = tmp_path / "docker-compose.e2e.override.yml" + conftest_module._write_compose_override(override_path, 18080) + + override = yaml.safe_load(override_path.read_text(encoding="utf-8")) + environment = override["services"]["agentflow-api"]["environment"] + + assert environment["AGENTFLOW_EGRESS_ALLOWED_HOSTS"] == "host.docker.internal" + + def test_compose_callback_host_uses_linux_docker_alias(monkeypatch): conftest_module = _load_module(PROJECT_ROOT / "tests" / "e2e" / "conftest.py", "e2e_conftest") monkeypatch.delenv("AGENTFLOW_E2E_CALLBACK_HOST", raising=False) diff --git a/tests/unit/test_egress_guard.py b/tests/unit/test_egress_guard.py index 3b8cbbd..9aa1b2c 100644 --- a/tests/unit/test_egress_guard.py +++ b/tests/unit/test_egress_guard.py @@ -84,3 +84,61 @@ def test_validate_rejects_when_any_resolved_ip_is_private( ) with pytest.raises(UnsafeEgressURLError): validate_public_url("http://dual.example.com/") + + +# --- Opt-in egress allowlist (AGENTFLOW_EGRESS_ALLOWED_HOSTS) ----------------- +# A controlled deployment (e2e compose, staging kind relay) must be able to +# deliver to the specific callback host it stands up — which deliberately +# resolves to a private/loopback address — without weakening the guard for +# tenant traffic. The allowlist is opt-in (default empty) and matches exact +# hostnames, case-insensitively, while still enforcing the http(s) scheme. + + +def test_allowlisted_loopback_host_is_permitted(monkeypatch: pytest.MonkeyPatch) -> None: + # Staging shape: the in-pod relay listens on 127.0.0.1 (normally rejected as + # loopback); the operator allowlists it so the webhook callback is delivered. + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "127.0.0.1") + validate_public_url("http://127.0.0.1:18080/callback") # must not raise + + +def test_allowlisted_gateway_host_skips_resolution(monkeypatch: pytest.MonkeyPatch) -> None: + # E2E shape: host.docker.internal resolves to a private gateway IP. An + # allowlisted host is trusted without any DNS resolution at all. + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "host.docker.internal") + + def _no_resolution(*_a: object, **_k: object) -> list[object]: + raise AssertionError("allowlisted host must not be resolved") + + monkeypatch.setattr(socket, "getaddrinfo", _no_resolution) + validate_public_url("http://host.docker.internal:9000/callback") # must not raise + + +def test_allowlist_matches_host_case_insensitively(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "Host.Docker.Internal") + validate_public_url("http://host.docker.internal/cb") # must not raise + + +def test_allowlist_parses_comma_list_and_ignores_blanks(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", " , 127.0.0.1 ,host.docker.internal, ") + validate_public_url("http://127.0.0.1:18080/callback") # must not raise + validate_public_url("http://host.docker.internal/cb") # must not raise + + +def test_allowlist_still_enforces_scheme(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "127.0.0.1") + with pytest.raises(UnsafeEgressURLError): + validate_public_url("file://127.0.0.1/etc/passwd") + + +def test_allowlist_only_exempts_listed_hosts(monkeypatch: pytest.MonkeyPatch) -> None: + # A different private host, not on the list, is still rejected. + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "127.0.0.1") + with pytest.raises(UnsafeEgressURLError): + validate_public_url("http://10.0.0.5:6379/") + + +def test_empty_allowlist_preserves_loopback_rejection(monkeypatch: pytest.MonkeyPatch) -> None: + # Default/empty env must not weaken the guard. + monkeypatch.setenv("AGENTFLOW_EGRESS_ALLOWED_HOSTS", "") + with pytest.raises(UnsafeEgressURLError): + validate_public_url("http://127.0.0.1/x")