Skip to content

Commit d2f3ca7

Browse files
committed
feat(sandbox): add callable python exec API and refresh e2e coverage (!19)
Closes NVIDIA#13 ## Summary - add Python sandbox execution APIs for command and callable workflows - consolidate sandbox policy fixtures and expand e2e test coverage for policy and Python exec paths - update CI/build config and images for sandbox e2e execution dependencies ## Test Plan - mise run pre-commit
1 parent 6cf0264 commit d2f3ca7

33 files changed

+1716
-343
lines changed

.agent/skills/debug-navigator-cluster/SKILL.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@ Diagnose why a navigator cluster failed to start after `nav cluster admin deploy
2525
- If TLS enabled: `navigator-cli-client` secret exists with cert data
2626
9. Extract mTLS credentials if TLS is enabled (up to 3 min)
2727

28+
For local deploys, metadata endpoint selection now depends on Docker connectivity:
29+
30+
- default local Docker socket (`unix:///var/run/docker.sock`): `https://127.0.0.1`
31+
- TCP Docker daemon (`DOCKER_HOST=tcp://<host>:<port>`): `https://<host>` for non-loopback hosts
32+
33+
The TCP host is also added as an extra gateway TLS SAN so mTLS hostname validation succeeds.
34+
2835
The default cluster name is `navigator`. The container is `navigator-cluster-{name}`.
2936

3037
## Prerequisites
@@ -173,6 +180,8 @@ If ports are missing or conflicting, another process may be using them. Check wi
173180
ss -tlnp | grep -E ':(6443|80|443|30051)\s'
174181
```
175182

183+
If using Docker-in-Docker (`DOCKER_HOST=tcp://docker:2375`), verify metadata points at `https://docker` (not `https://127.0.0.1`).
184+
176185
### Step 6: Check Image Availability
177186

178187
Component images (server, sandbox, pki-job) can reach k3s containerd via two paths:
@@ -279,9 +288,11 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
279288
| Architecture mismatch (remote) | Built on arm64, deploying to amd64 | Cross-build the image for the target architecture |
280289
| SSH connection failed (remote) | SSH key/host/Docker issues | Test `ssh <host> docker ps` manually |
281290
| Port conflict | Another service on 6443/80/443/30051 | Stop conflicting service or change port mapping |
291+
| gRPC connect refused to `127.0.0.1:443` in CI | Docker daemon is remote (`DOCKER_HOST=tcp://...`) but metadata still points to loopback | Verify metadata endpoint host matches `DOCKER_HOST` and includes non-loopback host |
282292
| DNS failures inside container | Entrypoint DNS detection failed | Check `/etc/rancher/k3s/resolv.conf` and container startup logs |
283293
| `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component |
284294
| Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 3) or choose "Recreate" when prompted |
295+
| gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `navigator-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` |
285296

286297
## Remote Cluster Debugging
287298

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ dmypy.json
9494
# Cython debug symbols
9595
cython_debug/
9696

97+
# Generated Python protobuf stubs (keep package marker)
98+
python/navigator/_proto/*
99+
!python/navigator/_proto/__init__.py
100+
97101
# =============================================================================
98102
# IDE / Editor
99103
# =============================================================================

.gitlab-ci.yml

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ variables:
1818
default:
1919
image: $CI_IMAGE
2020
tags:
21-
- os/linux
22-
- type/docker
21+
- agent-dev-kit-build
2322

2423
cache:
2524
key: "$CI_COMMIT_REF_SLUG"
@@ -40,8 +39,7 @@ build_ci_image:
4039
services:
4140
- docker:24-dind
4241
tags:
43-
- os/linux
44-
- type/docker
42+
- agent-dev-kit-build
4543
variables:
4644
DOCKER_TLS_CERTDIR: "/certs"
4745
DOCKER_BUILDKIT: "1"
@@ -90,6 +88,30 @@ python_test:
9088
script:
9189
- mise run test:python
9290

91+
python_e2e_sandbox_test:
92+
stage: test
93+
services:
94+
- docker:24-dind
95+
tags:
96+
- agent-dev-kit-build
97+
variables:
98+
DOCKER_HOST: tcp://docker:2375
99+
DOCKER_TLS_CERTDIR: ""
100+
DOCKER_TLS_VERIFY: ""
101+
DOCKER_CERT_PATH: ""
102+
before_script:
103+
- uv sync --frozen
104+
- apt-get update -qq && apt-get install -y -qq socat >/dev/null
105+
- curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-27.5.1.tgz | tar xz --strip-components=1 -C /usr/local/bin docker/docker
106+
- mkdir -p /usr/local/lib/docker/cli-plugins
107+
- curl -fsSL https://github.com/docker/buildx/releases/download/v0.21.1/buildx-v0.21.1.linux-amd64 -o /usr/local/lib/docker/cli-plugins/docker-buildx
108+
- chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx
109+
script:
110+
- socat UNIX-LISTEN:/var/run/docker.sock,fork,reuseaddr TCP:docker:2375 &
111+
- sleep 1
112+
- mise run cluster
113+
- mise run test:e2e:sandbox
114+
93115
# =============================================================================
94116
# Publish Jobs (main branch only)
95117
# =============================================================================
@@ -98,11 +120,7 @@ publish_ecr_images:
98120
services:
99121
- docker:24-dind
100122
tags:
101-
- os/linux
102-
- perflab
103-
- ran-as/container
104-
- size/large
105-
- type/docker
123+
- agent-dev-kit-build
106124
variables:
107125
DOCKER_HOST: tcp://docker:2375
108126
DOCKER_TLS_CERTDIR: ""

CONTRIBUTING.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,17 @@ mise run check # Quick compile check
100100
mise run test # All tests (Rust + Python)
101101
mise run test:rust # Rust tests only
102102
mise run test:python # Python tests only
103+
mise run test:e2e:sandbox # Sandbox Python e2e tests
103104
```
104105

106+
### Python E2E Test Patterns
107+
108+
- Put sandbox SDK e2e tests in `e2e/python/`.
109+
- Prefer `Sandbox.exec_python(...)` with Python callables over inline `python -c` strings.
110+
- Define callable helpers inside the test function when possible so they serialize cleanly in sandbox.
111+
- Keep scenarios focused: one test for happy path and separate tests for negative/policy enforcement behavior.
112+
- Use `mise run test:e2e:sandbox` to run this suite locally.
113+
105114
### Linting & Formatting
106115

107116
```bash
@@ -177,6 +186,10 @@ mise run python:dev # Install Python package in development mode (builds CL
177186
mise run python:build # Build Python wheel with CLI binary
178187
```
179188

189+
Python protobuf stubs in `python/navigator/_proto/` are generated artifacts and are gitignored
190+
(except `__init__.py`). `mise` Python build/test/lint/typecheck tasks run `python:proto`
191+
automatically, so you generally do not need to generate stubs manually.
192+
180193
### Publishing
181194

182195
Versions are derived from git tags using `setuptools_scm`. No version bumps need to be committed.

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

architecture/cluster-bootstrap.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,13 @@ For the target daemon (local or remote):
155155

156156
1. Ensure bridge network `navigator-cluster` (attachable, bridge driver).
157157
2. Ensure volume `navigator-cluster-{name}`.
158-
3. Compute extra TLS SANs for remote deploys:
158+
3. Compute extra TLS SANs:
159+
- For local deploys, when `DOCKER_HOST` is a non-loopback `tcp://` endpoint (for example `tcp://docker:2375` in CI), add that host as an extra SAN.
159160
- Resolve the SSH host via `ssh -G` to get the canonical hostname/IP.
160161
- Add resolved host (and original SSH host if different) as extra `--tls-san` arguments.
161162
- Set container env vars `EXTRA_SANS`, `SSH_GATEWAY_HOST`, `SSH_GATEWAY_PORT=8080`.
162163
4. Ensure container `navigator-cluster-{name}` with:
163-
- k3s server command: `server --disable=traefik --tls-san=127.0.0.1 --tls-san=localhost --tls-san=host.docker.internal` (plus extra SANs for remote).
164+
- k3s server command: `server --disable=traefik --tls-san=127.0.0.1 --tls-san=localhost --tls-san=host.docker.internal` (plus computed extra SANs).
164165
- privileged mode,
165166
- bind mount of volume to `/var/lib/rancher/k3s`,
166167
- network mode `navigator-cluster`,
@@ -212,7 +213,7 @@ Write is performed atomically through temp + backup directory swap.
212213

213214
Bootstrap writes metadata JSON for the cluster:
214215

215-
- local: endpoint `https://127.0.0.1`, `is_remote=false`
216+
- local: endpoint `https://127.0.0.1` by default, or `https://{docker_host}` when `DOCKER_HOST` is a non-loopback `tcp://` endpoint; `is_remote=false`
216217
- remote: endpoint `https://{resolved_host}`, `is_remote=true`, plus SSH destination and resolved host
217218

218219
Metadata fields:
@@ -304,6 +305,9 @@ nav cluster admin tunnel --name <name> --remote user@host
304305
| `XDG_CONFIG_HOME` | Base config directory (default: `$HOME/.config`) |
305306
| `KUBECONFIG` | Target kubeconfig path for merge (first colon-separated path; default: `$HOME/.kube/config`) |
306307

308+
When `NAVIGATOR_PUSH_IMAGES` is enabled, the entrypoint rewrites HelmChart image tags from `latest` to `IMAGE_TAG` and now handles both quoted and unquoted `tag: latest` formats.
309+
In push mode, bootstrap also passes exact imported image refs (`server`, `sandbox`, `pki-job`) to the entrypoint, which rewrites Helm values to those refs directly before the tag/pull-policy overrides. Image import uses the `k8s.io` containerd namespace so kubelet resolves the pushed refs without falling back to pulled registry tags. After import, bootstrap restarts `deployment/navigator` and waits for rollout completion so running pods pick up the imported image references.
310+
307311
Container-level env vars set for remote deploys:
308312

309313
| Variable | Value |

architecture/containers.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ The server container runs the Navigator orchestration service.
5757
- Exposes gRPC/HTTP on port 8080
5858
- Health checks at `/healthz`
5959
- SQLx migrations copied from source
60+
- Uses an embedded Rust SSH client (`russh`) for sandbox exec
6061

6162
### navigator-cluster
6263

@@ -79,6 +80,7 @@ An airgapped k3s image with all components pre-loaded for single-container deplo
7980
When running k3s in Docker, the container's `/etc/resolv.conf` contains Docker's internal DNS (127.0.0.11), which is not reachable from k3s pods. While k3s auto-detects this and falls back to 8.8.8.8, external UDP traffic doesn't work reliably on Docker Desktop.
8081

8182
The `cluster-entrypoint.sh` script solves this by:
83+
8284
1. Detecting the Docker host gateway IP from `/etc/hosts` (requires `--add-host=host.docker.internal:host-gateway`)
8385
2. Writing a custom resolv.conf with the host gateway as the nameserver
8486
3. Passing `--resolv-conf` to k3s to use this configuration

build/python.toml

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,25 @@
22

33
["python:dev"]
44
description = "Install Python package in development mode (builds CLI binary)"
5+
depends = ["python:proto"]
56
run = "uv sync --group dev && uv pip install ."
67

78
["python:build"]
89
description = "Build Python wheel with CLI binary (native)"
10+
depends = ["python:proto"]
911
run = "uv run maturin build --release"
1012

1113
["python:build:linux-x86"]
1214
description = "Build Python wheel for Linux x86_64"
15+
depends = ["python:proto"]
1316
run = """
1417
docker run --rm --platform linux/amd64 -v $(pwd):/io -w /io \
1518
ghcr.io/pyo3/maturin build --release
1619
"""
1720

1821
["python:build:linux-arm"]
1922
description = "Build Python wheel for Linux aarch64"
23+
depends = ["python:proto"]
2024
run = """
2125
docker run --rm --platform linux/arm64 -v $(pwd):/io -w /io \
2226
ghcr.io/pyo3/maturin build --release
@@ -28,6 +32,7 @@ depends = ["python:build", "python:build:linux-x86", "python:build:linux-arm"]
2832

2933
["python:lint"]
3034
description = "Lint Python code with ruff"
35+
depends = ["python:proto"]
3136
env = { UV_NO_SYNC = "1" }
3237
run = "uv run ruff check {{vars.python_paths}}"
3338

@@ -37,6 +42,7 @@ run = "uv run ruff format {{vars.python_paths}}"
3742

3843
["python:typecheck"]
3944
description = "Type check Python code with ty"
45+
depends = ["python:proto"]
4046
run = "uv run ty check {{vars.python_paths}}"
4147

4248
["python:proto"]
@@ -50,8 +56,42 @@ uv run python -m grpc_tools.protoc \
5056
--python_out=python/navigator/_proto \
5157
--pyi_out=python/navigator/_proto \
5258
--grpc_python_out=python/navigator/_proto \
53-
proto/inference.proto
54-
# Fix absolute imports in generated gRPC stubs to use relative imports
55-
sed -i '' 's/^import inference_pb2/from . import inference_pb2/' \
56-
python/navigator/_proto/inference_pb2_grpc.py
57-
"""
59+
proto/inference.proto \
60+
proto/navigator.proto \
61+
proto/datamodel.proto \
62+
proto/sandbox.proto
63+
# Fix absolute imports in generated stubs to use package-relative imports
64+
uv run python - <<'PY'
65+
from pathlib import Path
66+
67+
replacements = {
68+
"python/navigator/_proto/inference_pb2_grpc.py": [
69+
("import inference_pb2", "from . import inference_pb2"),
70+
],
71+
"python/navigator/_proto/navigator_pb2_grpc.py": [
72+
("import navigator_pb2", "from . import navigator_pb2"),
73+
("import sandbox_pb2", "from . import sandbox_pb2"),
74+
],
75+
"python/navigator/_proto/navigator_pb2.py": [
76+
("import datamodel_pb2", "from . import datamodel_pb2"),
77+
("import sandbox_pb2", "from . import sandbox_pb2"),
78+
],
79+
"python/navigator/_proto/datamodel_pb2.py": [
80+
("import sandbox_pb2", "from . import sandbox_pb2"),
81+
],
82+
"python/navigator/_proto/datamodel_pb2_grpc.py": [
83+
("import datamodel_pb2", "from . import datamodel_pb2"),
84+
],
85+
"python/navigator/_proto/sandbox_pb2_grpc.py": [
86+
("import sandbox_pb2", "from . import sandbox_pb2"),
87+
],
88+
}
89+
90+
for path, rules in replacements.items():
91+
file_path = Path(path)
92+
text = file_path.read_text()
93+
for before, after in rules:
94+
text = text.replace(before, after)
95+
file_path.write_text(text)
96+
PY
97+
"""

build/test.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,12 @@ run = "cargo test --workspace"
1010

1111
["test:python"]
1212
description = "Run Python tests"
13+
depends = ["python:proto"]
1314
env = { UV_NO_SYNC = "1" }
1415
run = "uv run pytest python/"
16+
17+
["test:e2e:sandbox"]
18+
description = "Run sandbox end-to-end tests"
19+
depends = ["python:proto"]
20+
env = { UV_NO_SYNC = "1", PYTHONPATH = "python" }
21+
run = "uv run pytest -o python_files='test_*.py' e2e/python"

crates/navigator-bootstrap/src/docker.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,11 @@ pub async fn ensure_container(
306306
.ok()
307307
.filter(|v| !v.trim().is_empty())
308308
.unwrap_or_else(|| "dev".to_string());
309+
if let Ok(images) = std::env::var("NAVIGATOR_PUSH_IMAGES")
310+
&& !images.trim().is_empty()
311+
{
312+
env_vars.push(format!("PUSH_IMAGE_REFS={images}"));
313+
}
309314
env_vars.push(format!("IMAGE_TAG={tag}"));
310315
env_vars.push("IMAGE_PULL_POLICY=IfNotPresent".to_string());
311316
} else if let Ok(tag) = std::env::var("IMAGE_TAG")

0 commit comments

Comments
 (0)