diff --git a/.env.example b/.env.example index 8d91968..3ed36de 100644 --- a/.env.example +++ b/.env.example @@ -32,6 +32,8 @@ API_KEY_CACHE_TTL=300 RATE_LIMIT_ENABLED=true # Redis Configuration +# Deployment mode: standalone (default), cluster, or sentinel +REDIS_MODE=standalone REDIS_HOST=localhost REDIS_PORT=6379 REDIS_PASSWORD= @@ -42,6 +44,33 @@ REDIS_MAX_CONNECTIONS=20 REDIS_SOCKET_TIMEOUT=5 REDIS_SOCKET_CONNECT_TIMEOUT=5 +# Optional key prefix — useful when sharing a Redis instance across environments +# All keys will be stored as (e.g. "prod:sessions:abc") +REDIS_KEY_PREFIX= + +# Redis Cluster Mode (REDIS_MODE=cluster) +# Comma-separated list of host:port pairs for cluster startup nodes +# REDIS_CLUSTER_NODES=node1:6379,node2:6379,node3:6379 + +# Redis Sentinel Mode (REDIS_MODE=sentinel) +# Comma-separated list of host:port pairs for Sentinel instances +# REDIS_SENTINEL_NODES=sentinel1:26379,sentinel2:26379,sentinel3:26379 +# REDIS_SENTINEL_MASTER=mymaster +# REDIS_SENTINEL_PASSWORD= + +# Redis TLS/SSL Configuration +# Required for most managed Redis services (GCP Memorystore, AWS ElastiCache, Azure Cache) +REDIS_TLS_ENABLED=false +# REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt +# REDIS_TLS_CERT_FILE=/path/to/client.crt +# REDIS_TLS_KEY_FILE=/path/to/client.key +# REDIS_TLS_INSECURE=false +# Hostname verification is off by default because managed Redis services +# and Redis Cluster mode expose node IPs that don't match cert CN/SAN. +# The CA certificate chain is still fully verified. Enable hostname +# checking when your Redis server hostnames match certificate CN/SAN. +# REDIS_TLS_CHECK_HOSTNAME=false + # MinIO/S3 Configuration MINIO_ENDPOINT=localhost:9000 MINIO_ACCESS_KEY=minioadmin @@ -144,6 +173,37 @@ METRICS_ARCHIVE_RETENTION_DAYS=90 ENABLE_NETWORK_ISOLATION=true ENABLE_FILESYSTEM_ISOLATION=true +# Kubernetes Execution Configuration +# Execution mode: 'agent' (default, recommended) or 'nsenter' (legacy) +# agent: Executor-agent binary runs inside the main container. +# No nsenter, no capabilities, no privilege escalation. +# Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards. +# nsenter: Sidecar uses nsenter to enter the main container's mount namespace. +# Requires shareProcessNamespace, SYS_PTRACE/SYS_ADMIN/SYS_CHROOT caps, +# and allowPrivilegeEscalation: true. NOT compatible with GKE Sandbox. +K8S_EXECUTION_MODE=agent +# K8S_EXECUTOR_PORT=9090 # Port for the executor-agent HTTP server (agent mode only) + +# Sidecar image — must match the execution mode: +# agent mode: aronmuon/kubecoderun-sidecar-agent:latest (default) +# nsenter mode: aronmuon/kubecoderun-sidecar-nsenter:latest +# K8S_SIDECAR_IMAGE=aronmuon/kubecoderun-sidecar-agent:latest + +# Image pull policy for execution pods (Always, IfNotPresent, Never) +# K8S_IMAGE_PULL_POLICY=Always + +# Image pull secrets for private container registries (comma-separated secret names) +# These Kubernetes secrets must already exist in the execution namespace. +# Leave empty or unset if not using private registries. +# K8S_IMAGE_PULL_SECRETS=my-registry-secret,another-secret + +# GKE Sandbox (gVisor) Configuration +# Requires K8S_EXECUTION_MODE=agent (nsenter is incompatible with gVisor) +# GKE_SANDBOX_ENABLED=false +# GKE_SANDBOX_RUNTIME_CLASS=gvisor +# GKE_SANDBOX_NODE_SELECTOR={} +# GKE_SANDBOX_CUSTOM_TOLERATIONS=[] + # WAN Network Access Configuration # When enabled, execution containers can access the public internet # but are blocked from accessing host, other containers, and private networks diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 238e30a..0560fb2 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -145,7 +145,7 @@ jobs: uses: ./.github/workflows/docker-build-reusable.yml secrets: inherit with: - image_name: kubecoderun-sidecar + image_name: kubecoderun-sidecar-agent dockerfile: docker/sidecar/Dockerfile context: docker/sidecar image_tag: ${{ needs.changes.outputs.image_tag }} @@ -344,7 +344,7 @@ jobs: uses: ./.github/workflows/docker-retag-reusable.yml secrets: inherit with: - image_name: kubecoderun-sidecar + image_name: kubecoderun-sidecar-agent new_tag: ${{ needs.changes.outputs.image_tag }} previous_tag: ${{ needs.changes.outputs.previous_tag }} diff --git a/.gitignore b/.gitignore index 557bbf9..86d1a16 100644 --- a/.gitignore +++ b/.gitignore @@ -200,3 +200,5 @@ config/local.py # Hatch auto-generated version file _version.py + +.pdm-python diff --git a/docker-compose.redis-cluster-tls.yml b/docker-compose.redis-cluster-tls.yml new file mode 100644 index 0000000..ceff514 --- /dev/null +++ b/docker-compose.redis-cluster-tls.yml @@ -0,0 +1,231 @@ +# Redis Cluster with TLS for integration testing +# +# This mimics a production GCP Memorystore Redis Cluster setup: +# - 6-node cluster (3 masters + 3 replicas) with TLS enabled +# - No authentication (no password) +# - Server-side TLS with CA verification (no mutual TLS / no client certs) +# - Accessible on localhost ports 6380-6385 (TLS) +# +# Usage: +# docker compose -f docker-compose.redis-cluster-tls.yml up -d +# +# Test with: +# redis-cli -c -p 6380 --tls --cacert tests/tls-certs/ca.crt CLUSTER INFO + +services: + redis-tls-node-0: + image: redis:7-alpine + container_name: redis-tls-cluster-0 + ports: + - "127.0.0.1:6380:6380" + - "127.0.0.1:16380:16380" + volumes: + - redis-tls-cluster-0:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6380 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6380", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-1: + image: redis:7-alpine + container_name: redis-tls-cluster-1 + ports: + - "127.0.0.1:6381:6381" + - "127.0.0.1:16381:16381" + volumes: + - redis-tls-cluster-1:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6381 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6381", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-2: + image: redis:7-alpine + container_name: redis-tls-cluster-2 + ports: + - "127.0.0.1:6382:6382" + - "127.0.0.1:16382:16382" + volumes: + - redis-tls-cluster-2:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6382 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6382", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-3: + image: redis:7-alpine + container_name: redis-tls-cluster-3 + ports: + - "127.0.0.1:6383:6383" + - "127.0.0.1:16383:16383" + volumes: + - redis-tls-cluster-3:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6383 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6383", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-4: + image: redis:7-alpine + container_name: redis-tls-cluster-4 + ports: + - "127.0.0.1:6384:6384" + - "127.0.0.1:16384:16384" + volumes: + - redis-tls-cluster-4:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6384 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6384", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-5: + image: redis:7-alpine + container_name: redis-tls-cluster-5 + ports: + - "127.0.0.1:6385:6385" + - "127.0.0.1:16385:16385" + volumes: + - redis-tls-cluster-5:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6385 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6385", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + # Initializer: creates TLS cluster from the 6 nodes + redis-tls-cluster-init: + image: redis:7-alpine + container_name: redis-tls-cluster-init + volumes: + - ./tests/tls-certs:/tls:ro + depends_on: + redis-tls-node-0: + condition: service_healthy + redis-tls-node-1: + condition: service_healthy + redis-tls-node-2: + condition: service_healthy + redis-tls-node-3: + condition: service_healthy + redis-tls-node-4: + condition: service_healthy + redis-tls-node-5: + condition: service_healthy + restart: "no" + entrypoint: + - sh + - -c + - | + echo 'Creating Redis TLS Cluster...' && + redis-cli --cluster create redis-tls-node-0:6380 redis-tls-node-1:6381 redis-tls-node-2:6382 redis-tls-node-3:6383 redis-tls-node-4:6384 redis-tls-node-5:6385 --cluster-replicas 1 --cluster-yes --tls --cert /tls/redis.crt --key /tls/redis.key --cacert /tls/ca.crt && + echo 'Redis TLS Cluster created successfully' && + redis-cli -h redis-tls-node-0 -p 6380 --tls --cert /tls/redis.crt --key /tls/redis.key --cacert /tls/ca.crt CLUSTER INFO + +volumes: + redis-tls-cluster-0: + redis-tls-cluster-1: + redis-tls-cluster-2: + redis-tls-cluster-3: + redis-tls-cluster-4: + redis-tls-cluster-5: diff --git a/docker-compose.redis-cluster.yml b/docker-compose.redis-cluster.yml new file mode 100644 index 0000000..7f62c52 --- /dev/null +++ b/docker-compose.redis-cluster.yml @@ -0,0 +1,182 @@ +# Redis Cluster for integration testing +# +# Usage: +# docker compose -f docker-compose.redis-cluster.yml up -d +# +# This creates a 6-node Redis Cluster (3 masters + 3 replicas) +# accessible on localhost ports 7000-7005. +# +# Test with: redis-cli -c -p 7000 CLUSTER INFO + +services: + redis-node-0: + image: redis:7-alpine + container_name: redis-cluster-0 + ports: + - "127.0.0.1:7000:7000" + - "127.0.0.1:17000:17000" + volumes: + - redis-cluster-0:/data + command: > + redis-server + --port 7000 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7000", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-1: + image: redis:7-alpine + container_name: redis-cluster-1 + ports: + - "127.0.0.1:7001:7001" + - "127.0.0.1:17001:17001" + volumes: + - redis-cluster-1:/data + command: > + redis-server + --port 7001 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7001", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-2: + image: redis:7-alpine + container_name: redis-cluster-2 + ports: + - "127.0.0.1:7002:7002" + - "127.0.0.1:17002:17002" + volumes: + - redis-cluster-2:/data + command: > + redis-server + --port 7002 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7002", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-3: + image: redis:7-alpine + container_name: redis-cluster-3 + ports: + - "127.0.0.1:7003:7003" + - "127.0.0.1:17003:17003" + volumes: + - redis-cluster-3:/data + command: > + redis-server + --port 7003 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7003", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-4: + image: redis:7-alpine + container_name: redis-cluster-4 + ports: + - "127.0.0.1:7004:7004" + - "127.0.0.1:17004:17004" + volumes: + - redis-cluster-4:/data + command: > + redis-server + --port 7004 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7004", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-5: + image: redis:7-alpine + container_name: redis-cluster-5 + ports: + - "127.0.0.1:7005:7005" + - "127.0.0.1:17005:17005" + volumes: + - redis-cluster-5:/data + command: > + redis-server + --port 7005 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7005", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + # Initializer: creates cluster from the 6 nodes + redis-cluster-init: + image: redis:7-alpine + container_name: redis-cluster-init + depends_on: + redis-node-0: + condition: service_healthy + redis-node-1: + condition: service_healthy + redis-node-2: + condition: service_healthy + redis-node-3: + condition: service_healthy + redis-node-4: + condition: service_healthy + redis-node-5: + condition: service_healthy + restart: "no" + entrypoint: > + sh -c " + echo 'Creating Redis Cluster...' && + redis-cli --cluster create redis-node-0:7000 redis-node-1:7001 redis-node-2:7002 redis-node-3:7003 redis-node-4:7004 redis-node-5:7005 --cluster-replicas 1 --cluster-yes && + echo 'Redis Cluster created successfully' && + redis-cli -h redis-node-0 -p 7000 CLUSTER INFO + " + +volumes: + redis-cluster-0: + redis-cluster-1: + redis-cluster-2: + redis-cluster-3: + redis-cluster-4: + redis-cluster-5: diff --git a/docker/c-cpp.Dockerfile b/docker/c-cpp.Dockerfile index 38ead4e..5dd26f1 100644 --- a/docker/c-cpp.Dockerfile +++ b/docker/c-cpp.Dockerfile @@ -1,7 +1,8 @@ # syntax=docker/dockerfile:1 # C/C++ execution environment with Docker Hardened Images. +# Uses -dev variant because compilers and dev libraries must be available at runtime. -FROM dhi.io/debian-base:trixie +FROM dhi.io/debian-base:trixie-debian13-dev ARG BUILD_DATE ARG VERSION diff --git a/docker/d.Dockerfile b/docker/d.Dockerfile index da2d92f..b64bf77 100644 --- a/docker/d.Dockerfile +++ b/docker/d.Dockerfile @@ -1,7 +1,8 @@ # syntax=docker/dockerfile:1 # D execution environment with Docker Hardened Images. +# Uses -dev variant because compilers must be available at runtime. -FROM dhi.io/debian-base:trixie +FROM dhi.io/debian-base:trixie-debian13-dev ARG BUILD_DATE ARG VERSION diff --git a/docker/fortran.Dockerfile b/docker/fortran.Dockerfile index 928e1e1..8d20f7a 100644 --- a/docker/fortran.Dockerfile +++ b/docker/fortran.Dockerfile @@ -1,7 +1,8 @@ # syntax=docker/dockerfile:1 # Fortran execution environment with Docker Hardened Images. +# Uses -dev variant because compilers and dev libraries must be available at runtime. -FROM dhi.io/debian-base:trixie +FROM dhi.io/debian-base:trixie-debian13-dev ARG BUILD_DATE ARG VERSION diff --git a/docker/go.Dockerfile b/docker/go.Dockerfile index bf37a74..4f86354 100644 --- a/docker/go.Dockerfile +++ b/docker/go.Dockerfile @@ -3,7 +3,7 @@ ################################ # Stage 1: Build and download dependencies -FROM dhi.io/golang:1.25-debian13-dev AS builder +FROM dhi.io/golang:1.26-debian13-dev AS builder SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -26,7 +26,7 @@ RUN cd /tmp/gosetup && \ ################################ # Stage 2: Prepare runtime directories -FROM dhi.io/golang:1.25-debian13-dev AS runtime-deps +FROM dhi.io/golang:1.26-debian13-dev AS runtime-deps SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -35,7 +35,7 @@ RUN mkdir -p /mnt/data /mnt/data/go-build && chown -R 65532:65532 /mnt/data ################################ # Stage 3: Minimal runtime image -FROM dhi.io/golang:1.25-debian13 AS final +FROM dhi.io/golang:1.26-debian13 AS final ARG BUILD_DATE ARG VERSION diff --git a/docker/php.Dockerfile b/docker/php.Dockerfile index fe28565..1549d80 100644 --- a/docker/php.Dockerfile +++ b/docker/php.Dockerfile @@ -2,8 +2,8 @@ # PHP execution environment with Docker Hardened Images. # PHP version configuration - single source of truth -ARG PHP_VERSION=8.4.17 -ARG PHP_MAJOR=8.4 +ARG PHP_VERSION=8.5.3 +ARG PHP_MAJOR=8.5 ARG DEBIAN_VERSION=debian13 ARG BUILD_DATE diff --git a/docker/python.Dockerfile b/docker/python.Dockerfile index 197e6d7..e3b9917 100644 --- a/docker/python.Dockerfile +++ b/docker/python.Dockerfile @@ -83,33 +83,34 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN mkdir -p /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu && \ apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - # Runtime libraries (counterparts to -dev packages in builder) - libxml2 \ - libxslt1.1 \ - libffi8 \ - libssl3t64 \ + # Core System Utilities + libgomp1 \ + liblz4-1 \ + # Image Processing (Pillow, OpenCV) + ffmpeg \ libjpeg62-turbo \ libpng16-16t64 \ libtiff6 \ + libwebp7 \ libopenjp2-7 \ - libfreetype6 \ liblcms2-2 \ - libwebp7 \ - libportaudio2 \ - libpulse0 \ - # External tools needed at runtime + # XML/HTML Processing (lxml, beautifulsoup4) + libxml2 \ + libxslt1.1 \ + # Cryptography (cryptography, PyOpenSSL) + libffi8 \ + libssl3t64 \ + # Font Support (Matplotlib, WordCloud) + libfreetype6 \ + fontconfig \ + # External Tools (Runtime executables) poppler-utils \ - tesseract-ocr \ - pandoc \ - ffmpeg \ - flac \ - antiword \ - unrtf \ && apt-get autoremove -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && mkdir -p /mnt/data && chown 65532:65532 /mnt/data + ################################ # Final stage - minimal runtime image ################################ @@ -129,13 +130,7 @@ LABEL org.opencontainers.image.title="KubeCodeRun Python Environment" \ COPY --from=runtime-deps /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu COPY --from=runtime-deps /usr/lib/aarch64-linux-gnu /usr/lib/aarch64-linux-gnu COPY --from=runtime-deps /usr/bin/pdftotext /usr/bin/pdftoppm /usr/bin/pdfinfo /usr/bin/ -COPY --from=runtime-deps /usr/bin/tesseract /usr/bin/ -COPY --from=runtime-deps /usr/bin/pandoc /usr/bin/ COPY --from=runtime-deps /usr/bin/ffmpeg /usr/bin/ffprobe /usr/bin/ -COPY --from=runtime-deps /usr/bin/flac /usr/bin/ -COPY --from=runtime-deps /usr/bin/antiword /usr/bin/ -COPY --from=runtime-deps /usr/bin/unrtf /usr/bin/ -COPY --from=runtime-deps /usr/share/tesseract-ocr /usr/share/tesseract-ocr # Copy installed Python packages from builder # DHI Python is installed in /opt/python, not /usr/local diff --git a/docker/r.Dockerfile b/docker/r.Dockerfile index e8343e4..626a44b 100644 --- a/docker/r.Dockerfile +++ b/docker/r.Dockerfile @@ -9,7 +9,7 @@ ARG VCS_REF ################################ # Builder stage - install R and compile packages ################################ -FROM dhi.io/debian-base:trixie AS builder +FROM dhi.io/debian-base:trixie-debian13-dev AS builder SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -40,7 +40,7 @@ RUN R -e "options(repos = c(CRAN = 'https://packagemanager.posit.co/cran/__linux ################################ # Final stage - runtime image ################################ -FROM dhi.io/debian-base:trixie AS final +FROM dhi.io/debian-base:trixie-debian13-dev AS final ARG BUILD_DATE ARG VERSION diff --git a/docker/requirements/go.mod b/docker/requirements/go.mod index bd7d5d6..8b9b86a 100644 --- a/docker/requirements/go.mod +++ b/docker/requirements/go.mod @@ -1,18 +1,22 @@ module preload -go 1.25 +go 1.26 require ( - github.com/gorilla/mux v1.8.1 + github.com/davidbyttow/govips/v2 v2.16.0 + github.com/fatih/color v1.18.0 github.com/gin-gonic/gin v1.11.0 - github.com/sirupsen/logrus v1.9.4 + github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 github.com/google/uuid v1.6.0 - github.com/shopspring/decimal v1.4.0 - gonum.org/v1/gonum v0.17.0 + github.com/gorilla/mux v1.8.1 github.com/montanaflynn/stats v0.7.1 - github.com/xuri/excelize/v2 v2.10.0 + github.com/shopspring/decimal v1.4.0 + github.com/sirupsen/logrus v1.9.4 github.com/spf13/cobra v1.10.2 - github.com/fatih/color v1.18.0 - github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 github.com/tidwall/gjson v1.18.0 + github.com/unidoc/unioffice/v2 v2.8.0 + github.com/xuri/excelize/v2 v2.10.0 + github.com/yuin/goldmark v1.7.16 + gonum.org/v1/gonum v0.17.0 + gopkg.in/yaml.v3 v3.0.1 ) diff --git a/docker/requirements/nodejs.txt b/docker/requirements/nodejs.txt index 665e959..e8b5683 100644 --- a/docker/requirements/nodejs.txt +++ b/docker/requirements/nodejs.txt @@ -1,34 +1,58 @@ -# Node.js global packages -# One package per line for npm install -g +# Node.js packages +# One package per line: npm install -g -lodash +# Core utilities +archiver axios -moment -uuid chalk commander -express -fs-extra -csv-parser -json2csv -papaparse -archiver -yauzl -pdf-lib +crypto-js date-fns +fs-extra +lodash +luxon +uuid validator -crypto-js +zod + +# Web scraping & parsing +cheerio + +# Data processing +papaparse mathjs ml-matrix simple-statistics -sharp + +# File formats - Office documents +exceljs +xlsx +pptxgenjs +mammoth + +# File formats - Other +js-yaml +marked +pdf-lib + +# Images jimp +sharp + +# Compression +yauzl + +# TypeScript (recommended local + global) typescript ts-node -xlsx -exceljs -luxon -zod +@types/node + +# Web frameworks +express handlebars -cheerio -marked + +# UI libraries +jquery +jquery-ui-dist +three +p5 diff --git a/docker/requirements/python-analysis.txt b/docker/requirements/python-analysis.txt index ec53ec8..7b2c6ae 100644 --- a/docker/requirements/python-analysis.txt +++ b/docker/requirements/python-analysis.txt @@ -1,19 +1,18 @@ -# Math, science, and analysis packages - -scipy>=1.11 -scikit-learn>=1.3 -statsmodels>=0.14 -sympy>=1.12 -mpmath>=1.3 -numba>=0.58 -llvmlite>=0.41 -numexpr>=2.8 -networkx>=3.2 -lifelines>=0.27 -autograd>=1.6 -autograd-gamma>=0.5 -formulaic>=1.0 -patsy>=0.5 -kiwisolver>=1.4 -joblib>=1.3 -threadpoolctl>=3.2 +# Math, science, and analysis +autograd-gamma>=0.5.0 +autograd>=1.7.0 +formulaic>=1.0.2 +joblib>=1.4.2 +kiwisolver>=1.4.7 +lifelines>=0.30.0 +llvmlite>=0.44.0 +mpmath>=1.3.0 +networkx>=3.4.2 +numba>=0.61.0 +numexpr>=2.10.2 +patsy>=1.0.1 +scikit-learn>=1.7.2 +scipy>=1.16.1 +statsmodels>=0.14.4 +sympy>=1.13.3 +threadpoolctl>=3.5.0 diff --git a/docker/requirements/python-core.txt b/docker/requirements/python-core.txt index 3fbfb33..a6e78cd 100644 --- a/docker/requirements/python-core.txt +++ b/docker/requirements/python-core.txt @@ -1,19 +1,17 @@ -# Core data processing packages -# These are the most stable and form the foundation - -cloudpickle>=3.0 -lz4>=4.3.0 -numpy>=1.24 -pandas>=2.0 -openpyxl>=3.1 -xlrd>=2.0 -XlsxWriter>=3.1 -pyarrow>=14.0 -tabulate>=0.9 -six>=1.16 -packaging>=23.0 -python-dateutil>=2.8 -pytz>=2024.1 -tzdata>=2024.1 -pendulum>=3.0 -pydantic>=2.5 +# Core data processing +cloudpickle>=3.1.0 +lz4>=4.3.3 +numpy>=2.2.1 +pandas>=3.0.0 +openpyxl>=3.1.5 +xlrd>=2.0.1 +XlsxWriter>=3.2.0 +pyarrow>=18.1.0 +tabulate>=0.9.0 +six>=1.17.0 +packaging>=24.2 +python-dateutil>=2.9.0 +pytz>=2025.1 +tzdata>=2025.2 +pendulum>=3.0.0 +pydantic>=2.10.6 diff --git a/docker/requirements/python-documents.txt b/docker/requirements/python-documents.txt index 8317219..6d861d8 100644 --- a/docker/requirements/python-documents.txt +++ b/docker/requirements/python-documents.txt @@ -1,41 +1,34 @@ -# Document processing packages (PDF, Office, etc.) +# Document processing - PDF +pdf2image>=1.17.0 +pdfminer.six>=20231228 +pypdf>=5.1.0 +PyPDF2>=3.0.1 +reportlab>=4.2.5 -# PDF -PyPDF2>=3.0 -pdfminer.six>=20221105 -pdfminer>=20191125 -pdf2image>=1.16 -reportlab>=4.0 - -# Office documents -python-docx>=1.1 -python-pptx>=0.6 -mammoth>=1.6 +# Document processing - Office (Excel, PowerPoint, Word) +docx2python>=3.1.1 docx2txt>=0.8 -docx2python>=2.0 -docxcompose>=1.4 -docxtpl>=0.16 -doc2pdf>=0.2 - -# XML/HTML parsing -beautifulsoup4>=4.12 -lxml>=4.9 -soupsieve>=2.5 -defusedxml>=0.7 -cssselect2>=0.7 -webencodings>=0.5 -tinycss2>=1.2 +docxcompose>=1.4.0 +docxtpl>=0.18.0 +mammoth>=1.8.0 +openpyxl>=3.1.5 +python-docx>=1.1.2 +python-pptx>=1.0.2 -# Text extraction -textract-py3>=1.5 -antiword>=0.1 -pytesseract>=0.3 -pypandoc>=1.12 +# Document processing - XML/HTML +beautifulsoup4>=4.12.3 +cssselect2>=0.7.0 +defusedxml>=0.7.1 +lxml>=5.3.0 +soupsieve>=2.6 +tinycss2>=1.4.0 +webencodings>=0.5.1 -# Other formats -vsdx>=0.5 -compressed-rtf>=1.0 -extract-msg>=0.47 -olefile>=0.47 -ebcdic>=1.1 -ExifRead>=3.0 +# Text and format processing (YAML, Markdown, JSON, CSV) +chardet>=5.2.0 +markdown-it-py>=3.0.0 +markdown>=3.7 +pyparsing>=3.2.1 +python-frontmatter>=1.1.0 +pyyaml>=6.0.2 +toml>=0.10.2 diff --git a/docker/requirements/python-utilities.txt b/docker/requirements/python-utilities.txt index b7a8521..171c571 100644 --- a/docker/requirements/python-utilities.txt +++ b/docker/requirements/python-utilities.txt @@ -1,65 +1,43 @@ -# Utility packages (cryptography, encoding, misc) - -# Cryptography -cryptography>=41.0 -bcrypt>=4.1 -PyNaCl>=1.5 -pycryptodome>=3.19 -passlib>=1.7 - -# Encoding/parsing -chardet>=5.2 -cffi>=1.16 -pycparser>=2.21 -pyparsing>=3.1 -base58>=2.1 -cobble>=0.1 -xxhash>=3.4 - -# Templates and markup -Jinja2>=3.1 -MarkupSafe>=2.1 - -# Barcodes/QR -qrcode>=7.4 -python-barcode>=0.15 - -# Other utilities -argcomplete>=3.2 -babel>=2.14 -deprecation>=2.1 -hachoir>=3.3 -interface-meta>=1.3 -paragraphs>=0.1 -sortedcontainers>=2.4 -tenacity>=8.2 -typing-extensions>=4.9 +# Cryptography and security +bcrypt>=4.2.1 +cryptography>=44.0.0 +passlib>=1.7.4 +pycryptodome>=3.21.0 +PyNaCl>=1.5.0 + +# Utilities +argcomplete>=3.5.3 +babel>=2.16.0 +base58>=2.1.1 +cffi>=1.17.1 +deprecation>=2.1.0 +hachoir>=3.3.0 +Jinja2>=3.1.5 +MarkupSafe>=3.0.2 +pycparser>=2.22 +python-barcode>=0.15.1 +qrcode>=8.0 +regex>=2024.11.6 +sortedcontainers>=2.4.0 +tenacity>=9.0.0 +typing-extensions>=4.12.2 tzlocal>=5.2 -Whoosh>=2.7 -wordcloud>=1.9 -wrapt>=1.16 -regex>=2023.12 - -# HTTP clients -requests>=2.31 -httpx>=0.25 - -# CLI and output formatting -rich>=13.7 -click>=8.1 -typer>=0.9 - -# Data generation and formatting -faker>=22.0 -humanize>=4.9 +wordcloud>=1.9.4 +wrapt>=1.17.2 +xxhash>=3.5.0 + +# HTTP and networking +certifi>=2025.1.31 +httpx>=0.28.1 +requests>=2.32.3 +urllib3>=2.3.0 + +# CLI and formatting +click>=8.1.8 +faker>=33.3.0 +humanize>=4.11.0 +rich>=13.9.4 +typer>=0.15.1 # Configuration -python-dotenv>=1.0 -toml>=0.10 -pyyaml>=6.0 - -# Audio -SpeechRecognition>=3.10 - -# Email -IMAPClient>=3.0 +python-dotenv>=1.0.1 diff --git a/docker/requirements/python-visualization.txt b/docker/requirements/python-visualization.txt index 2ba55a0..dba6e97 100644 --- a/docker/requirements/python-visualization.txt +++ b/docker/requirements/python-visualization.txt @@ -1,14 +1,13 @@ -# Visualization and graphics packages - -matplotlib>=3.8 -seaborn>=0.13 -plotly>=5.18 -pillow>=10.0 -imageio>=2.33 -scikit-image>=0.22 -opencv-python-headless>=4.8 -contourpy>=1.2 -cycler>=0.12 -fonttools>=4.47 -lazy-loader>=0.3 -tifffile>=2024.1 +# Visualization +contourpy>=1.3.1 +cycler>=0.12.1 +fonttools>=4.55.3 +imageio>=2.36.1 +lazy-loader>=0.4 +matplotlib>=3.10.0 +opencv-python-headless>=4.10.0.84 +pillow>=11.1.0 +plotly>=5.24.1 +scikit-image>=0.25.0 +seaborn>=0.13.2 +tifffile>=2025.2.12 diff --git a/docker/requirements/rust-Cargo.toml b/docker/requirements/rust-Cargo.toml index 36071e6..55c343c 100644 --- a/docker/requirements/rust-Cargo.toml +++ b/docker/requirements/rust-Cargo.toml @@ -5,29 +5,38 @@ edition = "2021" [dependencies] # Serialization -serde = { version = "1", features = ["derive"] } -serde_json = "1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0.149" +serde_yaml = "0.9.34" # Data formats -csv = "1" -zip = "7" -flate2 = "1" -calamine = "0.32" +csv = "1.4.0" +zip = "8.1.0" +flate2 = "1.1.9" + +# Spreadsheets (Excel, ODS) +calamine = { version = "0.33.0", features = ["dates"] } + +# Office documents (DOCX, PPTX, XLSX) +undoc = "0.1.13" + +# Markdown parsing +pulldown-cmark = "0.13.0" # Images -image = "0.25" +image = "0.25.9" # Utilities -uuid = { version = "1", features = ["v4"] } -chrono = { version = "0.4", features = ["serde"] } -regex = "1" -clap = { version = "4", features = ["derive"] } -anyhow = "1" -thiserror = "2" +uuid = { version = "1.21.0", features = ["v4", "serde"] } +chrono = { version = "0.4.43", features = ["serde"] } +regex = "1.12.3" +clap = { version = "4.5.59", features = ["derive"] } +anyhow = "1.0.101" +thiserror = "2.0.18" # Math/Stats -nalgebra = "0.34" -statrs = "0.18" +nalgebra = "0.34.1" +statrs = "0.18.0" # Plotting -plotters = "0.3" +plotters = "0.3.7" diff --git a/docker/rust.Dockerfile b/docker/rust.Dockerfile index 1a3416b..e4197c0 100644 --- a/docker/rust.Dockerfile +++ b/docker/rust.Dockerfile @@ -8,7 +8,7 @@ ################################ # Builder stage - compile crate dependencies ################################ -FROM dhi.io/rust:1.92-debian13-dev AS builder +FROM dhi.io/rust:1.93-debian13-dev AS builder SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -45,7 +45,7 @@ RUN rm -rf /tmp/rust-cache/src /tmp/rust-cache/Cargo.toml /tmp/rust-cache/Cargo. ################################ # Final stage - runtime only ################################ -FROM dhi.io/rust:1.92-debian13-dev AS final +FROM dhi.io/rust:1.93-debian13-dev AS final ARG BUILD_DATE ARG VERSION diff --git a/docker/sidecar/Dockerfile b/docker/sidecar/Dockerfile index 5e79da2..32248a7 100644 --- a/docker/sidecar/Dockerfile +++ b/docker/sidecar/Dockerfile @@ -1,32 +1,75 @@ # syntax=docker/dockerfile:1 -# KubeCodeRun HTTP sidecar with Docker Hardened Images. +# KubeCodeRun HTTP Sidecar — Multi-target Dockerfile. +# +# Produces two distinct container images via Docker build targets: +# +# docker build --target sidecar-agent → kubecoderun-sidecar-agent (default) +# docker build --target sidecar-nsenter → kubecoderun-sidecar-nsenter +# +# sidecar-agent (default): +# - Contains the executor-agent Go binary (copied to main container via init container) +# - No nsenter, no setcap, no capabilities, no privilege escalation +# - Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards +# +# sidecar-nsenter (legacy): +# - Contains nsenter with file capabilities (setcap) for namespace entry +# - Requires shareProcessNamespace, SYS_PTRACE/SYS_ADMIN/SYS_CHROOT capabilities, +# and allowPrivilegeEscalation: true in the pod spec +# - For clusters that do not support agent mode or need legacy behavior ARG BUILD_DATE ARG VERSION ARG VCS_REF ################################ -# Builder stage - install Python dependencies and runtime tools +# Executor agent build stage — statically compiled Go binary. +# This binary runs in the main (language) container via init container copy, +# providing HTTP-based code execution without nsenter. ################################ -FROM dhi.io/python:3.13-debian13-dev AS builder +FROM golang:1.26-alpine AS agent-builder + +WORKDIR /build +COPY executor-agent/ . +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -trimpath -o /opt/executor-agent . + +################################ +# Python builder (common) — install Python dependencies and app code. +# Used by both agent and nsenter targets. +################################ +FROM dhi.io/python:3.13-debian13-dev AS builder-common SHELL ["/bin/bash", "-o", "pipefail", "-c"] ENV PIP_DISABLE_PIP_VERSION_CHECK=1 -# Install runtime dependencies and set up nsenter with file capabilities -# - util-linux: provides nsenter for entering container namespaces -# - libcap2-bin: provides setcap for setting file capabilities -# Create both arch lib dirs to ensure COPY works on either architecture +# Create data directory and arch lib dirs (for COPY compatibility) RUN mkdir -p /lib/x86_64-linux-gnu /lib/aarch64-linux-gnu && \ - apt-get update && \ + mkdir -p /mnt/data && chown 65532:65532 /mnt/data + +WORKDIR /app + +# Install Python dependencies +COPY requirements.txt /tmp/requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r /tmp/requirements.txt + +# Copy application code +COPY main.py . + +################################ +# nsenter builder — extends common builder with nsenter + file capabilities. +# Only needed for the nsenter sidecar target. +################################ +FROM builder-common AS builder-nsenter + +# Install nsenter (util-linux) and setcap (libcap2-bin) +RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ util-linux \ libcap2-bin \ && apt-get autoremove -y \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /mnt/data && chown 65532:65532 /mnt/data + && rm -rf /var/lib/apt/lists/* # Add file capabilities to nsenter binary so non-root users can use it # - cap_sys_ptrace: access /proc//ns/ of other processes @@ -34,52 +77,27 @@ RUN mkdir -p /lib/x86_64-linux-gnu /lib/aarch64-linux-gnu && \ # - cap_sys_chroot: required for mount namespace operations RUN setcap 'cap_sys_ptrace,cap_sys_admin,cap_sys_chroot+eip' /usr/bin/nsenter -WORKDIR /app - -# Install Python dependencies -COPY requirements.txt /tmp/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r /tmp/requirements.txt - -# Copy application code -COPY main.py . ################################ -# Final stage - minimal runtime image +# Common runtime base — shared by both final targets. +# Contains Python runtime, sidecar app, and common configuration. ################################ -FROM dhi.io/python:3.13-debian13 AS final +FROM dhi.io/python:3.13-debian13 AS runtime-base ARG BUILD_DATE ARG VERSION ARG VCS_REF -LABEL org.opencontainers.image.title="KubeCodeRun Sidecar" \ - org.opencontainers.image.description="HTTP sidecar for executing code in Kubernetes pods via nsenter" \ - org.opencontainers.image.version="${VERSION}" \ - org.opencontainers.image.created="${BUILD_DATE}" \ - org.opencontainers.image.revision="${VCS_REF}" - -# Copy nsenter with file capabilities from builder -COPY --from=builder /usr/bin/nsenter /usr/bin/ - -# Copy shared libraries needed by nsenter (libselinux, libpcre2) for both architectures -# Note: Only one arch directory will have content depending on build platform -COPY --from=builder /lib/x86_64-linux-gnu /lib/x86_64-linux-gnu -COPY --from=builder /lib/aarch64-linux-gnu /lib/aarch64-linux-gnu - # Copy installed Python packages from builder # DHI Python is installed in /opt/python, not /usr/local -COPY --from=builder /opt/python/lib/python3.13/site-packages /opt/python/lib/python3.13/site-packages -COPY --from=builder /opt/python/bin /opt/python/bin - -# Copy /usr/bin/env for execution patterns, sleep for CMD -COPY --from=builder /usr/bin/env /usr/bin/sleep /usr/bin/ +COPY --from=builder-common /opt/python/lib/python3.13/site-packages /opt/python/lib/python3.13/site-packages +COPY --from=builder-common /opt/python/bin /opt/python/bin # Copy data directory with correct ownership (DHI uses UID 65532) -COPY --from=builder /mnt/data /mnt/data +COPY --from=builder-common /mnt/data /mnt/data # Copy application code -COPY --from=builder /app /app +COPY --from=builder-common /app /app WORKDIR /app @@ -102,15 +120,80 @@ ENV VERSION=${VERSION} \ MAX_EXECUTION_TIME=120 \ PYTHONUNBUFFERED=1 -# Kubernetes pod spec still requires: -# - shareProcessNamespace: true (so sidecar can see main container's processes) -# - securityContext.capabilities.add: ["SYS_PTRACE", "SYS_ADMIN", "SYS_CHROOT"] -# (to allow the bounding set to include these caps) -# - securityContext.allowPrivilegeEscalation: true -# (required for file capabilities to be honored) - # DHI images run as non-root (UID 65532) by default -# File capabilities on nsenter allow this user to use nsenter with required privileges - -# Run sidecar CMD ["python", "main.py"] + + +################################ +# TARGET: sidecar-agent (default) +# +# Agent mode sidecar — the recommended execution mode. +# Contains the executor-agent Go binary for init-container distribution. +# No nsenter, no capabilities, no privilege escalation needed. +# +# Kubernetes pod spec (agent mode): +# - No shareProcessNamespace needed +# - No capabilities needed (all dropped) +# - allowPrivilegeEscalation: false for all containers +# - Init container copies /opt/executor-agent to shared volume +# - Main container runs executor-agent instead of sleep infinity +# - Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards +# +# Build: docker build --target sidecar-agent -t kubecoderun-sidecar-agent . +################################ +FROM runtime-base AS sidecar-agent + +ARG BUILD_DATE +ARG VERSION +ARG VCS_REF + +LABEL org.opencontainers.image.title="KubeCodeRun Sidecar (Agent)" \ + org.opencontainers.image.description="HTTP sidecar for executing code in Kubernetes pods via executor agent" \ + org.opencontainers.image.version="${VERSION}" \ + org.opencontainers.image.created="${BUILD_DATE}" \ + org.opencontainers.image.revision="${VCS_REF}" + +# Copy executor agent binary (distributed to main container via init container) +COPY --from=agent-builder /opt/executor-agent /opt/executor-agent + +ENV EXECUTION_MODE=agent + + +################################ +# TARGET: sidecar-nsenter (legacy) +# +# nsenter mode sidecar — for backward compatibility. +# Contains nsenter with file capabilities for namespace entry. +# +# Kubernetes pod spec (nsenter mode): +# - shareProcessNamespace: true (so sidecar can see main container's processes) +# - securityContext.capabilities.add: ["SYS_PTRACE", "SYS_ADMIN", "SYS_CHROOT"] +# - securityContext.allowPrivilegeEscalation: true +# (required for file capabilities to be honored) +# +# Build: docker build --target sidecar-nsenter -t kubecoderun-sidecar-nsenter . +################################ +FROM runtime-base AS sidecar-nsenter + +ARG BUILD_DATE +ARG VERSION +ARG VCS_REF + +LABEL org.opencontainers.image.title="KubeCodeRun Sidecar (nsenter)" \ + org.opencontainers.image.description="HTTP sidecar for executing code in Kubernetes pods via nsenter" \ + org.opencontainers.image.version="${VERSION}" \ + org.opencontainers.image.created="${BUILD_DATE}" \ + org.opencontainers.image.revision="${VCS_REF}" + +# Copy nsenter with file capabilities from nsenter builder +COPY --from=builder-nsenter /usr/bin/nsenter /usr/bin/ + +# Copy /usr/bin/env from builder — nsenter mode uses /usr/bin/env -i for clean execution +COPY --from=builder-common /usr/bin/env /usr/bin/env + +# Copy shared libraries needed by nsenter (libselinux, libpcre2) for both architectures +# Note: Only one arch directory will have content depending on build platform +COPY --from=builder-nsenter /lib/x86_64-linux-gnu /lib/x86_64-linux-gnu +COPY --from=builder-nsenter /lib/aarch64-linux-gnu /lib/aarch64-linux-gnu + +ENV EXECUTION_MODE=nsenter diff --git a/docker/sidecar/executor-agent/go.mod b/docker/sidecar/executor-agent/go.mod new file mode 100644 index 0000000..b7f6ec9 --- /dev/null +++ b/docker/sidecar/executor-agent/go.mod @@ -0,0 +1,3 @@ +module executor-agent + +go 1.26 diff --git a/docker/sidecar/executor-agent/main.go b/docker/sidecar/executor-agent/main.go new file mode 100644 index 0000000..cbd6dc5 --- /dev/null +++ b/docker/sidecar/executor-agent/main.go @@ -0,0 +1,253 @@ +// Executor Agent - Lightweight HTTP server for code execution in Kubernetes pods. +// +// This binary runs inside the main (language) container and provides an HTTP API +// that the sidecar uses to execute code. It replaces the nsenter-based approach, +// enabling execution without any Linux capabilities or privilege escalation. +// +// Architecture: +// - Listens on localhost (pod-internal only) on a configurable port (default: 9090) +// - Receives execution requests from the sidecar via HTTP +// - Spawns subprocesses using the container's inherited environment (PATH, HOME, etc.) +// - Returns stdout, stderr, exit code, and execution time +// +// The agent inherits its environment from the container's ENTRYPOINT (env -i PATH=... HOME=...), +// ensuring subprocesses run with the exact same sanitized environment as the language runtime. + +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" +) + +const ( + defaultPort = 9090 + maxOutputSize = 1048576 // 1MB - matches sidecar's MAX_OUTPUT_SIZE + maxBodySize = 10485760 // 10MB +) + +// ExecuteRequest is the JSON request body for /execute. +type ExecuteRequest struct { + Command []string `json:"command"` + Timeout int `json:"timeout"` + WorkingDir string `json:"working_dir"` + Env map[string]string `json:"env,omitempty"` +} + +// ExecuteResponse is the JSON response body for /execute. +type ExecuteResponse struct { + ExitCode int `json:"exit_code"` + Stdout string `json:"stdout"` + Stderr string `json:"stderr"` + ExecutionTimeMs int64 `json:"execution_time_ms"` +} + +func handleExecute(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, maxBodySize)) + if err != nil { + writeJSON(w, http.StatusBadRequest, ExecuteResponse{ + ExitCode: 1, Stderr: "Failed to read request body", + }) + return + } + + var req ExecuteRequest + if err := json.Unmarshal(body, &req); err != nil { + writeJSON(w, http.StatusBadRequest, ExecuteResponse{ + ExitCode: 1, Stderr: fmt.Sprintf("Invalid JSON: %v", err), + }) + return + } + + if len(req.Command) == 0 { + writeJSON(w, http.StatusBadRequest, ExecuteResponse{ + ExitCode: 1, Stderr: "No command specified", + }) + return + } + + timeout := req.Timeout + if timeout <= 0 { + timeout = 30 + } + + workingDir := req.WorkingDir + if workingDir == "" { + workingDir = "/mnt/data" + } + + // Validate that working directory is within the safe /mnt/data directory. + // Use filepath.Clean + exact-prefix check to prevent traversal to e.g. /mnt/data2. + absDir, err := filepath.Abs(workingDir) + if err != nil { + writeJSON(w, http.StatusBadRequest, ExecuteResponse{ + ExitCode: 1, Stderr: fmt.Sprintf("Invalid working directory: %v", err), + }) + return + } + absDir = filepath.Clean(absDir) + if absDir != "/mnt/data" && !strings.HasPrefix(absDir, "/mnt/data/") { + writeJSON(w, http.StatusBadRequest, ExecuteResponse{ + ExitCode: 1, Stderr: fmt.Sprintf("Invalid working directory: must be within /mnt/data, got %q", workingDir), + }) + return + } + workingDir = absDir + + fmt.Fprintf(os.Stdout, "[executor-agent] cmd=%v timeout=%ds dir=%s\n", + req.Command, timeout, workingDir) + + start := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, req.Command[0], req.Command[1:]...) + cmd.Dir = workingDir + + // Inherit the current process environment (from container's ENTRYPOINT env -i). + // Merge request-provided env overrides by replacing existing keys (so the + // override actually takes effect regardless of runtime first/last-wins semantics). + if len(req.Env) > 0 { + env := os.Environ() + for k, v := range req.Env { + prefix := k + "=" + found := false + for i, e := range env { + if strings.HasPrefix(e, prefix) { + env[i] = prefix + v + found = true + break + } + } + if !found { + env = append(env, prefix+v) + } + } + cmd.Env = env + } + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err = cmd.Run() + elapsed := time.Since(start).Milliseconds() + + exitCode := 0 + if err != nil { + if ctx.Err() == context.DeadlineExceeded { + fmt.Fprintf(os.Stdout, "[executor-agent] TIMEOUT after %ds\n", timeout) + writeJSON(w, http.StatusOK, ExecuteResponse{ + ExitCode: 124, + Stdout: "", + Stderr: fmt.Sprintf("Execution timed out after %d seconds", timeout), + ExecutionTimeMs: elapsed, + }) + return + } + if exitErr, ok := err.(*exec.ExitError); ok { + exitCode = exitErr.ExitCode() + } else { + writeJSON(w, http.StatusOK, ExecuteResponse{ + ExitCode: 1, + Stdout: "", + Stderr: fmt.Sprintf("Failed to execute command: %v", err), + ExecutionTimeMs: elapsed, + }) + return + } + } + + stdoutStr := truncate(stdout.String(), maxOutputSize) + stderrStr := truncate(stderr.String(), maxOutputSize) + + fmt.Fprintf(os.Stdout, "[executor-agent] exit=%d stdout=%d stderr=%d time=%dms\n", + exitCode, len(stdoutStr), len(stderrStr), elapsed) + + writeJSON(w, http.StatusOK, ExecuteResponse{ + ExitCode: exitCode, + Stdout: stdoutStr, + Stderr: stderrStr, + ExecutionTimeMs: elapsed, + }) +} + +func handleHealth(w http.ResponseWriter, _ *http.Request) { + writeJSON(w, http.StatusOK, map[string]string{"status": "healthy"}) +} + +func handleReady(w http.ResponseWriter, _ *http.Request) { + writeJSON(w, http.StatusOK, map[string]string{"status": "ready"}) +} + +func writeJSON(w http.ResponseWriter, status int, data interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + json.NewEncoder(w).Encode(data) //nolint:errcheck +} + +func truncate(s string, maxLen int) string { + if len(s) > maxLen { + return s[:maxLen] + } + return s +} + +func main() { + port := defaultPort + + // Parse --port flag from CLI args + for i := 1; i < len(os.Args)-1; i++ { + if os.Args[i] == "--port" { + if p, err := strconv.Atoi(os.Args[i+1]); err == nil { + port = p + } + } + } + + mux := http.NewServeMux() + mux.HandleFunc("/execute", handleExecute) + mux.HandleFunc("/health", handleHealth) + mux.HandleFunc("/ready", handleReady) + + server := &http.Server{ + Addr: fmt.Sprintf("127.0.0.1:%d", port), + Handler: mux, + ReadTimeout: 30 * time.Second, + WriteTimeout: 300 * time.Second, + } + + // Graceful shutdown on SIGTERM/SIGINT + go func() { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) + <-sigCh + fmt.Fprintln(os.Stdout, "[executor-agent] Shutting down...") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + server.Shutdown(ctx) //nolint:errcheck + }() + + fmt.Fprintf(os.Stdout, "[executor-agent] Listening on 127.0.0.1:%d\n", port) + if err := server.ListenAndServe(); err != http.ErrServerClosed { + fmt.Fprintf(os.Stderr, "[executor-agent] Server error: %v\n", err) + os.Exit(1) + } +} diff --git a/docker/sidecar/main.py b/docker/sidecar/main.py index 3d4663e..684d8e2 100644 --- a/docker/sidecar/main.py +++ b/docker/sidecar/main.py @@ -2,10 +2,17 @@ """HTTP Sidecar for Kubernetes Pod Execution. This sidecar runs alongside the main language container and provides -an HTTP API for code execution. It uses nsenter to execute code in -the main container's mount namespace. +an HTTP API for code execution. It supports two execution modes: -Requires: shareProcessNamespace: true in the pod spec. +1. Agent mode (default): Forwards execution requests to an executor agent + HTTP server running inside the main container. No nsenter, no capabilities, + no privilege escalation needed. Compatible with GKE Sandbox (gVisor). + +2. nsenter mode (legacy): Uses nsenter to execute code in the main container's + mount namespace. Requires shareProcessNamespace, SYS_PTRACE, SYS_ADMIN, + SYS_CHROOT capabilities, and allowPrivilegeEscalation: true. + +The mode is controlled by the EXECUTION_MODE environment variable. """ import asyncio @@ -19,6 +26,7 @@ from pathlib import Path from typing import Optional +import httpx from fastapi import FastAPI, File, HTTPException, Response, UploadFile from fastapi.responses import FileResponse from pydantic import BaseModel, Field @@ -35,6 +43,11 @@ # Network isolation mode - when true, disables network-dependent features (e.g., Go module proxy) NETWORK_ISOLATED = os.getenv("NETWORK_ISOLATED", "false").lower() in ("true", "1", "yes") +# Execution mode: "agent" (default, no nsenter) or "nsenter" (legacy) +EXECUTION_MODE = os.getenv("EXECUTION_MODE", "agent") +# Executor port (used in agent mode for the executor agent HTTP server) +EXECUTOR_PORT = int(os.getenv("EXECUTOR_PORT", "9090")) + class ExecuteRequest(BaseModel): """Request to execute code.""" code: str @@ -218,85 +231,208 @@ def apply_network_isolation_overrides(env: dict[str, str], language: str) -> dic return env -def get_language_command( - language: str, code: str, working_dir: str, container_env: dict[str, str] -) -> tuple[list[str], Path | None]: - """Get the command to execute code for a given language. +def _write_code_file(language: str, code: str, working_dir: str) -> tuple[list[str], Path | None]: + """Write code to a temp file and return the bare command to execute it. + This is the core (DRY) logic shared by both execution modes. Returns (command_list, temp_file_path_or_none). - - Environment is always read from the container at runtime via /proc//environ. - This eliminates config drift between Dockerfiles and sidecar code. - - Two execution modes: - - Direct mode: Uses '/usr/bin/env -i' for single-command execution - - Shell mode: Uses 'sh -c' for multi-step (compile && run) commands - - Both modes use the runtime-detected environment from the container. """ - # Use container env, fall back to minimal defaults if not available - env = container_env if container_env else {"PATH": "/usr/local/bin:/usr/bin:/bin", "HOME": "/tmp"} - - # Single wrapper using /usr/bin/env -i with runtime-detected environment - def wrap(cmd_args: list[str]) -> list[str]: - env_args = [f"{k}={v}" for k, v in env.items()] - return ["/usr/bin/env", "-i"] + env_args + cmd_args - - # Helper for compiled languages needing shell for compile && run safe_wd = shlex.quote(working_dir) if language in ("python", "py"): code_file = Path(working_dir) / "code.py" code_file.write_text(code) - return wrap(["python", str(code_file)]), code_file + return ["python", str(code_file)], code_file elif language in ("javascript", "js"): code_file = Path(working_dir) / "code.js" code_file.write_text(code) - return wrap(["node", str(code_file)]), code_file + return ["node", str(code_file)], code_file elif language in ("typescript", "ts"): code_file = Path(working_dir) / "code.ts" code_file.write_text(code) - return wrap(["node", "/opt/scripts/ts-runner.js", str(code_file)]), code_file + return ["node", "/opt/scripts/ts-runner.js", str(code_file)], code_file elif language in ("go",): code_file = Path(working_dir) / "main.go" code_file.write_text(code) - return wrap(["go", "run", str(code_file)]), code_file + return ["go", "run", str(code_file)], code_file elif language in ("rust", "rs"): code_file = Path(working_dir) / "main.rs" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && rustc {code_file} -o /tmp/main && /tmp/main"]), code_file + return ["sh", "-c", f"cd {safe_wd} && rustc {code_file} -o /tmp/main && /tmp/main"], code_file elif language in ("java",): code_file = Path(working_dir) / "Code.java" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && javac {code_file} && java -cp {working_dir} Code"]), code_file + return ["sh", "-c", f"cd {safe_wd} && javac {code_file} && java -cp {working_dir} Code"], code_file elif language in ("c",): code_file = Path(working_dir) / "code.c" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && gcc {code_file} -o /tmp/code && /tmp/code"]), code_file + return ["sh", "-c", f"cd {safe_wd} && gcc {code_file} -o /tmp/code && /tmp/code"], code_file elif language in ("cpp",): code_file = Path(working_dir) / "code.cpp" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && g++ {code_file} -o /tmp/code && /tmp/code"]), code_file + return ["sh", "-c", f"cd {safe_wd} && g++ {code_file} -o /tmp/code && /tmp/code"], code_file elif language in ("php",): code_file = Path(working_dir) / "code.php" code_file.write_text(code) - return wrap(["php", str(code_file)]), code_file + return ["php", str(code_file)], code_file elif language in ("r",): code_file = Path(working_dir) / "code.r" code_file.write_text(code) - return wrap(["Rscript", str(code_file)]), code_file + return ["Rscript", str(code_file)], code_file elif language in ("fortran", "f90"): code_file = Path(working_dir) / "code.f90" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && gfortran {code_file} -o /tmp/code && /tmp/code"]), code_file + return ["sh", "-c", f"cd {safe_wd} && gfortran {code_file} -o /tmp/code && /tmp/code"], code_file elif language in ("d", "dlang"): code_file = Path(working_dir) / "code.d" code_file.write_text(code) - return wrap(["sh", "-c", f"cd {safe_wd} && ldc2 {code_file} -of=/tmp/code && /tmp/code"]), code_file + return ["sh", "-c", f"cd {safe_wd} && ldc2 {code_file} -of=/tmp/code && /tmp/code"], code_file else: return [], None +def get_language_command( + language: str, code: str, working_dir: str, container_env: dict[str, str] +) -> tuple[list[str], Path | None]: + """Get the command to execute code for a given language (nsenter mode). + + Wraps the bare command with `/usr/bin/env -i` and the container's environment + variables to ensure a clean, reproducible execution context. + + Returns (command_list, temp_file_path_or_none). + + Environment is always read from the container at runtime via /proc//environ. + This eliminates config drift between Dockerfiles and sidecar code. + """ + cmd, temp_file = _write_code_file(language, code, working_dir) + if not cmd: + return [], None + + # Use container env, fall back to minimal defaults if not available + env = container_env if container_env else {"PATH": "/usr/local/bin:/usr/bin:/bin", "HOME": "/tmp"} + + # Wrap with /usr/bin/env -i for a clean environment + env_args = [f"{k}={v}" for k, v in env.items()] + return ["/usr/bin/env", "-i"] + env_args + cmd, temp_file + + +def get_language_command_bare( + language: str, code: str, working_dir: str, +) -> tuple[list[str], Path | None]: + """Get the bare command to execute code for a given language (agent mode). + + Used in agent mode where the executor agent already inherits the correct + environment from the container's ENTRYPOINT. No env -i wrapper needed. + + Returns (command_list, temp_file_path_or_none). + """ + return _write_code_file(language, code, working_dir) + + +def get_network_isolation_overrides(language: str) -> dict[str, str]: + """Get environment variable overrides for network-isolated execution. + + Returns a dict of env vars to override in the executor agent's subprocess. + """ + if not NETWORK_ISOLATED: + return {} + + overrides = {} + if language in ("go",): + overrides["GOPROXY"] = "off" + overrides["GOSUMDB"] = "off" + print(f"[EXECUTE] Network isolation: overriding GOPROXY=off, GOSUMDB=off", flush=True) + return overrides + + +async def execute_via_agent(request: ExecuteRequest) -> ExecuteResponse: + """Execute code via the executor agent running in the main container. + + The executor agent is a lightweight HTTP server that runs inside the main + container, receiving commands over localhost (shared pod network namespace). + No nsenter, capabilities, or privilege escalation needed. + """ + start_time = time.perf_counter() + + try: + # Write code to a temp file and get the bare command (no env -i wrapper) + cmd, temp_file = get_language_command_bare( + LANGUAGE, request.code, request.working_dir + ) + if not cmd: + return ExecuteResponse( + exit_code=1, + stdout="", + stderr=f"Unsupported language: {LANGUAGE}", + execution_time_ms=0, + ) + except Exception as e: + return ExecuteResponse( + exit_code=1, + stdout="", + stderr=f"Failed to prepare execution: {str(e)}\n{traceback.format_exc()}", + execution_time_ms=int((time.perf_counter() - start_time) * 1000), + ) + + # Build env overrides for network isolation + env_overrides = get_network_isolation_overrides(LANGUAGE) + + print(f"[EXECUTE] agent mode, cmd={cmd}, timeout={request.timeout}s", flush=True) + + try: + async with httpx.AsyncClient() as client: + resp = await client.post( + f"http://127.0.0.1:{EXECUTOR_PORT}/execute", + json={ + "command": cmd, + "timeout": request.timeout, + "working_dir": request.working_dir, + "env": env_overrides if env_overrides else None, + }, + timeout=request.timeout + 10, # Extra margin for HTTP overhead + ) + + if resp.status_code != 200: + return ExecuteResponse( + exit_code=1, + stdout="", + stderr=f"Executor agent returned HTTP {resp.status_code}: {resp.text}", + execution_time_ms=int((time.perf_counter() - start_time) * 1000), + ) + + data = resp.json() + return ExecuteResponse( + exit_code=data.get("exit_code", 1), + stdout=data.get("stdout", ""), + stderr=data.get("stderr", ""), + execution_time_ms=data.get("execution_time_ms", 0), + ) + + except httpx.TimeoutException: + return ExecuteResponse( + exit_code=124, + stdout="", + stderr=f"Execution timed out after {request.timeout} seconds", + execution_time_ms=int((time.perf_counter() - start_time) * 1000), + ) + except httpx.ConnectError: + return ExecuteResponse( + exit_code=1, + stdout="", + stderr=f"Cannot connect to executor agent at 127.0.0.1:{EXECUTOR_PORT}. " + f"Ensure the main container is running the executor agent.", + execution_time_ms=int((time.perf_counter() - start_time) * 1000), + ) + except Exception as e: + print(f"[EXECUTE] AGENT EXCEPTION: {type(e).__name__}: {e}", flush=True) + return ExecuteResponse( + exit_code=1, + stdout="", + stderr=f"Agent execution error: {str(e)}\n{traceback.format_exc()}", + execution_time_ms=int((time.perf_counter() - start_time) * 1000), + ) + + async def execute_via_nsenter(request: ExecuteRequest) -> ExecuteResponse: """Execute code in the main container using nsenter. @@ -474,8 +610,11 @@ async def execute_via_subprocess_direct(request: ExecuteRequest) -> ExecuteRespo @app.post("/execute", response_model=ExecuteResponse) async def execute_code(request: ExecuteRequest) -> ExecuteResponse: - """Execute code and return results via nsenter.""" - return await execute_via_nsenter(request) + """Execute code using the configured execution mode (agent or nsenter).""" + if EXECUTION_MODE == "agent": + return await execute_via_agent(request) + else: + return await execute_via_nsenter(request) @app.post("/files") @@ -586,10 +725,25 @@ async def readiness_check(): if not os.path.isdir(WORKING_DIR): raise HTTPException(status_code=503, detail="Working directory not ready") - # Check if we can find the main container - main_pid = find_main_container_pid() - if not main_pid: - raise HTTPException(status_code=503, detail="Main container not found") + if EXECUTION_MODE == "agent": + # In agent mode, check if the executor agent is reachable + try: + async with httpx.AsyncClient() as client: + resp = await client.get( + f"http://127.0.0.1:{EXECUTOR_PORT}/health", + timeout=2, + ) + if resp.status_code != 200: + raise HTTPException(status_code=503, detail="Executor agent not healthy") + except httpx.ConnectError: + raise HTTPException(status_code=503, detail="Executor agent not reachable") + except Exception: + raise HTTPException(status_code=503, detail="Executor agent health check failed") + else: + # In nsenter mode, check if we can find the main container + main_pid = find_main_container_pid() + if not main_pid: + raise HTTPException(status_code=503, detail="Main container not found") return {"status": "ready"} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index fc0dcd3..f0a81b4 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -48,20 +48,105 @@ The warm pool approach achieves ~85% reduction in P99 latency compared to cold-s ## Pod Design: Two-Container Sidecar Pattern -Each execution pod contains two containers that share process namespaces, enabling the sidecar to execute code using the main container's runtime environment. +Each execution pod contains two containers that communicate over the shared pod network (`localhost`). KubeCodeRun supports two execution modes controlled by `K8S_EXECUTION_MODE`. -### 1. Main Container (Language Runtime) +### Execution Modes + +#### Agent Mode (Default) — `K8S_EXECUTION_MODE=agent` + +In agent mode, a statically compiled Go binary (**executor agent**) runs inside the main (language) container. The sidecar forwards execution requests to the agent over `localhost:9090`. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Execution Pod │ +│ shareProcessNamespace: false (not needed) │ +│ │ +│ ┌────────────────┐ │ +│ │ Init Container │ Copies /opt/executor-agent │ +│ │ (agent-init) │ → /mnt/data/.executor-agent │ +│ └────────┬───────┘ │ +│ │ │ +│ ┌────────▼────────────┐ ┌─────────────────────────────┐ │ +│ │ Main Container │ │ Sidecar Container │ │ +│ │ │ │ │ │ +│ │ • Language runtime │◄───│ • Receives HTTP request │ │ +│ │ • Executor agent │ │ • Forwards to agent via │ │ +│ │ on 127.0.0.1:9090│ │ POST localhost:9090 │ │ +│ │ • Spawns code │ │ • Returns stdout/stderr │ │ +│ │ subprocesses │ │ │ │ +│ └─────────────────────┘ └─────────────────────────────┘ │ +│ │ │ │ +│ └────────────────────────────┘ │ +│ Shared /mnt/data volume │ +└─────────────────────────────────────────────────────────────┘ +``` + +**Security properties:** +- No `shareProcessNamespace` — containers cannot see each other's processes +- No capabilities — all capabilities dropped for all containers +- `allowPrivilegeEscalation: false` — no binary can gain elevated privileges +- Compatible with **GKE Sandbox (gVisor)** and restricted Pod Security Standards +- Communication via `localhost` only (pod-internal, not network-accessible) + +**Container images:** +- **Sidecar:** Built with `--target sidecar-agent` from `docker/sidecar/Dockerfile` +- **Image name:** `kubecoderun-sidecar-agent` (contains executor-agent binary + Python sidecar) + +#### nsenter Mode (Legacy) — `K8S_EXECUTION_MODE=nsenter` + +In nsenter mode, the sidecar uses Linux `nsenter` to execute code in the main container's mount namespace. This requires elevated privileges and is preserved for backward compatibility. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Execution Pod │ +│ shareProcessNamespace: true │ +│ │ +│ ┌─────────────────────┐ ┌─────────────────────────────┐ │ +│ │ Main Container │ │ Sidecar Container │ │ +│ │ │ │ │ │ +│ │ • Python/Node/Go │◄───│ • Receives HTTP request │ │ +│ │ • sleep infinity │ │ • Writes code to /mnt/data │ │ +│ │ • PID 1 visible │ │ • nsenter -m -t │ │ +│ │ to sidecar │ │ --wdns=/mnt/data sh │ │ +│ │ │ │ • Returns stdout/stderr │ │ +│ └─────────────────────┘ └─────────────────────────────┘ │ +│ │ │ │ +│ └────────────────────────────┘ │ +│ Shared /mnt/data volume │ +└─────────────────────────────────────────────────────────────┘ +``` + +**Required pod settings:** +- `shareProcessNamespace: true` +- Sidecar capabilities: `SYS_PTRACE`, `SYS_ADMIN`, `SYS_CHROOT` +- `allowPrivilegeEscalation: true` (for file capabilities on nsenter binary) +- **Not compatible** with GKE Sandbox (gVisor) + +**Container images:** +- **Sidecar:** Built with `--target sidecar-nsenter` from `docker/sidecar/Dockerfile` +- **Image name:** `kubecoderun-sidecar-nsenter` (contains nsenter with setcap + Python sidecar) + +### Container Details + +#### 1. Main Container (Language Runtime) - Runs the language runtime (Python, Node.js, Go, etc.) - Provides the execution environment (compilers, interpreters, libraries) - Shares `/mnt/data` volume with sidecar -- Runs a sleep loop to keep the container alive +- **Agent mode:** Runs the executor agent binary (copied by init container) +- **nsenter mode:** Runs `sleep infinity` to keep the container alive -### 2. HTTP Sidecar (Executor) +#### 2. HTTP Sidecar (Executor) - Lightweight FastAPI server (~50MB) - Exposes REST API for code execution -- Uses `nsenter` to execute code in the main container's namespace +- **Agent mode:** Forwards requests to the executor agent via HTTP on `localhost` +- **nsenter mode:** Uses `nsenter` to execute code in the main container's namespace - Handles file transfers and state management +#### 3. Init Container (Agent Mode Only) +- Uses the sidecar-agent image +- Copies `/opt/executor-agent` binary to `/mnt/data/.executor-agent` +- Runs once at pod startup, then exits + **Sidecar API Endpoints:** ``` POST /execute - Execute code with optional state @@ -71,9 +156,9 @@ GET /files/{name} - Download file content GET /health - Health check ``` -### Namespace Sharing with nsenter +### Namespace Sharing with nsenter (Legacy Mode) -The pod uses `shareProcessNamespace: true`, allowing containers to see each other's processes. The sidecar uses Linux `nsenter` to execute code in the main container's mount namespace: +In nsenter mode, the pod uses `shareProcessNamespace: true`, allowing containers to see each other's processes. The sidecar uses Linux `nsenter` to execute code in the main container's mount namespace: ``` ┌─────────────────────────────────────────────────────────────┐ @@ -100,18 +185,18 @@ The pod uses `shareProcessNamespace: true`, allowing containers to see each othe 3. Sets the working directory to `/mnt/data` so relative paths write to the shared volume 4. Captures stdout/stderr and returns via HTTP -**nsenter Privilege Model:** +**nsenter Privilege Model (nsenter mode only):** The sidecar runs as non-root (UID 65532) but requires Linux capabilities to use `nsenter`. Since capabilities for non-root users only populate the *bounding set* (not effective/permitted), we use **file capabilities** via `setcap` on the nsenter binary: ```dockerfile -# In sidecar Dockerfile +# In sidecar Dockerfile (sidecar-nsenter target only) RUN setcap 'cap_sys_ptrace,cap_sys_admin,cap_sys_chroot+eip' /usr/bin/nsenter ``` This allows the non-root user to gain the required capabilities when executing nsenter, without running as root. The pod spec still requires `allowPrivilegeEscalation: true` for file capabilities to be honored. See [SECURITY.md](SECURITY.md) for full details. -**Per-Language Environment Setup:** +**Per-Language Environment Setup (nsenter mode only):** Since `nsenter -m` only enters the mount namespace (not the environment), the sidecar explicitly sets up PATH and environment variables for each language: @@ -186,7 +271,8 @@ Since `nsenter -m` only enters the mount namespace (not the environment), the si ▼ 5. HTTP Sidecar ├── POST /execute - ├── Run code in main container + ├── Agent mode: Forward to executor agent → subprocess in main container + ├── nsenter mode: nsenter into main container's mount namespace → subprocess └── Return stdout/stderr/files │ ▼ @@ -253,7 +339,7 @@ POD_POOL_EXHAUSTION_TRIGGER=true # Trigger immediate replenishment when exhaus ```python K8S_NAMESPACE=kubecoderun -K8S_SIDECAR_IMAGE=aronmuon/kubecoderun-sidecar:latest +K8S_SIDECAR_IMAGE=aronmuon/kubecoderun-sidecar-agent:latest K8S_IMAGE_REGISTRY=aronmuon/kubecoderun K8S_IMAGE_TAG=latest K8S_CPU_LIMIT=1 diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 30049f5..ee57b72 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -135,18 +135,23 @@ Manages API key authentication and security. ### Redis Configuration -Redis is used for session management and caching. - -| Variable | Default | Description | -| ------------------------------ | ----------- | -------------------------------------------------- | -| `REDIS_HOST` | `localhost` | Redis server hostname | -| `REDIS_PORT` | `6379` | Redis server port | -| `REDIS_PASSWORD` | - | Redis password (if required) | -| `REDIS_DB` | `0` | Redis database number | -| `REDIS_URL` | - | Complete Redis URL (overrides individual settings) | -| `REDIS_MAX_CONNECTIONS` | `20` | Maximum connections in pool | -| `REDIS_SOCKET_TIMEOUT` | `5` | Socket timeout (seconds) | -| `REDIS_SOCKET_CONNECT_TIMEOUT` | `5` | Connection timeout (seconds) | +Redis is used for session management and caching. Three deployment modes are supported: +**standalone** (default), **cluster**, and **sentinel** — all with optional TLS/SSL. + +#### Connection Settings + +| Variable | Default | Description | +| ------------------------------ | ------------- | -------------------------------------------------------- | +| `REDIS_MODE` | `standalone` | Deployment mode: `standalone`, `cluster`, or `sentinel` | +| `REDIS_HOST` | `localhost` | Redis server hostname | +| `REDIS_PORT` | `6379` | Redis server port | +| `REDIS_PASSWORD` | - | Redis password (if required) | +| `REDIS_DB` | `0` | Redis database number (standalone/sentinel only) | +| `REDIS_URL` | - | Complete Redis URL (overrides individual settings) | +| `REDIS_MAX_CONNECTIONS` | `20` | Maximum connections in pool | +| `REDIS_SOCKET_TIMEOUT` | `5` | Socket timeout (seconds) | +| `REDIS_SOCKET_CONNECT_TIMEOUT` | `5` | Connection timeout (seconds) | +| `REDIS_KEY_PREFIX` | - | Optional prefix prepended to every Redis key (e.g. `prod:`) | **Example Redis URL:** @@ -154,6 +159,83 @@ Redis is used for session management and caching. REDIS_URL=redis://password@localhost:6379/0 ``` +#### Redis Cluster Mode + +Use `REDIS_MODE=cluster` when running against a Redis Cluster deployment (e.g. GCP Memorystore Cluster, AWS ElastiCache Cluster Mode). + +| Variable | Default | Description | +| ---------------------- | ------- | --------------------------------------------------------------------------- | +| `REDIS_CLUSTER_NODES` | - | Comma-separated `host:port` pairs for cluster startup nodes | + +> **Note:** `REDIS_DB` is ignored in cluster mode (Redis Cluster only supports database 0). + +**Example:** + +```bash +REDIS_MODE=cluster +REDIS_CLUSTER_NODES=node1:6379,node2:6379,node3:6379 +REDIS_PASSWORD=your-cluster-password +``` + +#### Redis Sentinel Mode + +Use `REDIS_MODE=sentinel` for high-availability setups with Redis Sentinel. + +| Variable | Default | Description | +| -------------------------- | ---------- | ------------------------------------------------------------ | +| `REDIS_SENTINEL_NODES` | - | Comma-separated `host:port` pairs for Sentinel instances | +| `REDIS_SENTINEL_MASTER` | `mymaster` | Name of the Sentinel-monitored master | +| `REDIS_SENTINEL_PASSWORD` | - | Password for authenticating to Sentinel instances | + +**Example:** + +```bash +REDIS_MODE=sentinel +REDIS_SENTINEL_NODES=sentinel1:26379,sentinel2:26379,sentinel3:26379 +REDIS_SENTINEL_MASTER=mymaster +REDIS_PASSWORD=your-redis-password +REDIS_SENTINEL_PASSWORD=your-sentinel-password +``` + +#### Redis TLS/SSL + +Enable TLS for encrypted connections. Required by most managed Redis services (GCP Memorystore, AWS ElastiCache, Azure Cache for Redis). + +| Variable | Default | Description | +| ------------------------------ | ------- | ---------------------------------------------------------------- | +| `REDIS_TLS_ENABLED` | `false` | Enable TLS/SSL for Redis connections | +| `REDIS_TLS_CA_CERT_FILE` | - | Path to CA certificate for verifying the server | +| `REDIS_TLS_CERT_FILE` | - | Path to client TLS certificate (mutual TLS) | +| `REDIS_TLS_KEY_FILE` | - | Path to client TLS private key (mutual TLS) | +| `REDIS_TLS_INSECURE` | `false` | Skip TLS certificate verification (NOT recommended) | +| `REDIS_TLS_CHECK_HOSTNAME` | `false` | Verify server hostname against certificate CN/SAN | + +> When `REDIS_TLS_ENABLED=true` the generated URL uses the `rediss://` scheme automatically. +> +> **Security note:** `REDIS_TLS_CHECK_HOSTNAME` is `false` by default because managed Redis services +> (GCP Memorystore, AWS ElastiCache) and Redis Cluster node discovery expose IP addresses +> that do not match certificate CN/SAN entries. The CA certificate chain is still fully +> validated. For environments where Redis hostnames match their certificates, set +> `REDIS_TLS_CHECK_HOSTNAME=true` for stronger TLS authentication. + +**Example — GCP Memorystore with TLS:** + +```bash +REDIS_HOST=10.0.0.3 +REDIS_PORT=6378 +REDIS_TLS_ENABLED=true +REDIS_TLS_CA_CERT_FILE=/etc/ssl/redis/server-ca.pem +``` + +**Example — GCP Memorystore Cluster:** + +```bash +REDIS_MODE=cluster +REDIS_CLUSTER_NODES=10.0.0.3:6379,10.0.0.4:6379,10.0.0.5:6379 +REDIS_TLS_ENABLED=true +REDIS_TLS_CA_CERT_FILE=/etc/ssl/redis/server-ca.pem +``` + ### MinIO/S3 Configuration MinIO provides S3-compatible object storage for files. @@ -175,22 +257,128 @@ Kubernetes is used for secure code execution in isolated pods. | Variable | Default | Description | | ---------------------- | -------------------------------------------- | ---------------------------------------- | | `K8S_NAMESPACE` | `""` (uses API's namespace) | Namespace for execution pods | -| `K8S_SIDECAR_IMAGE` | `aronmuon/kubecoderun-sidecar:latest` | HTTP sidecar image for pod communication | +| `K8S_SIDECAR_IMAGE` | `aronmuon/kubecoderun-sidecar-agent:latest` | HTTP sidecar image for pod communication | | `K8S_IMAGE_REGISTRY` | `aronmuon/kubecoderun` | Registry prefix for language images | | `K8S_IMAGE_TAG` | `latest` | Image tag for language images | | `K8S_CPU_LIMIT` | `1` | CPU limit per execution pod | | `K8S_MEMORY_LIMIT` | `512Mi` | Memory limit per execution pod | | `K8S_CPU_REQUEST` | `100m` | CPU request per execution pod | | `K8S_MEMORY_REQUEST` | `128Mi` | Memory request per execution pod | +| `K8S_EXECUTION_MODE` | `agent` | Execution mode: `agent` (default) or `nsenter` | +| `K8S_EXECUTOR_PORT` | `9090` | Port for the executor HTTP server inside the main container | +| `K8S_IMAGE_PULL_POLICY`| `Always` | Image pull policy for execution pods (`Always`, `IfNotPresent`, `Never`) | +| `K8S_IMAGE_PULL_SECRETS`| `""` | Comma-separated list of Kubernetes secret names for pulling images from private registries | + +**Image Pull Secrets:** + +When using private container registries, create Kubernetes secrets in the execution namespace and reference them via `K8S_IMAGE_PULL_SECRETS`: + +```bash +# Create the secret +kubectl create secret docker-registry my-registry-secret \ + --docker-server=ghcr.io \ + --docker-username= \ + --docker-password= \ + -n + +# Configure the API +K8S_IMAGE_PULL_SECRETS=my-registry-secret +# Multiple secrets: K8S_IMAGE_PULL_SECRETS=secret1,secret2 +``` + +The secrets are applied to all dynamically created execution pods (both warm pool pods and on-demand Job pods). + +**Execution Modes:** + +- **`agent` (default):** A lightweight Go HTTP server runs inside the main container. The sidecar forwards execution requests via localhost. No `nsenter`, no capabilities, no privilege escalation. Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards. +- **`nsenter` (legacy):** The sidecar uses `nsenter` to enter the main container's mount namespace. Requires `shareProcessNamespace`, `SYS_PTRACE`/`SYS_ADMIN`/`SYS_CHROOT` capabilities, and `allowPrivilegeEscalation: true`. Use only on clusters that allow privilege escalation. **Security Notes:** - Both containers run with `runAsNonRoot: true` and `runAsUser: 65532` -- The sidecar uses file capabilities (`setcap`) on the `nsenter` binary to allow non-root users to enter namespaces -- Required pod capabilities (SYS_PTRACE, SYS_ADMIN, SYS_CHROOT) must be in the bounding set with `allowPrivilegeEscalation: true` +- In agent mode: all capabilities are dropped, `allowPrivilegeEscalation: false` for all containers +- In nsenter mode: the sidecar uses file capabilities (`setcap`) on the `nsenter` binary to allow non-root namespace entry - Network policies deny all egress by default - Pods are destroyed immediately after execution -- See [SECURITY.md](SECURITY.md) for detailed explanation of the nsenter privilege model +- See [SECURITY.md](SECURITY.md) for detailed explanation of the security model + +#### Sidecar Container Images + +The sidecar Dockerfile produces two distinct images via Docker build targets. Use the image that matches your configured `K8S_EXECUTION_MODE`: + +| Build Target | Image Name | Execution Mode | Description | +|-------------|------------|---------------|-------------| +| `sidecar-agent` (default) | `kubecoderun-sidecar-agent` | `agent` | Contains executor-agent binary; no nsenter, no capabilities | +| `sidecar-nsenter` | `kubecoderun-sidecar-nsenter` | `nsenter` | Contains nsenter with file capabilities (setcap) | + +**Building the images:** + +```bash +# Agent mode sidecar (default, recommended): +docker build --target sidecar-agent \ + -t kubecoderun-sidecar-agent:latest \ + -f docker/sidecar/Dockerfile docker/sidecar/ + +# nsenter mode sidecar (legacy): +docker build --target sidecar-nsenter \ + -t kubecoderun-sidecar-nsenter:latest \ + -f docker/sidecar/Dockerfile docker/sidecar/ + +# Or use the build script (builds both automatically): +./scripts/build-images.sh sidecar-agent # agent mode sidecar +./scripts/build-images.sh sidecar-nsenter # nsenter mode sidecar +./scripts/build-images.sh # all images (both sidecars) +``` + +**Helm chart configuration:** + +Update `values.yaml` to use the correct sidecar image for your execution mode: + +```yaml +execution: + executionMode: "agent" # or "nsenter" + sidecar: + # For agent mode (default): + repository: ghcr.io/your-org/kubecoderun-sidecar-agent + # For nsenter mode: + # repository: ghcr.io/your-org/kubecoderun-sidecar-nsenter +``` + +### GKE Sandbox (gVisor) Configuration + +[GKE Sandbox](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods) provides kernel-level isolation using gVisor to protect the host kernel from untrusted code. It is **only compatible with agent execution mode**. + +| Variable | Default | Description | +| ----------------------------------- | --------- | -------------------------------------------------- | +| `GKE_SANDBOX_ENABLED` | `false` | Enable GKE Sandbox (gVisor) for execution pods | +| `GKE_SANDBOX_RUNTIME_CLASS` | `gvisor` | RuntimeClass name for sandboxed pods | +| `GKE_SANDBOX_NODE_SELECTOR` | `{}` | JSON node selector for sandbox nodes | +| `GKE_SANDBOX_CUSTOM_TOLERATIONS` | `[]` | JSON array of custom tolerations for sandbox nodes | + +**Requirements:** + +- `K8S_EXECUTION_MODE=agent` (nsenter is **incompatible** with gVisor) +- GKE cluster with a sandbox-enabled node pool (`--sandbox type=gvisor`) +- At least two node pools — one with GKE Sandbox enabled, one without +- Container-Optimized OS with containerd (`cos_containerd`) node image + +**Example configuration:** + +```bash +K8S_EXECUTION_MODE=agent +GKE_SANDBOX_ENABLED=true +GKE_SANDBOX_RUNTIME_CLASS=gvisor +# Schedule on specific sandbox node pool: +GKE_SANDBOX_NODE_SELECTOR={"pool":"sandbox"} +GKE_SANDBOX_CUSTOM_TOLERATIONS=[{"key":"pool","value":"sandbox","operator":"Equal","effect":"NoSchedule"}] +``` + +**Key limitations of GKE Sandbox** (see [GKE docs](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#limitations)): + +- Incompatible with `nsenter` execution mode, privileged containers, and `shareProcessNamespace` (all avoided in agent mode) +- Seccomp, AppArmor, and SELinux not applicable inside the sandbox +- HostPath volumes and port-forwarding not supported +- Container-level memory metrics not available (pod-level metrics are) ### Resource Limits @@ -395,6 +583,10 @@ if validate_configuration(): - [ ] Deploy Kubernetes NetworkPolicy to deny egress - [ ] Configure pod security context (non-root user) - [ ] Review and adjust resource limits +- [ ] Choose execution mode (`K8S_EXECUTION_MODE=agent` recommended) +- [ ] Ensure sidecar image matches execution mode (`sidecar-agent` for agent, `sidecar-nsenter` for nsenter) +- [ ] Configure `K8S_IMAGE_PULL_SECRETS` if using private registries +- [ ] Enable GKE Sandbox for additional kernel isolation if running on GKE (`GKE_SANDBOX_ENABLED=true`) ### Performance diff --git a/docs/SECURITY.md b/docs/SECURITY.md index 0415712..716666f 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -113,7 +113,75 @@ Code is analyzed for potentially dangerous patterns: - **Security context**: Pods run as non-root (`runAsUser: 65532`) - **Ephemeral execution**: Pods destroyed immediately after execution -#### Namespace Sharing Security (nsenter) +#### Execution Modes + +KubeCodeRun supports two execution modes, controlled by the `K8S_EXECUTION_MODE` environment variable: + +##### Agent Mode (Default) — `K8S_EXECUTION_MODE=agent` + +In agent mode, a lightweight Go HTTP server (the **executor agent**) runs inside the main language container. The sidecar forwards execution requests to it over `localhost` (pod-internal network). This eliminates the need for `nsenter`, Linux capabilities, privilege escalation, and `shareProcessNamespace`. + +**How it works:** + +1. An **init container** (using the `sidecar-agent` image) copies the executor agent binary from `/opt/executor-agent` to the shared volume at `/mnt/data/.executor-agent` +2. The main container's CMD is overridden to run `/mnt/data/.executor-agent` instead of `sleep infinity` +3. The executor agent starts an HTTP server on `127.0.0.1:9090` (configurable via `K8S_EXECUTOR_PORT`) +4. The sidecar sends execution requests to the agent via HTTP POST to `/execute` +5. The agent spawns subprocesses (e.g., `python code.py`) inheriting the container's sanitized environment + +**Pod Settings (agent mode):** +```yaml +spec: + # No shareProcessNamespace needed + initContainers: + - name: agent-init + image: + command: ["python", "-c", "import shutil,os; shutil.copy2('/opt/executor-agent','/mnt/data/.executor-agent'); os.chmod('/mnt/data/.executor-agent',0o755)"] + securityContext: + runAsUser: 65532 + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + containers: + - name: main + args: ["/mnt/data/.executor-agent"] # Runs via existing ENTRYPOINT + securityContext: + runAsUser: 65532 + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + - name: sidecar + env: + - name: EXECUTION_MODE + value: "agent" + - name: EXECUTOR_PORT + value: "9090" + securityContext: + runAsUser: 65532 + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] +``` + +**Security advantages of agent mode:** + +| Feature | Benefit | +|---------|---------| +| No `shareProcessNamespace` | Containers cannot see each other's processes | +| No capabilities | All capabilities dropped for all containers | +| No `allowPrivilegeEscalation` | No binary can gain elevated privileges | +| No `nsenter` | No namespace entry, no mount namespace sharing | +| GKE Sandbox (gVisor) compatible | Works with the most restrictive Pod Security Standards | +| Communication via localhost | Pod-internal only, not network-accessible | + +##### nsenter Mode (Legacy) — `K8S_EXECUTION_MODE=nsenter` + +In nsenter mode, the sidecar uses Linux `nsenter` to execute code in the main container's mount namespace. This requires elevated privileges and is preserved for backward compatibility with clusters that allow privilege escalation. + +**Namespace Sharing Security (nsenter)** The sidecar container uses Linux `nsenter` to execute code in the main container's mount namespace. This requires specific pod and image configuration. @@ -267,6 +335,58 @@ execution: 3. **No Inter-Pod Communication**: NetworkPolicy denies all ingress from other pods. +### GKE Sandbox (gVisor) Support + +For clusters requiring additional kernel-level isolation, KubeCodeRun supports [GKE Sandbox](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods), which uses [gVisor](https://gvisor.dev/) to intercept system calls before they reach the host kernel. + +**GKE Sandbox requires agent mode** (`K8S_EXECUTION_MODE=agent`). nsenter mode is incompatible with gVisor because: +- gVisor does not support `shareProcessNamespace` the same way as a standard Linux kernel +- `nsenter` relies on host kernel namespace operations that gVisor intentionally intercepts +- Agent mode eliminates the need for `SYS_PTRACE`, `SYS_ADMIN`, and `SYS_CHROOT` capabilities, which are restricted in sandboxed pods + +#### Configuration + +```yaml +# In helm values.yaml +execution: + executionMode: "agent" # Required for GKE Sandbox + + gkeSandbox: + enabled: true + runtimeClassName: "gvisor" + nodeSelector: + sandbox.gke.io/runtime: gvisor + customTolerations: + - key: sandbox.gke.io/runtime + operator: Equal + value: gvisor + effect: NoSchedule +``` + +#### Security Benefits + +| Feature | Without GKE Sandbox | With GKE Sandbox | +|---------|-------------------|-----------------| +| System call isolation | Seccomp profile only | gVisor userspace kernel intercepts all syscalls | +| Kernel exposure | Container shares host kernel | gVisor provides an independent kernel API | +| Escape risk | Kernel vulnerability could escape | Two boundaries: gVisor + container | +| Side-channel attacks | Possible via shared kernel | Mitigated by kernel-level isolation | + +#### Requirements + +- GKE cluster with at least two node pools (one standard, one sandbox-enabled) +- Sandbox node pool with `--sandbox type=gvisor` +- Agent execution mode (`executionMode: "agent"`) +- Sidecar image built with `--target sidecar-agent` (default) + +#### Limitations (from GKE documentation) + +- No `hostPath` storage +- No privileged containers +- Seccomp, AppArmor, SELinux are not supported (gVisor provides its own isolation) +- Container-level memory metrics are not available (pod-level metrics work) +- See [GKE Sandbox limitations](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#limitations) for the full list + ### State Persistence Security Python state persistence introduces additional security considerations: @@ -280,7 +400,7 @@ Python state persistence introduces additional security considerations: #### Storage Security -- **Redis encryption**: Consider enabling Redis TLS in production for encrypted state storage +- **Redis encryption**: Enable Redis TLS in production for encrypted state storage (`REDIS_TLS_ENABLED=true`). Required for managed services like GCP Memorystore, AWS ElastiCache, and Azure Cache for Redis. See the [Configuration Guide](CONFIGURATION.md#redis-tlsssl) for details. - **MinIO encryption**: Enable server-side encryption for archived states - **TTL-based cleanup**: States automatically expire (2 hours in Redis, 7 days in MinIO archives) - **Size limits**: `STATE_MAX_SIZE_MB` prevents denial-of-service via large states diff --git a/helm-deployments/kubecoderun/templates/_helpers.tpl b/helm-deployments/kubecoderun/templates/_helpers.tpl index 6e6d32d..91d06d9 100644 --- a/helm-deployments/kubecoderun/templates/_helpers.tpl +++ b/helm-deployments/kubecoderun/templates/_helpers.tpl @@ -78,19 +78,21 @@ Execution namespace {{- end }} {{/* -Redis URL +Redis URL — honours TLS setting to switch between redis:// and rediss:// */}} {{- define "kubecoderun.redisUrl" -}} {{- if .Values.redis.url }} {{- .Values.redis.url }} {{- else if .Values.redis.host }} +{{- $scheme := ternary "rediss" "redis" .Values.redis.tls.enabled }} {{- if .Values.redis.password }} -{{- printf "redis://:%s@%s:%d/%d" .Values.redis.password .Values.redis.host (int .Values.redis.port) (int .Values.redis.db) }} +{{- printf "%s://:%s@%s:%d/%d" $scheme .Values.redis.password .Values.redis.host (int .Values.redis.port) (int .Values.redis.db) }} {{- else }} -{{- printf "redis://%s:%d/%d" .Values.redis.host (int .Values.redis.port) (int .Values.redis.db) }} +{{- printf "%s://%s:%d/%d" $scheme .Values.redis.host (int .Values.redis.port) (int .Values.redis.db) }} {{- end }} {{- else }} -{{- "redis://redis:6379/0" }} +{{- $scheme := ternary "rediss" "redis" .Values.redis.tls.enabled }} +{{- printf "%s://redis:6379/0" $scheme }} {{- end }} {{- end }} diff --git a/helm-deployments/kubecoderun/templates/configmap.yaml b/helm-deployments/kubecoderun/templates/configmap.yaml index a20df80..3c0fd29 100644 --- a/helm-deployments/kubecoderun/templates/configmap.yaml +++ b/helm-deployments/kubecoderun/templates/configmap.yaml @@ -36,15 +36,32 @@ data: K8S_IMAGE_REGISTRY: {{ .Values.execution.imageRegistry | quote }} K8S_IMAGE_TAG: {{ $imageTag | quote }} K8S_IMAGE_PULL_POLICY: {{ .Values.execution.imagePullPolicy | quote }} + {{- if .Values.execution.imagePullSecrets }} + K8S_IMAGE_PULL_SECRETS: {{ join "," (pluck "name" .Values.execution.imagePullSecrets) | quote }} + {{- else }} + K8S_IMAGE_PULL_SECRETS: "" + {{- end }} K8S_CPU_LIMIT: {{ .Values.execution.resources.limits.cpu | quote }} K8S_MEMORY_LIMIT: {{ .Values.execution.resources.limits.memory | quote }} K8S_CPU_REQUEST: {{ .Values.execution.resources.requests.cpu | quote }} K8S_MEMORY_REQUEST: {{ .Values.execution.resources.requests.memory | quote }} K8S_RUN_AS_USER: {{ .Values.execution.securityContext.runAsUser | quote }} + K8S_EXECUTION_MODE: {{ .Values.execution.executionMode | quote }} + K8S_EXECUTOR_PORT: {{ .Values.execution.executorPort | quote }} K8S_SECCOMP_PROFILE_TYPE: {{ .Values.execution.securityContext.seccompProfile.type | quote }} K8S_JOB_TTL_SECONDS: {{ .Values.execution.jobs.ttlSecondsAfterFinished | quote }} K8S_JOB_DEADLINE_SECONDS: {{ .Values.execution.jobs.activeDeadlineSeconds | quote }} + # GKE Sandbox Configuration + GKE_SANDBOX_ENABLED: {{ .Values.execution.gkeSandbox.enabled | quote }} + GKE_SANDBOX_RUNTIME_CLASS: {{ .Values.execution.gkeSandbox.runtimeClassName | quote }} + {{- if .Values.execution.gkeSandbox.nodeSelector }} + GKE_SANDBOX_NODE_SELECTOR: {{ .Values.execution.gkeSandbox.nodeSelector | toJson | quote }} + {{- end }} + {{- if .Values.execution.gkeSandbox.customTolerations }} + GKE_SANDBOX_CUSTOM_TOLERATIONS: {{ .Values.execution.gkeSandbox.customTolerations | toJson | quote }} + {{- end }} + # Pod Lifecycle POD_TTL_MINUTES: {{ .Values.execution.podTtlMinutes | quote }} POD_CLEANUP_INTERVAL_MINUTES: {{ .Values.execution.podCleanupIntervalMinutes | quote }} @@ -302,10 +319,41 @@ data: WAN_NETWORK_NAME: {{ .Values.network.wan.networkName | quote }} WAN_DNS_SERVERS: {{ .Values.network.wan.dnsServers | toJson | quote }} - # Redis Advanced Configuration + # Redis Configuration + REDIS_MODE: {{ .Values.redis.mode | quote }} + {{- if .Values.redis.host }} + REDIS_HOST: {{ .Values.redis.host | quote }} + {{- end }} + REDIS_PORT: {{ .Values.redis.port | quote }} + REDIS_DB: {{ .Values.redis.db | quote }} REDIS_MAX_CONNECTIONS: {{ .Values.redis.maxConnections | quote }} REDIS_SOCKET_TIMEOUT: {{ .Values.redis.socketTimeout | quote }} REDIS_SOCKET_CONNECT_TIMEOUT: {{ .Values.redis.socketConnectTimeout | quote }} + {{- if .Values.redis.keyPrefix }} + REDIS_KEY_PREFIX: {{ .Values.redis.keyPrefix | quote }} + {{- end }} + {{- if .Values.redis.clusterNodes }} + REDIS_CLUSTER_NODES: {{ .Values.redis.clusterNodes | quote }} + {{- end }} + {{- if .Values.redis.sentinelNodes }} + REDIS_SENTINEL_NODES: {{ .Values.redis.sentinelNodes | quote }} + {{- end }} + REDIS_SENTINEL_MASTER: {{ .Values.redis.sentinelMaster | quote }} + {{- if .Values.redis.sentinelPassword }} + REDIS_SENTINEL_PASSWORD: {{ .Values.redis.sentinelPassword | quote }} + {{- end }} + REDIS_TLS_ENABLED: {{ .Values.redis.tls.enabled | quote }} + {{- if .Values.redis.tls.caCertFile }} + REDIS_TLS_CA_CERT_FILE: {{ .Values.redis.tls.caCertFile | quote }} + {{- end }} + {{- if .Values.redis.tls.certFile }} + REDIS_TLS_CERT_FILE: {{ .Values.redis.tls.certFile | quote }} + {{- end }} + {{- if .Values.redis.tls.keyFile }} + REDIS_TLS_KEY_FILE: {{ .Values.redis.tls.keyFile | quote }} + {{- end }} + REDIS_TLS_INSECURE: {{ .Values.redis.tls.insecure | quote }} + REDIS_TLS_CHECK_HOSTNAME: {{ .Values.redis.tls.checkHostname | quote }} # MinIO/S3 Configuration {{- if not .Values.secretsStore.enabled }} diff --git a/helm-deployments/kubecoderun/templates/secret.yaml b/helm-deployments/kubecoderun/templates/secret.yaml index 2e22a17..8003502 100644 --- a/helm-deployments/kubecoderun/templates/secret.yaml +++ b/helm-deployments/kubecoderun/templates/secret.yaml @@ -20,8 +20,11 @@ stringData: {{- end }} {{- end }} {{- if not .Values.redis.existingSecret }} - # Redis URL + # Redis URL (standalone mode) and password (all modes) REDIS_URL: {{ include "kubecoderun.redisUrl" . | quote }} + {{- if .Values.redis.password }} + REDIS_PASSWORD: {{ .Values.redis.password | quote }} + {{- end }} {{- end }} {{- if and (not .Values.minio.existingSecret) (not .Values.minio.useIAM) }} # S3-Compatible Storage Credentials (Garage/MinIO/S3) diff --git a/helm-deployments/kubecoderun/values.yaml b/helm-deployments/kubecoderun/values.yaml index e06bbd6..956aa4a 100644 --- a/helm-deployments/kubecoderun/values.yaml +++ b/helm-deployments/kubecoderun/values.yaml @@ -99,6 +99,10 @@ redis: # When set, the url/host/port/password/db fields below are ignored # Expected secret key: REDIS_URL (full connection string) existingSecret: "" + + # Deployment mode: standalone (default), cluster, or sentinel + mode: "standalone" + # External Redis URL (required unless existingSecret is set) url: "redis://redis:6379/0" # Or specify individual fields @@ -111,6 +115,35 @@ redis: socketTimeout: 5 socketConnectTimeout: 5 + # Optional key prefix prepended to every Redis key. + # Useful when sharing a Redis instance across environments. + keyPrefix: "" + + # Redis Cluster mode (mode: cluster) + # Comma-separated host:port pairs for cluster startup nodes + clusterNodes: "" + + # Redis Sentinel mode (mode: sentinel) + # Comma-separated host:port pairs for sentinel instances + sentinelNodes: "" + sentinelMaster: "mymaster" + sentinelPassword: "" + + # TLS/SSL settings (all modes) + tls: + enabled: false + # Path to CA certificate inside the container + caCertFile: "" + # Client certificate and key for mutual TLS + certFile: "" + keyFile: "" + # Skip server certificate verification (NOT recommended for production) + insecure: false + # Verify server hostname against certificate CN/SAN. + # Off by default because managed Redis services and cluster mode + # expose node IPs that typically don't match certificate names. + checkHostname: false + minio: # Reference an existing Kubernetes Secret containing S3 credentials # When set, the accessKey/secretKey fields below are ignored @@ -172,17 +205,36 @@ execution: # Image pull policy for execution pods (IfNotPresent, Always, Never) imagePullPolicy: "IfNotPresent" + # Image pull secrets for private registries (applies to execution pods) + # Example: + # imagePullSecrets: + # - name: secret-for-registry + # - name: another-secret + imagePullSecrets: [] + # Service account for execution pods (with pod/job create permissions) serviceAccount: create: true name: "kubecoderun-executor" annotations: {} + # Execution mode: "agent" (default) or "nsenter" (legacy) + # - agent: Executor agent runs inside main container. No nsenter, no capabilities, + # no privilege escalation. Compatible with GKE Sandbox (gVisor). Requires the + # sidecar-agent image (default build target). + # - nsenter: Sidecar uses nsenter to enter the main container's namespace. Requires + # the sidecar-nsenter image, shareProcessNamespace, SYS_PTRACE/SYS_ADMIN/SYS_CHROOT + # capabilities, and allowPrivilegeEscalation: true. + executionMode: "agent" + + # Port for the executor HTTP server inside the main container + executorPort: 9090 + # Sidecar container configuration - # CRITICAL: User code runs in sidecar's cgroup via nsenter (Issue #32) - # These resource limits apply to user code execution, not the main container + # In nsenter mode: user code runs in sidecar's cgroup via nsenter + # In agent mode: sidecar only proxies requests, user code runs in main container's cgroup sidecar: - repository: ghcr.io/aron-muon/kubecoderun-sidecar + repository: ghcr.io/aron-muon/kubecoderun-sidecar-agent # tag defaults to Chart.AppVersion if not specified tag: "" port: 8080 @@ -285,6 +337,40 @@ execution: enabled: true denyEgress: true + # GKE Sandbox (gVisor) Configuration + # Provides additional kernel isolation for untrusted workloads using gVisor + # See: https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods + gkeSandbox: + # Enable GKE Sandbox for execution pods. + # WARNING: When enabled, pods require nodes with the gVisor runtime class. + # Pods will stay Pending on clusters without sandbox-enabled node pools. + enabled: false + + # Runtime class name (default: gvisor for GKE) + runtimeClassName: "gvisor" + + # Node selector for sandbox-enabled nodes + # GKE automatically adds sandbox.gke.io/runtime=gvisor to sandbox nodes + # Add additional selectors here if needed (e.g., for specific node pools) + nodeSelector: {} + # Example: + # sandbox.gke.io/runtime: gvisor + # cloud.google.com/gke-nodepool: sandbox-pool + + # Custom tolerations for node pool taints + # GKE automatically adds toleration for sandbox.gke.io/runtime=gvisor + # Use this for additional custom taints (e.g., dedicated sandbox node pools) + customTolerations: [] + # Example: + # - key: pool + # operator: Equal + # value: sandbox + # effect: NoSchedule + # - key: sandbox.gke.io/runtime + # operator: Equal + # value: gvisor + # effect: NoSchedule + # Resource Limits Configuration resourceLimits: # Execution limits diff --git a/scripts/build-images.sh b/scripts/build-images.sh index 13acdb0..0a94ef7 100755 --- a/scripts/build-images.sh +++ b/scripts/build-images.sh @@ -1,29 +1,6 @@ #!/usr/bin/env bash # shellcheck disable=SC2153 # Variables are intentionally sourced from result files # Build all KubeCodeRun Docker images in parallel -# -# Usage: ./scripts/build-images.sh [OPTIONS] [IMAGE] -# -# Arguments: -# IMAGE Build a single image with full output (e.g., go, python, sidecar) -# -# Options: -# -t, --tag TAG Image tag (default: latest) -# -r, --registry REG Registry prefix (e.g., aronmuon/kubecoderun) -# -p, --push Push images after building -# --no-cache Build without cache -# --sequential Build sequentially instead of in parallel -# -h, --help Show this help message -# -# Environment: -# DHI_USERNAME Username for dhi.io registry login -# DHI_PASSWORD Password for dhi.io registry login -# -# Examples: -# ./scripts/build-images.sh # Build all images in parallel -# ./scripts/build-images.sh go # Build only the go image with full output -# ./scripts/build-images.sh --no-cache rust # Build rust image without cache - set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -61,13 +38,77 @@ LANGUAGE_IMAGES=( # Infrastructure images with custom contexts # sidecar: context is docker/sidecar/ (contains requirements.txt, main.py) # api: context is repo root (needs uv.lock, pyproject.toml, src/) +# +# Format: dockerfile_path:image_name:context_dir:docker_target (target is optional) +# The sidecar Dockerfile has two targets: +# sidecar-agent → kubecoderun-sidecar-agent (default, no nsenter) +# sidecar-nsenter → kubecoderun-sidecar-nsenter (legacy, with nsenter+setcap) INFRA_IMAGES=( - "sidecar/Dockerfile:sidecar:docker/sidecar" + "sidecar/Dockerfile:sidecar-agent:docker/sidecar:sidecar-agent" + "sidecar/Dockerfile:sidecar-nsenter:docker/sidecar:sidecar-nsenter" "api/Dockerfile:api:." ) usage() { - head -n 25 "$0" | tail -n 23 | sed 's/^# //' + cat <<'EOF' +Usage: ./scripts/build-images.sh [OPTIONS] [IMAGE] + +Build Docker images for the KubeCodeRun platform. + +When called without arguments, builds ALL images (language runtimes, +sidecar variants, and API) in parallel. Specify IMAGE to build a +single image with full terminal output (useful for debugging). + +Arguments: + IMAGE Name of a single image to build (see --list) + +Options: + -t, --tag TAG Image tag (default: latest) + -r, --registry REG Registry prefix (e.g., ghcr.io/org/kubecoderun) + -p, --push Push images to the registry after building + --no-cache Build without Docker layer cache + --sequential Build images one at a time instead of in parallel + -l, --list List all available image names and exit + -h, -?, --help Show this help message and exit + +Environment Variables: + DHI_USERNAME Username for dhi.io registry authentication + DHI_PASSWORD Password for dhi.io registry authentication + +Examples: + # Build all images in parallel (default) + ./scripts/build-images.sh + + # Build only the Go language image with full output + ./scripts/build-images.sh go + + # Build the agent-mode sidecar without cache + ./scripts/build-images.sh --no-cache sidecar-agent + + # Build and push all images to a private registry + ./scripts/build-images.sh -r ghcr.io/myorg/kubecoderun -t v2.0.0 --push + + # Build the nsenter-mode sidecar + ./scripts/build-images.sh sidecar-nsenter + + # List all available image names + ./scripts/build-images.sh --list +EOF +} + +list_images() { + local all_images=("${LANGUAGE_IMAGES[@]}" "${INFRA_IMAGES[@]}") + echo "Available images:" + echo "" + printf " %-20s %-35s %-15s\n" "NAME" "DOCKERFILE" "TARGET" + printf " %-20s %-35s %-15s\n" "────────────────────" "───────────────────────────────────" "───────────────" + for entry in "${all_images[@]}"; do + IFS=':' read -r dockerfile image_name context_dir docker_target <<< "$entry" + printf " %-20s %-35s %-15s\n" "$image_name" "$dockerfile" "${docker_target:--}" + done + echo "" + echo "Build a single image: ./scripts/build-images.sh " + echo "Build all images: ./scripts/build-images.sh" } dhi_login() { @@ -112,13 +153,18 @@ parse_args() { SEQUENTIAL=true shift ;; - -h|--help) + -h|-\?|--help) usage exit 0 ;; + -l|--list) + list_images + exit 0 + ;; -*) - echo "Unknown option: $1" - usage + echo "Error: Unknown option '$1'" + echo "" + echo "Run './scripts/build-images.sh --help' for usage information." exit 1 ;; *) @@ -169,6 +215,7 @@ build_image() { local image_name="$2" local result_file="$3" local context_dir="$4" + local docker_target="$5" local full_name full_name=$(get_full_image_name "$image_name") @@ -186,9 +233,17 @@ build_image() { local build_output local exit_code=0 + # Build with optional --target + local target_flag="" + if [[ -n "$docker_target" ]]; then + target_flag="--target $docker_target" + fi + # shellcheck disable=SC2086 build_output=$(docker build \ $NO_CACHE \ + $target_flag \ + --build-arg VERSION="$TAG" \ -t "$full_name" \ -f "$DOCKER_DIR/$dockerfile" \ "$context_path" 2>&1) || exit_code=$? @@ -228,9 +283,10 @@ build_image_wrapper() { local dockerfile="$1" local image_name="$2" local context_dir="$3" + local docker_target="$4" local result_file="$RESULTS_DIR/${image_name}.result" - if build_image "$dockerfile" "$image_name" "$result_file" "$context_dir"; then + if build_image "$dockerfile" "$image_name" "$result_file" "$context_dir" "$docker_target"; then echo "Completed: $image_name" else echo "Failed: $image_name" @@ -244,7 +300,7 @@ build_single_image() { local found=false for entry in "${all_images[@]}"; do - IFS=':' read -r dockerfile image_name context_dir <<< "$entry" + IFS=':' read -r dockerfile image_name context_dir docker_target <<< "$entry" if [[ "$image_name" == "$target_image" ]]; then found=true @@ -268,12 +324,22 @@ build_single_image() { echo "Building $image_name -> $full_name" echo " Dockerfile: $DOCKER_DIR/$dockerfile" echo " Context: $context_path" + if [[ -n "$docker_target" ]]; then + echo " Target: $docker_target" + fi echo "" - # Build with output directly to terminal + # Build with output directly to terminal (optional --target) + local target_flag="" + if [[ -n "$docker_target" ]]; then + target_flag="--target $docker_target" + fi + # shellcheck disable=SC2086 docker build \ $NO_CACHE \ + $target_flag \ + --build-arg VERSION="$TAG" \ -t "$full_name" \ -f "$DOCKER_DIR/$dockerfile" \ "$context_path" @@ -297,7 +363,7 @@ build_single_image() { echo "" echo "Available images:" for entry in "${all_images[@]}"; do - IFS=':' read -r _ image_name _ <<< "$entry" + IFS=':' read -r _ image_name _ _ <<< "$entry" echo " - $image_name" done exit 1 @@ -342,8 +408,8 @@ main() { echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" for entry in "${all_images[@]}"; do - # Parse entry: dockerfile:image_name:context_dir - IFS=':' read -r dockerfile image_name context_dir <<< "$entry" + # Parse entry: dockerfile:image_name:context_dir:docker_target (target is optional) + IFS=':' read -r dockerfile image_name context_dir docker_target <<< "$entry" if [[ ! -f "$DOCKER_DIR/$dockerfile" ]]; then echo "Warning: Dockerfile not found: $dockerfile" @@ -352,9 +418,9 @@ main() { echo "Starting: $image_name" if [[ "$SEQUENTIAL" == true ]]; then - build_image_wrapper "$dockerfile" "$image_name" "$context_dir" + build_image_wrapper "$dockerfile" "$image_name" "$context_dir" "$docker_target" else - build_image_wrapper "$dockerfile" "$image_name" "$context_dir" & + build_image_wrapper "$dockerfile" "$image_name" "$context_dir" "$docker_target" & pids+=($!) fi done @@ -386,7 +452,7 @@ main() { printf "%-15s %-10s %-12s %-8s\n" "─────────────" "────────" "──────────" "──────" for entry in "${all_images[@]}"; do - IFS=':' read -r _ image_name _ <<< "$entry" + IFS=':' read -r _ image_name _ _ <<< "$entry" result_file="$RESULTS_DIR/${image_name}.result" if [[ -f "$result_file" ]]; then diff --git a/src/config/__init__.py b/src/config/__init__.py index aee6bfa..9926ff4 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -88,15 +88,71 @@ class Settings(BaseSettings): rate_limit_enabled: bool = Field(default=True, description="Enable per-key rate limiting for Redis-managed keys") # Redis Configuration + redis_mode: Literal["standalone", "cluster", "sentinel"] = Field( + default="standalone", + description="Redis deployment mode: standalone, cluster, or sentinel", + ) redis_host: str = Field(default="localhost") redis_port: int = Field(default=6379, ge=1, le=65535) - redis_password: str | None = Field(default=None) + redis_password: str | None = Field(default=None, description="Redis password (empty string treated as no password)") redis_db: int = Field(default=0, ge=0, le=15) redis_url: str | None = Field(default=None) redis_max_connections: int = Field(default=20, ge=1) redis_socket_timeout: int = Field(default=5, ge=1) redis_socket_connect_timeout: int = Field(default=5, ge=1) + # Redis Cluster + redis_cluster_nodes: str | None = Field( + default=None, + description="Comma-separated host:port pairs for Redis Cluster startup nodes", + ) + + # Redis Sentinel + redis_sentinel_nodes: str | None = Field( + default=None, + description="Comma-separated host:port pairs for Sentinel instances", + ) + redis_sentinel_master: str = Field( + default="mymaster", + description="Name of the Sentinel-monitored master", + ) + redis_sentinel_password: str | None = Field( + default=None, + description="Password for authenticating to Sentinel instances", + ) + + # Redis Key Prefix + redis_key_prefix: str = Field( + default="", + description="Optional prefix prepended to every Redis key (e.g. 'prod:', 'kubecoderun:')", + ) + + # Redis TLS/SSL + redis_tls_enabled: bool = Field( + default=False, + description="Enable TLS/SSL for Redis connections", + ) + redis_tls_cert_file: str | None = Field( + default=None, + description="Path to client TLS certificate (mutual TLS)", + ) + redis_tls_key_file: str | None = Field( + default=None, + description="Path to client TLS private key (mutual TLS)", + ) + redis_tls_ca_cert_file: str | None = Field( + default=None, + description="Path to CA certificate for verifying the server", + ) + redis_tls_insecure: bool = Field( + default=False, + description="Skip TLS certificate verification (NOT recommended for production)", + ) + redis_tls_check_hostname: bool = Field( + default=False, + description="Enable TLS hostname verification (off by default for managed Redis / cluster)", + ) + # MinIO/S3 Configuration minio_endpoint: str = Field(default="localhost:9000") minio_access_key: str | None = Field(default=None) @@ -119,7 +175,7 @@ class Settings(BaseSettings): description="Service account for execution pods", ) k8s_sidecar_image: str = Field( - default="aronmuon/kubecoderun-sidecar:latest", + default="aronmuon/kubecoderun-sidecar-agent:latest", description="Sidecar container image for pod communication", ) k8s_sidecar_port: int = Field(default=8080, ge=1, le=65535, description="Sidecar HTTP API port") @@ -132,6 +188,16 @@ class Settings(BaseSettings): k8s_cpu_request: str = Field(default="100m", description="CPU request for execution pods") k8s_memory_request: str = Field(default="128Mi", description="Memory request for execution pods") k8s_run_as_user: int = Field(default=65532, ge=1, description="UID to run containers as") + k8s_execution_mode: Literal["agent", "nsenter"] = Field( + default="agent", + description="Execution mode: 'agent' (no nsenter/capabilities, gVisor-safe) or 'nsenter' (legacy)", + ) + k8s_executor_port: int = Field( + default=9090, + ge=1, + le=65535, + description="Port for the executor HTTP server inside the main container", + ) k8s_seccomp_profile_type: Literal["RuntimeDefault", "Unconfined"] = Field( default="RuntimeDefault", description="Seccomp profile type for execution pods", @@ -157,6 +223,28 @@ class Settings(BaseSettings): default="Always", description="Image pull policy for execution pods (Always, IfNotPresent, Never)", ) + k8s_image_pull_secrets: str = Field( + default="", + description="Comma-separated list of secret names for pulling images from private registries", + ) + + # GKE Sandbox (gVisor) Configuration + gke_sandbox_enabled: bool = Field( + default=False, + description="Enable GKE Sandbox (gVisor) for additional kernel isolation", + ) + gke_sandbox_runtime_class: str = Field( + default="gvisor", + description="Runtime class name for sandboxed pods", + ) + gke_sandbox_node_selector: str | None = Field( + default=None, + description="JSON string of node selector for sandbox-enabled nodes", + ) + gke_sandbox_custom_tolerations: str | None = Field( + default=None, + description="JSON string of custom tolerations for node pool taints", + ) # Resource Limits - Execution max_execution_time: int = Field(default=30, ge=1, le=600) @@ -407,6 +495,12 @@ def _set_supported_languages(cls, data): } return data + # Service Version Override (set at deploy time to override build-time version) + service_version: str | None = Field( + default=None, + description="Runtime version override (e.g. '2.1.4'). Falls back to build-time version from _version.py.", + ) + # Logging Configuration log_level: str = Field(default="INFO") log_format: str = Field(default="json") @@ -435,6 +529,38 @@ def parse_api_keys(cls, v): """Parse comma-separated API keys into a list.""" return [key.strip() for key in v.split(",") if key.strip()] if v else None + @field_validator("redis_host", mode="before") + @classmethod + def sanitize_redis_host(cls, v): + """Strip accidental URL scheme from Redis host.""" + if isinstance(v, str): + for scheme in ("rediss://", "redis://"): + if v.lower().startswith(scheme): + v = v[len(scheme) :].rstrip("/") + break + return v + + @field_validator("redis_password", "redis_sentinel_password", mode="before") + @classmethod + def sanitize_redis_password(cls, v): + """Convert empty password strings to None. + + Kubernetes / Helm often set REDIS_PASSWORD="" which pydantic reads + as empty string. Passing an empty password to redis-py sends + AUTH "" which fails when the server has no auth configured. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + @field_validator("redis_cluster_nodes", "redis_sentinel_nodes", mode="before") + @classmethod + def sanitize_redis_nodes(cls, v): + """Convert empty node lists to None so code falls back to host:port.""" + if isinstance(v, str) and v.strip() == "": + return None + return v + @field_validator("minio_endpoint") @classmethod def validate_minio_endpoint(cls, v): @@ -470,6 +596,7 @@ def api(self) -> APIConfig: def redis(self) -> RedisConfig: """Access Redis configuration group.""" return RedisConfig( + redis_mode=self.redis_mode, redis_host=self.redis_host, redis_port=self.redis_port, redis_password=self.redis_password, @@ -478,6 +605,17 @@ def redis(self) -> RedisConfig: redis_max_connections=self.redis_max_connections, redis_socket_timeout=self.redis_socket_timeout, redis_socket_connect_timeout=self.redis_socket_connect_timeout, + redis_cluster_nodes=self.redis_cluster_nodes, + redis_sentinel_nodes=self.redis_sentinel_nodes, + redis_sentinel_master=self.redis_sentinel_master, + redis_sentinel_password=self.redis_sentinel_password, + redis_key_prefix=self.redis_key_prefix, + redis_tls_enabled=self.redis_tls_enabled, + redis_tls_cert_file=self.redis_tls_cert_file, + redis_tls_key_file=self.redis_tls_key_file, + redis_tls_ca_cert_file=self.redis_tls_ca_cert_file, + redis_tls_insecure=self.redis_tls_insecure, + redis_tls_check_hostname=self.redis_tls_check_hostname, ) @property @@ -548,6 +686,33 @@ def logging(self) -> LoggingConfig: @property def kubernetes(self) -> KubernetesConfig: """Access Kubernetes configuration group.""" + import json + + # Parse JSON strings for node selector and tolerations + sandbox_node_selector = None + if self.gke_sandbox_node_selector: + try: + sandbox_node_selector = json.loads(self.gke_sandbox_node_selector) + except json.JSONDecodeError: + import logging + + logging.getLogger(__name__).warning( + "Invalid JSON in GKE_SANDBOX_NODE_SELECTOR, ignoring: %s", + self.gke_sandbox_node_selector, + ) + + custom_tolerations = None + if self.gke_sandbox_custom_tolerations: + try: + custom_tolerations = json.loads(self.gke_sandbox_custom_tolerations) + except json.JSONDecodeError: + import logging + + logging.getLogger(__name__).warning( + "Invalid JSON in GKE_SANDBOX_CUSTOM_TOLERATIONS, ignoring: %s", + self.gke_sandbox_custom_tolerations, + ) + return KubernetesConfig( namespace=self.k8s_namespace, service_account=self.k8s_service_account, @@ -562,11 +727,19 @@ def kubernetes(self) -> KubernetesConfig: cpu_request=self.k8s_cpu_request, memory_request=self.k8s_memory_request, run_as_user=self.k8s_run_as_user, + execution_mode=self.k8s_execution_mode, + executor_port=self.k8s_executor_port, seccomp_profile_type=self.k8s_seccomp_profile_type, job_ttl_seconds_after_finished=self.k8s_job_ttl_seconds, job_active_deadline_seconds=self.k8s_job_deadline_seconds, image_registry=self.k8s_image_registry, image_tag=self.k8s_image_tag, + image_pull_policy=self.k8s_image_pull_policy, + image_pull_secrets=self.k8s_image_pull_secrets, + gke_sandbox_enabled=self.gke_sandbox_enabled, + runtime_class_name=self.gke_sandbox_runtime_class, + sandbox_node_selector=sandbox_node_selector, + custom_tolerations=custom_tolerations, ) def get_pool_configs(self): @@ -574,10 +747,26 @@ def get_pool_configs(self): Returns list of PoolConfig for all configured languages. """ + import json import os from ..services.kubernetes.models import PoolConfig + # Parse GKE Sandbox configuration once + sandbox_node_selector = None + if self.gke_sandbox_node_selector: + try: + sandbox_node_selector = json.loads(self.gke_sandbox_node_selector) + except json.JSONDecodeError: + pass + + custom_tolerations = None + if self.gke_sandbox_custom_tolerations: + try: + custom_tolerations = json.loads(self.gke_sandbox_custom_tolerations) + except json.JSONDecodeError: + pass + configs = [] languages = ["py", "js", "ts", "go", "java", "c", "cpp", "php", "rs", "r", "f90", "d"] @@ -610,6 +799,11 @@ def get_pool_configs(self): sidecar_cpu_request = os.getenv(f"LANG_CPU_REQUEST_{lang_upper}") or self.k8s_sidecar_cpu_request sidecar_memory_request = os.getenv(f"LANG_MEMORY_REQUEST_{lang_upper}") or self.k8s_sidecar_memory_request + # Parse image pull secrets (comma-separated string -> list) + pull_secrets = None + if self.k8s_image_pull_secrets: + pull_secrets = [s.strip() for s in self.k8s_image_pull_secrets.split(",") if s.strip()] + configs.append( PoolConfig( language=lang, @@ -623,8 +817,15 @@ def get_pool_configs(self): sidecar_cpu_request=sidecar_cpu_request, sidecar_memory_request=sidecar_memory_request, image_pull_policy=self.k8s_image_pull_policy, + image_pull_secrets=pull_secrets, + execution_mode=self.k8s_execution_mode, + executor_port=self.k8s_executor_port, seccomp_profile_type=self.k8s_seccomp_profile_type, network_isolated=self.enable_network_isolation, + gke_sandbox_enabled=self.gke_sandbox_enabled, + runtime_class_name=self.gke_sandbox_runtime_class, + sandbox_node_selector=sandbox_node_selector, + custom_tolerations=custom_tolerations, ) ) @@ -643,11 +844,15 @@ def validate_ssl_files(self) -> bool: return Path(self.ssl_cert_file).exists() and Path(self.ssl_key_file).exists() def get_redis_url(self) -> str: - """Get Redis connection URL.""" + """Get Redis connection URL. + + Automatically uses ``rediss://`` when TLS is enabled. + """ if self.redis_url: return self.redis_url + scheme = "rediss" if self.redis_tls_enabled else "redis" password_part = f":{self.redis_password}@" if self.redis_password else "" - return f"redis://{password_part}{self.redis_host}:{self.redis_port}/{self.redis_db}" + return f"{scheme}://{password_part}{self.redis_host}:{self.redis_port}/{self.redis_db}" def get_valid_api_keys(self) -> list[str]: """Get all valid API keys including the primary key.""" diff --git a/src/config/kubernetes.py b/src/config/kubernetes.py index fe99a64..64d07f1 100644 --- a/src/config/kubernetes.py +++ b/src/config/kubernetes.py @@ -18,7 +18,7 @@ class KubernetesConfig: service_account: str = "kubecoderun-executor" # Sidecar configuration - sidecar_image: str = "aronmuon/kubecoderun-sidecar:latest" + sidecar_image: str = "aronmuon/kubecoderun-sidecar-agent:latest" sidecar_port: int = 8080 # Resource limits for execution pods @@ -27,7 +27,9 @@ class KubernetesConfig: cpu_request: str = "100m" memory_request: str = "128Mi" - # Sidecar resource limits (CRITICAL: user code inherits these via nsenter) + # Sidecar resource limits + # In nsenter mode: user code runs in sidecar's cgroup via nsenter + # In agent mode: user code runs in main container's cgroup sidecar_cpu_limit: str = "500m" sidecar_memory_limit: str = "512Mi" sidecar_cpu_request: str = "100m" @@ -39,6 +41,14 @@ class KubernetesConfig: run_as_non_root: bool = True seccomp_profile_type: str = "RuntimeDefault" + # Execution mode: "agent" (default, no nsenter/capabilities needed) or "nsenter" (legacy) + # agent: Executor agent runs in main container, no privilege escalation or capabilities needed + # nsenter: Sidecar uses nsenter to enter main container namespace (requires capabilities) + execution_mode: str = "agent" + + # Executor port (main container listens on this port for execution requests) + executor_port: int = 9090 + # Job settings (for languages with pool_size=0) job_ttl_seconds_after_finished: int = 60 job_active_deadline_seconds: int = 300 @@ -52,6 +62,27 @@ class KubernetesConfig: # e.g., aronmuon/kubecoderun-python:latest image_registry: str = "aronmuon/kubecoderun" image_tag: str = "latest" + image_pull_policy: str = "Always" + + # Image pull secrets for private registries + # Format: comma-separated list of secret names, e.g., "secret-for-registry,another-secret" + image_pull_secrets: str = "" + + # GKE Sandbox (gVisor) configuration + # When enabled, pods run with additional kernel isolation via gVisor + gke_sandbox_enabled: bool = False + + # Runtime class name for sandboxed pods (default: gvisor for GKE) + runtime_class_name: str = "gvisor" + + # Node selector for sandbox nodes + # GKE automatically adds: sandbox.gke.io/runtime=gvisor + sandbox_node_selector: dict[str, str] | None = None + + # Custom tolerations for execution pods + # GKE Sandbox automatically adds toleration for sandbox.gke.io/runtime=gvisor + # Use this for additional custom node pool taints (e.g., pool=sandbox) + custom_tolerations: list[dict[str, str]] | None = None def get_image_for_language(self, language: str) -> str: """Get the container image for a language. diff --git a/src/config/redis.py b/src/config/redis.py index 153f11e..f668528 100644 --- a/src/config/redis.py +++ b/src/config/redis.py @@ -1,11 +1,25 @@ -"""Redis configuration.""" +"""Redis configuration. -from pydantic import Field +Supports three deployment modes: +- **standalone** (default): Single Redis instance. +- **cluster**: Redis Cluster with automatic slot routing. +- **sentinel**: Redis Sentinel for high-availability failover. + +TLS/SSL is supported in all modes and is required for most managed Redis +services such as GCP Memorystore, AWS ElastiCache, and Azure Cache for Redis. +""" + +from typing import Literal + +from pydantic import Field, field_validator from pydantic_settings import BaseSettings, SettingsConfigDict class RedisConfig(BaseSettings): - """Redis connection settings.""" + """Redis connection settings. + + Supports standalone, cluster, and sentinel modes with optional TLS. + """ model_config = SettingsConfigDict( env_prefix="", @@ -13,6 +27,14 @@ class RedisConfig(BaseSettings): populate_by_name=True, ) + # -- Connection mode ------------------------------------------------------- + mode: Literal["standalone", "cluster", "sentinel"] = Field( + default="standalone", + alias="redis_mode", + description="Redis deployment mode: standalone, cluster, or sentinel", + ) + + # -- Basic connection (standalone / single-entry for cluster & sentinel) --- host: str = Field(default="localhost", alias="redis_host") port: int = Field(default=6379, ge=1, le=65535, alias="redis_port") password: str | None = Field(default=None, alias="redis_password") @@ -22,9 +44,193 @@ class RedisConfig(BaseSettings): socket_timeout: int = Field(default=5, ge=1, alias="redis_socket_timeout") socket_connect_timeout: int = Field(default=5, ge=1, alias="redis_socket_connect_timeout") + # -- Cluster mode ---------------------------------------------------------- + cluster_nodes: str | None = Field( + default=None, + alias="redis_cluster_nodes", + description=( + "Comma-separated list of host:port pairs for Redis Cluster startup nodes. " + "Example: 'node1:6379,node2:6379,node3:6379'" + ), + ) + + # -- Sentinel mode --------------------------------------------------------- + sentinel_nodes: str | None = Field( + default=None, + alias="redis_sentinel_nodes", + description=( + "Comma-separated list of host:port pairs for Sentinel instances. " + "Example: 'sentinel1:26379,sentinel2:26379,sentinel3:26379'" + ), + ) + sentinel_master: str = Field( + default="mymaster", + alias="redis_sentinel_master", + description="Name of the Sentinel-monitored master.", + ) + sentinel_password: str | None = Field( + default=None, + alias="redis_sentinel_password", + description="Password for authenticating to Sentinel instances (if different from Redis password).", + ) + + # -- Key prefix ------------------------------------------------------------ + key_prefix: str = Field( + default="", + alias="redis_key_prefix", + description=( + "Optional prefix prepended to every Redis key. " + "Useful for sharing a single Redis instance across multiple environments " + "or applications (e.g. 'prod:', 'staging:', 'kubecoderun:'). " + "Must end with a separator like ':' if you want one." + ), + ) + + # -- TLS / SSL ------------------------------------------------------------- + tls_enabled: bool = Field( + default=False, + alias="redis_tls_enabled", + description="Enable TLS/SSL for Redis connections.", + ) + tls_cert_file: str | None = Field( + default=None, + alias="redis_tls_cert_file", + description="Path to client TLS certificate file (mutual TLS).", + ) + tls_key_file: str | None = Field( + default=None, + alias="redis_tls_key_file", + description="Path to client TLS private key file (mutual TLS).", + ) + tls_ca_cert_file: str | None = Field( + default=None, + alias="redis_tls_ca_cert_file", + description="Path to CA certificate file for verifying the server.", + ) + tls_insecure: bool = Field( + default=False, + alias="redis_tls_insecure", + description="Skip TLS certificate verification (NOT recommended for production).", + ) + tls_check_hostname: bool = Field( + default=False, + alias="redis_tls_check_hostname", + description=( + "Enable TLS hostname verification. Disabled by default because " + "managed Redis services (GCP Memorystore, AWS ElastiCache) and " + "Redis Cluster mode expose node IPs that typically do not match " + "the certificate CN/SAN entries. The certificate chain is still " + "verified against the CA when tls_insecure is False." + ), + ) + + # -- Validators ------------------------------------------------------------ + + @field_validator("host", mode="before") + @classmethod + def _sanitize_host(cls, v: str) -> str: + """Strip an accidental URL scheme from the host value. + + Users sometimes set ``REDIS_HOST=rediss://hostname`` instead of just + ``REDIS_HOST=hostname``. This validator normalises the value so that + downstream code always receives a plain hostname or IP. + """ + if isinstance(v, str): + for scheme in ("rediss://", "redis://"): + if v.lower().startswith(scheme): + v = v[len(scheme) :] + # Drop any trailing slash left over + v = v.rstrip("/") + break + return v + + @field_validator("password", "sentinel_password", mode="before") + @classmethod + def _empty_string_to_none(cls, v: str | None) -> str | None: + """Convert empty strings to ``None``. + + Kubernetes ConfigMaps and Helm values often set ``REDIS_PASSWORD: ""`` + which pydantic-settings reads as ``""`` rather than ``None``. Passing + an empty password to redis-py causes it to send ``AUTH ""`` which + fails when the server has no authentication configured. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + @field_validator("cluster_nodes", "sentinel_nodes", mode="before") + @classmethod + def _empty_nodes_to_none(cls, v: str | None) -> str | None: + """Convert empty/whitespace-only node lists to ``None``. + + Helm values default to ``clusterNodes: ""`` which renders in the + ConfigMap as an empty string. This validator treats it the same + as "not set" so the code falls back to ``host:port``. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + # -- Helpers --------------------------------------------------------------- + def get_url(self) -> str: - """Get Redis connection URL.""" + """Get Redis connection URL (standalone mode only). + + For cluster/sentinel modes the URL is not used; startup nodes are + provided separately. This method honours an explicit ``url`` and + automatically switches between the ``redis://`` and ``rediss://`` + scheme based on the ``tls_enabled`` flag. + """ if self.url: return self.url + scheme = "rediss" if self.tls_enabled else "redis" password_part = f":{self.password}@" if self.password else "" - return f"redis://{password_part}{self.host}:{self.port}/{self.db}" + return f"{scheme}://{password_part}{self.host}:{self.port}/{self.db}" + + def get_tls_kwargs(self) -> dict: + """Build keyword arguments for redis-py SSL/TLS configuration. + + Returns an empty dict when TLS is disabled so callers can safely + unpack the result: ``redis.Redis(**config.get_tls_kwargs())``. + """ + if not self.tls_enabled: + return {} + + import ssl + + kwargs: dict = {"ssl": True} + + if self.tls_insecure: + kwargs["ssl_cert_reqs"] = ssl.CERT_NONE + kwargs["ssl_check_hostname"] = False + else: + kwargs["ssl_cert_reqs"] = ssl.CERT_REQUIRED + # Hostname checking is off by default because managed Redis + # services (GCP Memorystore, AWS ElastiCache) and Redis + # Cluster node discovery return IPs that do not match the + # certificate CN/SAN. The certificate chain is still fully + # validated against the CA. + kwargs["ssl_check_hostname"] = self.tls_check_hostname + + if self.tls_ca_cert_file: + kwargs["ssl_ca_certs"] = self.tls_ca_cert_file + if self.tls_cert_file: + kwargs["ssl_certfile"] = self.tls_cert_file + if self.tls_key_file: + kwargs["ssl_keyfile"] = self.tls_key_file + + return kwargs + + def parse_nodes(self, raw: str) -> list[tuple[str, int]]: + """Parse a comma-separated ``host:port`` string into a list of tuples.""" + nodes: list[tuple[str, int]] = [] + for entry in raw.split(","): + entry = entry.strip() + if not entry: + continue + if ":" in entry: + h, p = entry.rsplit(":", 1) + nodes.append((h.strip(), int(p.strip()))) + else: + nodes.append((entry, self.port)) + return nodes diff --git a/src/core/pool.py b/src/core/pool.py index 21e9baa..26629cc 100644 --- a/src/core/pool.py +++ b/src/core/pool.py @@ -2,15 +2,33 @@ This module provides centralized connection pools for external services, allowing efficient resource sharing across the application. + +Supported Redis deployment modes: +- **standalone** (default): Single Redis server with ``ConnectionPool``. +- **cluster**: Redis Cluster via ``RedisCluster``. +- **sentinel**: Redis Sentinel via ``Sentinel`` for HA failover. + +All modes support optional TLS/SSL for managed services such as +GCP Memorystore, AWS ElastiCache, and Azure Cache for Redis. """ -from typing import Optional +from __future__ import annotations + +from typing import TYPE_CHECKING import redis.asyncio as redis import structlog +from redis.asyncio.cluster import RedisCluster +from redis.asyncio.sentinel import Sentinel +from redis.backoff import ExponentialBackoff +from redis.exceptions import ConnectionError, TimeoutError +from redis.retry import Retry from ..config import settings +if TYPE_CHECKING: + from ..config.redis import RedisConfig + logger = structlog.get_logger(__name__) @@ -18,76 +36,209 @@ class RedisPool: """Centralized async Redis connection pool. Provides a shared connection pool for all services that need Redis, - avoiding the overhead of multiple separate pools. + avoiding the overhead of multiple separate pools. Supports standalone, + cluster, and sentinel modes with optional TLS. Usage: client = redis_pool.get_client() await client.set("key", "value") """ - def __init__(self): + def __init__(self) -> None: self._pool: redis.ConnectionPool | None = None - self._client: redis.Redis | None = None - self._initialized = False + self._client: redis.Redis | RedisCluster | None = None + self._sentinel: Sentinel | None = None + self._initialized: bool = False + self._mode: str = "standalone" + self._key_prefix: str = "" def _initialize(self) -> None: - """Initialize the connection pool lazily.""" + """Initialize the connection pool lazily based on the configured mode.""" if self._initialized: return try: - redis_url = settings.get_redis_url() - self._pool = redis.ConnectionPool.from_url( - redis_url, - max_connections=20, # Shared across all services - decode_responses=True, - socket_timeout=5.0, - socket_connect_timeout=5.0, - retry_on_timeout=True, - ) - self._client = redis.Redis(connection_pool=self._pool) + redis_cfg = settings.redis + self._mode = redis_cfg.mode + self._key_prefix = redis_cfg.key_prefix + tls_kwargs = redis_cfg.get_tls_kwargs() + max_conns = redis_cfg.max_connections + socket_timeout = float(redis_cfg.socket_timeout) + socket_connect_timeout = float(redis_cfg.socket_connect_timeout) + + if self._mode == "cluster": + self._init_cluster(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + elif self._mode == "sentinel": + self._init_sentinel(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + else: + self._init_standalone(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + self._initialized = True - logger.info( - "Redis connection pool initialized", - max_connections=20, - url=redis_url.split("@")[-1], # Don't log password - ) except Exception as e: - logger.error("Failed to initialize Redis pool", error=str(e)) - # Create a fallback client - self._client = redis.from_url("redis://localhost:6379/0", decode_responses=True) - self._initialized = True - - def get_client(self) -> redis.Redis: + logger.error( + "Failed to initialize Redis pool", + error=str(e), + mode=self._mode, + ) + raise + + # -- Mode-specific initialisers ------------------------------------------- + + def _init_standalone( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + redis_url = cfg.get_url() + self._pool = redis.ConnectionPool.from_url( + redis_url, + max_connections=max_conns, + decode_responses=True, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + retry_on_timeout=True, + **tls_kwargs, + ) + self._client = redis.Redis(connection_pool=self._pool) + logger.info( + "Redis standalone connection pool initialized", + max_connections=max_conns, + tls=cfg.tls_enabled, + url=redis_url.split("@")[-1], + ) + + def _init_cluster( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + if cfg.cluster_nodes: + startup_nodes = [redis.cluster.ClusterNode(host=h, port=p) for h, p in cfg.parse_nodes(cfg.cluster_nodes)] + else: + startup_nodes = [redis.cluster.ClusterNode(host=cfg.host, port=cfg.port)] + + self._client = RedisCluster( + startup_nodes=startup_nodes, + password=cfg.password, + decode_responses=True, + max_connections=max_conns, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + **tls_kwargs, + ) + logger.info( + "Redis cluster connection initialized", + startup_nodes=[ + f"{h}:{p}" + for h, p in (cfg.parse_nodes(cfg.cluster_nodes) if cfg.cluster_nodes else [(cfg.host, cfg.port)]) + ], + tls=cfg.tls_enabled, + ) + + def _init_sentinel( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + if cfg.sentinel_nodes: + sentinel_hosts = cfg.parse_nodes(cfg.sentinel_nodes) + else: + sentinel_hosts = [(cfg.host, 26379)] + + self._sentinel = Sentinel( + sentinels=sentinel_hosts, + password=cfg.sentinel_password, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + **tls_kwargs, + ) + self._client = self._sentinel.master_for( + service_name=cfg.sentinel_master, + password=cfg.password, + decode_responses=True, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + max_connections=max_conns, + retry_on_timeout=True, + **tls_kwargs, + ) + logger.info( + "Redis sentinel connection initialized", + sentinel_nodes=[f"{h}:{p}" for h, p in sentinel_hosts], + master=cfg.sentinel_master, + tls=cfg.tls_enabled, + ) + + # -- Public API ----------------------------------------------------------- + + def get_client(self) -> redis.Redis | RedisCluster: """Get an async Redis client from the shared pool. Returns: - Async Redis client instance connected to the shared pool + Async Redis client instance connected to the shared pool. + For cluster mode this is a ``RedisCluster`` instance which + exposes the same command interface. """ if not self._initialized: self._initialize() assert self._client is not None, "Redis client not initialized" return self._client + @property + def key_prefix(self) -> str: + """Return the configured Redis key prefix (may be empty).""" + if not self._initialized: + self._initialize() + return self._key_prefix + + def make_key(self, key: str) -> str: + """Prepend the configured key prefix to *key*. + + Returns *key* unchanged when no prefix is configured. + """ + prefix = self.key_prefix + if prefix: + return f"{prefix}{key}" + return key + @property def pool_stats(self) -> dict: """Get connection pool statistics.""" - if not self._pool: - return {"initialized": False} + if not self._pool and self._mode == "standalone": + return {"initialized": self._initialized, "mode": self._mode} + + stats: dict = {"initialized": self._initialized, "mode": self._mode} + + if self._key_prefix: + stats["key_prefix"] = self._key_prefix + + if self._pool: + stats["max_connections"] = self._pool.max_connections - return { - "initialized": True, - "max_connections": self._pool.max_connections, - } + return stats async def close(self) -> None: """Close the connection pool and release all connections.""" if self._client: await self._client.close() - logger.info("Redis connection pool closed") + logger.info("Redis connection pool closed", mode=self._mode) self._pool = None self._client = None + self._sentinel = None self._initialized = False + self._mode = "standalone" + self._key_prefix = "" # Global Redis pool instance diff --git a/src/main.py b/src/main.py index d1d46dd..3914c1b 100644 --- a/src/main.py +++ b/src/main.py @@ -33,6 +33,9 @@ from .utils.logging import setup_logging from .utils.shutdown import setup_graceful_shutdown, shutdown_handler +# Resolve effective version: runtime SERVICE_VERSION overrides build-time _version.py +effective_version: str = settings.service_version or __version__ + # Setup logging setup_logging() logger = structlog.get_logger() @@ -42,7 +45,7 @@ async def lifespan(app: FastAPI): """Application lifespan manager.""" # Startup - logger.info("Starting Code Interpreter API", version=__version__) + logger.info("Starting Code Interpreter API", version=effective_version) # Setup graceful shutdown callbacks (uvicorn handles signals) setup_graceful_shutdown() @@ -143,6 +146,39 @@ async def lifespan(app: FastAPI): # Build pool configs from settings pool_configs = settings.get_pool_configs() + # Parse image pull secrets (comma-separated string -> list) + pull_secrets = None + if settings.k8s_image_pull_secrets: + pull_secrets = [s.strip() for s in settings.k8s_image_pull_secrets.split(",") if s.strip()] + + # Validate execution mode / sidecar image consistency + sidecar_img = settings.k8s_sidecar_image.lower() + exec_mode = settings.k8s_execution_mode + if exec_mode == "agent" and "nsenter" in sidecar_img: + logger.warning( + "Execution mode is 'agent' but sidecar image appears to be nsenter-based. " + "Consider using a sidecar-agent image for agent mode.", + sidecar_image=settings.k8s_sidecar_image, + execution_mode=exec_mode, + ) + elif exec_mode == "nsenter" and "agent" in sidecar_img and "nsenter" not in sidecar_img: + logger.warning( + "Execution mode is 'nsenter' but sidecar image appears to be agent-based. " + "Consider using a sidecar-nsenter image for nsenter mode.", + sidecar_image=settings.k8s_sidecar_image, + execution_mode=exec_mode, + ) + + # Validate GKE Sandbox / execution mode compatibility + if settings.gke_sandbox_enabled and exec_mode == "nsenter": + logger.warning( + "GKE Sandbox (gVisor) is enabled but execution mode is 'nsenter'. " + "nsenter requires SYS_PTRACE/SYS_ADMIN/SYS_CHROOT capabilities which are " + "incompatible with gVisor. Switch to 'agent' execution mode for GKE Sandbox.", + execution_mode=exec_mode, + gke_sandbox_enabled=True, + ) + kubernetes_manager = KubernetesManager( namespace=settings.k8s_namespace or None, pool_configs=pool_configs, @@ -151,8 +187,16 @@ async def lifespan(app: FastAPI): default_memory_limit=settings.k8s_memory_limit, default_cpu_request=settings.k8s_cpu_request, default_memory_request=settings.k8s_memory_request, + execution_mode=settings.k8s_execution_mode, + executor_port=settings.k8s_executor_port, seccomp_profile_type=settings.k8s_seccomp_profile_type, network_isolated=settings.enable_network_isolation, + image_pull_policy=settings.k8s_image_pull_policy, + gke_sandbox_enabled=settings.gke_sandbox_enabled, + runtime_class_name=settings.gke_sandbox_runtime_class, + sandbox_node_selector=settings.kubernetes.sandbox_node_selector, + custom_tolerations=settings.kubernetes.custom_tolerations, + image_pull_secrets=pull_secrets, ) await kubernetes_manager.start() @@ -249,7 +293,7 @@ async def lifespan(app: FastAPI): app = FastAPI( title="Code Interpreter API", description="A secure API for executing code in isolated Kubernetes pods", - version=__version__, + version=effective_version, docs_url="/docs" if settings.enable_docs else None, redoc_url="/redoc" if settings.enable_docs else None, debug=settings.api_debug, @@ -287,7 +331,7 @@ async def health_check(): """Health check endpoint for liveness probe.""" return { "status": "healthy", - "version": __version__, + "version": effective_version, "config": { "debug": settings.api_debug, "docs_enabled": settings.enable_docs, diff --git a/src/services/api_key_manager.py b/src/services/api_key_manager.py index 3fb0249..463c873 100644 --- a/src/services/api_key_manager.py +++ b/src/services/api_key_manager.py @@ -31,12 +31,12 @@ class ApiKeyManagerService: """Manages API keys stored in Redis.""" - # Redis key prefixes - RECORD_PREFIX = "api_keys:records:" - VALID_CACHE_PREFIX = "api_keys:valid:" - USAGE_PREFIX = "api_keys:usage:" - INDEX_KEY = "api_keys:index" - ENV_KEYS_INDEX = "api_keys:env_index" # Separate index for env keys + # Base Redis key prefixes (before application-level prefix) + _RECORD_PREFIX = "api_keys:records:" + _VALID_CACHE_PREFIX = "api_keys:valid:" + _USAGE_PREFIX = "api_keys:usage:" + _INDEX_KEY = "api_keys:index" + _ENV_KEYS_INDEX = "api_keys:env_index" # Separate index for env keys # Cache TTL VALIDATION_CACHE_TTL = 300 # 5 minutes @@ -49,6 +49,14 @@ def __init__(self, redis_client: redis.Redis | None = None): """ self._redis = redis_client + # Compute prefixed keys once so every method uses the prefix + mk = redis_pool.make_key + self.RECORD_PREFIX = mk(self._RECORD_PREFIX) + self.VALID_CACHE_PREFIX = mk(self._VALID_CACHE_PREFIX) + self.USAGE_PREFIX = mk(self._USAGE_PREFIX) + self.INDEX_KEY = mk(self._INDEX_KEY) + self.ENV_KEYS_INDEX = mk(self._ENV_KEYS_INDEX) + @property def redis(self) -> redis.Redis: """Get Redis client, initializing if needed.""" @@ -130,8 +138,9 @@ async def _ensure_single_env_key_record(self, api_key: str, name: str) -> ApiKey source="environment", ) - # Store in Redis - pipe = self.redis.pipeline(transaction=True) + # Store in Redis (transaction=False for Redis Cluster compatibility + # — record key and index key hash to different slots) + pipe = self.redis.pipeline(transaction=False) pipe.hset(record_key, mapping=record.to_redis_hash()) pipe.sadd(self.ENV_KEYS_INDEX, key_hash) await pipe.execute() @@ -237,9 +246,10 @@ async def create_key( metadata=metadata or {}, ) - # Store in Redis + # Store in Redis (transaction=False for Redis Cluster compatibility + # — record key and index key hash to different slots) record_key = f"{self.RECORD_PREFIX}{key_hash}" - pipe = self.redis.pipeline(transaction=True) + pipe = self.redis.pipeline(transaction=False) pipe.hset(record_key, mapping=record.to_redis_hash()) pipe.sadd(self.INDEX_KEY, key_hash) await pipe.execute() @@ -358,8 +368,9 @@ async def revoke_key(self, key_hash: str) -> bool: if not exists: return False - # Delete from Redis - pipe = self.redis.pipeline(transaction=True) + # Delete from Redis (transaction=False for Redis Cluster compatibility + # — keys hash to different slots) + pipe = self.redis.pipeline(transaction=False) pipe.delete(record_key) pipe.srem(self.INDEX_KEY, key_hash) pipe.delete(f"{self.VALID_CACHE_PREFIX}{self._short_hash(key_hash)}") diff --git a/src/services/detailed_metrics.py b/src/services/detailed_metrics.py index e3cb82b..31b578e 100644 --- a/src/services/detailed_metrics.py +++ b/src/services/detailed_metrics.py @@ -31,12 +31,12 @@ class DetailedMetricsService: """Service for collecting and querying detailed execution metrics.""" - # Redis key prefixes - BUFFER_KEY = "metrics:detailed:buffer" - HOURLY_PREFIX = "metrics:detailed:hourly:" - DAILY_PREFIX = "metrics:detailed:daily:" - POOL_STATS_KEY = "metrics:pool:stats" - API_KEY_HOURLY_PREFIX = "metrics:api_key:" + # Base Redis key prefixes (before application-level prefix) + _BUFFER_KEY = "metrics:detailed:buffer" + _HOURLY_PREFIX = "metrics:detailed:hourly:" + _DAILY_PREFIX = "metrics:detailed:daily:" + _POOL_STATS_KEY = "metrics:pool:stats" + _API_KEY_HOURLY_PREFIX = "metrics:api_key:" # Buffer and retention settings MAX_BUFFER_SIZE = 10000 @@ -52,6 +52,16 @@ def __init__(self, redis_client: redis.Redis | None = None): self._redis = redis_client self._in_memory_buffer: list[DetailedExecutionMetrics] = [] + # Compute prefixed keys once + from ..core.pool import redis_pool + + mk = redis_pool.make_key + self.BUFFER_KEY = mk(self._BUFFER_KEY) + self.HOURLY_PREFIX = mk(self._HOURLY_PREFIX) + self.DAILY_PREFIX = mk(self._DAILY_PREFIX) + self.POOL_STATS_KEY = mk(self._POOL_STATS_KEY) + self.API_KEY_HOURLY_PREFIX = mk(self._API_KEY_HOURLY_PREFIX) + def register_event_handlers(self) -> None: """Register event handlers for pool metrics.""" from ..core.events import ( diff --git a/src/services/file.py b/src/services/file.py index 7c9d7cf..1554398 100644 --- a/src/services/file.py +++ b/src/services/file.py @@ -29,8 +29,11 @@ def __init__(self): # which handles IAM vs static credentials automatically self.minio_client = settings.minio.create_client() - # Initialize Redis client - self.redis_client = redis.from_url(settings.get_redis_url(), decode_responses=True) + # Initialize Redis client via the shared connection pool so that + # cluster, sentinel, and TLS modes are handled automatically. + from ..core.pool import redis_pool + + self.redis_client = redis_pool.get_client() self.bucket_name = settings.minio_bucket @@ -55,11 +58,15 @@ def _get_file_key(self, session_id: str, file_id: str, file_type: str = "uploads def _get_file_metadata_key(self, session_id: str, file_id: str) -> str: """Generate Redis key for file metadata.""" - return f"files:{session_id}:{file_id}" + from ..core.pool import redis_pool + + return redis_pool.make_key(f"files:{session_id}:{file_id}") def _get_session_files_key(self, session_id: str) -> str: """Generate Redis key for session file list.""" - return f"session_files:{session_id}" + from ..core.pool import redis_pool + + return redis_pool.make_key(f"session_files:{session_id}") async def _store_file_metadata(self, session_id: str, file_id: str, metadata: dict[str, Any]) -> None: """Store file metadata in Redis.""" diff --git a/src/services/health.py b/src/services/health.py index 083d04d..af90608 100644 --- a/src/services/health.py +++ b/src/services/health.py @@ -142,16 +142,16 @@ async def check_redis(self) -> HealthCheckResult: try: # Use shared connection pool - if not self._redis_client: - from ..core.pool import redis_pool + from ..core.pool import redis_pool + if not self._redis_client: self._redis_client = redis_pool.get_client() # Test basic connectivity await self._redis_client.ping() # Test read/write operations - test_key = "health_check:test" + test_key = redis_pool.make_key("health_check:test") test_value = f"test_{int(time.time())}" await self._redis_client.set(test_key, test_value, ex=60) diff --git a/src/services/kubernetes/client.py b/src/services/kubernetes/client.py index a313dc9..3e81ecf 100644 --- a/src/services/kubernetes/client.py +++ b/src/services/kubernetes/client.py @@ -193,9 +193,27 @@ def create_pod_manifest( sidecar_memory_request: str = "256Mi", seccomp_profile_type: str = "RuntimeDefault", network_isolated: bool = False, + execution_mode: str = "agent", + executor_port: int = 9090, + gke_sandbox_enabled: bool = False, + runtime_class_name: str = "gvisor", + sandbox_node_selector: dict[str, str] | None = None, + custom_tolerations: list[dict[str, str]] | None = None, + image_pull_secrets: list[str] | None = None, ) -> client.V1Pod: """Create a Pod manifest for code execution. + Supports two execution modes: + + - agent (default): An executor agent runs in the main container, providing + HTTP-based code execution. No nsenter, no capabilities, no privilege + escalation needed. Compatible with GKE Sandbox (gVisor) and restricted + Pod Security Standards. + + - nsenter (legacy): The sidecar uses nsenter to enter the main container's + mount namespace. Requires SYS_PTRACE, SYS_ADMIN, SYS_CHROOT capabilities, + shareProcessNamespace, and allowPrivilegeEscalation: true. + Args: name: Pod name namespace: Kubernetes namespace @@ -211,10 +229,30 @@ def create_pod_manifest( run_as_user: UID to run containers as sidecar_port: Port for sidecar HTTP API seccomp_profile_type: Seccomp profile type (RuntimeDefault or Unconfined) + network_isolated: Whether network isolation is enabled + execution_mode: Execution mode - "agent" (default) or "nsenter" + executor_port: Port for the executor HTTP server inside the main container + gke_sandbox_enabled: Enable GKE Sandbox (gVisor) for additional kernel isolation + runtime_class_name: Runtime class name for sandboxed pods (default: gvisor) + sandbox_node_selector: Node selector for sandbox-enabled nodes + custom_tolerations: Additional tolerations for custom node pool taints + image_pull_secrets: List of secret names for pulling images from private registries Returns: V1Pod manifest ready for creation. """ + use_agent = execution_mode == "agent" + + # Warn if GKE Sandbox is enabled with nsenter mode (incompatible with gVisor) + if gke_sandbox_enabled and not use_agent: + logger.warning( + "GKE Sandbox (gVisor) is enabled but execution mode is 'nsenter'. " + "nsenter requires capabilities incompatible with gVisor. " + "Consider switching to 'agent' execution mode.", + execution_mode=execution_mode, + gke_sandbox_enabled=gke_sandbox_enabled, + ) + # Shared volume for code and data shared_volume = client.V1Volume( name="shared-data", @@ -229,8 +267,8 @@ def create_pod_manifest( mount_path="/mnt/data", ) - # Security context for main container - security_context = client.V1SecurityContext( + # Security context for main container - minimal privileges in both modes + main_security_context = client.V1SecurityContext( run_as_user=run_as_user, run_as_group=run_as_user, run_as_non_root=True, @@ -238,34 +276,35 @@ def create_pod_manifest( capabilities=client.V1Capabilities(drop=["ALL"]), ) - # Security context for sidecar - needs elevated privileges for nsenter - # - # The sidecar uses nsenter to execute code in the main container's mount namespace. - # nsenter requires these capabilities: - # - SYS_PTRACE: access /proc//ns/ of other processes - # - SYS_ADMIN: call setns() to enter namespaces - # - SYS_CHROOT: required for mount namespace operations - # - # For non-root users, Linux capabilities only populate the bounding set, not - # effective/permitted sets. To make capabilities usable, the sidecar Docker image - # uses setcap on the nsenter binary: - # setcap 'cap_sys_ptrace,cap_sys_admin,cap_sys_chroot+eip' /usr/bin/nsenter - # - # The pod spec must still: - # - Add capabilities to the bounding set (capabilities.add) - # - Allow privilege escalation (for file capabilities to be honored) - # - # This approach allows running as non-root while still having nsenter work. - sidecar_security_context = client.V1SecurityContext( - run_as_user=run_as_user, - run_as_group=run_as_user, - run_as_non_root=True, - allow_privilege_escalation=True, # Required for file capabilities - capabilities=client.V1Capabilities( - add=["SYS_PTRACE", "SYS_ADMIN", "SYS_CHROOT"], - drop=["ALL"], - ), - ) + if use_agent: + # Agent mode: sidecar also has minimal privileges (no nsenter needed) + sidecar_security_context = client.V1SecurityContext( + run_as_user=run_as_user, + run_as_group=run_as_user, + run_as_non_root=True, + allow_privilege_escalation=False, + capabilities=client.V1Capabilities(drop=["ALL"]), + ) + else: + # nsenter mode: sidecar needs elevated privileges for nsenter + # + # The sidecar uses nsenter to execute code in the main container's mount namespace. + # nsenter requires these capabilities: + # - SYS_PTRACE: access /proc//ns/ of other processes + # - SYS_ADMIN: call setns() to enter namespaces + # - SYS_CHROOT: required for mount namespace operations + # + # File capabilities (setcap on nsenter) require allowPrivilegeEscalation: true. + sidecar_security_context = client.V1SecurityContext( + run_as_user=run_as_user, + run_as_group=run_as_user, + run_as_non_root=True, + allow_privilege_escalation=True, + capabilities=client.V1Capabilities( + add=["SYS_PTRACE", "SYS_ADMIN", "SYS_CHROOT"], + drop=["ALL"], + ), + ) # Resource requirements resources = client.V1ResourceRequirements( @@ -279,7 +318,7 @@ def create_pod_manifest( image=main_image, image_pull_policy=image_pull_policy, volume_mounts=[shared_mount], - security_context=security_context, + security_context=main_security_context, resources=resources, env=[ client.V1EnvVar(name="PYTHONUNBUFFERED", value="1"), @@ -287,6 +326,21 @@ def create_pod_manifest( ], ) + # In agent mode, override CMD to run the executor agent from the shared volume + # (copied there by the init container) + if use_agent: + main_container.args = ["/mnt/data/.executor-agent", "--port", str(executor_port)] + + # Sidecar environment variables + sidecar_env = [ + client.V1EnvVar(name="LANGUAGE", value=language), + client.V1EnvVar(name="WORKING_DIR", value="/mnt/data"), + client.V1EnvVar(name="SIDECAR_PORT", value=str(sidecar_port)), + client.V1EnvVar(name="NETWORK_ISOLATED", value=str(network_isolated).lower()), + client.V1EnvVar(name="EXECUTION_MODE", value=execution_mode), + client.V1EnvVar(name="EXECUTOR_PORT", value=str(executor_port)), + ] + # Sidecar container (HTTP API) sidecar_container = client.V1Container( name="sidecar", @@ -296,17 +350,12 @@ def create_pod_manifest( volume_mounts=[shared_mount], security_context=sidecar_security_context, resources=client.V1ResourceRequirements( - # CRITICAL: User code runs in the sidecar's cgroup via nsenter (Issue #32) - # These limits apply to user code execution, not just the sidecar process + # In nsenter mode: user code runs in the sidecar's cgroup via nsenter + # In agent mode: sidecar only proxies requests, user code runs in main container limits={"cpu": sidecar_cpu_limit, "memory": sidecar_memory_limit}, requests={"cpu": sidecar_cpu_request, "memory": sidecar_memory_request}, ), - env=[ - client.V1EnvVar(name="LANGUAGE", value=language), - client.V1EnvVar(name="WORKING_DIR", value="/mnt/data"), - client.V1EnvVar(name="SIDECAR_PORT", value=str(sidecar_port)), - client.V1EnvVar(name="NETWORK_ISOLATED", value=str(network_isolated).lower()), - ], + env=sidecar_env, readiness_probe=client.V1Probe( http_get=client.V1HTTPGetAction(path="/ready", port=sidecar_port), initial_delay_seconds=5, @@ -323,33 +372,111 @@ def create_pod_manifest( ), ) + # Init containers (agent mode only) + # Copy the executor agent binary from the sidecar image to the shared volume + init_containers = None + if use_agent: + init_containers = [ + client.V1Container( + name="agent-init", + image=sidecar_image, + image_pull_policy=image_pull_policy, + command=[ + "python", + "-c", + "import shutil, os; shutil.copy2('/opt/executor-agent', '/mnt/data/.executor-agent'); os.chmod('/mnt/data/.executor-agent', 0o755)", + ], + volume_mounts=[shared_mount], + security_context=client.V1SecurityContext( + run_as_user=run_as_user, + run_as_group=run_as_user, + run_as_non_root=True, + allow_privilege_escalation=False, + capabilities=client.V1Capabilities(drop=["ALL"]), + ), + resources=client.V1ResourceRequirements( + limits={"cpu": "100m", "memory": "64Mi"}, + requests={"cpu": "50m", "memory": "32Mi"}, + ), + ) + ] + + # GKE Sandbox configuration + # When enabled, adds gVisor runtime, node selector, and tolerations + runtime_class = runtime_class_name if gke_sandbox_enabled else None + + # Build node selector + node_selector = {} + if gke_sandbox_enabled: + # GKE automatically adds this label to sandbox-enabled nodes + node_selector["sandbox.gke.io/runtime"] = "gvisor" + if sandbox_node_selector: + node_selector.update(sandbox_node_selector) + + # Build tolerations list + tolerations = [] + if gke_sandbox_enabled: + # GKE Sandbox standard taint + tolerations.append( + client.V1Toleration( + key="sandbox.gke.io/runtime", + operator="Equal", + value="gvisor", + effect="NoSchedule", + ) + ) + if custom_tolerations: + # Add custom node pool taints (e.g., pool=sandbox) + for tol in custom_tolerations: + tol_key = tol.get("key") + if not tol_key: + logger.warning("Skipping custom toleration with missing 'key' field", toleration=tol) + continue + tolerations.append( + client.V1Toleration( + key=tol_key, + operator=tol.get("operator", "Equal"), + value=tol.get("value"), + effect=tol.get("effect", "NoSchedule"), + ) + ) + + # Build image pull secrets list + pull_secrets = None + if image_pull_secrets: + pull_secrets = [client.V1LocalObjectReference(name=secret_name) for secret_name in image_pull_secrets] + # Pod spec pod_spec = client.V1PodSpec( + init_containers=init_containers, containers=[main_container, sidecar_container], volumes=[shared_volume], restart_policy="Never", termination_grace_period_seconds=10, - # Share process namespace so sidecar can use nsenter to execute in main container - share_process_namespace=True, + # Share process namespace only needed for nsenter mode + share_process_namespace=not use_agent, + runtime_class_name=runtime_class, + node_selector=node_selector if node_selector else None, + tolerations=tolerations if tolerations else None, + image_pull_secrets=pull_secrets, security_context=client.V1PodSecurityContext( - # Note: We don't set run_as_user at pod level; each container - # sets its own security context. Both run as non-root UID 65532. - # The sidecar uses file capabilities (setcap) on nsenter for privileges. fs_group=run_as_user, - # Apply seccomp profile to block dangerous syscalls - # while preserving nsenter functionality for the sidecar seccomp_profile=client.V1SeccompProfile(type=seccomp_profile_type), ), - # Prevent scheduling on same node as other execution pods - # (optional, can be configured via affinity) ) # Pod metadata + # Add GKE Sandbox annotation if enabled + pod_annotations = dict(annotations) if annotations else {} + if gke_sandbox_enabled: + # GKE Sandbox annotation for gVisor runtime + pod_annotations["sandbox.gke.io/runtime"] = "gvisor" + metadata = client.V1ObjectMeta( name=name, namespace=namespace, labels=labels, - annotations=annotations or {}, + annotations=pod_annotations, ) return client.V1Pod( diff --git a/src/services/kubernetes/job_executor.py b/src/services/kubernetes/job_executor.py index d69d700..d301210 100644 --- a/src/services/kubernetes/job_executor.py +++ b/src/services/kubernetes/job_executor.py @@ -5,8 +5,6 @@ """ import asyncio -from datetime import datetime -from typing import Any, Dict, List, Optional from uuid import uuid4 import httpx @@ -41,7 +39,7 @@ def __init__( namespace: str | None = None, ttl_seconds_after_finished: int = 60, active_deadline_seconds: int = 300, - sidecar_image: str = "aronmuon/kubecoderun-sidecar:latest", + sidecar_image: str = "aronmuon/kubecoderun-sidecar-agent:latest", ): """Initialize the Job executor. @@ -123,8 +121,16 @@ async def create_job( sidecar_memory_limit=spec.sidecar_memory_limit, sidecar_cpu_request=spec.sidecar_cpu_request, sidecar_memory_request=spec.sidecar_memory_request, + execution_mode=spec.execution_mode, + executor_port=spec.executor_port, seccomp_profile_type=spec.seccomp_profile_type, network_isolated=spec.network_isolated, + gke_sandbox_enabled=spec.gke_sandbox_enabled, + runtime_class_name=spec.runtime_class_name, + sandbox_node_selector=spec.sandbox_node_selector, + custom_tolerations=spec.custom_tolerations, + image_pull_policy=spec.image_pull_policy, + image_pull_secrets=spec.image_pull_secrets, ttl_seconds_after_finished=self.ttl_seconds_after_finished, active_deadline_seconds=self.active_deadline_seconds, ) diff --git a/src/services/kubernetes/manager.py b/src/services/kubernetes/manager.py index 3690b66..6b645a6 100644 --- a/src/services/kubernetes/manager.py +++ b/src/services/kubernetes/manager.py @@ -41,13 +41,21 @@ def __init__( self, namespace: str | None = None, pool_configs: list[PoolConfig] | None = None, - sidecar_image: str = "aronmuon/kubecoderun-sidecar:latest", + sidecar_image: str = "aronmuon/kubecoderun-sidecar-agent:latest", default_cpu_limit: str = "1", default_memory_limit: str = "512Mi", default_cpu_request: str = "100m", default_memory_request: str = "128Mi", + execution_mode: str = "agent", + executor_port: int = 9090, seccomp_profile_type: str = "RuntimeDefault", network_isolated: bool = False, + image_pull_policy: str = "Always", + gke_sandbox_enabled: bool = False, + runtime_class_name: str = "gvisor", + sandbox_node_selector: dict[str, str] | None = None, + custom_tolerations: list[dict[str, str]] | None = None, + image_pull_secrets: list[str] | None = None, ): """Initialize the Kubernetes manager. @@ -59,8 +67,16 @@ def __init__( default_memory_limit: Default memory limit for pods default_cpu_request: Default CPU request for pods default_memory_request: Default memory request for pods + execution_mode: Execution mode - "agent" (default) or "nsenter" + executor_port: Port for executor HTTP server in the main container seccomp_profile_type: Seccomp profile type (RuntimeDefault, Unconfined, Localhost) network_isolated: Whether network isolation is enabled (disables network-dependent features) + image_pull_policy: Image pull policy for execution pods (Always, IfNotPresent, Never) + gke_sandbox_enabled: Enable GKE Sandbox (gVisor) for additional kernel isolation + runtime_class_name: Runtime class name for sandboxed pods + sandbox_node_selector: Node selector for sandbox-enabled nodes + custom_tolerations: Custom tolerations for node pool taints + image_pull_secrets: List of secret names for pulling images from private registries """ self.namespace = namespace or get_current_namespace() self.sidecar_image = sidecar_image @@ -68,13 +84,22 @@ def __init__( self.default_memory_limit = default_memory_limit self.default_cpu_request = default_cpu_request self.default_memory_request = default_memory_request + self.execution_mode = execution_mode + self.executor_port = executor_port self.seccomp_profile_type = seccomp_profile_type self.network_isolated = network_isolated + self.image_pull_policy = image_pull_policy + self.gke_sandbox_enabled = gke_sandbox_enabled + self.runtime_class_name = runtime_class_name + self.sandbox_node_selector = sandbox_node_selector + self.custom_tolerations = custom_tolerations + self.image_pull_secrets = image_pull_secrets + self._pool_configs = pool_configs or [] # Pool manager for warm pods self._pool_manager = PodPoolManager( namespace=self.namespace, - configs=pool_configs or [], + configs=self._pool_configs, ) # Job executor for cold languages @@ -268,6 +293,15 @@ async def execute_code( return result, handle, source else: # Use Job execution + # Get image_pull_secrets from pool config for this language + pull_secrets = self.image_pull_secrets + pull_policy = self.image_pull_policy + for config in self._pool_configs: + if config.language.lower() == language.lower(): + pull_secrets = config.image_pull_secrets or self.image_pull_secrets + pull_policy = config.image_pull_policy or self.image_pull_policy + break + spec = PodSpec( language=language, image=self.get_image_for_language(language), @@ -278,8 +312,16 @@ async def execute_code( memory_limit=self.default_memory_limit, cpu_request=self.default_cpu_request, memory_request=self.default_memory_request, + execution_mode=self.execution_mode, + executor_port=self.executor_port, seccomp_profile_type=self.seccomp_profile_type, network_isolated=self.network_isolated, + image_pull_policy=pull_policy, + gke_sandbox_enabled=self.gke_sandbox_enabled, + runtime_class_name=self.runtime_class_name, + sandbox_node_selector=self.sandbox_node_selector, + custom_tolerations=self.custom_tolerations, + image_pull_secrets=pull_secrets, ) result = await self._job_executor.execute_with_job( diff --git a/src/services/kubernetes/models.py b/src/services/kubernetes/models.py index f9db8c4..6e06be0 100644 --- a/src/services/kubernetes/models.py +++ b/src/services/kubernetes/models.py @@ -106,7 +106,9 @@ class PodSpec: cpu_request: str = "100m" memory_request: str = "128Mi" - # Sidecar resource limits (CRITICAL: user code runs in sidecar's cgroup via nsenter) + # Sidecar resource limits + # In nsenter mode: user code runs in sidecar's cgroup via nsenter + # In agent mode: user code runs in main container's cgroup sidecar_cpu_limit: str = "500m" sidecar_memory_limit: str = "512Mi" sidecar_cpu_request: str = "100m" @@ -116,15 +118,27 @@ class PodSpec: run_as_user: int = 65532 run_as_group: int = 65532 run_as_non_root: bool = True + execution_mode: str = "agent" # "agent" or "nsenter" + executor_port: int = 9090 seccomp_profile_type: str = "RuntimeDefault" # Sidecar configuration - sidecar_image: str = "aronmuon/kubecoderun-sidecar:latest" + sidecar_image: str = "aronmuon/kubecoderun-sidecar-agent:latest" sidecar_port: int = 8080 + # Image pull policy and secrets + image_pull_policy: str = "Always" + image_pull_secrets: list[str] | None = None + # Network isolation mode - disables network-dependent features (e.g., Go module proxy) network_isolated: bool = False + # GKE Sandbox (gVisor) configuration + gke_sandbox_enabled: bool = False + runtime_class_name: str = "gvisor" + sandbox_node_selector: dict[str, str] | None = None + custom_tolerations: list[dict[str, str]] | None = None + @dataclass class PoolConfig: @@ -133,13 +147,15 @@ class PoolConfig: language: str image: str pool_size: int = 0 # 0 = use Jobs instead of pool - sidecar_image: str = "aronmuon/kubecoderun-sidecar:latest" + sidecar_image: str = "aronmuon/kubecoderun-sidecar-agent:latest" # Resource limits (can override defaults) cpu_limit: str | None = None memory_limit: str | None = None - # Sidecar resource limits (CRITICAL: user code runs in sidecar's cgroup via nsenter) + # Sidecar resource limits + # In nsenter mode: user code runs in sidecar's cgroup via nsenter + # In agent mode: user code runs in main container's cgroup sidecar_cpu_limit: str = "500m" sidecar_memory_limit: str = "512Mi" sidecar_cpu_request: str = "100m" @@ -148,12 +164,23 @@ class PoolConfig: # Image pull policy (Always, IfNotPresent, Never) image_pull_policy: str = "Always" - # Seccomp profile type (RuntimeDefault, Unconfined, Localhost) + # Image pull secrets (list of secret names) + image_pull_secrets: list[str] | None = None + + # Execution mode and security settings + execution_mode: str = "agent" # "agent" or "nsenter" + executor_port: int = 9090 seccomp_profile_type: str = "RuntimeDefault" # Network isolation mode - disables network-dependent features (e.g., Go module proxy) network_isolated: bool = False + # GKE Sandbox (gVisor) configuration + gke_sandbox_enabled: bool = False + runtime_class_name: str = "gvisor" + sandbox_node_selector: dict[str, str] | None = None + custom_tolerations: list[dict[str, str]] | None = None + @property def uses_pool(self) -> bool: """Whether this language uses a warm pod pool.""" diff --git a/src/services/kubernetes/pool.py b/src/services/kubernetes/pool.py index 98fb572..04eb73a 100644 --- a/src/services/kubernetes/pool.py +++ b/src/services/kubernetes/pool.py @@ -185,8 +185,15 @@ async def _create_warm_pod(self) -> PooledPod | None: sidecar_memory_limit=self.config.sidecar_memory_limit, sidecar_cpu_request=self.config.sidecar_cpu_request, sidecar_memory_request=self.config.sidecar_memory_request, + execution_mode=self.config.execution_mode, + executor_port=self.config.executor_port, seccomp_profile_type=self.config.seccomp_profile_type, network_isolated=self.config.network_isolated, + gke_sandbox_enabled=self.config.gke_sandbox_enabled, + runtime_class_name=self.config.runtime_class_name, + sandbox_node_selector=self.config.sandbox_node_selector, + custom_tolerations=self.config.custom_tolerations, + image_pull_secrets=self.config.image_pull_secrets, ) try: diff --git a/src/services/metrics.py b/src/services/metrics.py index cb5281e..7a1f40a 100644 --- a/src/services/metrics.py +++ b/src/services/metrics.py @@ -391,8 +391,10 @@ async def _persist_metrics_to_redis(self) -> None: } # Store in Redis with TTL + from ..core.pool import redis_pool + await self._redis_client.setex( - "metrics:current", + redis_pool.make_key("metrics:current"), 86400, str(metrics_data), # 24 hours TTL ) @@ -400,7 +402,7 @@ async def _persist_metrics_to_redis(self) -> None: # Store historical data (keep last 24 hours) hour_key = datetime.now(UTC).strftime("%Y-%m-%d-%H") await self._redis_client.setex( - f"metrics:hourly:{hour_key}", + redis_pool.make_key(f"metrics:hourly:{hour_key}"), 86400 * 7, # 7 days TTL for hourly data str(metrics_data), ) @@ -417,7 +419,9 @@ async def _load_metrics_from_redis(self) -> None: try: # Load current metrics - current_data = await self._redis_client.get("metrics:current") + from ..core.pool import redis_pool + + current_data = await self._redis_client.get(redis_pool.make_key("metrics:current")) if current_data: # In a full implementation, we would parse and restore the metrics # For now, just log that we found existing data diff --git a/src/services/session.py b/src/services/session.py index 4dbbd9b..0288d20 100644 --- a/src/services/session.py +++ b/src/services/session.py @@ -122,15 +122,15 @@ def _generate_session_id(self) -> str: def _session_key(self, session_id: str) -> str: """Generate Redis key for session data.""" - return f"sessions:{session_id}" + return redis_pool.make_key(f"sessions:{session_id}") def _session_index_key(self) -> str: """Generate Redis key for session index.""" - return "sessions:index" + return redis_pool.make_key("sessions:index") def _entity_sessions_key(self, entity_id: str) -> str: """Generate Redis key for entity-based session grouping.""" - return f"entity_sessions:{entity_id}" + return redis_pool.make_key(f"entity_sessions:{entity_id}") async def create_session(self, request: SessionCreate) -> Session: """Create a new code execution session.""" @@ -169,8 +169,9 @@ async def create_session(self, request: SessionCreate) -> Session: # Extract entity_id from metadata if provided entity_id = request.metadata.get("entity_id") if request.metadata else None - # Use Redis transaction to ensure atomicity - pipe = await self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — keys span different hash slots) + pipe = self.redis.pipeline(transaction=False) try: # Store session data pipe.hset(session_key, mapping=session_data) @@ -307,8 +308,9 @@ async def delete_session(self, session_id: str) -> bool: ) # Continue with session deletion even if file cleanup fails - # Use transaction to ensure atomicity - pipe = await self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — keys span different hash slots) + pipe = self.redis.pipeline(transaction=False) try: # Remove session data pipe.delete(session_key) diff --git a/src/services/state.py b/src/services/state.py index 29d83c9..654c34c 100644 --- a/src/services/state.py +++ b/src/services/state.py @@ -55,19 +55,19 @@ def __init__(self, redis_client: redis.Redis | None = None): def _state_key(self, session_id: str) -> str: """Generate Redis key for session state.""" - return f"{self.KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.KEY_PREFIX}{session_id}") def _hash_key(self, session_id: str) -> str: """Generate Redis key for state hash.""" - return f"{self.HASH_KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.HASH_KEY_PREFIX}{session_id}") def _meta_key(self, session_id: str) -> str: """Generate Redis key for state metadata.""" - return f"{self.META_KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.META_KEY_PREFIX}{session_id}") def _upload_marker_key(self, session_id: str) -> str: """Generate Redis key for upload marker.""" - return f"{self.UPLOAD_MARKER_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.UPLOAD_MARKER_PREFIX}{session_id}") @staticmethod def compute_hash(raw_bytes: bytes) -> str: @@ -133,8 +133,9 @@ async def save_state( state_hash = self.compute_hash(raw_bytes) now = datetime.now(UTC) - # Use pipeline for atomic operations - pipe = self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — state/hash/meta keys hash to different slots) + pipe = self.redis.pipeline(transaction=False) # Save state pipe.setex(self._state_key(session_id), ttl_seconds, state_b64) diff --git a/src/utils/config_validator.py b/src/utils/config_validator.py index 53328aa..82153a0 100644 --- a/src/utils/config_validator.py +++ b/src/utils/config_validator.py @@ -5,6 +5,8 @@ import redis from minio.error import S3Error +from redis.cluster import ClusterNode, RedisCluster +from redis.sentinel import Sentinel from ..config import settings @@ -94,18 +96,71 @@ def _validate_file_config(self): self.errors.append(f"File extension must start with dot: {ext}") def _validate_redis_connection(self): - """Validate Redis connection.""" + """Validate Redis connection. + + Uses the correct client type depending on REDIS_MODE (standalone, + cluster, or sentinel) and forwards TLS kwargs so that managed + services with custom CA certificates are validated correctly. + """ try: - # Use Redis URL from settings - client = redis.from_url( - settings.get_redis_url(), - socket_timeout=settings.redis_socket_timeout, - socket_connect_timeout=settings.redis_socket_connect_timeout, - max_connections=settings.redis_max_connections, - ) - - # Test connection - client.ping() + redis_cfg = settings.redis + tls_kwargs = redis_cfg.get_tls_kwargs() + # ``ssl`` is implied by the ``rediss://`` scheme for standalone; + # for cluster/sentinel it's passed directly. + tls_standalone = {k: v for k, v in tls_kwargs.items() if k != "ssl"} + + if redis_cfg.mode == "cluster": + # --- Cluster mode --- + if redis_cfg.cluster_nodes: + startup_nodes = [ + ClusterNode(host=h, port=p) for h, p in redis_cfg.parse_nodes(redis_cfg.cluster_nodes) + ] + else: + startup_nodes = [ClusterNode(host=redis_cfg.host, port=redis_cfg.port)] + + client = RedisCluster( + startup_nodes=startup_nodes, + password=redis_cfg.password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + client.ping() + client.close() + + elif redis_cfg.mode == "sentinel": + # --- Sentinel mode --- + if redis_cfg.sentinel_nodes: + sentinel_hosts = redis_cfg.parse_nodes(redis_cfg.sentinel_nodes) + else: + sentinel_hosts = [(redis_cfg.host, 26379)] + + sentinel = Sentinel( + sentinels=sentinel_hosts, + password=redis_cfg.sentinel_password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + master = sentinel.master_for( + service_name=redis_cfg.sentinel_master, + password=redis_cfg.password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + master.ping() + + else: + # --- Standalone mode --- + client = redis.from_url( + settings.get_redis_url(), + socket_timeout=settings.redis_socket_timeout, + socket_connect_timeout=settings.redis_socket_connect_timeout, + max_connections=settings.redis_max_connections, + **tls_standalone, + ) + client.ping() except redis.ConnectionError as e: # Treat as warning in development mode to allow startup without Redis diff --git a/src/utils/logging.py b/src/utils/logging.py index 37c6347..54ceaee 100644 --- a/src/utils/logging.py +++ b/src/utils/logging.py @@ -110,7 +110,7 @@ def configure_third_party_loggers() -> None: def add_service_context(logger, method_name, event_dict): """Add service context information to log entries.""" event_dict["service"] = "kubecoderun-api" - event_dict["version"] = __version__ + event_dict["version"] = settings.service_version or __version__ return event_dict diff --git a/tests/integration/test_redis_cluster.py b/tests/integration/test_redis_cluster.py new file mode 100644 index 0000000..679cc4f --- /dev/null +++ b/tests/integration/test_redis_cluster.py @@ -0,0 +1,295 @@ +"""Integration test for Redis Cluster connectivity. + +Requires a running Redis Cluster on localhost:7000-7005. +Start with: docker compose -f docker-compose.redis-cluster.yml up -d + +Usage: + uv run python -m pytest tests/integration/test_redis_cluster.py -v +""" + +import asyncio +import os + +import pytest +import redis as sync_redis +import redis.asyncio as async_redis +from redis.asyncio.cluster import RedisCluster as AsyncRedisCluster +from redis.cluster import ClusterNode, RedisCluster + +# Only run when cluster is available +CLUSTER_HOST = os.environ.get("REDIS_CLUSTER_HOST", "127.0.0.1") +CLUSTER_PORT = int(os.environ.get("REDIS_CLUSTER_PORT", "7000")) + +pytestmark = pytest.mark.integration + + +def _cluster_available() -> bool: + """Check if a Redis Cluster is reachable.""" + try: + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + socket_timeout=2, + socket_connect_timeout=2, + ) + rc.ping() + rc.close() + return True + except Exception: + return False + + +skip_no_cluster = pytest.mark.skipif( + not _cluster_available(), + reason=f"Redis Cluster not available at {CLUSTER_HOST}:{CLUSTER_PORT}", +) + + +# ── Synchronous (validator path) ────────────────────────────────────────── + + +@skip_no_cluster +class TestSyncRedisCluster: + """Tests using synchronous redis-py RedisCluster (same as config_validator).""" + + def test_connect_with_single_startup_node(self): + """Cluster discovery works from a single startup node.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + ) + assert rc.ping() is True + # Verify the cluster is operational via a targeted node + node_info = rc.cluster_info(target_nodes=RedisCluster.RANDOM) + assert node_info.get("cluster_state") == "ok" + rc.close() + + def test_connect_with_multiple_startup_nodes(self): + """Cluster discovery works from multiple startup nodes.""" + nodes = [ + ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT + 1), + ] + rc = RedisCluster( + startup_nodes=nodes, + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + ) + assert rc.ping() is True + rc.close() + + def test_connect_with_no_password(self): + """Cluster connects with password=None (no AUTH).""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + password=None, + decode_responses=True, + socket_timeout=5, + ) + assert rc.ping() is True + rc.close() + + def test_empty_password_converted_to_none(self): + """Our validator converts empty password to None to avoid spurious AUTH. + + Redis servers without requirepass accept AUTH with any string, + so we can't observe the bug via an error. Instead, verify that + our Settings validator normalises empty password to None. + """ + from src.config import Settings + + s = Settings(redis_password="") + assert s.redis_password is None + + s2 = Settings(redis_password=" ") + assert s2.redis_password is None + + s3 = Settings(redis_password="real-password") + assert s3.redis_password == "real-password" + + def test_set_get_operations(self): + """Basic SET/GET across cluster slots.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + ) + # These keys hash to different slots + for i in range(10): + key = f"test:cluster:{i}" + rc.set(key, f"value-{i}") + assert rc.get(key) == f"value-{i}" + rc.delete(key) + rc.close() + + +# ── Asynchronous (pool path) ───────────────────────────────────────────── + + +@skip_no_cluster +class TestAsyncRedisCluster: + """Tests using async redis-py RedisCluster (same as RedisPool._init_cluster).""" + + @pytest.mark.asyncio + async def test_async_connect_and_ping(self): + """Async cluster client connects and pings.""" + from redis.backoff import ExponentialBackoff + from redis.exceptions import ConnectionError, TimeoutError + from redis.retry import Retry + + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ], + password=None, + decode_responses=True, + max_connections=20, + socket_timeout=5.0, + socket_connect_timeout=5.0, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + ) + result = await rc.ping() + assert result is True + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_set_get(self): + """Async SET/GET across cluster slots.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ], + decode_responses=True, + ) + for i in range(10): + key = f"test:async:cluster:{i}" + await rc.set(key, f"value-{i}") + val = await rc.get(key) + assert val == f"value-{i}" + await rc.delete(key) + await rc.aclose() + + +# ── RedisPool integration ──────────────────────────────────────────────── + + +@skip_no_cluster +class TestRedisPoolClusterMode: + """Test RedisPool with actual cluster backend.""" + + @pytest.mark.asyncio + async def test_pool_cluster_mode(self, monkeypatch): + """RedisPool initializes in cluster mode and can SET/GET.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") # empty = no auth + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") # empty = fallback to host:port + + # Re-import to pick up new env + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + # Verify our validators worked + assert cfg.password is None, f"Expected None, got {cfg.password!r}" + assert cfg.cluster_nodes is None, f"Expected None, got {cfg.cluster_nodes!r}" + + from src.core.pool import RedisPool + + pool = RedisPool() + # Inject our test settings + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert isinstance(client, AsyncRedisCluster) + + # Test operations + await client.set("test:pool:cluster", "works") + val = await client.get("test:pool:cluster") + assert val == "works" + await client.delete("test:pool:cluster") + await client.aclose() + + @pytest.mark.asyncio + async def test_pool_cluster_mode_with_explicit_nodes(self, monkeypatch): + """RedisPool uses REDIS_CLUSTER_NODES when provided.""" + nodes_str = f"{CLUSTER_HOST}:{CLUSTER_PORT},{CLUSTER_HOST}:{CLUSTER_PORT + 1}" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_CLUSTER_NODES", nodes_str) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + assert cfg.cluster_nodes == nodes_str + assert cfg.password is None + + from src.core.pool import RedisPool + + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + result = await client.ping() + assert result is True + await client.aclose() + + +# ── Config Validator integration ───────────────────────────────────────── + + +@skip_no_cluster +class TestConfigValidatorClusterMode: + """Test ConfigValidator._validate_redis_connection with real cluster.""" + + def test_validator_cluster_succeeds(self, monkeypatch): + """Config validator passes with a real cluster.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" + + def test_validator_cluster_with_explicit_nodes(self, monkeypatch): + """Config validator passes with explicit cluster nodes.""" + nodes_str = f"{CLUSTER_HOST}:{CLUSTER_PORT},{CLUSTER_HOST}:{CLUSTER_PORT + 1},{CLUSTER_HOST}:{CLUSTER_PORT + 2}" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_CLUSTER_NODES", nodes_str) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" diff --git a/tests/integration/test_redis_cluster_tls.py b/tests/integration/test_redis_cluster_tls.py new file mode 100644 index 0000000..e4f9f47 --- /dev/null +++ b/tests/integration/test_redis_cluster_tls.py @@ -0,0 +1,454 @@ +"""Integration tests for Redis Cluster with TLS. + +Mirrors the user's production GCP Memorystore configuration: +- REDIS_MODE=cluster +- REDIS_TLS_ENABLED=true +- REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt (server verification) +- REDIS_TLS_CERT_FILE="" (no client cert / no mTLS) +- REDIS_TLS_KEY_FILE="" (no client key / no mTLS) +- REDIS_TLS_INSECURE=false (certificate chain verified) +- REDIS_TLS_CHECK_HOSTNAME not set (defaults to false) +- REDIS_PASSWORD="" (no authentication) +- REDIS_CLUSTER_NODES not set (falls back to host:port) +- REDIS_KEY_PREFIX=kubecoderun: + +Requires a running TLS Redis Cluster on localhost:6380-6385. +Start with: docker compose -f docker-compose.redis-cluster-tls.yml up -d + +Usage: + uv run python -m pytest tests/integration/test_redis_cluster_tls.py -v +""" + +import os +import ssl as ssl_mod +from pathlib import Path + +import pytest +import redis as sync_redis +import redis.asyncio as async_redis +from redis.asyncio.cluster import RedisCluster as AsyncRedisCluster +from redis.cluster import ClusterNode, RedisCluster + +# ── Configuration matching production ──────────────────────────────────── + +TLS_CLUSTER_HOST = os.environ.get("REDIS_TLS_CLUSTER_HOST", "127.0.0.1") +TLS_CLUSTER_PORT = int(os.environ.get("REDIS_TLS_CLUSTER_PORT", "6380")) + +# CA cert path (relative to project root, same concept as production +# REDIS_TLS_CA_CERT_FILE=/app/api/cache/redis-ca.crt) +CERTS_DIR = Path(__file__).resolve().parent.parent / "tls-certs" +CA_CERT_FILE = str(CERTS_DIR / "ca.crt") + +pytestmark = pytest.mark.integration + + +def _tls_kwargs_production() -> dict: + """Build TLS kwargs matching production config. + + This mirrors what RedisConfig.get_tls_kwargs() produces with: + REDIS_TLS_ENABLED=true + REDIS_TLS_INSECURE=false + REDIS_TLS_CHECK_HOSTNAME=false (default) + REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt + REDIS_TLS_CERT_FILE="" -> None + REDIS_TLS_KEY_FILE="" -> None + """ + return { + "ssl": True, + "ssl_cert_reqs": ssl_mod.CERT_REQUIRED, + "ssl_check_hostname": False, + "ssl_ca_certs": CA_CERT_FILE, + } + + +def _tls_cluster_available() -> bool: + """Check if a TLS Redis Cluster is reachable.""" + try: + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + socket_timeout=3, + socket_connect_timeout=3, + **_tls_kwargs_production(), + ) + rc.ping() + rc.close() + return True + except Exception: + return False + + +skip_no_tls_cluster = pytest.mark.skipif( + not _tls_cluster_available(), + reason=f"TLS Redis Cluster not available at {TLS_CLUSTER_HOST}:{TLS_CLUSTER_PORT}", +) + + +# ── Synchronous TLS Cluster tests ──────────────────────────────────────── + + +@skip_no_tls_cluster +class TestSyncTlsCluster: + """Synchronous redis-py with TLS (same path as config_validator).""" + + def test_connect_single_startup_node_tls(self): + """TLS cluster discovery from a single startup node.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + **_tls_kwargs_production(), + ) + assert rc.ping() is True + node_info = rc.cluster_info(target_nodes=RedisCluster.RANDOM) + assert node_info.get("cluster_state") == "ok" + rc.close() + + def test_connect_no_password_tls(self): + """TLS cluster with password=None (production has REDIS_PASSWORD='').""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + password=None, + decode_responses=True, + socket_timeout=5, + **_tls_kwargs_production(), + ) + assert rc.ping() is True + rc.close() + + def test_set_get_across_slots_tls(self): + """SET/GET across cluster slots over TLS.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + **_tls_kwargs_production(), + ) + for i in range(10): + key = f"test:tls:cluster:{i}" + rc.set(key, f"value-{i}") + assert rc.get(key) == f"value-{i}" + rc.delete(key) + rc.close() + + def test_key_prefix_operations_tls(self): + """Operations with kubecoderun: prefix (matching production key_prefix).""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + **_tls_kwargs_production(), + ) + prefix = "kubecoderun:" + key = f"{prefix}session:test-abc" + rc.set(key, "session-data") + assert rc.get(key) == "session-data" + rc.delete(key) + rc.close() + + +# ── Asynchronous TLS Cluster tests ─────────────────────────────────────── + + +@skip_no_tls_cluster +class TestAsyncTlsCluster: + """Async redis-py with TLS (same path as RedisPool._init_cluster).""" + + @pytest.mark.asyncio + async def test_async_connect_tls(self): + """Async TLS cluster client connects and pings.""" + from redis.backoff import ExponentialBackoff + from redis.exceptions import ConnectionError, TimeoutError + from redis.retry import Retry + + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + password=None, + decode_responses=True, + max_connections=20, + socket_timeout=5.0, + socket_connect_timeout=5.0, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + **_tls_kwargs_production(), + ) + assert await rc.ping() is True + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_set_get_tls(self): + """Async SET/GET over TLS cluster.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + decode_responses=True, + **_tls_kwargs_production(), + ) + for i in range(10): + key = f"test:async:tls:{i}" + await rc.set(key, f"tls-value-{i}") + val = await rc.get(key) + assert val == f"tls-value-{i}" + await rc.delete(key) + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_prefixed_operations_tls(self): + """Async operations with production-like key prefix over TLS.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + decode_responses=True, + **_tls_kwargs_production(), + ) + prefix = "kubecoderun:" + keys = [f"{prefix}session:{i}" for i in range(5)] + for key in keys: + await rc.set(key, "data") + assert await rc.get(key) == "data" + for key in keys: + await rc.delete(key) + await rc.aclose() + + +# ── RedisPool with TLS Cluster ─────────────────────────────────────────── + + +@skip_no_tls_cluster +class TestRedisPoolTlsCluster: + """Test RedisPool with TLS cluster backend — mirrors production config.""" + + @pytest.mark.asyncio + async def test_pool_tls_cluster_production_config(self, monkeypatch): + """RedisPool initializes with the exact production configuration. + + Env vars set here match the user's Helm values: + REDIS_MODE: "cluster" + REDIS_HOST: + REDIS_PORT: "6380" + REDIS_PASSWORD: "" + REDIS_DB: "0" + REDIS_MAX_CONNECTIONS: "20" + REDIS_SOCKET_TIMEOUT: "5" + REDIS_SOCKET_CONNECT_TIMEOUT: "5" + REDIS_KEY_PREFIX: "kubecoderun:" + REDIS_TLS_ENABLED: "true" + REDIS_TLS_CA_CERT_FILE: + REDIS_TLS_CERT_FILE: "" + REDIS_TLS_KEY_FILE: "" + REDIS_TLS_INSECURE: "false" + """ + # Set env vars exactly as Helm renders them in production + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") # empty -> None via validator + monkeypatch.setenv("REDIS_DB", "0") + monkeypatch.setenv("REDIS_MAX_CONNECTIONS", "20") + monkeypatch.setenv("REDIS_SOCKET_TIMEOUT", "5") + monkeypatch.setenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5") + monkeypatch.setenv("REDIS_KEY_PREFIX", "kubecoderun:") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") # no client cert + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") # no client key + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") # empty -> None, fallback to host:port + + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + # Verify validators worked correctly + assert cfg.mode == "cluster" + assert cfg.host == TLS_CLUSTER_HOST + assert cfg.port == TLS_CLUSTER_PORT + assert cfg.password is None, f"Expected None, got {cfg.password!r}" + assert cfg.cluster_nodes is None, f"Expected None, got {cfg.cluster_nodes!r}" + assert cfg.tls_enabled is True + assert cfg.tls_ca_cert_file == CA_CERT_FILE + assert cfg.tls_cert_file is None or cfg.tls_cert_file == "" + assert cfg.tls_key_file is None or cfg.tls_key_file == "" + assert cfg.tls_insecure is False + assert cfg.tls_check_hostname is False # default + assert cfg.key_prefix == "kubecoderun:" + + # Verify TLS kwargs + tls_kwargs = cfg.get_tls_kwargs() + assert tls_kwargs["ssl"] is True + assert tls_kwargs["ssl_cert_reqs"] == ssl_mod.CERT_REQUIRED + assert tls_kwargs["ssl_check_hostname"] is False + assert tls_kwargs["ssl_ca_certs"] == CA_CERT_FILE + assert "ssl_certfile" not in tls_kwargs # no client cert + assert "ssl_keyfile" not in tls_kwargs # no client key + + # Initialize pool + from src.core.pool import RedisPool + + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert isinstance(client, AsyncRedisCluster) + assert pool.key_prefix == "kubecoderun:" + + # Test operations with prefix + full_key = pool.make_key("session:test-tls") + assert full_key == "kubecoderun:session:test-tls" + + await client.set(full_key, "tls-session-data") + val = await client.get(full_key) + assert val == "tls-session-data" + await client.delete(full_key) + + await pool.close() + + @pytest.mark.asyncio + async def test_pool_tls_cluster_without_key_prefix(self, monkeypatch): + """RedisPool works in TLS cluster mode without key prefix.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_KEY_PREFIX", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + from src.core.pool import RedisPool + + settings_obj = Settings() + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert pool.key_prefix == "" + assert pool.make_key("mykey") == "mykey" + + await client.set("test:no-prefix:tls", "ok") + assert await client.get("test:no-prefix:tls") == "ok" + await client.delete("test:no-prefix:tls") + await pool.close() + + +# ── ConfigValidator with TLS Cluster ───────────────────────────────────── + + +@skip_no_tls_cluster +class TestConfigValidatorTlsCluster: + """Test ConfigValidator._validate_redis_connection with TLS cluster.""" + + def test_validator_tls_cluster_production_config(self, monkeypatch): + """Config validator passes with production-like TLS cluster config.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" + + def test_validator_tls_cluster_bad_ca_cert_fails(self, monkeypatch): + """Config validator fails when CA cert path is wrong.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", "/nonexistent/ca.crt") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert len(validator.errors) > 0, "Expected validation error for bad CA cert" + + +# ── RedisConfig TLS kwargs verification ────────────────────────────────── + + +@skip_no_tls_cluster +class TestRedisConfigTlsKwargs: + """Verify RedisConfig.get_tls_kwargs() produces correct kwargs for production.""" + + def test_production_tls_kwargs(self, monkeypatch): + """get_tls_kwargs() output matches what RedisCluster needs for TLS.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + + from src.config.redis import RedisConfig + + cfg = RedisConfig( + redis_mode="cluster", + redis_tls_enabled=True, + redis_tls_ca_cert_file=CA_CERT_FILE, + redis_tls_cert_file="", + redis_tls_key_file="", + redis_tls_insecure=False, + ) + kwargs = cfg.get_tls_kwargs() + + assert kwargs["ssl"] is True + assert kwargs["ssl_cert_reqs"] == ssl_mod.CERT_REQUIRED + assert kwargs["ssl_check_hostname"] is False + assert kwargs["ssl_ca_certs"] == CA_CERT_FILE + # Empty string cert/key files should NOT be in kwargs + assert "ssl_certfile" not in kwargs + assert "ssl_keyfile" not in kwargs + + def test_tls_insecure_kwargs(self, monkeypatch): + """get_tls_kwargs() with insecure mode skips cert verification.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig( + redis_mode="cluster", + redis_tls_enabled=True, + redis_tls_insecure=True, + ) + kwargs = cfg.get_tls_kwargs() + + assert kwargs["ssl"] is True + assert kwargs["ssl_cert_reqs"] == ssl_mod.CERT_NONE + assert kwargs["ssl_check_hostname"] is False + + def test_tls_disabled_returns_empty(self): + """get_tls_kwargs() returns empty dict when TLS is off.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_tls_enabled=False) + assert cfg.get_tls_kwargs() == {} diff --git a/tests/tls-certs/.gitignore b/tests/tls-certs/.gitignore new file mode 100644 index 0000000..3ba3676 --- /dev/null +++ b/tests/tls-certs/.gitignore @@ -0,0 +1,6 @@ +# Generated TLS certificates — do not commit +*.key +*.crt +*.csr +*.srl +*.cnf diff --git a/tests/tls-certs/cleanup.sh b/tests/tls-certs/cleanup.sh new file mode 100644 index 0000000..340cc1d --- /dev/null +++ b/tests/tls-certs/cleanup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Remove generated TLS certificates from tests/tls-certs/. +# +# Usage: +# cd tests/tls-certs && ./cleanup.sh +set -euo pipefail +cd "$(dirname "$0")" + +rm -f ca.key ca.crt ca.srl ca-ext.cnf +rm -f redis.key redis.crt redis.csr redis-ext.cnf + +echo "TLS certificates cleaned up." diff --git a/tests/tls-certs/generate.sh b/tests/tls-certs/generate.sh new file mode 100644 index 0000000..2c62473 --- /dev/null +++ b/tests/tls-certs/generate.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Generate self-signed TLS certificates for Redis Cluster integration testing. +# +# Creates: +# ca.key / ca.crt — Certificate Authority (with keyUsage extensions for Python 3.14+) +# redis.key / redis.crt — Server cert signed by the CA (SANs for localhost + docker IPs) +# +# Usage: +# cd tests/tls-certs && ./generate.sh +set -euo pipefail +cd "$(dirname "$0")" + +echo "Generating CA key + certificate..." +cat > ca-ext.cnf << 'EOF' +[req] +default_bits = 4096 +prompt = no +distinguished_name = dn +x509_extensions = v3_ca + +[dn] +C = PT +ST = Lisboa +L = Lisboa +O = NOS Testing +CN = Redis Test CA + +[v3_ca] +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid:always,issuer +basicConstraints = critical, CA:TRUE +keyUsage = critical, keyCertSign, cRLSign +EOF + +openssl genrsa -out ca.key 4096 2>/dev/null +openssl req -x509 -new -nodes -key ca.key -sha256 -days 3650 \ + -out ca.crt -config ca-ext.cnf 2>/dev/null + +echo "Generating server key + certificate..." +cat > redis-ext.cnf << 'EOF' +[req] +default_bits = 2048 +prompt = no +distinguished_name = dn +req_extensions = v3_req + +[dn] +C = PT +ST = Lisboa +L = Lisboa +O = NOS Testing +CN = redis-node + +[v3_req] +subjectAltName = @alt_names +basicConstraints = CA:FALSE +keyUsage = digitalSignature, keyEncipherment +extendedKeyUsage = serverAuth, clientAuth + +[alt_names] +DNS.1 = redis-tls-node-0 +DNS.2 = redis-tls-node-1 +DNS.3 = redis-tls-node-2 +DNS.4 = redis-tls-node-3 +DNS.5 = redis-tls-node-4 +DNS.6 = redis-tls-node-5 +DNS.7 = localhost +IP.1 = 127.0.0.1 +IP.2 = 172.17.0.1 +IP.3 = 172.18.0.1 +IP.4 = 172.19.0.1 +IP.5 = 172.20.0.1 +IP.6 = 172.21.0.1 +IP.7 = 172.22.0.1 +IP.8 = 172.23.0.1 +IP.9 = 172.24.0.1 +IP.10 = 172.25.0.1 +EOF + +openssl genrsa -out redis.key 2048 2>/dev/null +openssl req -new -key redis.key -out redis.csr -config redis-ext.cnf 2>/dev/null +openssl x509 -req -in redis.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out redis.crt -days 3650 -sha256 \ + -extfile redis-ext.cnf -extensions v3_req 2>/dev/null + +# Redis needs world-readable key files (containers run as redis user) +chmod 644 redis.key +# CA private key should stay restricted — it is not needed by Redis containers +chmod 600 ca.key + +echo "Verifying certificate chain..." +openssl verify -CAfile ca.crt redis.crt + +echo "Done. Certificates generated in $(pwd)/" diff --git a/tests/unit/test_cluster_pipeline_compat.py b/tests/unit/test_cluster_pipeline_compat.py new file mode 100644 index 0000000..b308e8f --- /dev/null +++ b/tests/unit/test_cluster_pipeline_compat.py @@ -0,0 +1,244 @@ +"""Unit tests verifying that all Redis pipelines use transaction=False. + +Redis Cluster does not support MULTI/EXEC transactions across keys in +different hash slots. Every pipeline that touches keys with different +prefixes (e.g. session data + session index) MUST use transaction=False +so redis-py's ClusterPipeline can split commands by node. + +These tests act as a safety net: if someone accidentally changes a +pipeline back to transaction=True, the test will catch it. +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from src.models.session import SessionCreate +from src.services.api_key_manager import ApiKeyManagerService +from src.services.session import SessionService +from src.services.state import StateService + +# ── Session Service ───────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_session(): + """Mock Redis client for session tests.""" + redis_mock = AsyncMock() + + pipeline_mock = AsyncMock() + pipeline_mock.hset = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.sadd = MagicMock() + pipeline_mock.delete = MagicMock() + pipeline_mock.srem = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) + pipeline_mock.reset = AsyncMock() + + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) + redis_mock.hgetall = AsyncMock(return_value={}) + return redis_mock + + +@pytest.fixture +def session_service(mock_redis_session): + return SessionService(redis_client=mock_redis_session) + + +@pytest.mark.asyncio +async def test_session_create_uses_non_transactional_pipeline(session_service, mock_redis_session): + """create_session() must use transaction=False for cluster compat.""" + request = SessionCreate(metadata={"test": "value"}) + await session_service.create_session(request) + + mock_redis_session.pipeline.assert_called_once_with(transaction=False) + + +@pytest.mark.asyncio +async def test_session_delete_uses_non_transactional_pipeline(session_service, mock_redis_session): + """delete_session() must use transaction=False for cluster compat.""" + session_id = "session-to-delete" + # Provide minimal session data so delete_session finds the session + mock_redis_session.hgetall.return_value = { + "session_id": session_id, + "status": "active", + "created_at": "2025-01-01T00:00:00", + "last_activity": "2025-01-01T00:00:00", + "expires_at": "2026-01-01T00:00:00", + "files": "{}", + "metadata": "{}", + "working_directory": "/workspace", + } + + pipeline_mock = mock_redis_session.pipeline.return_value + pipeline_mock.execute = AsyncMock(return_value=[1, 1]) + + await session_service.delete_session(session_id) + + mock_redis_session.pipeline.assert_called_with(transaction=False) + + +# ── API Key Manager ───────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_apikey(): + """Mock Redis client for API key manager tests.""" + redis_mock = AsyncMock() + redis_mock.hgetall = AsyncMock(return_value={}) + redis_mock.hset = AsyncMock(return_value=1) + redis_mock.exists = AsyncMock(return_value=True) + redis_mock.delete = AsyncMock(return_value=1) + redis_mock.sadd = AsyncMock(return_value=1) + redis_mock.srem = AsyncMock(return_value=1) + redis_mock.smembers = AsyncMock(return_value=set()) + redis_mock.get = AsyncMock(return_value=None) + redis_mock.setex = AsyncMock(return_value=True) + redis_mock.incr = AsyncMock(return_value=1) + redis_mock.expire = AsyncMock(return_value=True) + redis_mock.hincrby = AsyncMock(return_value=1) + + pipeline_mock = AsyncMock() + pipeline_mock.hset = MagicMock() + pipeline_mock.sadd = MagicMock() + pipeline_mock.delete = MagicMock() + pipeline_mock.srem = MagicMock() + pipeline_mock.incr = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.hincrby = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) + + return redis_mock + + +@pytest.fixture +def api_key_manager(mock_redis_apikey): + return ApiKeyManagerService(redis_client=mock_redis_apikey) + + +@pytest.mark.asyncio +async def test_create_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """create_key() must use transaction=False for cluster compat.""" + result = await api_key_manager.create_key( + name="test-key", + ) + + # create_key calls pipeline at least once + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +@pytest.mark.asyncio +async def test_ensure_single_env_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """_ensure_single_env_key_record() must use transaction=False.""" + # Call the internal method directly + await api_key_manager._ensure_single_env_key_record("test-hash", "test-env") + + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +@pytest.mark.asyncio +async def test_revoke_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """revoke_key() must use transaction=False for cluster compat.""" + # Setup: make the key "exist" so revoke proceeds + mock_redis_apikey.hgetall.return_value = { + "name": "test-key", + "key_hash": "abc123", + "environment": "test", + "status": "active", + "created_at": "2025-01-01T00:00:00+00:00", + } + mock_redis_apikey.exists.return_value = True + + await api_key_manager.revoke_key("abc123") + + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +# ── State Service ─────────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_state(): + """Mock Redis client for state service tests.""" + client = AsyncMock() + client.get = AsyncMock(return_value=None) + client.setex = AsyncMock() + client.delete = AsyncMock() + client.strlen = AsyncMock(return_value=0) + client.ttl = AsyncMock(return_value=-1) + client.expire = AsyncMock() + + pipeline_mock = AsyncMock() + pipeline_mock.set = MagicMock() + pipeline_mock.setex = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True, True, True]) + client.pipeline = MagicMock(return_value=pipeline_mock) + + return client + + +@pytest.fixture +def state_service(mock_redis_state): + with patch("src.services.state.redis_pool") as mock_pool: + mock_pool.get_client.return_value = mock_redis_state + service = StateService(redis_client=mock_redis_state) + return service + + +@pytest.mark.asyncio +async def test_save_state_uses_non_transactional_pipeline(state_service, mock_redis_state): + """save_state() must use transaction=False for cluster compat.""" + import base64 + + session_id = "state-test-session" + raw_bytes = b"\x02test state data" + state_b64 = base64.b64encode(raw_bytes).decode("utf-8") + + await state_service.save_state(session_id, state_b64) + + mock_redis_state.pipeline.assert_called() + for call in mock_redis_state.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +# ── Version resolution ────────────────────────────────────────────────── + + +class TestVersionResolution: + """Tests for SERVICE_VERSION env var override.""" + + def test_logging_uses_service_version_when_set(self): + """add_service_context should prefer settings.service_version.""" + with ( + patch("src.utils.logging.settings") as mock_settings, + patch("src.utils.logging.__version__", "0.0.0.dev0"), + ): + mock_settings.service_version = "2.1.4" + from src.utils.logging import add_service_context + + event_dict = {} + add_service_context(None, None, event_dict) + + assert event_dict["version"] == "2.1.4" + + def test_logging_falls_back_to_build_version(self): + """add_service_context should fall back to __version__ when SERVICE_VERSION unset.""" + with ( + patch("src.utils.logging.settings") as mock_settings, + patch("src.utils.logging.__version__", "1.2.3"), + ): + mock_settings.service_version = None + from src.utils.logging import add_service_context + + event_dict = {} + add_service_context(None, None, event_dict) + + assert event_dict["version"] == "1.2.3" diff --git a/tests/unit/test_core_pool.py b/tests/unit/test_core_pool.py index 9d486a8..8d46d60 100644 --- a/tests/unit/test_core_pool.py +++ b/tests/unit/test_core_pool.py @@ -51,25 +51,60 @@ def test_initialize_creates_pool(self): assert pool._initialized is True assert pool._client is not None - def test_initialize_fallback_on_error(self): - """Test _initialize creates fallback client on error.""" + def test_initialize_raises_on_error(self): + """Test _initialize propagates errors instead of silently falling back.""" pool = RedisPool() with patch("src.core.pool.settings") as mock_settings: - mock_settings.get_redis_url.side_effect = Exception("Connection failed") - - with patch("src.core.pool.redis.from_url") as mock_from_url: - mock_from_url.return_value = MagicMock() - + mock_settings.redis.mode = "standalone" + mock_settings.redis.get_url.side_effect = Exception("Connection failed") + mock_settings.redis.get_tls_kwargs.return_value = {} + mock_settings.redis.key_prefix = "" + mock_settings.redis.max_connections = 20 + mock_settings.redis.socket_timeout = 5 + mock_settings.redis.socket_connect_timeout = 5 + + with pytest.raises(Exception, match="Connection failed"): pool._initialize() - assert pool._initialized is True - assert pool._client is not None + assert pool._initialized is False + assert pool._client is None class TestGetClient: """Tests for get_client method.""" + def test_init_cluster_does_not_pass_retry_on_timeout(self): + """Test _init_cluster uses retry/retry_on_error instead of retry_on_timeout. + + RedisCluster (async) does not accept retry_on_timeout as a kwarg. + """ + pool = RedisPool() + + with patch("src.core.pool.settings") as mock_settings: + cfg = mock_settings.redis + cfg.mode = "cluster" + cfg.host = "redis-host" + cfg.port = 6379 + cfg.password = None + cfg.cluster_nodes = None + cfg.key_prefix = "" + cfg.tls_enabled = False + cfg.max_connections = 20 + cfg.socket_timeout = 5 + cfg.socket_connect_timeout = 5 + cfg.get_tls_kwargs.return_value = {} + + with patch("src.core.pool.RedisCluster") as mock_cluster: + mock_cluster.return_value = MagicMock() + pool._initialize() + + mock_cluster.assert_called_once() + call_kwargs = mock_cluster.call_args[1] + assert "retry_on_timeout" not in call_kwargs, "RedisCluster does not accept retry_on_timeout" + assert "retry" in call_kwargs + assert "retry_on_error" in call_kwargs + def test_get_client_initializes_if_needed(self): """Test get_client initializes the pool if not initialized.""" pool = RedisPool() @@ -106,7 +141,7 @@ def test_pool_stats_not_initialized(self): stats = pool.pool_stats - assert stats == {"initialized": False} + assert stats == {"initialized": False, "mode": "standalone"} def test_pool_stats_initialized(self): """Test pool_stats when pool is initialized.""" @@ -114,10 +149,12 @@ def test_pool_stats_initialized(self): mock_pool = MagicMock() mock_pool.max_connections = 20 pool._pool = mock_pool + pool._initialized = True stats = pool.pool_stats assert stats["initialized"] is True + assert stats["mode"] == "standalone" assert stats["max_connections"] == 20 diff --git a/tests/unit/test_job_executor.py b/tests/unit/test_job_executor.py index 1036c66..d07bc0a 100644 --- a/tests/unit/test_job_executor.py +++ b/tests/unit/test_job_executor.py @@ -62,7 +62,7 @@ def test_init_with_defaults(self): assert executor.namespace == "default" assert executor.ttl_seconds_after_finished == 60 assert executor.active_deadline_seconds == 300 - assert executor.sidecar_image == "aronmuon/kubecoderun-sidecar:latest" + assert executor.sidecar_image == "aronmuon/kubecoderun-sidecar-agent:latest" def test_init_with_custom_values(self): """Test initialization with custom values.""" @@ -499,3 +499,90 @@ async def test_execute_with_job_cleanup_on_error(self, job_executor, pod_spec, j # Give asyncio.create_task time to schedule await asyncio.sleep(0.1) + + +class TestCreateJobPassesAllPodSpecFields: + """Tests that create_job passes all PodSpec fields to create_job_manifest.""" + + @pytest.mark.asyncio + async def test_create_job_passes_image_pull_secrets(self, job_executor): + """Test that image_pull_secrets are forwarded to create_job_manifest.""" + spec = PodSpec( + image="python:3.11", + language="python", + namespace="test-namespace", + image_pull_secrets=["my-registry-secret", "other-secret"], + ) + mock_batch_api = MagicMock() + mock_job = MagicMock() + mock_job.metadata.uid = "job-uid-123" + mock_batch_api.create_namespaced_job.return_value = mock_job + + with patch("src.services.kubernetes.job_executor.get_batch_api", return_value=mock_batch_api): + with patch("src.services.kubernetes.job_executor.create_job_manifest", return_value={}) as mock_manifest: + await job_executor.create_job(spec, "session-123") + + _, kwargs = mock_manifest.call_args + assert kwargs["image_pull_secrets"] == ["my-registry-secret", "other-secret"] + + @pytest.mark.asyncio + async def test_create_job_passes_image_pull_policy(self, job_executor): + """Test that image_pull_policy is forwarded to create_job_manifest.""" + spec = PodSpec( + image="python:3.11", + language="python", + namespace="test-namespace", + image_pull_policy="IfNotPresent", + ) + mock_batch_api = MagicMock() + mock_job = MagicMock() + mock_job.metadata.uid = "job-uid-123" + mock_batch_api.create_namespaced_job.return_value = mock_job + + with patch("src.services.kubernetes.job_executor.get_batch_api", return_value=mock_batch_api): + with patch("src.services.kubernetes.job_executor.create_job_manifest", return_value={}) as mock_manifest: + await job_executor.create_job(spec, "session-123") + + _, kwargs = mock_manifest.call_args + assert kwargs["image_pull_policy"] == "IfNotPresent" + + @pytest.mark.asyncio + async def test_create_job_passes_execution_mode(self, job_executor): + """Test that execution_mode is forwarded to create_job_manifest.""" + spec = PodSpec( + image="python:3.11", + language="python", + namespace="test-namespace", + execution_mode="nsenter", + ) + mock_batch_api = MagicMock() + mock_job = MagicMock() + mock_job.metadata.uid = "job-uid-123" + mock_batch_api.create_namespaced_job.return_value = mock_job + + with patch("src.services.kubernetes.job_executor.get_batch_api", return_value=mock_batch_api): + with patch("src.services.kubernetes.job_executor.create_job_manifest", return_value={}) as mock_manifest: + await job_executor.create_job(spec, "session-123") + + _, kwargs = mock_manifest.call_args + assert kwargs["execution_mode"] == "nsenter" + + @pytest.mark.asyncio + async def test_create_job_no_image_pull_secrets_by_default(self, job_executor): + """Test that image_pull_secrets defaults to None.""" + spec = PodSpec( + image="python:3.11", + language="python", + namespace="test-namespace", + ) + mock_batch_api = MagicMock() + mock_job = MagicMock() + mock_job.metadata.uid = "job-uid-123" + mock_batch_api.create_namespaced_job.return_value = mock_job + + with patch("src.services.kubernetes.job_executor.get_batch_api", return_value=mock_batch_api): + with patch("src.services.kubernetes.job_executor.create_job_manifest", return_value={}) as mock_manifest: + await job_executor.create_job(spec, "session-123") + + _, kwargs = mock_manifest.call_args + assert kwargs["image_pull_secrets"] is None diff --git a/tests/unit/test_kubernetes_client.py b/tests/unit/test_kubernetes_client.py index 35ef7b3..d0e145d 100644 --- a/tests/unit/test_kubernetes_client.py +++ b/tests/unit/test_kubernetes_client.py @@ -417,6 +417,102 @@ def test_create_pod_manifest_security_context(self): assert main_container.security_context.run_as_user == 1001 assert main_container.security_context.run_as_non_root is True + def test_create_pod_manifest_agent_mode_default(self): + """Test that agent mode is the default execution mode.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + ) + + # Agent mode: no shareProcessNamespace + assert pod.spec.share_process_namespace is False + + # Agent mode: init container copies executor agent to shared volume + assert pod.spec.init_containers is not None + assert len(pod.spec.init_containers) == 1 + init_container = pod.spec.init_containers[0] + assert init_container.name == "agent-init" + assert init_container.command[0] == "python" + assert "/opt/executor-agent" in init_container.command[2] + assert "/mnt/data/.executor-agent" in init_container.command[2] + + # Agent mode: main container runs executor agent with port + main_container = next(c for c in pod.spec.containers if c.name == "main") + assert main_container.args == ["/mnt/data/.executor-agent", "--port", "9090"] + + # Agent mode: sidecar has EXECUTION_MODE and EXECUTOR_PORT env vars + sidecar = next(c for c in pod.spec.containers if c.name == "sidecar") + env_dict = {e.name: e.value for e in sidecar.env} + assert env_dict["EXECUTION_MODE"] == "agent" + assert env_dict["EXECUTOR_PORT"] == "9090" + + # Agent mode: no capabilities, no privilege escalation for sidecar + assert sidecar.security_context.allow_privilege_escalation is False + assert sidecar.security_context.capabilities.drop == ["ALL"] + assert sidecar.security_context.capabilities.add is None + + # Agent mode: no capabilities, no privilege escalation for main + assert main_container.security_context.allow_privilege_escalation is False + assert main_container.security_context.capabilities.drop == ["ALL"] + + # Agent mode: init container also has minimal security + assert init_container.security_context.allow_privilege_escalation is False + assert init_container.security_context.capabilities.drop == ["ALL"] + + def test_create_pod_manifest_nsenter_mode(self): + """Test nsenter mode has the required capabilities and settings.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + execution_mode="nsenter", + ) + + # nsenter mode: shareProcessNamespace required + assert pod.spec.share_process_namespace is True + + # nsenter mode: no init containers + assert pod.spec.init_containers is None + + # nsenter mode: main container uses default CMD (no args override) + main_container = next(c for c in pod.spec.containers if c.name == "main") + assert main_container.args is None + + # nsenter mode: sidecar has elevated privileges + sidecar = next(c for c in pod.spec.containers if c.name == "sidecar") + assert sidecar.security_context.allow_privilege_escalation is True + assert set(sidecar.security_context.capabilities.add) == {"SYS_PTRACE", "SYS_ADMIN", "SYS_CHROOT"} + + # nsenter mode: EXECUTION_MODE is set to nsenter + env_dict = {e.name: e.value for e in sidecar.env} + assert env_dict["EXECUTION_MODE"] == "nsenter" + # nsenter mode: EXECUTOR_PORT is still present (used by both modes) + assert env_dict["EXECUTOR_PORT"] == "9090" + + def test_create_pod_manifest_agent_mode_executor_port(self): + """Test that agent mode uses the configured executor port.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + execution_mode="agent", + executor_port=8888, + ) + + sidecar = next(c for c in pod.spec.containers if c.name == "sidecar") + env_dict = {e.name: e.value for e in sidecar.env} + assert env_dict["EXECUTOR_PORT"] == "8888" + def test_create_pod_manifest_seccomp_profile_default(self): """Test pod manifest has RuntimeDefault seccomp profile by default.""" pod = client.create_pod_manifest( @@ -509,3 +605,246 @@ def test_create_pod_manifest_network_isolated_default(self): env_dict = {e.name: e.value for e in sidecar.env} assert "NETWORK_ISOLATED" in env_dict assert env_dict["NETWORK_ISOLATED"] == "false" + + def test_create_pod_manifest_gke_sandbox_enabled(self): + """Test GKE Sandbox adds runtime class, node selector, tolerations, and annotation.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + ) + + # Runtime class + assert pod.spec.runtime_class_name == "gvisor" + + # Node selector + assert pod.spec.node_selector is not None + assert pod.spec.node_selector["sandbox.gke.io/runtime"] == "gvisor" + + # Tolerations + assert pod.spec.tolerations is not None + assert len(pod.spec.tolerations) == 1 + tol = pod.spec.tolerations[0] + assert tol.key == "sandbox.gke.io/runtime" + assert tol.operator == "Equal" + assert tol.value == "gvisor" + assert tol.effect == "NoSchedule" + + # Annotation + assert pod.metadata.annotations["sandbox.gke.io/runtime"] == "gvisor" + + def test_create_pod_manifest_gke_sandbox_disabled(self): + """Test GKE Sandbox disabled has no runtime class, node selector, or tolerations.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=False, + ) + + assert pod.spec.runtime_class_name is None + assert pod.spec.node_selector is None + assert pod.spec.tolerations is None + + def test_create_pod_manifest_gke_sandbox_custom_runtime_class(self): + """Test GKE Sandbox with custom runtime class name.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + runtime_class_name="custom-runtime", + ) + + assert pod.spec.runtime_class_name == "custom-runtime" + + def test_create_pod_manifest_gke_sandbox_custom_node_selector(self): + """Test GKE Sandbox with additional custom node selector.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + sandbox_node_selector={"pool": "sandbox"}, + ) + + assert pod.spec.node_selector["sandbox.gke.io/runtime"] == "gvisor" + assert pod.spec.node_selector["pool"] == "sandbox" + + def test_create_pod_manifest_gke_sandbox_custom_tolerations(self): + """Test GKE Sandbox with additional custom tolerations.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + custom_tolerations=[{"key": "pool", "value": "sandbox"}], + ) + + # Should have both the GKE default + custom toleration + assert len(pod.spec.tolerations) == 2 + keys = [t.key for t in pod.spec.tolerations] + assert "sandbox.gke.io/runtime" in keys + assert "pool" in keys + + def test_create_pod_manifest_image_pull_secrets(self): + """Test pod manifest with image pull secrets.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + image_pull_secrets=["my-registry-secret", "other-secret"], + ) + + assert pod.spec.image_pull_secrets is not None + assert len(pod.spec.image_pull_secrets) == 2 + secret_names = [s.name for s in pod.spec.image_pull_secrets] + assert "my-registry-secret" in secret_names + assert "other-secret" in secret_names + + def test_create_pod_manifest_no_image_pull_secrets(self): + """Test pod manifest without image pull secrets.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + ) + + assert pod.spec.image_pull_secrets is None + + def test_create_pod_manifest_gke_sandbox_with_annotations(self): + """Test GKE Sandbox merges with existing annotations.""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + annotations={"custom": "value"}, + gke_sandbox_enabled=True, + ) + + assert pod.metadata.annotations["custom"] == "value" + assert pod.metadata.annotations["sandbox.gke.io/runtime"] == "gvisor" + + def test_create_pod_manifest_gke_sandbox_requires_agent_mode(self): + """Test that GKE Sandbox works with agent mode (default).""" + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + execution_mode="agent", + ) + + # Should have both GKE Sandbox and agent mode features + assert pod.spec.runtime_class_name == "gvisor" + assert pod.spec.share_process_namespace is False + assert pod.spec.init_containers is not None + + # Sidecar should have minimal privileges (agent mode) + sidecar = next(c for c in pod.spec.containers if c.name == "sidecar") + assert sidecar.security_context.allow_privilege_escalation is False + + def test_create_pod_manifest_gke_sandbox_warns_on_nsenter_mode(self): + """Test that GKE Sandbox with nsenter mode logs a warning.""" + with patch("src.services.kubernetes.client.logger") as mock_logger: + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + execution_mode="nsenter", + ) + + # Should still create the pod (warning, not error) + assert pod.spec.runtime_class_name == "gvisor" + + # nsenter mode features should be present + assert pod.spec.share_process_namespace is True + assert pod.spec.init_containers is None + + # Sidecar should have elevated privileges (nsenter mode) + sidecar = next(c for c in pod.spec.containers if c.name == "sidecar") + assert sidecar.security_context.allow_privilege_escalation is True + + # Should have logged a warning about incompatibility + mock_logger.warning.assert_called_once() + warning_msg = mock_logger.warning.call_args[0][0] + assert "gVisor" in warning_msg or "GKE Sandbox" in warning_msg + + def test_create_pod_manifest_annotations_not_mutated(self): + """Test that the caller's annotations dict is not mutated by GKE Sandbox.""" + original_annotations = {"custom": "value"} + annotations_copy = dict(original_annotations) + + client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + annotations=original_annotations, + gke_sandbox_enabled=True, + ) + + # Original dict must be unchanged + assert original_annotations == annotations_copy + + def test_create_pod_manifest_custom_tolerations_missing_key_skipped(self): + """Test that custom tolerations without a 'key' field are skipped with a warning.""" + with patch("src.services.kubernetes.client.logger") as mock_logger: + pod = client.create_pod_manifest( + name="test-pod", + namespace="test-ns", + main_image="python:3.12", + sidecar_image="sidecar:latest", + language="python", + labels={"app": "test"}, + gke_sandbox_enabled=True, + custom_tolerations=[ + {"key": "pool", "value": "sandbox"}, + {"operator": "Exists", "effect": "NoSchedule"}, # missing key + {"key": "other", "value": "val"}, + ], + ) + + # The GKE default + 2 valid custom (the one missing 'key' is skipped) + assert len(pod.spec.tolerations) == 3 + keys = [t.key for t in pod.spec.tolerations] + assert "pool" in keys + assert "other" in keys + assert "sandbox.gke.io/runtime" in keys + + # Should warn about the skipped toleration + mock_logger.warning.assert_called_once() diff --git a/tests/unit/test_session_service.py b/tests/unit/test_session_service.py index 579e571..f4e205a 100644 --- a/tests/unit/test_session_service.py +++ b/tests/unit/test_session_service.py @@ -26,8 +26,8 @@ def mock_redis(): pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) pipeline_mock.reset = AsyncMock() - # Make pipeline() return the pipeline mock when awaited - redis_mock.pipeline = AsyncMock(return_value=pipeline_mock) + # Make pipeline() return the pipeline mock (synchronous, like redis.asyncio) + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) return redis_mock diff --git a/tests/unit/test_settings_validators.py b/tests/unit/test_settings_validators.py index 01744c4..496eeda 100644 --- a/tests/unit/test_settings_validators.py +++ b/tests/unit/test_settings_validators.py @@ -3,6 +3,8 @@ Tests that our Settings class validates configuration values correctly. """ +import logging + import pytest from pydantic import ValidationError @@ -42,3 +44,125 @@ def test_default_is_runtime_default(self): """Test that the default seccomp profile type is RuntimeDefault.""" settings = Settings() assert settings.k8s_seccomp_profile_type == "RuntimeDefault" + + +class TestKubernetesPropertyJsonParsing: + """Tests for kubernetes property JSON parsing of GKE fields.""" + + def test_valid_node_selector_json(self): + """Test valid JSON for GKE_SANDBOX_NODE_SELECTOR is parsed.""" + settings = Settings(gke_sandbox_node_selector='{"pool": "sandbox"}') + k8s = settings.kubernetes + assert k8s.sandbox_node_selector == {"pool": "sandbox"} + + def test_invalid_node_selector_json_logs_warning(self, caplog): + """Test invalid JSON for GKE_SANDBOX_NODE_SELECTOR logs a warning.""" + settings = Settings(gke_sandbox_node_selector="not-valid-json") + with caplog.at_level(logging.WARNING): + k8s = settings.kubernetes + assert k8s.sandbox_node_selector is None + assert "GKE_SANDBOX_NODE_SELECTOR" in caplog.text + + def test_valid_custom_tolerations_json(self): + """Test valid JSON for GKE_SANDBOX_CUSTOM_TOLERATIONS is parsed.""" + settings = Settings(gke_sandbox_custom_tolerations='[{"key": "pool", "value": "sandbox"}]') + k8s = settings.kubernetes + assert k8s.custom_tolerations == [{"key": "pool", "value": "sandbox"}] + + def test_invalid_custom_tolerations_json_logs_warning(self, caplog): + """Test invalid JSON for GKE_SANDBOX_CUSTOM_TOLERATIONS logs a warning.""" + settings = Settings(gke_sandbox_custom_tolerations="[broken") + with caplog.at_level(logging.WARNING): + k8s = settings.kubernetes + assert k8s.custom_tolerations is None + assert "GKE_SANDBOX_CUSTOM_TOLERATIONS" in caplog.text + + def test_image_pull_policy_default_is_always(self): + """Test that the default image_pull_policy is 'Always' (matches Settings).""" + settings = Settings() + k8s = settings.kubernetes + assert k8s.image_pull_policy == "Always" + + +class TestRedisPasswordValidator: + """Tests for empty-string-to-None password sanitization.""" + + def test_empty_password_becomes_none(self): + """Empty string REDIS_PASSWORD is converted to None.""" + settings = Settings(redis_password="") + assert settings.redis_password is None + + def test_whitespace_password_becomes_none(self): + """Whitespace-only REDIS_PASSWORD is converted to None.""" + settings = Settings(redis_password=" ") + assert settings.redis_password is None + + def test_real_password_preserved(self): + """Non-empty password is kept as-is.""" + settings = Settings(redis_password="s3cret") + assert settings.redis_password == "s3cret" + + def test_none_password_stays_none(self): + """None password stays None.""" + settings = Settings(redis_password=None) + assert settings.redis_password is None + + def test_empty_sentinel_password_becomes_none(self): + """Empty sentinel password is converted to None.""" + settings = Settings(redis_sentinel_password="") + assert settings.redis_sentinel_password is None + + +class TestRedisClusterNodesValidator: + """Tests for empty-string-to-None cluster/sentinel node sanitization.""" + + def test_empty_cluster_nodes_becomes_none(self): + """Empty REDIS_CLUSTER_NODES is converted to None.""" + settings = Settings(redis_cluster_nodes="") + assert settings.redis_cluster_nodes is None + + def test_whitespace_cluster_nodes_becomes_none(self): + """Whitespace-only REDIS_CLUSTER_NODES is converted to None.""" + settings = Settings(redis_cluster_nodes=" ") + assert settings.redis_cluster_nodes is None + + def test_real_cluster_nodes_preserved(self): + """Valid node list is kept.""" + settings = Settings(redis_cluster_nodes="node1:7000,node2:7001") + assert settings.redis_cluster_nodes == "node1:7000,node2:7001" + + def test_empty_sentinel_nodes_becomes_none(self): + """Empty REDIS_SENTINEL_NODES is converted to None.""" + settings = Settings(redis_sentinel_nodes="") + assert settings.redis_sentinel_nodes is None + + def test_real_sentinel_nodes_preserved(self): + """Valid sentinel node list is kept.""" + settings = Settings(redis_sentinel_nodes="sent1:26379,sent2:26379") + assert settings.redis_sentinel_nodes == "sent1:26379,sent2:26379" + + +class TestRedisConfigValidators: + """Tests for RedisConfig-level validators (password + nodes).""" + + def test_redis_config_empty_password_to_none(self): + """RedisConfig also converts empty password to None.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_password="") + assert cfg.password is None + + def test_redis_config_empty_cluster_nodes_to_none(self): + """RedisConfig also converts empty cluster nodes to None.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_cluster_nodes="") + assert cfg.cluster_nodes is None + + def test_redis_config_real_values_preserved(self): + """Non-empty values pass through.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_password="pass", redis_cluster_nodes="h:7000") + assert cfg.password == "pass" + assert cfg.cluster_nodes == "h:7000"