diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..86dea68 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:1 +# Zeroboot server image +# Multi-stage build: compile Rust binary, then assemble minimal runtime image. +# +# Usage: +# docker build -t zeroboot:latest . +# +# The image does NOT bundle vmlinux or rootfs — mount them via PersistentVolume. +# See deploy/k8s/ for Kubernetes manifests. + +# ─── Stage 1: Build zeroboot binary ────────────────────────────────────────── +FROM rust:1.86-bookworm AS builder + +WORKDIR /build + +# Cache dependencies separately from source +COPY Cargo.toml Cargo.lock ./ +RUN mkdir src && echo 'fn main(){}' > src/main.rs && \ + cargo build --release && \ + rm -f target/release/zeroboot target/release/deps/zeroboot* + +# Build actual source +COPY src/ src/ +COPY guest/ guest/ +RUN cargo build --release + +# ─── Stage 2: Runtime image ─────────────────────────────────────────────────── +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# Runtime dependencies only +RUN apt-get update -qq && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Firecracker +ARG FC_VERSION=v1.15.0 +RUN curl -fsSL -o /tmp/fc.tgz \ + "https://github.com/firecracker-microvm/firecracker/releases/download/${FC_VERSION}/firecracker-${FC_VERSION}-x86_64.tgz" && \ + tar -xzf /tmp/fc.tgz -C /tmp && \ + mv "/tmp/release-${FC_VERSION}-x86_64/firecracker-${FC_VERSION}-x86_64" /usr/local/bin/firecracker && \ + chmod +x /usr/local/bin/firecracker && \ + rm -rf /tmp/fc.tgz /tmp/release-* + +# Copy zeroboot binary +COPY --from=builder /build/target/release/zeroboot /usr/local/bin/zeroboot + +# Data directory — mount a PersistentVolume here to persist snapshots +VOLUME ["/var/lib/zeroboot"] + +# Copy entrypoint +COPY docker/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +EXPOSE 8080 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/deploy/eks/eks-add-kvm-nodegroup.yaml b/deploy/eks/eks-add-kvm-nodegroup.yaml new file mode 100644 index 0000000..6b5a761 --- /dev/null +++ b/deploy/eks/eks-add-kvm-nodegroup.yaml @@ -0,0 +1,37 @@ +# Scenario 2b: Add KVM node group to an EXISTING EKS cluster +# Usage: +# eksctl create nodegroup -f eks-add-kvm-nodegroup.yaml + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: zeroboot-eks # must match existing cluster name + region: ap-southeast-1 # must match existing cluster region + +managedNodeGroups: + - name: zeroboot-kvm + instanceType: c8i.xlarge + minSize: 1 + maxSize: 5 + desiredCapacity: 2 + amiFamily: AmazonLinux2023 + volumeSize: 50 + privateNetworking: true + + cpuOptions: + nestedVirtualization: enabled + + labels: + kvm-capable: "true" + workload: zeroboot + + iam: + attachPolicyARNs: + - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy + - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + + tags: + Project: zeroboot + ManagedBy: eksctl diff --git a/deploy/eks/eks-cluster-only.yaml b/deploy/eks/eks-cluster-only.yaml new file mode 100644 index 0000000..54fb2ba --- /dev/null +++ b/deploy/eks/eks-cluster-only.yaml @@ -0,0 +1,28 @@ +# Scenario 2a: Create EKS cluster WITHOUT any node group +# Usage: +# eksctl create cluster -f eks-cluster-only.yaml + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: zeroboot-eks + region: ap-southeast-1 + version: "1.31" + +vpc: + clusterEndpoints: + privateAccess: true + publicAccess: true + +# Explicitly no node groups at cluster creation time +managedNodeGroups: [] + +iam: + withOIDC: true + +addons: + - name: aws-ebs-csi-driver + version: latest + attachPolicyARNs: + - arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy diff --git a/deploy/eks/eks-self-managed-kvm.sh b/deploy/eks/eks-self-managed-kvm.sh new file mode 100755 index 0000000..aaac251 --- /dev/null +++ b/deploy/eks/eks-self-managed-kvm.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +# deploy/eks/eks-self-managed-kvm.sh +# +# Creates a self-managed EKS node group with nested virtualization enabled. +# +# WHY SELF-MANAGED? +# EKS Managed Node Groups silently drop CpuOptions when generating their +# internal Launch Template — even when you supply CpuOptions in your own LT. +# Self-managed ASG + Launch Template bypasses EKS entirely, so CpuOptions +# (including NestedVirtualization=enabled) is applied directly to the instance. +# +# USAGE: +# export AWS_PROFILE=your-profile +# export CLUSTER_NAME=zeroboot-eks +# export REGION=ap-southeast-1 +# bash eks-self-managed-kvm.sh +# +# REQUIREMENTS: +# - aws cli v2 +# - kubectl configured for the target cluster +# - eksctl (for cluster-only creation, see eks-cluster-only.yaml) +# +# WHAT THIS SCRIPT DOES: +# 1. Creates IAM role + instance profile for worker nodes +# 2. Registers node role with EKS (access entry) +# 3. Fetches cluster params (endpoint, cert, subnets, SGs) +# 4. Queries latest EKS-optimized AL2023 AMI +# 5. Creates Launch Template with CpuOptions.NestedVirtualization=enabled +# 6. Creates Auto Scaling Group (2-4 nodes) +# 7. Verifies /dev/kvm is present on nodes + +set -euo pipefail + +: "${CLUSTER_NAME:=zeroboot-eks}" +: "${REGION:=ap-southeast-1}" +: "${INSTANCE_TYPE:=c8i.xlarge}" +: "${K8S_VERSION:=1.31}" +: "${MIN_SIZE:=1}" +: "${MAX_SIZE:=4}" +: "${DESIRED:=2}" +: "${NODE_ROLE_NAME:=zeroboot-eks-node-role}" +: "${INSTANCE_PROFILE_NAME:=zeroboot-eks-node-profile}" +: "${LT_NAME:=zeroboot-kvm-nested-virt}" +: "${ASG_NAME:=zeroboot-kvm-self-managed}" + +echo "==> Fetching cluster info..." +ENDPOINT=$(aws eks describe-cluster --name "$CLUSTER_NAME" --region "$REGION" \ + --query "cluster.endpoint" --output text) +CERT_AUTH=$(aws eks describe-cluster --name "$CLUSTER_NAME" --region "$REGION" \ + --query "cluster.certificateAuthority.data" --output text) +CIDR=$(aws eks describe-cluster --name "$CLUSTER_NAME" --region "$REGION" \ + --query "cluster.kubernetesNetworkConfig.serviceIpv4Cidr" --output text) +CLUSTER_SG=$(aws eks describe-cluster --name "$CLUSTER_NAME" --region "$REGION" \ + --query "cluster.resourcesVpcConfig.clusterSecurityGroupId" --output text) +SUBNETS=$(aws eks describe-cluster --name "$CLUSTER_NAME" --region "$REGION" \ + --query "cluster.resourcesVpcConfig.subnetIds" --output text | tr '\t' ',') +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + +echo " Cluster: $CLUSTER_NAME" +echo " Region: $REGION" +echo " Account: $ACCOUNT_ID" +echo " ClusterSG: $CLUSTER_SG" +echo " Subnets: $SUBNETS" + +# ─── Step 1: IAM role ───────────────────────────────────────────────────────── +echo "" +echo "==> Creating IAM node role: $NODE_ROLE_NAME" + +if aws iam get-role --role-name "$NODE_ROLE_NAME" &>/dev/null; then + echo " Role already exists, skipping." +else + aws iam create-role \ + --role-name "$NODE_ROLE_NAME" \ + --assume-role-policy-document '{ + "Version":"2012-10-17", + "Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}] + }' > /dev/null + + for POLICY in AmazonEKSWorkerNodePolicy AmazonEKS_CNI_Policy AmazonEC2ContainerRegistryReadOnly; do + aws iam attach-role-policy \ + --role-name "$NODE_ROLE_NAME" \ + --policy-arn "arn:aws:iam::aws:policy/${POLICY}" + done + echo " Role created." +fi + +# Instance profile +if aws iam get-instance-profile --instance-profile-name "$INSTANCE_PROFILE_NAME" &>/dev/null; then + echo " Instance profile already exists, skipping." +else + aws iam create-instance-profile --instance-profile-name "$INSTANCE_PROFILE_NAME" > /dev/null + aws iam add-role-to-instance-profile \ + --instance-profile-name "$INSTANCE_PROFILE_NAME" \ + --role-name "$NODE_ROLE_NAME" + echo " Instance profile created. Waiting 15s for IAM propagation..." + sleep 15 +fi + +NODE_ROLE_ARN="arn:aws:iam::${ACCOUNT_ID}:role/${NODE_ROLE_NAME}" +INSTANCE_PROFILE_ARN="arn:aws:iam::${ACCOUNT_ID}:instance-profile/${INSTANCE_PROFILE_NAME}" + +# ─── Step 2: EKS access entry ──────────────────────────────────────────────── +echo "" +echo "==> Registering node role with EKS cluster..." +aws eks create-access-entry \ + --cluster-name "$CLUSTER_NAME" \ + --principal-arn "$NODE_ROLE_ARN" \ + --type EC2_LINUX \ + --region "$REGION" 2>/dev/null || echo " Access entry already exists." + +# ─── Step 3: AMI ───────────────────────────────────────────────────────────── +echo "" +echo "==> Fetching latest EKS-optimized AMI (AL2023, K8s ${K8S_VERSION})..." +AMI_ID=$(aws ssm get-parameter \ + --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" \ + --region "$REGION" --query "Parameter.Value" --output text) +echo " AMI: $AMI_ID" + +# ─── Step 4: UserData (AL2023 nodeadm format) ──────────────────────────────── +echo "" +echo "==> Preparing UserData..." +USERDATA=$(cat << EOF +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="//" + +--// +Content-Type: application/node.eks.aws + +--- +apiVersion: node.eks.aws/v1alpha1 +kind: NodeConfig +spec: + cluster: + apiServerEndpoint: ${ENDPOINT} + certificateAuthority: ${CERT_AUTH} + cidr: ${CIDR} + name: ${CLUSTER_NAME} + kubelet: + config: + maxPods: 110 + flags: + - "--node-labels=kvm-capable=true,workload=zeroboot" + +--//-- +EOF +) +USERDATA_B64=$(echo "$USERDATA" | base64 -w 0) + +# ─── Step 5: Launch Template ────────────────────────────────────────────────── +echo "" +echo "==> Creating Launch Template: $LT_NAME" +echo " (CpuOptions.NestedVirtualization=enabled — this is the key field that" +echo " EKS managed node groups silently drop)" + +LT_DATA=$(cat << EOF +{ + "ImageId": "${AMI_ID}", + "InstanceType": "${INSTANCE_TYPE}", + "CpuOptions": {"NestedVirtualization": "enabled"}, + "SecurityGroupIds": ["${CLUSTER_SG}"], + "MetadataOptions": {"HttpTokens": "required", "HttpPutResponseHopLimit": 2}, + "IamInstanceProfile": {"Arn": "${INSTANCE_PROFILE_ARN}"}, + "UserData": "${USERDATA_B64}", + "TagSpecifications": [{ + "ResourceType": "instance", + "Tags": [ + {"Key": "Name", "Value": "zeroboot-kvm-node"}, + {"Key": "kubernetes.io/cluster/${CLUSTER_NAME}", "Value": "owned"}, + {"Key": "kvm-capable", "Value": "true"} + ] + }] +} +EOF +) + +LT_RESULT=$(aws ec2 create-launch-template \ + --launch-template-name "$LT_NAME" \ + --region "$REGION" \ + --launch-template-data "$LT_DATA" \ + --output json 2>/dev/null || \ + aws ec2 describe-launch-templates \ + --launch-template-names "$LT_NAME" \ + --region "$REGION" \ + --query "LaunchTemplates[0]" --output json) + +LT_ID=$(echo "$LT_RESULT" | python3 -c " +import json,sys +d = json.load(sys.stdin) +# handle both create and describe responses +print(d.get('LaunchTemplate', d).get('LaunchTemplateId')) +") +LT_VERSION=$(aws ec2 describe-launch-template-versions \ + --launch-template-id "$LT_ID" --region "$REGION" \ + --query "LaunchTemplateVersions[-1].VersionNumber" --output text) + +echo " LT ID: $LT_ID" +echo " LT Version: $LT_VERSION" + +# ─── Step 6: Auto Scaling Group ─────────────────────────────────────────────── +echo "" +echo "==> Creating Auto Scaling Group: $ASG_NAME" +aws autoscaling create-auto-scaling-group \ + --auto-scaling-group-name "$ASG_NAME" \ + --launch-template "LaunchTemplateId=${LT_ID},Version=${LT_VERSION}" \ + --min-size "$MIN_SIZE" \ + --max-size "$MAX_SIZE" \ + --desired-capacity "$DESIRED" \ + --vpc-zone-identifier "$SUBNETS" \ + --tags \ + "Key=Name,Value=zeroboot-kvm-node,PropagateAtLaunch=true" \ + "Key=kubernetes.io/cluster/${CLUSTER_NAME},Value=owned,PropagateAtLaunch=true" \ + "Key=kvm-capable,Value=true,PropagateAtLaunch=true" \ + --region "$REGION" 2>/dev/null || echo " ASG already exists." + +echo " ASG created. Waiting for nodes to join (up to 3 minutes)..." +sleep 60 + +# ─── Step 7: Verify ─────────────────────────────────────────────────────────── +echo "" +echo "==> Verifying nodes..." +kubectl get nodes -l kvm-capable=true 2>/dev/null || echo " (kubectl not configured or nodes not yet ready)" + +echo "" +echo "==> Testing /dev/kvm access..." +kubectl run kvm-verify --restart=Never \ + --image=amazonlinux:2023 \ + --overrides='{"spec":{"nodeSelector":{"kvm-capable":"true"},"containers":[{"name":"c","image":"amazonlinux:2023","command":["sh","-c","ls -la /dev/kvm && grep -c vmx /proc/cpuinfo && cat /sys/module/kvm_intel/parameters/nested 2>/dev/null || echo N/A"],"securityContext":{"privileged":true}}]}}' \ + 2>/dev/null || true + +echo " Waiting 30s for pod to start..." +sleep 30 +kubectl logs kvm-verify 2>/dev/null || echo " Pod not ready yet, check manually: kubectl logs kvm-verify" +kubectl delete pod kvm-verify --ignore-not-found 2>/dev/null + +echo "" +echo "==> Done! Self-managed node group with nested virtualization created." +echo " - Launch Template: $LT_ID (v${LT_VERSION}) — CpuOptions.NestedVirtualization=enabled" +echo " - ASG: $ASG_NAME" +echo " - Node label: kvm-capable=true (already set via --node-labels in userdata)" +echo "" +echo " Next: Deploy zeroboot using deploy/k8s/" +echo " kubectl apply -f deploy/k8s/namespace.yaml" +echo " kubectl apply -f deploy/k8s/" diff --git a/deploy/eks/eks-with-kvm-nodegroup.yaml b/deploy/eks/eks-with-kvm-nodegroup.yaml new file mode 100644 index 0000000..d2efc47 --- /dev/null +++ b/deploy/eks/eks-with-kvm-nodegroup.yaml @@ -0,0 +1,80 @@ +# ⚠️ WARNING: EKS Managed Node Groups silently drop CpuOptions +# +# This file uses managedNodeGroups with cpuOptions.nestedVirtualization. +# However, EKS generates its own internal Launch Template from the user-supplied +# one, and CpuOptions is NOT carried over — resulting in nodes WITHOUT /dev/kvm +# even though the YAML appears correct. +# +# STATUS: AWS has acknowledged this as a documentation gap (CpuOptions is not in +# the official "blocked fields" list but is still ignored in practice). +# +# RECOMMENDED ALTERNATIVE: Use deploy/eks/eks-self-managed-kvm.yaml instead. +# That file creates a Self-managed Node Group (ASG + Launch Template) where +# CpuOptions is applied directly without EKS interference. +# +# Use THIS file only if you have verified that your eksctl version correctly +# passes CpuOptions through (test with: kubectl run kvm-check ... ls /dev/kvm). + +# Scenario 1: Create EKS cluster with KVM node group in one shot +# Usage: +# eksctl create cluster -f eks-with-kvm-nodegroup.yaml + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: zeroboot-eks + region: ap-southeast-1 + version: "1.31" + +# Cluster-level VPC (auto-created) +vpc: + clusterEndpoints: + privateAccess: true + publicAccess: true + +# No default node group — only our KVM-capable group +managedNodeGroups: + - name: zeroboot-kvm + instanceType: c8i.xlarge + minSize: 1 + maxSize: 5 + desiredCapacity: 2 + amiFamily: AmazonLinux2023 + volumeSize: 50 # GB — enough for OS + Docker images + zeroboot binary + privateNetworking: true # place nodes in private subnets + + # Enable nested virtualization — requires c8i/m8i/r8i + cpuOptions: + nestedVirtualization: enabled + + # Labels used by Deployment nodeSelector and Karpenter NodePool + labels: + kvm-capable: "true" + workload: zeroboot + + # Optional: taint to reserve these nodes exclusively for zeroboot + # taints: + # - key: zeroboot + # value: "true" + # effect: NoSchedule + + iam: + attachPolicyARNs: + - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy + - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + + tags: + Project: zeroboot + ManagedBy: eksctl + +# Add-ons needed for EBS PVC support +addons: + - name: aws-ebs-csi-driver + version: latest + attachPolicyARNs: + - arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy + +iam: + withOIDC: true # required for add-on IAM role binding (IRSA) diff --git a/deploy/k8s/deployment.yaml b/deploy/k8s/deployment.yaml new file mode 100644 index 0000000..cf3edd4 --- /dev/null +++ b/deploy/k8s/deployment.yaml @@ -0,0 +1,99 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: zeroboot + namespace: zeroboot + labels: + app: zeroboot +spec: + replicas: 2 + selector: + matchLabels: + app: zeroboot + template: + metadata: + labels: + app: zeroboot + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/v1/metrics" + spec: + # ── Scheduling: only run on KVM-capable nodes ─────────────────────────── + nodeSelector: + kvm-capable: "true" # label KVM nodes with: kubectl label node kvm-capable=true + + # Spread pods across nodes — each pod needs its own physical memory for CoW + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: zeroboot + topologyKey: kubernetes.io/hostname + + containers: + - name: zeroboot + image: ghcr.io/zerobootdev/zeroboot:latest + imagePullPolicy: Always + ports: + - name: http + containerPort: 8080 + + # ── KVM device access ────────────────────────────────────────────── + # Use privileged mode + hostPath mount (compatible with EKS without kubevirt). + # If KubeVirt device plugin is installed, you can replace this with: + # resources.limits: { devices.kubevirt.io/kvm: "1" } + # and remove securityContext.privileged. + securityContext: + privileged: true + resources: + requests: + memory: "2Gi" + cpu: "1" + limits: + memory: "8Gi" + cpu: "4" + + env: + - name: ZEROBOOT_WORKDIR + value: /var/lib/zeroboot + - name: ZEROBOOT_PORT + value: "8080" + - name: ZEROBOOT_TEMPLATE_WAIT + value: "15" + # Optional: path to api_keys.json on the PVC + # - name: ZEROBOOT_API_KEYS_FILE + # value: /var/lib/zeroboot/api_keys.json + + volumeMounts: + - name: data + mountPath: /var/lib/zeroboot + - name: kvm + mountPath: /dev/kvm + + # ── Health checks ────────────────────────────────────────────────── + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 30 # allow time for template creation on first boot + periodSeconds: 5 + failureThreshold: 6 + + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 120 # template creation takes ~19s; allow 2x margin + periodSeconds: 10 + failureThreshold: 6 + + volumes: + - name: data + persistentVolumeClaim: + claimName: zeroboot-data + - name: kvm + hostPath: + path: /dev/kvm + type: CharDevice diff --git a/deploy/k8s/hpa.yaml b/deploy/k8s/hpa.yaml new file mode 100644 index 0000000..5c34940 --- /dev/null +++ b/deploy/k8s/hpa.yaml @@ -0,0 +1,45 @@ +# Horizontal Pod Autoscaler for zeroboot +# +# Scales based on zeroboot_concurrent_forks — the number of active VM sandboxes +# per pod. CPU/memory are poor proxies for zeroboot workloads because: +# - Fork is memory-bound (CoW page faults), not CPU-bound +# - RSS grows proportionally with concurrent sandboxes +# +# Prerequisites: +# 1. prometheus-adapter installed and configured to expose zeroboot_concurrent_forks +# as a Kubernetes custom metric (pods/zeroboot_concurrent_forks) +# 2. See docs/KUBERNETES.md for prometheus-adapter config snippet + +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: zeroboot + namespace: zeroboot +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: zeroboot + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: zeroboot_concurrent_forks + target: + type: AverageValue + averageValue: "800" # scale out when avg concurrent sandboxes > 800 per pod + behavior: + scaleUp: + stabilizationWindowSeconds: 30 + policies: + - type: Pods + value: 2 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 # 5 min cooldown before scale-in + policies: + - type: Pods + value: 1 + periodSeconds: 120 diff --git a/deploy/k8s/namespace.yaml b/deploy/k8s/namespace.yaml new file mode 100644 index 0000000..4307e42 --- /dev/null +++ b/deploy/k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: zeroboot diff --git a/deploy/k8s/pvc.yaml b/deploy/k8s/pvc.yaml new file mode 100644 index 0000000..92cadba --- /dev/null +++ b/deploy/k8s/pvc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: zeroboot-data + namespace: zeroboot + labels: + app: zeroboot +spec: + accessModes: + - ReadWriteOnce + storageClassName: gp3 # AWS EBS gp3; adjust for your cloud provider + resources: + requests: + storage: 20Gi # vmlinux (~21MB) + rootfs (~500MB) + snapshot (~512MB) per template diff --git a/deploy/k8s/service.yaml b/deploy/k8s/service.yaml new file mode 100644 index 0000000..8854f22 --- /dev/null +++ b/deploy/k8s/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: zeroboot + namespace: zeroboot + labels: + app: zeroboot +spec: + type: ClusterIP # Use LoadBalancer to expose externally + selector: + app: zeroboot + ports: + - name: http + port: 80 + targetPort: 8080 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..4c339ca --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -euo pipefail + +WORKDIR="${ZEROBOOT_WORKDIR:-/var/lib/zeroboot}" +KERNEL="${ZEROBOOT_KERNEL:-${WORKDIR}/vmlinux-fc}" +ROOTFS_PYTHON="${ZEROBOOT_ROOTFS_PYTHON:-${WORKDIR}/rootfs-python.ext4}" +ROOTFS_NODE="${ZEROBOOT_ROOTFS_NODE:-}" +PORT="${ZEROBOOT_PORT:-8080}" +TEMPLATE_WAIT="${ZEROBOOT_TEMPLATE_WAIT:-15}" + +# ── Validate KVM access ─────────────────────────────────────────────────────── +if [ ! -c /dev/kvm ]; then + echo "ERROR: /dev/kvm not found. Node must support KVM and use the KVM device plugin." + exit 1 +fi + +# ── Check required files ────────────────────────────────────────────────────── +if [ ! -f "$KERNEL" ]; then + echo "ERROR: Kernel not found at $KERNEL" + echo "Mount a PersistentVolume to $WORKDIR containing vmlinux-fc and rootfs images." + exit 1 +fi + +if [ ! -f "$ROOTFS_PYTHON" ]; then + echo "ERROR: Python rootfs not found at $ROOTFS_PYTHON" + exit 1 +fi + +# ── Create template if snapshot doesn't exist ──────────────────────────────── +PYTHON_SNAPSHOT="${WORKDIR}/python/snapshot/vmstate" + +if [ ! -f "$PYTHON_SNAPSHOT" ]; then + echo "No snapshot found — creating Python template (this takes ~${TEMPLATE_WAIT}s)..." + mkdir -p "${WORKDIR}/python" + cp "$ROOTFS_PYTHON" "${WORKDIR}/python-rootfs.ext4" + /usr/local/bin/zeroboot template \ + "$KERNEL" \ + "${WORKDIR}/python-rootfs.ext4" \ + "${WORKDIR}/python" \ + "$TEMPLATE_WAIT" \ + /init + echo "Template created." +else + echo "Snapshot found — skipping template creation." +fi + +# ── Build serve target ──────────────────────────────────────────────────────── +SERVE_TARGET="python:${WORKDIR}/python" + +if [ -n "$ROOTFS_NODE" ] && [ -f "$ROOTFS_NODE" ]; then + NODE_SNAPSHOT="${WORKDIR}/node/snapshot/vmstate" + if [ ! -f "$NODE_SNAPSHOT" ]; then + echo "Creating Node.js template..." + mkdir -p "${WORKDIR}/node" + cp "$ROOTFS_NODE" "${WORKDIR}/node-rootfs.ext4" + /usr/local/bin/zeroboot template \ + "$KERNEL" \ + "${WORKDIR}/node-rootfs.ext4" \ + "${WORKDIR}/node" \ + "$TEMPLATE_WAIT" \ + /init-node.sh + echo "Node template created." + fi + SERVE_TARGET="${SERVE_TARGET},node:${WORKDIR}/node" +fi + +# ── Start API server ────────────────────────────────────────────────────────── +echo "Starting zeroboot API server on port ${PORT}..." +exec /usr/local/bin/zeroboot serve "$SERVE_TARGET" "$PORT" diff --git a/docs/KUBERNETES.md b/docs/KUBERNETES.md new file mode 100644 index 0000000..a131b4c --- /dev/null +++ b/docs/KUBERNETES.md @@ -0,0 +1,371 @@ +# Running Zeroboot on Kubernetes + +Zeroboot can be deployed as a stateful service inside a Kubernetes cluster. +This guide covers node requirements, KVM device access, persistent storage for +snapshots, reference manifests, and autoscaling. + +--- + +## Architecture overview + +``` +Internet → K8s Service + │ + ┌──────┼──────┐ + │ │ │ + Pod-1 Pod-2 Pod-3 ← one Pod per KVM-capable Node (podAntiAffinity) + │ │ │ + VM VM VM VM VM VM ← KVM forks happen inside the Pod, sub-millisecond +``` + +**Key point:** Kubernetes manages the lifecycle of the zeroboot *server* process. +It does not schedule individual sandboxes — each `v1/exec` request is handled +entirely within the Pod that receives it via a KVM fork (~0.8 ms). Kubernetes' +role is capacity management: health checks, rolling updates, and horizontal scaling. + +--- + +## Node requirements + +### Instance families with KVM support + +Not all EC2 instance types expose `/dev/kvm`. The following families support KVM +and are suitable for zeroboot: + +| Family | KVM method | Notes | +|---|---|---| +| `c8i`, `m8i`, `r8i` | ✅ **Nested virtualization** | **Recommended** — Intel 8th-gen Nitro platform; supports nested virt without metal. Enable at launch via `--cpu-options NestedVirtualization=enabled` (requires AWS CLI ≥ v2.34) | +| `c6i`, `c6a`, `c7i`, `m6i`, `m7i`, `r6i`, `r7i` | ✅ Bare-metal only | KVM available only on `.metal` sizes (e.g. `c6i.metal`) | +| `c5`, `m5`, `r5` | ✅ Bare-metal only | Older Nitro generation; `.metal` sizes only | +| `t3`, `t4g` | ❌ Not available | Burstable — `/dev/kvm` not exposed | +| `t2` | ❌ Not available | No Nitro, no KVM | +| Any ARM (`*g`) | ❌ Architecture mismatch | Firecracker x86_64 binary required | + +**TL;DR for EKS node groups:** Use `c8i`, `m8i`, or `r8i` with nested virtualization +enabled — these are the only non-metal families where regular (non-`.metal`) instance +sizes expose `/dev/kvm`. All other families require `.metal` sizes which are significantly +more expensive and harder to schedule in K8s. + +```bash +# Enable nested virtualization when launching a new instance (c8i/m8i/r8i only) +aws ec2 run-instances \ + --instance-type c8i.xlarge \ + --cpu-options "NestedVirtualization=enabled" \ + ... +``` + +> On GCP: `n2`, `n2d`, `c2`, `c3` families support KVM. +> On Azure: `Dv3`, `Ev3`, `Dsv3` with nested virtualization enabled. + +### Label KVM-capable nodes + +```bash +kubectl label node kvm-capable=true +``` + +The Deployment's `nodeSelector` uses this label to ensure Pods are only scheduled +where `/dev/kvm` is available. + +--- + +--- + +## EKS deployment: managed vs self-managed node groups + +> **TL;DR:** Use a self-managed node group. EKS managed node groups silently +> drop `CpuOptions.NestedVirtualization` — your nodes will start without `/dev/kvm`. + +### The problem with managed node groups + +EKS managed node groups take your Launch Template, then generate a new internal +Launch Template that merges only a subset of fields. `CpuOptions` is not in that +subset — even though it is **not** listed in the [official blocked-fields docs](https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#launch-template-basics). + +Symptoms: +- `ls /dev/kvm` returns "No such file or directory" +- `/proc/cpuinfo` has no `vmx` flag +- `eksctl create nodegroup` succeeds, but KVM is silently missing + +You can verify by inspecting the EKS-generated internal Launch Template: + +```bash +# Get the internal LT id (not your LT) +aws ec2 describe-launch-template-versions --launch-template-id --versions 1 --query "LaunchTemplateVersions[0].LaunchTemplateData.CpuOptions" +# Expected for managed nodegroup: null (even if you set it in your own LT) +``` + +### The solution: self-managed node group + +Create an Auto Scaling Group with a Launch Template directly — bypassing EKS's +internal LT generation. The provided script handles the full setup: + +```bash +export AWS_PROFILE=your-profile +export CLUSTER_NAME=zeroboot-eks +export REGION=ap-southeast-1 + +# Step 1: Create cluster without node group +eksctl create cluster -f deploy/eks/eks-cluster-only.yaml + +# Step 2: Create self-managed KVM node group +bash deploy/eks/eks-self-managed-kvm.sh +``` + +The script: +1. Creates an IAM node role + instance profile +2. Registers the role with EKS via `create-access-entry` +3. Queries the latest EKS-optimized AL2023 AMI +4. Creates a Launch Template with `CpuOptions.NestedVirtualization=enabled` +5. Creates an ASG referencing the LT directly +6. Verifies `/dev/kvm` is present on the new nodes + +> **Note:** `eksctl`'s `nodeGroups` (non-managed) do not support `launchTemplate`. +> Only `managedNodeGroups` does — but managed NGs drop `CpuOptions`. The script +> uses raw AWS CLI (`ec2 create-launch-template` + `autoscaling create-auto-scaling-group`) +> to sidestep both limitations. + + +--- + +## KVM device access without `privileged: true` + +Pods request `/dev/kvm` via the [KVM device plugin](https://github.com/kubevirt/kubevirt/tree/main/cmd/virt-handler) +from the KubeVirt project: + +```bash +# Install KVM device plugin (DaemonSet) +kubectl apply -f https://github.com/kubevirt/kubevirt/releases/latest/download/kubevirt-operator.yaml +kubectl apply -f https://github.com/kubevirt/kubevirt/releases/latest/download/kubevirt-cr.yaml +``` + +Once installed, Pods can request KVM access via resources (already set in `deployment.yaml`): + +```yaml +resources: + limits: + devices.kubevirt.io/kvm: "1" +``` + +This grants `/dev/kvm` access without `privileged: true` or `hostDevice` mounts. + +--- + +## Persistent storage for snapshots + +Zeroboot's `template` command snapshots ~512 MB of VM memory to disk. Without a +PersistentVolume, every Pod restart triggers a ~15 s re-snapshot. + +Mount a PVC at `/var/lib/zeroboot` (see `deploy/k8s/pvc.yaml`). The directory +layout on the volume: + +``` +/var/lib/zeroboot/ +├── vmlinux-fc ← kernel binary (~21 MB) +├── rootfs-python.ext4 ← base rootfs image (pre-loaded numpy/pandas) +├── python/ ← snapshot created by entrypoint on first boot +│ ├── snapshot/ +│ │ ├── vmstate ← CPU register state (~14 KB) +│ │ └── mem ← 512 MB memory image (CoW source) +│ └── rootfs_path +└── api_keys.json ← optional API key list +``` + +> **Populate the volume before first deploy.** Copy `vmlinux-fc` and +> `rootfs-python.ext4` to the PVC (e.g., via a one-shot init Job or manual +> `kubectl cp`). The entrypoint will create the snapshot automatically on +> first boot if it is missing. + +### Storage class recommendations + +| Cloud | StorageClass | Notes | +|---|---|---| +| AWS | `gp3` | Default for EKS; good random-read IOPS for CoW page faults | +| GCP | `premium-rwo` | SSD-backed, low latency | +| Azure | `managed-premium` | SSD, required for sub-ms fork performance | + +Avoid `gp2` or spinning-disk storage classes — the CoW page fault path is +latency-sensitive and benefits from SSD IOPS. + +--- + +## Deploying + +```bash +# 1. Create namespace +kubectl apply -f deploy/k8s/namespace.yaml + +# 2. Create PVC +kubectl apply -f deploy/k8s/pvc.yaml + +# 3. Deploy (2 replicas by default) +kubectl apply -f deploy/k8s/deployment.yaml +kubectl apply -f deploy/k8s/service.yaml + +# 4. Watch rollout — first boot takes ~30s for template creation +kubectl rollout status deployment/zeroboot -n zeroboot + +# 5. Verify +kubectl exec -n zeroboot deploy/zeroboot -- curl -s localhost:8080/v1/health +``` + +--- + +## Autoscaling + +### Why not CPU-based HPA? + +Zeroboot workloads are **memory-bound**, not CPU-bound. Each concurrent fork +adds ~265 KB of CoW memory pressure. CPU utilization is a poor scaling signal. + +### Custom metric HPA + +The `zeroboot_concurrent_forks` gauge (exposed at `/v1/metrics`) reflects the +number of active VM sandboxes per Pod. Use this for HPA: + +```bash +# Apply HPA (requires prometheus-adapter, see below) +kubectl apply -f deploy/k8s/hpa.yaml +``` + +Scale-out triggers when average concurrent forks per Pod exceeds 800. Adjust +this threshold based on your Node's available memory: + +``` +max_concurrent_forks ≈ (node_memory - 2GB_overhead) / 265KB_per_fork +# Example: 8GB node → (8192 - 2048) / 0.265 ≈ 23,000 theoretical max +# Practical limit with snapshot RSS: ~1000–2000 per Pod +``` + +### Exposing the metric via prometheus-adapter + +Add to your `prometheus-adapter` ConfigMap: + +```yaml +rules: + - seriesQuery: 'zeroboot_concurrent_forks{namespace!="",pod!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + pod: {resource: "pod"} + name: + matches: "zeroboot_concurrent_forks" + as: "zeroboot_concurrent_forks" + metricsQuery: 'avg_over_time(zeroboot_concurrent_forks{<<.LabelMatchers>>}[1m])' +``` + +### Karpenter node provisioning + +For cluster autoscaling with Karpenter, create a NodePool that targets KVM-capable instances: + +```yaml +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: zeroboot-kvm +spec: + template: + metadata: + labels: + kvm-capable: "true" + spec: + requirements: + - key: karpenter.k8s.aws/instance-family + operator: In + values: [c6i, c7i, c8i, m6i, m7i] + - key: karpenter.k8s.aws/instance-size + operator: In + values: [xlarge, 2xlarge, 4xlarge] + - key: kubernetes.io/arch + operator: In + values: [amd64] + limits: + cpu: 100 +``` + +> **Scaling latency note:** Karpenter takes 60–120 s to provision a new KVM +> node (EC2 start + kubelet join + Pod scheduling + snapshot load). Karpenter +> handles **capacity expansion** for sustained load — it is not designed to +> absorb sudden request spikes. Size your warm pool (`minReplicas`) to handle +> peak burst traffic; use HPA to scale within the existing node pool first. + +--- + +## Monitoring + +Zeroboot exposes Prometheus metrics at `/v1/metrics` (not `/metrics`). + +### ServiceMonitor (Prometheus Operator) + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: zeroboot + namespace: zeroboot +spec: + selector: + matchLabels: + app: zeroboot + endpoints: + - port: http + path: /v1/metrics + interval: 15s +``` + +### Key metrics + +| Metric | Type | Description | +|---|---|---| +| `zeroboot_concurrent_forks` | gauge | Active VM sandboxes — **use for HPA** | +| `zeroboot_fork_time_milliseconds` | histogram | Fork latency (P50/P99) | +| `zeroboot_exec_time_milliseconds` | histogram | Code execution latency | +| `zeroboot_total_time_milliseconds` | histogram | End-to-end request latency | +| `zeroboot_total_executions{status}` | counter | Success / error / timeout counts | +| `zeroboot_memory_usage_bytes` | gauge | Process RSS — monitor for memory pressure | + +--- + +## Configuration reference + +All configuration is via environment variables (set in `deployment.yaml`): + +| Variable | Default | Description | +|---|---|---| +| `ZEROBOOT_WORKDIR` | `/var/lib/zeroboot` | Working directory (PVC mount point) | +| `ZEROBOOT_KERNEL` | `$WORKDIR/vmlinux-fc` | Path to kernel binary | +| `ZEROBOOT_ROOTFS_PYTHON` | `$WORKDIR/rootfs-python.ext4` | Python rootfs image | +| `ZEROBOOT_ROOTFS_NODE` | _(unset)_ | Node.js rootfs image (optional) | +| `ZEROBOOT_PORT` | `8080` | API server port | +| `ZEROBOOT_TEMPLATE_WAIT` | `15` | Seconds to wait during template snapshot | +| `ZEROBOOT_API_KEYS_FILE` | _(unset)_ | Path to JSON array of API keys | + +--- + +### Server bind address + +By default, `zeroboot serve` binds to `0.0.0.0` (all interfaces), which is +required for Kubernetes health probes and Service routing. To restrict to +localhost (e.g. for local development), pass `--bind 127.0.0.1`: + +```bash +zeroboot serve python:/workdir/python 8080 --bind 127.0.0.1 +``` + +The `ZEROBOOT_BIND` environment variable (default: `0.0.0.0`) controls the +bind address when running via the Docker entrypoint. + + +--- + +## Limitations + +- **Single-node fork pool:** All sandboxes on a Pod run on the same physical Node. + Scale out by adding Pods (and Nodes), not by resizing individual Pods. +- **ReadWriteOnce PVC:** Each Pod needs its own PVC (`ReadWriteOnce`). If you + use a `StatefulSet` instead of a `Deployment`, each replica gets its own PVC + automatically via `volumeClaimTemplates`. +- **Snapshot on first boot:** The first Pod startup after PVC creation takes + ~15–30 s while the template snapshot is created. Subsequent restarts are fast + (~2 s) because the snapshot is persisted on the PVC. +- **x86_64 only:** Firecracker and the guest kernel are x86_64. ARM nodes are + not supported.