From 25748770255330c39e15ed8a7542eeaea1d1fc10 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Thu, 26 Feb 2026 23:00:23 -0500 Subject: [PATCH 01/13] fix(taskfiles/images): Multi-arch enablement image amd64 only Signed-off-by: Sam DaSilva --- taskfiles/images.yaml | 101 +++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/taskfiles/images.yaml b/taskfiles/images.yaml index 67f61e33b..355c1e83c 100644 --- a/taskfiles/images.yaml +++ b/taskfiles/images.yaml @@ -1,4 +1,4 @@ -version: '3' +version: "3" tasks: go-cache: @@ -12,10 +12,10 @@ tasks: - task: helper-prep-tmp-dir desc: Build {{.PLATFORM}} image using buildah vars: - NAME: '{{ .NAME }}' - DOCKERFILE: '{{ .DOCKERFILE }}' - PLATFORM: '{{.PLATFORM}}' - INCREMENTAL: '{{ .INCREMENTAL }}' + NAME: "{{ .NAME }}" + DOCKERFILE: "{{ .DOCKERFILE }}" + PLATFORM: "{{.PLATFORM}}" + INCREMENTAL: "{{ .INCREMENTAL }}" BUILDAH_OPTIONS: sh: | if [ "{{.INCREMENTAL}}" = "true" ]; then @@ -54,15 +54,26 @@ tasks: helper-prepare-multi-arch: internal: true status: - - test -f /proc/sys/fs/binfmt_misc/qemu-aarch64 + - | + if [ "$(uname -m)" = "aarch64" ]; then + test -f /proc/sys/fs/binfmt_misc/qemu-x86_64 + else + test -f /proc/sys/fs/binfmt_misc/qemu-aarch64 + fi cmds: - - sudo podman run --rm --privileged quay.io/bnemeth/multiarch-qemu-user-static --reset -p yes - - setenforce 0 + - sudo podman run --rm --privileged docker.io/tonistiigi/binfmt --install all + - | + if command -v getenforce >/dev/null 2>&1; then + mode="$(getenforce || true)" + if [ "$mode" != "Disabled" ]; then + sudo setenforce 0 || true + fi + fi clean-image-layer: internal: true vars: - NAME: '{{.NAME}}' + NAME: "{{.NAME}}" status: - sh -c '! buildah manifest inspect localhost/{{.NAME}}:dev-manifest' cmds: @@ -72,46 +83,46 @@ tasks: internal: true desc: Clean up image {{.NAME}} vars: - NAME: '{{.NAME}}' + NAME: "{{.NAME}}" cmds: - buildah manifest rm localhost/{{.NAME}}:dev-base-manifest || true - task: clean-image-layer vars: - NAME: '{{.NAME}}' + NAME: "{{.NAME}}" build-image: internal: true desc: Building {{.PLATFORM}} image {{.NAME}} vars: - NAME: '{{.NAME}}' - DOCKERFILE: '{{.DOCKERFILE}}' - PLATFORM: '{{.PLATFORM }}' + NAME: "{{.NAME}}" + DOCKERFILE: "{{.DOCKERFILE}}" + PLATFORM: "{{.PLATFORM }}" cmds: - task: helper-buildah vars: - NAME: 'localhost/{{.NAME}}:dev-base' - DOCKERFILE: '{{.DOCKERFILE}}' + NAME: "localhost/{{.NAME}}:dev-base" + DOCKERFILE: "{{.DOCKERFILE}}" INCREMENTAL: "false" - PLATFORM: '{{.PLATFORM}}' + PLATFORM: "{{.PLATFORM}}" # - cmd: buildah tag localhost/{{.NAME}}:dev-base-{{.PLATFORM}} localhost/{{.NAME}}:dev-base - task: prep-incremental-docker-file vars: - BASE_NAME: 'localhost/{{.NAME}}:dev-base-{{.PLATFORM}}' - IN_FILE: '{{.DOCKERFILE}}' - OUT_FILE: '{{.DOCKERFILE}}.inc' + BASE_NAME: "localhost/{{.NAME}}:dev-base-{{.PLATFORM}}" + IN_FILE: "{{.DOCKERFILE}}" + OUT_FILE: "{{.DOCKERFILE}}.inc" - task: helper-buildah vars: - NAME: 'localhost/{{.NAME}}:dev' - DOCKERFILE: '{{.DOCKERFILE}}.inc' + NAME: "localhost/{{.NAME}}:dev" + DOCKERFILE: "{{.DOCKERFILE}}.inc" INCREMENTAL: "true" - PLATFORM: '{{.PLATFORM}}' + PLATFORM: "{{.PLATFORM}}" prep-incremental-docker-file: internal: true vars: - BASE_NAME: '{{ .BASE_NAME }}' - IN_FILE: '{{ .IN_FILE }}' - OUT_FILE: '{{ .OUT_FILE }}' + BASE_NAME: "{{ .BASE_NAME }}" + IN_FILE: "{{ .IN_FILE }}" + OUT_FILE: "{{ .OUT_FILE }}" cmds: - > go run tools/incremental/incremental.go @@ -282,7 +293,6 @@ tasks: vars: NAME: intel-vsp - build-image-intel-vsp-p4: deps: - task: clean-image-layer @@ -331,7 +341,7 @@ tasks: NAME: network-resources-injector clean-image-all: - cmds: # can't run in parallel since multiple concurrent pulls are not supported + cmds: # can't run in parallel since multiple concurrent pulls are not supported - task: clean-image-manager - task: clean-image-daemon - task: clean-image-intel-vsp @@ -346,7 +356,7 @@ tasks: # they will be picked up by the build-image-* targets deps: - build-bin-all - cmds: # can't run in parallel since multiple concurrent pulls are not supported + cmds: # can't run in parallel since multiple concurrent pulls are not supported - task: build-image-manager - task: build-image-daemon - task: build-image-intel-vsp @@ -361,34 +371,33 @@ tasks: deps: - task: push-image-helper vars: - SOURCE: 'localhost/dpu-operator:dev' - IMAGE: '{{.REGISTRY}}/dpu-operator:dev' + SOURCE: "localhost/dpu-operator:dev" + IMAGE: "{{.REGISTRY}}/dpu-operator:dev" - task: push-image-helper vars: - SOURCE: 'localhost/dpu-daemon:dev' - IMAGE: '{{.REGISTRY}}/dpu-daemon:dev' + SOURCE: "localhost/dpu-daemon:dev" + IMAGE: "{{.REGISTRY}}/dpu-daemon:dev" - task: push-image-helper vars: - SOURCE: 'localhost/mrvl-vsp:dev' - IMAGE: '{{.REGISTRY}}/mrvl-vsp:dev' + SOURCE: "localhost/mrvl-vsp:dev" + IMAGE: "{{.REGISTRY}}/mrvl-vsp:dev" - task: push-image-helper vars: - SOURCE: 'localhost/mrvl-cpagent:dev' - IMAGE: '{{.REGISTRY}}/mrvl-cpagent:dev' + SOURCE: "localhost/mrvl-cpagent:dev" + IMAGE: "{{.REGISTRY}}/mrvl-cpagent:dev" - task: push-image-helper vars: - SOURCE: 'localhost/intel-vsp:dev' - IMAGE: '{{.REGISTRY}}/intel-vsp:dev' + SOURCE: "localhost/intel-vsp:dev" + IMAGE: "{{.REGISTRY}}/intel-vsp:dev" - task: push-image-helper vars: - SOURCE: 'localhost/intel-vsp-p4:dev' - IMAGE: '{{.REGISTRY}}/intel-vsp-p4:dev' + SOURCE: "localhost/intel-vsp-p4:dev" + IMAGE: "{{.REGISTRY}}/intel-vsp-p4:dev" - task: push-image-helper vars: - SOURCE: 'localhost/intel-netsec-vsp:dev' - IMAGE: '{{.REGISTRY}}/intel-netsec-vsp:dev' + SOURCE: "localhost/intel-netsec-vsp:dev" + IMAGE: "{{.REGISTRY}}/intel-netsec-vsp:dev" - task: push-image-helper vars: - SOURCE: 'localhost/network-resources-injector:dev' - IMAGE: '{{.REGISTRY}}/network-resources-injector:dev' - + SOURCE: "localhost/network-resources-injector:dev" + IMAGE: "{{.REGISTRY}}/network-resources-injector:dev" From a3393d1a357ee784286dcefe0c29599aa3ece23e Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Fri, 27 Feb 2026 16:47:24 -0500 Subject: [PATCH 02/13] feat(upstream-images): add upstream Dockerfiles and suffix-based task switching - add non-.rhel Dockerfiles for manager/daemon/VSP/NRI images - keep .rhel Dockerfiles unchanged and selectable via DOCKERFILE_SUFFIX - add build-image-all-rhel and clean-image-all-rhel wrappers - make REGISTRY overridable in taskfile - fix pip RPM-setuptools conflict in openshift/install-dpu.sh Signed-off-by: Sam DaSilva --- Dockerfile | 20 +++++++ Dockerfile.CNI | 12 +++++ Dockerfile.IntelNetSecVSP | 29 ++++++++++ Dockerfile.IntelP4 | 51 ++++++++++++++++++ Dockerfile.IntelVSP | 52 ++++++++++++++++++ Dockerfile.daemon | 28 ++++++++++ Dockerfile.inc | 6 +++ Dockerfile.mrvlCPAgent | 57 ++++++++++++++++++++ Dockerfile.mrvlVSP | 29 ++++++++++ Dockerfile.networkResourcesInjector | 15 ++++++ openshift/install-dpu.sh | 4 +- taskfile.yaml | 7 ++- taskfiles/images.yaml | 82 ++++++++++++++++++++++++----- 13 files changed, 376 insertions(+), 16 deletions(-) create mode 100644 Dockerfile create mode 100644 Dockerfile.CNI create mode 100644 Dockerfile.IntelNetSecVSP create mode 100644 Dockerfile.IntelP4 create mode 100644 Dockerfile.IntelVSP create mode 100644 Dockerfile.daemon create mode 100644 Dockerfile.inc create mode 100644 Dockerfile.mrvlCPAgent create mode 100644 Dockerfile.mrvlVSP create mode 100644 Dockerfile.networkResourcesInjector diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..1747eff5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +# Build the manager binary +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . + +# Build directly to avoid GOARCH leaking into go-run helper tooling during cross builds. +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/manager.${TARGETARCH} ./cmd/main.go + +# Use a minimal runtime image for the manager binary. +FROM gcr.io/distroless/static-debian12:nonroot@sha256:a9329520abc449e3b14d5bc3a6ffae065bdde0f02667fa10880c49b35c109fd1 +WORKDIR / +ARG TARGETARCH +COPY --from=builder /workspace/bin/manager.${TARGETARCH} /manager +USER 65532:65532 +ENTRYPOINT ["/manager"] diff --git a/Dockerfile.CNI b/Dockerfile.CNI new file mode 100644 index 000000000..93ac47add --- /dev/null +++ b/Dockerfile.CNI @@ -0,0 +1,12 @@ +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +COPY . /usr/src/dpu-cni +WORKDIR /usr/src/dpu-cni +RUN GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -o dpucni ./dpu-cni/dpu-cni.go + +FROM gcr.io/distroless/static-debian12:nonroot@sha256:a9329520abc449e3b14d5bc3a6ffae065bdde0f02667fa10880c49b35c109fd1 +COPY --from=builder /usr/src/dpu-cni/dpucni /usr/bin/ +WORKDIR / +LABEL io.k8s.display-name="DPU-CNI" diff --git a/Dockerfile.IntelNetSecVSP b/Dockerfile.IntelNetSecVSP new file mode 100644 index 000000000..12da83c40 --- /dev/null +++ b/Dockerfile.IntelNetSecVSP @@ -0,0 +1,29 @@ +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . + +# Build directly to avoid GOARCH leaking into go-run helper tooling during cross builds. +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/vsp-intel-netsec.${TARGETARCH} ./internal/daemon/vendor-specific-plugins/intel-netsec/main.go + +FROM quay.io/centos/centos:stream9 +ARG TARGETARCH +COPY --from=builder /workspace/bin/vsp-intel-netsec.${TARGETARCH} /vsp-intel-netsec + +RUN dnf update -y \ + && dnf install -y \ + ethtool \ + net-tools \ + kmod \ + pciutils \ + iputils \ + iproute \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +USER 0 +ENTRYPOINT ["/vsp-intel-netsec"] diff --git a/Dockerfile.IntelP4 b/Dockerfile.IntelP4 new file mode 100644 index 000000000..148f447d2 --- /dev/null +++ b/Dockerfile.IntelP4 @@ -0,0 +1,51 @@ +FROM quay.io/centos/centos:stream9 + +ARG P4_NAME=fxp-net_linux-networking +ENV P4_NAME $P4_NAME + +ARG TARGETOS +ARG TARGETARCH +ENV ARCHSUFFIX="aarch64" + +COPY . /src +WORKDIR /src +RUN dnf install -y \ + kmod \ + gettext \ + python3-devel \ + pciutils \ + libnl3 \ + libedit \ + net-tools \ + libatomic \ + libconfig \ + gcc gcc-c++ \ + && dnf clean all \ + && python3 -m ensurepip --upgrade + +RUN mkdir -p /opt/${P4_NAME} +COPY cmd/intelvsp/$P4_NAME/* /opt/${P4_NAME}/ +COPY cmd/intelvsp/p4sdk/entrypoint.sh / +COPY cmd/intelvsp/p4sdk/es2k_skip_p4.conf.template / + +WORKDIR / + +# Add steps for cachito +ENV REMOTE_SOURCES=${REMOTE_SOURCES:-"./openshift/"} +ENV REMOTE_SOURCES_DIR=${REMOTE_SOURCES_DIR:-"/cachito"} +COPY ${REMOTE_SOURCES} ${REMOTE_SOURCES_DIR} +COPY openshift/install-dpu-p4.sh . +RUN chmod +x install-dpu-p4.sh \ + && ./install-dpu-p4.sh + +# Remove packages only needed for cachito. +RUN dnf remove -y gcc gcc-c++ \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +COPY ./cmd/intelvsp/p4runtime-2023.11.0/p4 /opt/p4rt_proto +COPY ./cmd/intelvsp/p4runtime-2023.11.0/copy_p4rt_python_deps.sh /opt/p4rt_proto/ +RUN chmod a+x /opt/p4rt_proto/copy_p4rt_python_deps.sh +RUN /opt/p4rt_proto/copy_p4rt_python_deps.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/Dockerfile.IntelVSP b/Dockerfile.IntelVSP new file mode 100644 index 000000000..f186bc4f6 --- /dev/null +++ b/Dockerfile.IntelVSP @@ -0,0 +1,52 @@ +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . + +# Build directly to avoid GOARCH leaking into go-run helper tooling during cross builds. +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/ipuplugin.${TARGETARCH} ./cmd/intelvsp/intelvsp.go + +FROM quay.io/centos/centos:stream9 +ARG TARGETARCH +ENV PYTHONUNBUFFERED=1 +WORKDIR / + +# https://github.com/grpc/grpc/issues/24556 +RUN dnf install -y \ + centos-release-nfv-openvswitch \ + && dnf install -y \ + NetworkManager iproute python3 python3-devel openssh-clients gcc gcc-c++ openvswitch3.4 \ + && python3 -m ensurepip --upgrade + +# By setting WORKDIR, directories are created automatically. +WORKDIR /opt/p4/p4-cp-nws/bin/ +RUN mkdir -p /opt/p4/p4-cp-nws/bin/p4 + +COPY ./cmd/intelvsp/fxp-net_linux-networking/fxp-net_linux-networking.pkg / +COPY ./cmd/intelvsp/p4rt-ctl /opt/p4/p4-cp-nws/bin/ + +# Add steps for cachito +ENV REMOTE_SOURCES=${REMOTE_SOURCES:-"./openshift/"} +ENV REMOTE_SOURCES_DIR=${REMOTE_SOURCES_DIR:-"/cachito"} +COPY ${REMOTE_SOURCES} ${REMOTE_SOURCES_DIR} +COPY openshift/install-dpu.sh . +RUN chmod +x install-dpu.sh \ + && ./install-dpu.sh + +# Remove packages only needed for cachito. +RUN dnf remove -y gcc gcc-c++ \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +COPY ./cmd/intelvsp/p4runtime-2023.11.0/p4 /opt/p4rt_proto +COPY ./cmd/intelvsp/p4runtime-2023.11.0/copy_p4rt_python_deps.sh /opt/p4rt_proto +RUN chmod a+x /opt/p4rt_proto/copy_p4rt_python_deps.sh +RUN /opt/p4rt_proto/copy_p4rt_python_deps.sh + +COPY --chmod=755 --from=builder /workspace/bin/ipuplugin.${TARGETARCH} /ipuplugin +LABEL io.k8s.display-name="IPU OPI Plugin" +ENTRYPOINT ["/ipuplugin"] diff --git a/Dockerfile.daemon b/Dockerfile.daemon new file mode 100644 index 000000000..1b92777a3 --- /dev/null +++ b/Dockerfile.daemon @@ -0,0 +1,28 @@ +# Build the daemon and CNI binaries +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . + +# Build directly to avoid GOARCH leaking into go-run helper tooling during cross builds. +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/daemon.${TARGETARCH} ./cmd/daemon/daemon.go && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/dpu-cni.${TARGETARCH} ./dpu-cni/dpu-cni.go + +FROM quay.io/centos/centos:stream9 +ARG TARGETARCH +WORKDIR / +COPY --from=builder /workspace/bin/daemon.${TARGETARCH} /daemon +COPY --from=builder /workspace/bin/dpu-cni.${TARGETARCH} /dpu-cni + +# Install hwdata to include pci.ids so jaypipes/ghw can run offline. +RUN dnf install -y hwdata ethtool \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +USER 65532:65532 +ENTRYPOINT ["/daemon"] diff --git a/Dockerfile.inc b/Dockerfile.inc new file mode 100644 index 000000000..cbdf3ff90 --- /dev/null +++ b/Dockerfile.inc @@ -0,0 +1,6 @@ + +FROM localhost/dpu-operator:dev-base-arm64 + +ARG TARGETARCH + +COPY bin/manager.${TARGETARCH} /manager diff --git a/Dockerfile.mrvlCPAgent b/Dockerfile.mrvlCPAgent new file mode 100644 index 000000000..8aa86af54 --- /dev/null +++ b/Dockerfile.mrvlCPAgent @@ -0,0 +1,57 @@ +ARG TARGETARCH +FROM --platform=linux/${TARGETARCH} docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS stage1 +ARG TARGETOS +ARG TARGETARCH + +RUN apt-get update && apt-get install -y --no-install-recommends \ + gawk gcc g++ libconfig-dev make pkg-config \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace +COPY . . + +RUN \ + set -x && \ + mkdir -p /cpagent-bin/ && \ + if [ "$TARGETARCH" = "arm64" ] ; then \ + export OCTEP_PATH="/workspace/pcie_ep_octeon_target/target/libs/octep_cp_lib" && \ + ln -nfs internal/daemon/vendor-specific-plugins/marvell/vendor/pcie_ep_octeon_target.25.03.0/ /workspace/pcie_ep_octeon_target && \ + cd "/workspace/pcie_ep_octeon_target/target/libs/octep_cp_lib" && \ + make CFLAGS="-DUSE_PEM_AND_DPI_PF=1" && \ + cd "/workspace/pcie_ep_octeon_target/target/apps/octep_cp_agent" && \ + make CFLAGS="$(pkg-config --cflags libconfig) -I$OCTEP_PATH/include" \ + LDFLAGS="$(pkg-config --libs libconfig) -L$OCTEP_PATH/bin/lib" && \ + cp bin/bin/octep_cp_agent /cpagent-bin/octep_cp_agent.25.03.0 && \ + ln -nfs internal/daemon/vendor-specific-plugins/marvell/vendor/pcie_ep_octeon_target/ /workspace/pcie_ep_octeon_target && \ + cd "/workspace/pcie_ep_octeon_target/target/libs/octep_cp_lib" && \ + make CFLAGS="-DUSE_PEM_AND_DPI_PF=1" && \ + cd "/workspace/pcie_ep_octeon_target/target/apps/octep_cp_agent" && \ + make CFLAGS="$(pkg-config --cflags libconfig) -I$OCTEP_PATH/include" \ + LDFLAGS="$(pkg-config --libs libconfig) -L$OCTEP_PATH/bin/lib" && \ + cp bin/bin/octep_cp_agent /cpagent-bin/ && \ + cp cn106xx.cfg /cpagent-bin/ && \ + echo "build completed" ; \ + fi + +# Due to https://github.com/golang/go/issues/70329 cross-compilation hangs at times. +# As a temporary workaround, we can try specifying GOMAXPROCS=2 to relieve this issue. +WORKDIR /workspace +RUN GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /cpagent-bin/cp-agent-run internal/daemon/vendor-specific-plugins/marvell/cp-agent/cp-agent-run.go + +FROM quay.io/centos/centos:stream9 +COPY --from=stage1 /cpagent-bin/ /usr/bin/ + +RUN dnf update -y \ + && dnf install -y \ + net-tools \ + kmod \ + pciutils \ + iputils \ + iproute \ + libconfig \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +USER 0 +ENTRYPOINT ["/usr/bin/cp-agent-run"] diff --git a/Dockerfile.mrvlVSP b/Dockerfile.mrvlVSP new file mode 100644 index 000000000..8772044c9 --- /dev/null +++ b/Dockerfile.mrvlVSP @@ -0,0 +1,29 @@ +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . + +# Build directly to avoid GOARCH leaking into go-run helper tooling during cross builds. +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/vsp-mrvl.${TARGETARCH} ./internal/daemon/vendor-specific-plugins/marvell/main.go + +FROM quay.io/centos/centos:stream9 +ARG TARGETARCH +COPY --from=builder /workspace/bin/vsp-mrvl.${TARGETARCH} /vsp-mrvl + +RUN dnf update -y \ + && dnf install -y \ + net-tools \ + kmod \ + pciutils \ + iputils \ + iproute \ + ethtool \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +USER 0 +ENTRYPOINT ["/vsp-mrvl"] diff --git a/Dockerfile.networkResourcesInjector b/Dockerfile.networkResourcesInjector new file mode 100644 index 000000000..c6f6069ba --- /dev/null +++ b/Dockerfile.networkResourcesInjector @@ -0,0 +1,15 @@ +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.24-bookworm@sha256:1a6d4452c65dea36aac2e2d606b01b4a029ec90cc1ae53890540ce6173ea77ac AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . . +RUN mkdir -p /workspace/bin && \ + GOMAXPROCS=2 CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ + go build -o /workspace/bin/nri.${TARGETARCH} ./cmd/nri/networkresourcesinjector.go + +FROM gcr.io/distroless/static-debian12:nonroot@sha256:a9329520abc449e3b14d5bc3a6ffae065bdde0f02667fa10880c49b35c109fd1 +ARG TARGETARCH +WORKDIR / +COPY --from=builder /workspace/bin/nri.${TARGETARCH} /webhook +ENTRYPOINT ["/webhook"] diff --git a/openshift/install-dpu.sh b/openshift/install-dpu.sh index 0a4f5161b..06e9e493f 100644 --- a/openshift/install-dpu.sh +++ b/openshift/install-dpu.sh @@ -14,7 +14,9 @@ else fi # Install the packages in order of build dependency to avoid issues during installation. -python3 -m pip install ${PIP_OPTS} -r requirements-build.txt +# CentOS/RHEL-family images often ship setuptools via RPM; ignore-installed avoids pip trying +# (and failing) to uninstall the RPM-provided setuptools when requirements pin a newer version. +python3 -m pip install ${PIP_OPTS} --ignore-installed -r requirements-build.txt python3 -m pip install ${PIP_OPTS} -r requirements.txt rm -rf ${REMOTE_SOURCES_DIR} diff --git a/taskfile.yaml b/taskfile.yaml index 595f4eec0..72742b4d8 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -15,7 +15,12 @@ vars: BINDIR_ABS: sh: if [[ "{{.BINDIR}}" = /* ]]; then echo "{{.BINDIR}}"; else echo "$(pwd)/{{.BINDIR}}"; fi REGISTRY: - sh: hostname | sed 's/$/:5000/' + sh: | + if [ -n "${REGISTRY:-}" ]; then + echo "${REGISTRY}" + else + hostname | sed 's/$/:5000/' + fi ENVTEST_K8S_VERSION: 1.31.0 KUSTOMIZE_VERSION: v5.6.0 GINKGO_VERSION: diff --git a/taskfiles/images.yaml b/taskfiles/images.yaml index 355c1e83c..1d25f2a55 100644 --- a/taskfiles/images.yaml +++ b/taskfiles/images.yaml @@ -1,5 +1,8 @@ version: "3" +vars: + DOCKERFILE_SUFFIX: '{{ default "" .DOCKERFILE_SUFFIX }}' + tasks: go-cache: cmds: @@ -16,6 +19,13 @@ tasks: DOCKERFILE: "{{ .DOCKERFILE }}" PLATFORM: "{{.PLATFORM}}" INCREMENTAL: "{{ .INCREMENTAL }}" + BUILDPLATFORM: + sh: | + case "$(uname -m)" in + x86_64) echo "linux/amd64" ;; + aarch64|arm64) echo "linux/arm64" ;; + *) echo "linux/$(uname -m)" ;; + esac BUILDAH_OPTIONS: sh: | if [ "{{.INCREMENTAL}}" = "true" ]; then @@ -40,6 +50,9 @@ tasks: {{.BUILDAH_OPTIONS}} --manifest {{.NAME}}-manifest --platform linux/{{.PLATFORM}} + --build-arg BUILDPLATFORM={{.BUILDPLATFORM}} + --build-arg TARGETOS=linux + --build-arg TARGETARCH={{.PLATFORM}} -v {{.DPU_OPERATOR_TEMP_DIR}}/go-cache:/go:z -f {{.DOCKERFILE}} -t {{.NAME}}-{{.PLATFORM}} @@ -145,12 +158,12 @@ tasks: - task: build-image vars: NAME: dpu-operator - DOCKERFILE: Dockerfile.rhel + DOCKERFILE: Dockerfile{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: dpu-operator - DOCKERFILE: Dockerfile.rhel + DOCKERFILE: Dockerfile{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-manager: @@ -174,12 +187,12 @@ tasks: - task: build-image vars: NAME: dpu-daemon - DOCKERFILE: Dockerfile.daemon.rhel + DOCKERFILE: Dockerfile.daemon{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: dpu-daemon - DOCKERFILE: Dockerfile.daemon.rhel + DOCKERFILE: Dockerfile.daemon{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-daemon: @@ -203,12 +216,12 @@ tasks: - task: build-image vars: NAME: mrvl-vsp - DOCKERFILE: Dockerfile.mrvlVSP.rhel + DOCKERFILE: Dockerfile.mrvlVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: mrvl-vsp - DOCKERFILE: Dockerfile.mrvlVSP.rhel + DOCKERFILE: Dockerfile.mrvlVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-marvell-vsp: @@ -226,7 +239,7 @@ tasks: - task: build-image vars: NAME: mrvl-cpagent - DOCKERFILE: Dockerfile.mrvlCPAgent.rhel + DOCKERFILE: Dockerfile.mrvlCPAgent{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-marvell-cpagent: @@ -250,12 +263,12 @@ tasks: - task: build-image vars: NAME: intel-netsec-vsp - DOCKERFILE: Dockerfile.IntelNetSecVSP.rhel + DOCKERFILE: Dockerfile.IntelNetSecVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: intel-netsec-vsp - DOCKERFILE: Dockerfile.IntelNetSecVSP.rhel + DOCKERFILE: Dockerfile.IntelNetSecVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-intel-netsec-vsp: @@ -279,12 +292,12 @@ tasks: - task: build-image vars: NAME: intel-vsp - DOCKERFILE: Dockerfile.IntelVSP.rhel + DOCKERFILE: Dockerfile.IntelVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: intel-vsp - DOCKERFILE: Dockerfile.IntelVSP.rhel + DOCKERFILE: Dockerfile.IntelVSP{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-intel-vsp: @@ -302,7 +315,7 @@ tasks: - task: build-image vars: NAME: intel-vsp-p4 - DOCKERFILE: Dockerfile.IntelP4.rhel + DOCKERFILE: Dockerfile.IntelP4{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-intel-vsp-p4: @@ -326,12 +339,12 @@ tasks: - task: build-image vars: NAME: network-resources-injector - DOCKERFILE: Dockerfile.networkResourcesInjector.rhel + DOCKERFILE: Dockerfile.networkResourcesInjector{{.DOCKERFILE_SUFFIX}} PLATFORM: amd64 - task: build-image vars: NAME: network-resources-injector - DOCKERFILE: Dockerfile.networkResourcesInjector.rhel + DOCKERFILE: Dockerfile.networkResourcesInjector{{.DOCKERFILE_SUFFIX}} PLATFORM: arm64 clean-image-network-resources-injector: @@ -351,6 +364,17 @@ tasks: - task: clean-image-intel-netsec-vsp - task: clean-image-network-resources-injector + clean-image-all-rhel: + cmds: + - task: clean-image-manager + - task: clean-image-daemon + - task: clean-image-intel-vsp + - task: clean-image-intel-vsp-p4 + - task: clean-image-marvell-vsp + - task: clean-image-marvell-cpagent + - task: clean-image-intel-netsec-vsp + - task: clean-image-network-resources-injector + build-image-all: # build all the binaries in parallel for speed # they will be picked up by the build-image-* targets @@ -367,6 +391,36 @@ tasks: - task: build-image-network-resources-injector - task: push-image-all + build-image-all-rhel: + deps: + - build-bin-all + cmds: + - task: build-image-manager + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-daemon + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-intel-vsp + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-intel-vsp-p4 + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-marvell-vsp + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-intel-netsec-vsp + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-marvell-cpagent + vars: + DOCKERFILE_SUFFIX: .rhel + - task: build-image-network-resources-injector + vars: + DOCKERFILE_SUFFIX: .rhel + - task: push-image-all + push-image-all: deps: - task: push-image-helper From 9dd7a6604afb14ee015b11598453b43defe314a5 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Fri, 27 Feb 2026 20:10:05 -0500 Subject: [PATCH 03/13] fix(tasks): make local-registry push and tool bootstrap reliable - add PUSH_TLS_VERIFY (default true) to push-image-helper - allow --tls-verify=false for insecure local registries - unset GOOS/GOARCH for go-install tool tasks (kustomize/ginkgo/controller-gen/envtest) Signed-off-by: Sam DaSilva --- taskfile.yaml | 7 ++++--- taskfiles/operator-sdk.yaml | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/taskfile.yaml b/taskfile.yaml index 72742b4d8..865ff68ea 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -21,6 +21,7 @@ vars: else hostname | sed 's/$/:5000/' fi + PUSH_TLS_VERIFY: '{{ default "true" .PUSH_TLS_VERIFY }}' ENVTEST_K8S_VERSION: 1.31.0 KUSTOMIZE_VERSION: v5.6.0 GINKGO_VERSION: @@ -64,7 +65,7 @@ tasks: SOURCE: '{{.SOURCE}}' IMAGE: '{{.IMAGE}}' cmds: - - buildah manifest push --all '{{.SOURCE}}-manifest' 'docker://{{.IMAGE}}' + - buildah manifest push --all --tls-verify={{.PUSH_TLS_VERIFY}} '{{.SOURCE}}-manifest' 'docker://{{.IMAGE}}' undeploy-helper: internal: true @@ -115,7 +116,7 @@ tasks: - test -s {{.BINDIR}}/setup-envtest - ./{{.BINDIR}}/setup-envtest --help | head -1 | grep -q {{.SETUP_ENVTEST_VERSION}} cmds: - - GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install sigs.k8s.io/controller-runtime/tools/setup-envtest@{{.SETUP_ENVTEST_VERSION}} + - env -u GOOS -u GOARCH GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install sigs.k8s.io/controller-runtime/tools/setup-envtest@{{.SETUP_ENVTEST_VERSION}} deploy: deps: @@ -261,7 +262,7 @@ tasks: - test -s {{.BINDIR}}/controller-gen - ./{{.BINDIR}}/controller-gen --version | grep -q {{.CONTROLLER_TOOLS_VERSION}} cmds: - - GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install sigs.k8s.io/controller-tools/cmd/controller-gen@{{.CONTROLLER_TOOLS_VERSION}} + - env -u GOOS -u GOARCH GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install sigs.k8s.io/controller-tools/cmd/controller-gen@{{.CONTROLLER_TOOLS_VERSION}} test: deps: diff --git a/taskfiles/operator-sdk.yaml b/taskfiles/operator-sdk.yaml index 717ae6862..89fb93e51 100644 --- a/taskfiles/operator-sdk.yaml +++ b/taskfiles/operator-sdk.yaml @@ -4,7 +4,7 @@ tasks: kustomize: cmds: - mkdir -p {{.BINDIR}} - - GOBIN={{.BINDIR_ABS}} GOFLAGS='' GO111MODULE=on go install sigs.k8s.io/kustomize/kustomize/v5@{{.KUSTOMIZE_VERSION}} + - env -u GOOS -u GOARCH GOBIN={{.BINDIR_ABS}} GOFLAGS='' GO111MODULE=on go install sigs.k8s.io/kustomize/kustomize/v5@{{.KUSTOMIZE_VERSION}} - echo "{{.KUSTOMIZE_VERSION}}" > {{.BINDIR}}/kustomize_version status: - test -d {{.BINDIR}} @@ -17,7 +17,7 @@ tasks: - test -x {{.BINDIR}}/ginkgo - ./{{.BINDIR}}/ginkgo version | grep -q "{{.GINKGO_VERSION}}" cmds: - - GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install github.com/onsi/ginkgo/v2/ginkgo@v{{.GINKGO_VERSION}} + - env -u GOOS -u GOARCH GOBIN={{.BINDIR_ABS}} GOFLAGS='' go install github.com/onsi/ginkgo/v2/ginkgo@v{{.GINKGO_VERSION}} ## Download operator-sdk locally if necessary. operator-sdk: @@ -52,4 +52,3 @@ tasks: else ln -sf $(which opm) {{.BINDIR}}/opm fi - From 6feb3af9ae6599d70abece79ce439072a6de096d Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Fri, 27 Feb 2026 20:28:50 -0500 Subject: [PATCH 04/13] refactor(cli): switch deploy/undeploy flow from oc to kubectl - replace oc with kubectl in taskfile deploy and undeploy paths - update README example command accordingly Signed-off-by: Sam DaSilva --- README.md | 2 +- taskfile.yaml | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 6d3ffda58..7054863c6 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ kubectl create -f examples/config.yaml After creating the `DpuOperatorConfig` CR, you should see the following pods: ```sh -oc get pods -n openshift-dpu-operator -o wide +kubectl get pods -n openshift-dpu-operator -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES dpu-daemon-rn6mc 1/1 Running 0 22h 192.168.122.218 worker-229 dpu-daemon-xrrlg 1/1 Running 0 22h 192.168.122.90 worker-229-ptl diff --git a/taskfile.yaml b/taskfile.yaml index 865ff68ea..9f6878940 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -74,13 +74,13 @@ tasks: vars: KUBECONFIG: '{{.KUBECONFIG}}' status: - - NS=$(KUBECONFIG={{.KUBECONFIG}} oc get ns openshift-dpu-operator) + - NS=$(KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator) if [[ -n "$NS" ]]; then false else true fi cmds: # this will block untill everything is cleaned up and bringing system back into a clean state as if the operator was never installed - - KUBECONFIG={{.KUBECONFIG}} oc delete -f examples/config.yaml || true - - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} oc delete --ignore-not-found=true -f - - - KUBECONFIG={{.KUBECONFIG}} oc wait --for=delete ns openshift-dpu-operator --timeout=300s + - KUBECONFIG={{.KUBECONFIG}} kubectl delete -f examples/config.yaml || true + - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} kubectl delete --ignore-not-found=true -f - + - KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=300s - echo "Namespace 'openshift-dpu-operator' has been removed." undeploy: @@ -126,12 +126,12 @@ tasks: vars: KUBECONFIG_DPU: "{{.KUBECONFIG_DPU}}" KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_DPU}}" oc apply -f - - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" oc apply -f - - - KUBECONFIG="{{.KUBECONFIG_DPU}}" oc -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" oc -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_DPU}}" oc -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" oc -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl apply -f - + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - + - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s - echo "DPU operator deployment complete - controller manager and webhook are ready" deploy-1c: @@ -141,9 +141,9 @@ tasks: - task: undeploy-1c vars: KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" oc apply -f - - - KUBECONFIG="{{.KUBECONFIG_HOST}}" oc -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" oc -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - + - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s prepare-e2e-test: cmds: From 1da77590a179e39e7060b7cd42c0413be586bb14 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Mon, 2 Mar 2026 21:56:52 -0500 Subject: [PATCH 05/13] fix: Change kubeconfig path to use KUBECONFIG Signed-off-by: Sam DaSilva --- hack/cluster-configs/ocp-tft-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/cluster-configs/ocp-tft-config.yaml b/hack/cluster-configs/ocp-tft-config.yaml index b0bf6f50f..6e2009815 100644 --- a/hack/cluster-configs/ocp-tft-config.yaml +++ b/hack/cluster-configs/ocp-tft-config.yaml @@ -18,4 +18,4 @@ tft: - name: "$worker" sriov: "true" secondary_network_nad: "default-sriov-net" -kubeconfig: "/root/kubeconfig.ocpcluster" +kubeconfig: "$KUBECONFIG" From ccb18bb6077a5485ea4c0b5f0bf176d2fc6eb905 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Wed, 18 Mar 2026 17:12:06 -0400 Subject: [PATCH 06/13] feat(cluster): Add Vanilla cluster flavor as the default Signed-off-by: Sam DaSilva --- internal/testutils/testcluster.go | 8 +------- internal/utils/cluster_environment.go | 10 ++++++---- internal/utils/path_manager.go | 4 ++-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/internal/testutils/testcluster.go b/internal/testutils/testcluster.go index 90325dfbf..be2736566 100644 --- a/internal/testutils/testcluster.go +++ b/internal/testutils/testcluster.go @@ -861,7 +861,7 @@ func LabelNodesWithDpu(c client.Client) error { switch flavour { case utils.MicroShiftFlavour, utils.KindFlavour: return LabelAllNodesWithDpu(c) - case utils.OpenShiftFlavour: + case utils.OpenShiftFlavour, utils.VanillaFlavour: return LabelWorkerNodesWithDpu(c) default: return fmt.Errorf("unsupported cluster flavor %s", flavour) @@ -925,7 +925,6 @@ func WaitForDPUReady(c client.Client) error { return allReady, nil }) - if err != nil { return fmt.Errorf("timeout waiting for all DPU CRs to be Ready: %w", err) } @@ -954,7 +953,6 @@ func WaitForDPUReady(c client.Client) error { } func WaitForDPU(c client.Client) error { - var dpuName string err := wait.PollUntilContextTimeout(context.TODO(), time.Second, TestInitialSetupTimeout*3, true, func(ctx context.Context) (bool, error) { dpuList := &configv1.DataProcessingUnitList{} @@ -970,7 +968,6 @@ func WaitForDPU(c client.Client) error { return false, nil }) - if err != nil { return fmt.Errorf("timeout waiting for DPU resource: %w", err) } @@ -990,7 +987,6 @@ func WaitForDPU(c client.Client) error { return false, nil }) - if err != nil { return fmt.Errorf("timeout waiting for DPU %s to be Ready: %w", dpuName, err) } @@ -999,7 +995,6 @@ func WaitForDPU(c client.Client) error { } func WaitForAllPodsReady(c client.Client, namespace string) error { - err := wait.PollImmediate(time.Second, TestInitialSetupTimeout*3, func() (bool, error) { podList := &corev1.PodList{} err := c.List(context.TODO(), podList, client.InNamespace(namespace)) @@ -1026,7 +1021,6 @@ func WaitForAllPodsReady(c client.Client, namespace string) error { return true, nil }) - if err != nil { return fmt.Errorf("pods not ready after timeout: %w", err) } diff --git a/internal/utils/cluster_environment.go b/internal/utils/cluster_environment.go index b55533b0f..f36b4e750 100644 --- a/internal/utils/cluster_environment.go +++ b/internal/utils/cluster_environment.go @@ -25,6 +25,7 @@ func NewClusterEnvironment(client client.Client) *ClusterEnvironment { type Flavour string const ( + VanillaFlavour Flavour = "Vanilla" OpenShiftFlavour Flavour = "OpenShift" MicroShiftFlavour Flavour = "MicroShift" KindFlavour Flavour = "Kind" @@ -34,7 +35,7 @@ const ( func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { microShift, err := ce.isMicroShift(ctx) if err != nil { - return UnknownFlavour, err + return VanillaFlavour, err } if microShift { return MicroShiftFlavour, nil @@ -42,7 +43,7 @@ func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { openShift, err := ce.isOpenShift(ctx) if err != nil { - return UnknownFlavour, err + return VanillaFlavour, err } if openShift { return OpenShiftFlavour, nil @@ -50,12 +51,13 @@ func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { kind, err := ce.isKind(ctx) if err != nil { - return UnknownFlavour, err + return VanillaFlavour, err } if kind { return KindFlavour, nil } - return UnknownFlavour, nil + + return VanillaFlavour, nil } func (ce *ClusterEnvironment) isMicroShift(ctx context.Context) (bool, error) { diff --git a/internal/utils/path_manager.go b/internal/utils/path_manager.go index a445bcf87..ad5ca6e97 100644 --- a/internal/utils/path_manager.go +++ b/internal/utils/path_manager.go @@ -45,10 +45,10 @@ func (p *PathManager) CniHostDir(clusterFlavour Flavour, filesystemMode Filesyst case clusterFlavour == MicroShiftFlavour && filesystemMode == ImageMode: return p.wrap("/run/cni"), nil // OpenShift typically uses /var/lib/cni regardless of filesystem mode since nodes are always coreos based - case clusterFlavour == OpenShiftFlavour: + case clusterFlavour == OpenShiftFlavour || (clusterFlavour == VanillaFlavour && filesystemMode == ImageMode): return p.wrap("/var/lib/cni"), nil // MicroShift with PackageMode and Kind use /opt/cni - case (clusterFlavour == MicroShiftFlavour && filesystemMode == PackageMode) || clusterFlavour == KindFlavour: + case (clusterFlavour == MicroShiftFlavour && filesystemMode == PackageMode) || (clusterFlavour == VanillaFlavour && filesystemMode == PackageMode) || clusterFlavour == KindFlavour: return p.wrap("/opt/cni"), nil default: return "", fmt.Errorf("unknown combination of cluster flavour (%s) and filesystem mode (%s)", clusterFlavour, filesystemMode) From ca1ae61ba803a84055f733337c8b4360eb9be1fa Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Mon, 23 Mar 2026 14:32:01 -0400 Subject: [PATCH 07/13] feat: add flavor-aware NRI TLS provisioning Signed-off-by: Sam DaSilva --- .../00_service.yaml | 11 ++++ .../01_webhook.yaml | 27 ++++++++ .../02_serviceaccount.yaml | 5 ++ .../03_secret.yaml | 8 +++ ...lusterrole_network_resources_injector.yaml | 59 +++++++++++++++++ .../05_clusterrole_secrets.yaml | 11 ++++ .../06_clusterrole_webhook_configs.yaml | 11 ++++ .../07_clusterrole_service.yaml | 17 +++++ .../08_clusterrole_configmaps.yaml | 11 ++++ ...twork_resources_injector_role_binding.yaml | 11 ++++ .../10_clusterrolebinding_secrets.yaml | 12 ++++ ...11_clusterrolebinding_webhook_configs.yaml | 12 ++++ .../12_clusterrolebinding_service.yaml | 12 ++++ .../13_clusterrolebinding_configmaps.yaml | 12 ++++ .../14_server.yaml | 63 +++++++++++++++++++ .../15_issuer.yaml | 7 +++ .../16_certificate.yaml | 18 ++++++ .../dpuoperatorconfig_controller.go | 44 ++++++++++++- 18 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/00_service.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/01_webhook.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/02_serviceaccount.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/03_secret.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/04_clusterrole_network_resources_injector.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/05_clusterrole_secrets.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/06_clusterrole_webhook_configs.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/07_clusterrole_service.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/08_clusterrole_configmaps.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/09_clusterrolebinding_network_resources_injector_role_binding.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/10_clusterrolebinding_secrets.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/11_clusterrolebinding_webhook_configs.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/12_clusterrolebinding_service.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/13_clusterrolebinding_configmaps.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/14_server.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/15_issuer.yaml create mode 100644 internal/controller/bindata/network-resources-injector-certmanager/16_certificate.yaml diff --git a/internal/controller/bindata/network-resources-injector-certmanager/00_service.yaml b/internal/controller/bindata/network-resources-injector-certmanager/00_service.yaml new file mode 100644 index 000000000..a29632957 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/00_service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: network-resources-injector-service + namespace: {{.Namespace}} +spec: + ports: + - port: 443 + targetPort: 8443 + selector: + app: network-resources-injector diff --git a/internal/controller/bindata/network-resources-injector-certmanager/01_webhook.yaml b/internal/controller/bindata/network-resources-injector-certmanager/01_webhook.yaml new file mode 100644 index 000000000..47f119682 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/01_webhook.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: network-resources-injector-config + annotations: + cert-manager.io/inject-ca-from: "{{.Namespace}}/network-resources-injector-serving-cert" +webhooks: + - name: network-resources-injector-config.k8s.io + sideEffects: None + admissionReviewVersions: ["v1", "v1beta1"] + clientConfig: + service: + name: network-resources-injector-service + namespace: {{.Namespace}} + path: "/mutate" + namespaceSelector: + matchExpressions: + - key: "kubernetes.io/metadata.name" + operator: "NotIn" + values: + - {{.Namespace}} + rules: + - operations: [ "CREATE" ] + apiGroups: ["apps", ""] + apiVersions: ["v1"] + resources: ["pods"] diff --git a/internal/controller/bindata/network-resources-injector-certmanager/02_serviceaccount.yaml b/internal/controller/bindata/network-resources-injector-certmanager/02_serviceaccount.yaml new file mode 100644 index 000000000..5a4113857 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/02_serviceaccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + namespace: {{.Namespace}} + name: network-resources-injector-sa diff --git a/internal/controller/bindata/network-resources-injector-certmanager/03_secret.yaml b/internal/controller/bindata/network-resources-injector-certmanager/03_secret.yaml new file mode 100644 index 000000000..13b62fa1a --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/03_secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: network-resources-injector-sa-secret + namespace: {{.Namespace}} + annotations: + kubernetes.io/service-account.name: network-resources-injector-sa +type: kubernetes.io/service-account-token diff --git a/internal/controller/bindata/network-resources-injector-certmanager/04_clusterrole_network_resources_injector.yaml b/internal/controller/bindata/network-resources-injector-certmanager/04_clusterrole_network_resources_injector.yaml new file mode 100644 index 000000000..108989813 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/04_clusterrole_network_resources_injector.yaml @@ -0,0 +1,59 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-resources-injector +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - '*' +- apiGroups: + - "" + resources: + - secrets + verbs: + - '*' +- apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - create + - delete +- apiGroups: + - k8s.cni.cncf.io + resources: + - network-attachment-definitions + verbs: + - 'watch' + - 'list' + - 'get' +- apiGroups: + - "" + resources: + - configmaps + verbs: + - 'get' +- apiGroups: + - apps + resources: + - deployments + verbs: + - 'watch' + - 'list' + - 'get' +- apiGroups: + - security.openshift.io + resourceNames: + - anyuid + - hostnetwork + - privileged + resources: + - securitycontextconstraints + verbs: + - 'use' diff --git a/internal/controller/bindata/network-resources-injector-certmanager/05_clusterrole_secrets.yaml b/internal/controller/bindata/network-resources-injector-certmanager/05_clusterrole_secrets.yaml new file mode 100644 index 000000000..661f9b333 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/05_clusterrole_secrets.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-resources-injector-secrets +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - '*' diff --git a/internal/controller/bindata/network-resources-injector-certmanager/06_clusterrole_webhook_configs.yaml b/internal/controller/bindata/network-resources-injector-certmanager/06_clusterrole_webhook_configs.yaml new file mode 100644 index 000000000..d8a2e6e70 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/06_clusterrole_webhook_configs.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-resources-injector-webhook-configs +rules: +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + verbs: + - '*' diff --git a/internal/controller/bindata/network-resources-injector-certmanager/07_clusterrole_service.yaml b/internal/controller/bindata/network-resources-injector-certmanager/07_clusterrole_service.yaml new file mode 100644 index 000000000..7bec6c7ea --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/07_clusterrole_service.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-resources-injector-service +rules: +- apiGroups: + - "" + resources: + - services + verbs: + - '*' +- apiGroups: + - "" + resources: + - pods + verbs: + - '*' diff --git a/internal/controller/bindata/network-resources-injector-certmanager/08_clusterrole_configmaps.yaml b/internal/controller/bindata/network-resources-injector-certmanager/08_clusterrole_configmaps.yaml new file mode 100644 index 000000000..b58c11ad1 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/08_clusterrole_configmaps.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-resources-injector-configmaps +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - 'get' diff --git a/internal/controller/bindata/network-resources-injector-certmanager/09_clusterrolebinding_network_resources_injector_role_binding.yaml b/internal/controller/bindata/network-resources-injector-certmanager/09_clusterrolebinding_network_resources_injector_role_binding.yaml new file mode 100644 index 000000000..65e17756b --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/09_clusterrolebinding_network_resources_injector_role_binding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-resources-injector-role-binding +roleRef: + kind: ClusterRole + name: network-resources-injector +subjects: +- kind: ServiceAccount + name: network-resources-injector-sa + namespace: {{.Namespace}} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/10_clusterrolebinding_secrets.yaml b/internal/controller/bindata/network-resources-injector-certmanager/10_clusterrolebinding_secrets.yaml new file mode 100644 index 000000000..0732a6834 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/10_clusterrolebinding_secrets.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-resources-injector-secrets-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: network-resources-injector-secrets +subjects: +- kind: ServiceAccount + name: network-resources-injector-sa + namespace: {{.Namespace}} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/11_clusterrolebinding_webhook_configs.yaml b/internal/controller/bindata/network-resources-injector-certmanager/11_clusterrolebinding_webhook_configs.yaml new file mode 100644 index 000000000..57a8595f0 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/11_clusterrolebinding_webhook_configs.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-resources-injector-webhook-configs-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: network-resources-injector-webhook-configs +subjects: +- kind: ServiceAccount + name: network-resources-injector-sa + namespace: {{.Namespace}} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/12_clusterrolebinding_service.yaml b/internal/controller/bindata/network-resources-injector-certmanager/12_clusterrolebinding_service.yaml new file mode 100644 index 000000000..5ea93cf5f --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/12_clusterrolebinding_service.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-resources-injector-service-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: network-resources-injector-service +subjects: +- kind: ServiceAccount + name: network-resources-injector-sa + namespace: {{.Namespace}} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/13_clusterrolebinding_configmaps.yaml b/internal/controller/bindata/network-resources-injector-certmanager/13_clusterrolebinding_configmaps.yaml new file mode 100644 index 000000000..9f94516a8 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/13_clusterrolebinding_configmaps.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-resources-injector-configmaps-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: network-resources-injector-configmaps +subjects: +- kind: ServiceAccount + name: network-resources-injector-sa + namespace: {{.Namespace}} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/14_server.yaml b/internal/controller/bindata/network-resources-injector-certmanager/14_server.yaml new file mode 100644 index 000000000..883b5fa48 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/14_server.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: network-resources-injector + name: network-resources-injector + namespace: {{.Namespace}} +spec: + selector: + matchLabels: + app: network-resources-injector + template: + metadata: + labels: + app: network-resources-injector + spec: + serviceAccount: network-resources-injector-sa + containers: + - name: webhook-server + image: {{.NRIWebhookImage}} + imagePullPolicy: {{.ImagePullPolicy}} + command: + - /webhook + args: + - -bind-address=0.0.0.0 + - -port=8443 + - -tls-private-key-file=/etc/tls/tls.key + - -tls-cert-file=/etc/tls/tls.crt + - -insecure=true + - -health-check-port=8444 + - -logtostderr + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + securityContext: + privileged: true + capabilities: + drop: + - ALL + add: ["NET_BIND_SERVICE"] + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /etc/tls + name: tls + resources: + requests: + memory: "50Mi" + cpu: "250m" + limits: + memory: "200Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /healthz + port: 8444 + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: tls + secret: + secretName: network-resources-injector-secret diff --git a/internal/controller/bindata/network-resources-injector-certmanager/15_issuer.yaml b/internal/controller/bindata/network-resources-injector-certmanager/15_issuer.yaml new file mode 100644 index 000000000..5ff9aa6aa --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/15_issuer.yaml @@ -0,0 +1,7 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: network-resources-injector-selfsigned + namespace: {{.Namespace}} +spec: + selfSigned: {} diff --git a/internal/controller/bindata/network-resources-injector-certmanager/16_certificate.yaml b/internal/controller/bindata/network-resources-injector-certmanager/16_certificate.yaml new file mode 100644 index 000000000..9aa358148 --- /dev/null +++ b/internal/controller/bindata/network-resources-injector-certmanager/16_certificate.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: network-resources-injector-serving-cert + namespace: {{.Namespace}} +spec: + secretName: network-resources-injector-secret + issuerRef: + name: network-resources-injector-selfsigned + kind: Issuer + dnsNames: + - network-resources-injector-service + - network-resources-injector-service.{{.Namespace}} + - network-resources-injector-service.{{.Namespace}}.svc + usages: + - digital signature + - key encipherment + - server auth diff --git a/internal/controller/dpuoperatorconfig_controller.go b/internal/controller/dpuoperatorconfig_controller.go index b3d3ebcc0..8722e9e2b 100644 --- a/internal/controller/dpuoperatorconfig_controller.go +++ b/internal/controller/dpuoperatorconfig_controller.go @@ -27,9 +27,11 @@ import ( "github.com/openshift/dpu-operator/internal/utils" "github.com/openshift/dpu-operator/pkgs/render" "github.com/openshift/dpu-operator/pkgs/vars" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -90,6 +92,7 @@ func (r *DpuOperatorConfigReconciler) WithImagePullPolicy(policy string) *DpuOpe //+kubebuilder:rbac:groups="",resources=services,verbs=* //+kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=mutatingwebhookconfigurations,verbs=* //+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch +//+kubebuilder:rbac:groups=cert-manager.io,resources=issuers;certificates,verbs=* //+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch;create;update;patch;delete @@ -299,6 +302,7 @@ func (r *DpuOperatorConfigReconciler) yamlVars() map[string]string { "ImagePullPolicy": r.imagePullPolicy, "ResourceName": "openshift.io/dpu", // FIXME: Hardcode for now "CniDir": p, + "ClusterFlavour": string(flavour), } return data @@ -321,9 +325,47 @@ func (r *DpuOperatorConfigReconciler) ensureDpuDeamonSet(ctx context.Context, cf func (r *DpuOperatorConfigReconciler) ensureNetworkResourcesInjector(ctx context.Context, cfg *configv1.DpuOperatorConfig) error { logger := log.FromContext(ctx) + ce := utils.NewClusterEnvironment(r.Client) + flavour, err := ce.Flavour(ctx) + if err != nil { + return fmt.Errorf("failed to detect cluster flavour for network resources injector: %w", err) + } + + binDataPath := "network-resources-injector" + switch flavour { + case utils.OpenShiftFlavour, utils.MicroShiftFlavour: + binDataPath = "network-resources-injector" + default: + if err := r.ensureCertManagerInstalled(ctx, flavour); err != nil { + return err + } + binDataPath = "network-resources-injector-certmanager" + } + logger.Info("Create Network Resources Injector") - return r.createAndApplyAllFromBinData(logger, "network-resources-injector", cfg) + logger.Info("Selected network resources injector manifest set", "flavour", flavour, "path", binDataPath) + return r.createAndApplyAllFromBinData(logger, binDataPath, cfg) +} + +func (r *DpuOperatorConfigReconciler) ensureCertManagerInstalled(ctx context.Context, flavour utils.Flavour) error { + requiredCRDs := []string{ + "certificates.cert-manager.io", + "issuers.cert-manager.io", + } + + for _, crdName := range requiredCRDs { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := r.Get(ctx, types.NamespacedName{Name: crdName}, crd); err != nil { + if apierrors.IsNotFound(err) { + return fmt.Errorf("cert-manager is required on %s clusters for network-resources-injector TLS provisioning: missing CRD %q", flavour, crdName) + } + return fmt.Errorf("failed to verify cert-manager CRD %q on %s cluster: %w", crdName, flavour, err) + } + } + + return nil } + func (r *DpuOperatorConfigReconciler) ensureNetworkFunctioNAD(ctx context.Context, cfg *configv1.DpuOperatorConfig) error { logger := log.FromContext(ctx) From 7e339406114cc7cfa229078ad264d3e1bb5f9119 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Mon, 23 Mar 2026 14:32:11 -0400 Subject: [PATCH 08/13] fix(deploy): resolve OpenShift webhook TLS clash Signed-off-by: Sam DaSilva --- config/rbac/role.yaml | 13 +++++++++ taskfile.yaml | 65 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 8 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 42ef81004..001b01f40 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -41,6 +41,19 @@ rules: - get - list - watch +- apiGroups: + - cert-manager.io + resources: + - certificates + - issuers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - apps resources: diff --git a/taskfile.yaml b/taskfile.yaml index 9f6878940..660755fc0 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -109,6 +109,54 @@ tasks: -template-file config/dev/local-images-template.yaml -output-file bin/local-images.yaml - cp config/dev/kustomization.yaml bin + + deploy-webhook-compat: + internal: true + vars: + KUBECONFIG: '{{.KUBECONFIG}}' + cmds: + - | + KUBECONFIG={{.KUBECONFIG}} python3 - <<'PY' + import json + import os + import subprocess + + env = dict(os.environ) + env["KUBECONFIG"] = os.environ["KUBECONFIG"] + + data = json.loads(subprocess.check_output([ + "kubectl", + "get", + "validatingwebhookconfigurations", + "-o", + "json", + ], env=env)) + + for item in data.get("items", []): + name = item.get("metadata", {}).get("name", "") + if name == "dpu-operator-validating-webhook-configuration": + continue + + for webhook in item.get("webhooks", []): + if webhook.get("name") == "vdpuoperatorconfig.kb.io": + subprocess.run([ + "kubectl", + "delete", + "validatingwebhookconfiguration", + name, + "--ignore-not-found=true", + ], env=env, check=False) + break + PY + - | + if KUBECONFIG={{.KUBECONFIG}} kubectl get --raw /apis/route.openshift.io/v1 >/dev/null 2>&1; then + KUBECONFIG={{.KUBECONFIG}} kubectl annotate validatingwebhookconfiguration dpu-operator-validating-webhook-configuration cert-manager.io/inject-ca-from- --overwrite || true + KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete certificate dpu-operator-serving-cert --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete issuer dpu-operator-selfsigned-issuer --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete secret webhook-server-cert --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator wait --for=create secret/webhook-server-cert --timeout=180s + KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=jsonpath='{.webhooks[0].clientConfig.caBundle}' validatingwebhookconfiguration/dpu-operator-validating-webhook-configuration --timeout=180s + fi ## Download envtest-setup locally if necessary envtest: @@ -128,6 +176,12 @@ tasks: KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl apply -f - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - + - task: deploy-webhook-compat + vars: + KUBECONFIG: "{{.KUBECONFIG_DPU}}" + - task: deploy-webhook-compat + vars: + KUBECONFIG: "{{.KUBECONFIG_HOST}}" - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s @@ -142,6 +196,9 @@ tasks: vars: KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - + - task: deploy-webhook-compat + vars: + KUBECONFIG: "{{.KUBECONFIG_HOST}}" - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s @@ -196,14 +253,6 @@ tasks: {{.BINDIR}}/ginkgo -coverprofile cover.out ./e2e_test/... - KUBECONFIG_HOST={{.KUBECONFIG_HOST}} sh hack/traffic_flow_tests.sh - prepare-e2e-test: - cmds: - - > - if [ "{{.SUBMODULES}}" = "true" ]; then - hack/prepare-submodules.sh - fi - hack/prepare-venv.sh - redeploy: cmds: - task: build-image-all From 3e739fdfae7e16d7ee68a7080e6c78aa187d7fcb Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Mon, 23 Mar 2026 14:32:11 -0400 Subject: [PATCH 09/13] fix(vsp): make p4 hostPath writable Signed-off-by: Sam DaSilva --- .../bindata/vsp/intel-ipu/99.vsp-pod.yaml | 4 +- .../bindata/vsp/marvell-dpu/99.vsp-pod.yaml | 4 +- .../dataprocessingunit_controller.go | 33 ++++++++ internal/daemon/daemon.go | 79 +++++++++++++++---- internal/utils/labels.go | 13 +++ 5 files changed, 112 insertions(+), 21 deletions(-) create mode 100644 internal/utils/labels.go diff --git a/internal/controller/bindata/vsp/intel-ipu/99.vsp-pod.yaml b/internal/controller/bindata/vsp/intel-ipu/99.vsp-pod.yaml index 114fc4b27..611d99103 100644 --- a/internal/controller/bindata/vsp/intel-ipu/99.vsp-pod.yaml +++ b/internal/controller/bindata/vsp/intel-ipu/99.vsp-pod.yaml @@ -43,8 +43,8 @@ spec: type: "" name: host-proc - hostPath: - path: /opt/p4/p4-cp-nws/var - type: "" + path: {{.P4StateHostPath}} + type: DirectoryOrCreate name: host-opt - hostPath: path: /var/run/ diff --git a/internal/controller/bindata/vsp/marvell-dpu/99.vsp-pod.yaml b/internal/controller/bindata/vsp/marvell-dpu/99.vsp-pod.yaml index cdbab78da..9dd8465d2 100644 --- a/internal/controller/bindata/vsp/marvell-dpu/99.vsp-pod.yaml +++ b/internal/controller/bindata/vsp/marvell-dpu/99.vsp-pod.yaml @@ -46,8 +46,8 @@ spec: type: "" name: host-proc - hostPath: - path: /opt/p4/p4-cp-nws/var - type: "" + path: {{.P4StateHostPath}} + type: DirectoryOrCreate name: host-opt - hostPath: path: /var/run/ diff --git a/internal/controller/dataprocessingunit_controller.go b/internal/controller/dataprocessingunit_controller.go index efabd7425..164498008 100644 --- a/internal/controller/dataprocessingunit_controller.go +++ b/internal/controller/dataprocessingunit_controller.go @@ -26,6 +26,7 @@ import ( configv1 "github.com/openshift/dpu-operator/api/v1" "github.com/openshift/dpu-operator/internal/images" "github.com/openshift/dpu-operator/internal/platform" + "github.com/openshift/dpu-operator/internal/utils" "github.com/openshift/dpu-operator/pkgs/render" "github.com/openshift/dpu-operator/pkgs/vars" corev1 "k8s.io/api/core/v1" @@ -67,6 +68,7 @@ func (r *DataProcessingUnitReconciler) WithImagePullPolicy(policy string) *DataP } // +kubebuilder:rbac:groups="",resources=pods,verbs=* +// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=secrets,verbs=* // +kubebuilder:rbac:groups="",resources=services,verbs=* @@ -137,11 +139,17 @@ func (r *DataProcessingUnitReconciler) ensureVSPResources(ctx context.Context, d return fmt.Errorf("failed to get VSP image for DPU type %s: %v", dpu.Spec.DpuProductName, err) } + p4StateHostPath, err := r.resolveP4StateHostPath(ctx, dpu) + if err != nil { + return err + } + additionalVars := map[string]string{ "Namespace": vars.Namespace, "VspName": r.getVSPName(dpu), "DpuName": dpu.Name, "NodeName": dpu.Spec.NodeName, + "P4StateHostPath": p4StateHostPath, "VendorSpecificPluginImage": vspImage, "ImagePullPolicy": r.imagePullPolicy, "Command": "[]", @@ -170,6 +178,31 @@ func (r *DataProcessingUnitReconciler) ensureVSPResources(ctx context.Context, d return nil } +func (r *DataProcessingUnitReconciler) resolveP4StateHostPath(ctx context.Context, dpu *configv1.DataProcessingUnit) (string, error) { + node := &corev1.Node{} + if err := r.Get(ctx, client.ObjectKey{Name: dpu.Spec.NodeName}, node); err != nil { + return "", fmt.Errorf("failed to get node %s for DataProcessingUnit %s: %w", dpu.Spec.NodeName, dpu.Name, err) + } + + if node.Labels == nil { + return "", fmt.Errorf("missing %s label on node %s for DataProcessingUnit %s", utils.P4HostPathLabelKey, dpu.Spec.NodeName, dpu.Name) + } + + mode, exists := node.Labels[utils.P4HostPathLabelKey] + if !exists { + return "", fmt.Errorf("missing %s label on node %s for DataProcessingUnit %s", utils.P4HostPathLabelKey, dpu.Spec.NodeName, dpu.Name) + } + + switch mode { + case utils.P4HostPathLabelValueOpt: + return "/opt/p4/p4-cp-nws/var", nil + case utils.P4HostPathLabelValueVarOpt: + return "/var/opt/p4/p4-cp-nws/var", nil + default: + return "", fmt.Errorf("invalid %s label value %q on node %s for DataProcessingUnit %s", utils.P4HostPathLabelKey, mode, dpu.Spec.NodeName, dpu.Name) + } +} + func (r *DataProcessingUnitReconciler) applyVSPResourcesWithTracking(logger logr.Logger, binDataPath string, data map[string]string, owner client.Object, dpuName string) error { // Get or create VSP resource renderer for this DataProcessingUnit renderer, exists := r.vspResourceRenderers[dpuName] diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 2365bc1e9..030d67ae4 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -16,6 +16,7 @@ import ( "github.com/openshift/dpu-operator/internal/scheme" "github.com/openshift/dpu-operator/internal/utils" + "golang.org/x/sys/unix" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,7 +30,7 @@ import ( var () -const DpuSideLabelKey = "dpu.config.openshift.io/dpuside" +const DpuSideLabelKey = utils.DpuSideLabelKey type SideManager interface { StartVsp(ctx context.Context) error @@ -562,50 +563,94 @@ func (d *Daemon) updateNodeLabels() error { return fmt.Errorf("Failed to get node %s: %v", d.nodeName, err) } + p4HostPathMode, err := detectP4HostPathMode() + if err != nil { + d.log.Error(err, "Failed to auto-detect host P4 path mode, defaulting to /opt") + p4HostPathMode = utils.P4HostPathLabelValueOpt + } + + if node.Labels == nil { + node.Labels = make(map[string]string) + } + + changed := false + if node.Labels[utils.P4HostPathLabelKey] != p4HostPathMode { + node.Labels[utils.P4HostPathLabelKey] = p4HostPathMode + changed = true + } + // Determine the label value based on detected DPUs var labelValue string if len(d.managedDpus) == 0 { // No DPUs detected, remove the label if it exists - if node.Labels != nil { - if _, exists := node.Labels[DpuSideLabelKey]; exists { - delete(node.Labels, DpuSideLabelKey) - err := d.client.Update(context.TODO(), node) - if err != nil { - return fmt.Errorf("Failed to remove DPU side label from node %s: %v", d.nodeName, err) - } - d.log.Info("Removed DPU side label from node", "nodeName", d.nodeName) + if _, exists := node.Labels[DpuSideLabelKey]; exists { + delete(node.Labels, DpuSideLabelKey) + changed = true + } + + if changed { + err := d.client.Update(context.TODO(), node) + if err != nil { + return fmt.Errorf("Failed to update labels on node %s: %v", d.nodeName, err) } + d.log.Info("Updated node labels", "nodeName", d.nodeName, "p4HostPathMode", p4HostPathMode) } + return nil } for _, managedDpu := range d.managedDpus { if managedDpu.DpuCR.Spec.IsDpuSide { - labelValue = "dpu" + labelValue = utils.DpuSideLabelValueDpu } else { - labelValue = "dpu-host" + labelValue = utils.DpuSideLabelValueHost } break // It is a bug if there is node with managedDPU that is both hosting a DPU and is a DPU itself. Hense we only need to look at the first managedDPU DPU CR. } - if node.Labels == nil { - node.Labels = make(map[string]string) - } - // Check if the label needs to be updated currentValue, exists := node.Labels[DpuSideLabelKey] if !exists || currentValue != labelValue { node.Labels[DpuSideLabelKey] = labelValue + changed = true + } + + if changed { err := d.client.Update(context.TODO(), node) if err != nil { - return fmt.Errorf("Failed to update DPU side label on node %s: %v", d.nodeName, err) + return fmt.Errorf("Failed to update labels on node %s: %v", d.nodeName, err) } - d.log.Info("Updated DPU side label on node", "nodeName", d.nodeName, "labelValue", labelValue) + d.log.Info("Updated node labels", "nodeName", d.nodeName, "dpuSide", labelValue, "p4HostPathMode", p4HostPathMode) } return nil } +func detectP4HostPathMode() (string, error) { + paths := []string{"/proc/1/root/opt", "/opt"} + var lastErr error + + for _, path := range paths { + var fsStat unix.Statfs_t + if err := unix.Statfs(path, &fsStat); err != nil { + lastErr = err + continue + } + + if fsStat.Flags&unix.ST_RDONLY != 0 { + return utils.P4HostPathLabelValueVarOpt, nil + } + + return utils.P4HostPathLabelValueOpt, nil + } + + if lastErr != nil { + return "", fmt.Errorf("failed to stat host /opt filesystem: %w", lastErr) + } + + return "", fmt.Errorf("failed to stat host /opt filesystem") +} + // setOwnerReference sets the DpuOperatorConfig as the owner of the DataProcessingUnit CR func (d *Daemon) setOwnerReference(dpuCR *configv1.DataProcessingUnit) error { // Find the DpuOperatorConfig that should own this DPU CR diff --git a/internal/utils/labels.go b/internal/utils/labels.go new file mode 100644 index 000000000..ce8d3a0e3 --- /dev/null +++ b/internal/utils/labels.go @@ -0,0 +1,13 @@ +package utils + +const ( + DpuSideLabelKey = "dpu.config.openshift.io/dpuside" + + DpuSideLabelValueDpu = "dpu" + DpuSideLabelValueHost = "dpu-host" + + P4HostPathLabelKey = "dpu.config.openshift.io/p4hostpath" + + P4HostPathLabelValueOpt = "opt" + P4HostPathLabelValueVarOpt = "varopt" +) From e2609fb055adfcd2a5de4efbeaeb1516748e7ffa Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Wed, 25 Mar 2026 16:21:32 -0400 Subject: [PATCH 10/13] fix(taskfile): Intel vsp prevents ACC undeploy & e2e test vars Signed-off-by: Sam DaSilva --- taskfile.yaml | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/taskfile.yaml b/taskfile.yaml index 660755fc0..712174ed5 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -77,10 +77,35 @@ tasks: - NS=$(KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator) if [[ -n "$NS" ]]; then false else true fi cmds: - # this will block untill everything is cleaned up and bringing system back into a clean state as if the operator was never installed - - KUBECONFIG={{.KUBECONFIG}} kubectl delete -f examples/config.yaml || true - - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} kubectl delete --ignore-not-found=true -f - - - KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=300s + - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete pod -l app=vsp --ignore-not-found=true --wait=false || true + - KUBECONFIG={{.KUBECONFIG}} kubectl delete -f examples/config.yaml --ignore-not-found=true --wait=false || true + - | + if KUBECONFIG={{.KUBECONFIG}} kubectl get dpuoperatorconfig.config.openshift.io dpu-operator-config >/dev/null 2>&1; then + if ! KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete dpuoperatorconfig.config.openshift.io/dpu-operator-config --timeout=120s; then + echo "Timed out waiting for DpuOperatorConfig deletion, removing finalizers" + KUBECONFIG={{.KUBECONFIG}} kubectl patch dpuoperatorconfig.config.openshift.io dpu-operator-config --type=merge -p '{"metadata":{"finalizers":[]}}' || true + fi + fi + - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} kubectl delete --ignore-not-found=true --wait=false -f - + - | + if KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator >/dev/null 2>&1; then + if ! KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=300s; then + echo "Timed out waiting for namespace openshift-dpu-operator deletion" + echo "Namespace phase/finalizers:" + KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator -o jsonpath='{.status.phase}{"\n"}{.spec.finalizers}{"\n"}' || true + echo "Remaining common resources in openshift-dpu-operator:" + KUBECONFIG={{.KUBECONFIG}} kubectl get pod,deploy,ds,rs,svc,sa,cm,secret,role,rolebinding -n openshift-dpu-operator --ignore-not-found=true || true + if [ "${FORCE_UNDEPLOY:-false}" = "true" ]; then + echo "FORCE_UNDEPLOY=true set, clearing namespace finalizers" + KUBECONFIG={{.KUBECONFIG}} kubectl patch ns openshift-dpu-operator --type=merge -p '{"spec":{"finalizers":[]}}' || true + KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=120s || true + fi + if KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator >/dev/null 2>&1; then + echo "Namespace 'openshift-dpu-operator' is still present after cleanup attempts" + exit 1 + fi + fi + fi - echo "Namespace 'openshift-dpu-operator' has been removed." undeploy: @@ -246,9 +271,9 @@ tasks: KUBECONFIG_HOST={{.KUBECONFIG_HOST}} KUBECONFIG_DPU={{.KUBECONFIG_DPU}} REGISTRY={{.REGISTRY}} - NF_INGRESS_IP=10.20.30.2 - EXTERNAL_CLIENT_DEV=eno12409 - EXTERNAL_CLIENT_IP=10.20.30.100 + NF_INGRESS_IP=${NF_INGRESS_IP:-10.20.30.2} + EXTERNAL_CLIENT_DEV=${EXTERNAL_CLIENT_DEV:-eno12409} + EXTERNAL_CLIENT_IP=${EXTERNAL_CLIENT_IP:-10.20.30.100} KUBEBUILDER_ASSETS="$({{.BINDIR}}/setup-envtest use {{.ENVTEST_K8S_VERSION}} --bin-dir {{.BINDIR_ABS}} -p path)" {{.BINDIR}}/ginkgo -coverprofile cover.out ./e2e_test/... - KUBECONFIG_HOST={{.KUBECONFIG_HOST}} sh hack/traffic_flow_tests.sh From 26045f16661cdabaa4279ef32da557cac152d922 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Wed, 25 Mar 2026 16:46:05 -0400 Subject: [PATCH 11/13] fix(nad): pin Whereabouts IPAM to explicit kubeconfig for reliable leader lock Adds ipam.kubernetes.kubeconfig=/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig to the host NAD template so dpu-cni uses the intended Whereabouts identity across vanilla/OpenShift-style environments, preventing time limit exceeded while waiting to become leader during pod network setup. Signed-off-by: Sam DaSilva --- .../bindata/networkfn-nad-host/00.networkfun-nad.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml b/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml index 16c288471..1a70d69fb 100644 --- a/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml +++ b/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml @@ -4,7 +4,7 @@ metadata: name: default-sriov-net namespace: default annotations: - k8s.v1.cni.cncf.io/resourceName: {{.ResourceName}} + k8s.v1.cni.cncf.io/resourceName: '{{.ResourceName}}' spec: #TODO: We need to customize the config based on the user's DpuNetwork CR config: '{ @@ -15,6 +15,9 @@ spec: "type": "whereabouts", "range": "10.56.217.0/24", "range_start": "10.56.217.50", - "range_end": "10.56.217.150" - } + "range_end": "10.56.217.150", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + } + } }' From 2ee67a31ee17c607d9688f5a911d760476882ed9 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Wed, 8 Apr 2026 14:56:40 -0400 Subject: [PATCH 12/13] fix: Clean up ux to follow upstream SNO semantics - Create an upstream deploy path while preserving the original openshift path. - Add opt-in path for webhook and cert-manager kustomization config - Change VanillaFlavour to KubernetesFlavor - Move label code closer to where its used Signed-off-by: Sam DaSilva --- README.md | 7 + config/default-upstream/kustomization.yaml | 30 +++++ config/dev/kustomization-upstream.yaml | 4 + .../networkfn-nad-host/00.networkfun-nad.yaml | 2 + internal/testutils/testcluster.go | 2 +- internal/utils/cluster_environment.go | 22 ++- internal/utils/labels.go | 13 -- internal/utils/path_manager.go | 4 +- taskfile.yaml | 125 ++++++++++++------ 9 files changed, 150 insertions(+), 59 deletions(-) create mode 100644 config/default-upstream/kustomization.yaml create mode 100644 config/dev/kustomization-upstream.yaml delete mode 100644 internal/utils/labels.go diff --git a/README.md b/README.md index 7054863c6..feba32975 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,13 @@ For 1 cluster deployment, you will only need /root/kubeconfig.ocpcluster. 1 clus task deploy-1c ``` +For upstream-style deployment semantics (without OpenShift-specific webhook +compatibility adjustments), use: + +```sh +task deploy-upstream +``` + 4. **Undeploy** Undoes what deploying did: diff --git a/config/default-upstream/kustomization.yaml b/config/default-upstream/kustomization.yaml new file mode 100644 index 000000000..44b456458 --- /dev/null +++ b/config/default-upstream/kustomization.yaml @@ -0,0 +1,30 @@ +# Adds namespace to all resources. +namespace: openshift-dpu-operator + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +namePrefix: dpu-operator- + +resources: +- ../crd +- ../rbac +- ../manager + +# [WEBHOOK] Optional for upstream deployments. +#- ../webhook + +# [CERTMANAGER] Optional for upstream deployments and only required when +# webhook certificates are managed by cert-manager. +#- ../certmanager + +patches: +# [WEBHOOK] +#- path: ../default/manager_webhook_patch.yaml + +# [CERTMANAGER] +#- path: ../default/webhookcainjection_patch.yaml + +# [CERTMANAGER] Uncomment replacements if cert-manager webhook CA injection is +# enabled in this profile. +#replacements: [] diff --git a/config/dev/kustomization-upstream.yaml b/config/dev/kustomization-upstream.yaml new file mode 100644 index 000000000..9710dc0cc --- /dev/null +++ b/config/dev/kustomization-upstream.yaml @@ -0,0 +1,4 @@ +resources: +- ../config/default-upstream +patches: +- path: local-images.yaml diff --git a/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml b/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml index 1a70d69fb..373524ec2 100644 --- a/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml +++ b/internal/controller/bindata/networkfn-nad-host/00.networkfun-nad.yaml @@ -7,6 +7,8 @@ metadata: k8s.v1.cni.cncf.io/resourceName: '{{.ResourceName}}' spec: #TODO: We need to customize the config based on the user's DpuNetwork CR + # NOTE: whereabouts needs an explicit kubeconfig path in this environment + # because CNI IPAM execution runs on-node (outside in-cluster auth context). config: '{ "type": "dpu-cni", "cniVersion": "0.4.0", diff --git a/internal/testutils/testcluster.go b/internal/testutils/testcluster.go index be2736566..f5fc60fa0 100644 --- a/internal/testutils/testcluster.go +++ b/internal/testutils/testcluster.go @@ -861,7 +861,7 @@ func LabelNodesWithDpu(c client.Client) error { switch flavour { case utils.MicroShiftFlavour, utils.KindFlavour: return LabelAllNodesWithDpu(c) - case utils.OpenShiftFlavour, utils.VanillaFlavour: + case utils.OpenShiftFlavour, utils.KubernetesFlavour: return LabelWorkerNodesWithDpu(c) default: return fmt.Errorf("unsupported cluster flavor %s", flavour) diff --git a/internal/utils/cluster_environment.go b/internal/utils/cluster_environment.go index f36b4e750..26e9568e6 100644 --- a/internal/utils/cluster_environment.go +++ b/internal/utils/cluster_environment.go @@ -25,17 +25,29 @@ func NewClusterEnvironment(client client.Client) *ClusterEnvironment { type Flavour string const ( - VanillaFlavour Flavour = "Vanilla" + KubernetesFlavour Flavour = "Kubernetes" OpenShiftFlavour Flavour = "OpenShift" MicroShiftFlavour Flavour = "MicroShift" KindFlavour Flavour = "Kind" UnknownFlavour Flavour = "Unknown" ) +const ( + DpuSideLabelKey = "dpu.config.openshift.io/dpuside" + + DpuSideLabelValueDpu = "dpu" + DpuSideLabelValueHost = "dpu-host" + + P4HostPathLabelKey = "dpu.config.openshift.io/p4hostpath" + + P4HostPathLabelValueOpt = "opt" + P4HostPathLabelValueVarOpt = "varopt" +) + func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { microShift, err := ce.isMicroShift(ctx) if err != nil { - return VanillaFlavour, err + return KubernetesFlavour, err } if microShift { return MicroShiftFlavour, nil @@ -43,7 +55,7 @@ func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { openShift, err := ce.isOpenShift(ctx) if err != nil { - return VanillaFlavour, err + return KubernetesFlavour, err } if openShift { return OpenShiftFlavour, nil @@ -51,13 +63,13 @@ func (ce *ClusterEnvironment) Flavour(ctx context.Context) (Flavour, error) { kind, err := ce.isKind(ctx) if err != nil { - return VanillaFlavour, err + return KubernetesFlavour, err } if kind { return KindFlavour, nil } - return VanillaFlavour, nil + return KubernetesFlavour, nil } func (ce *ClusterEnvironment) isMicroShift(ctx context.Context) (bool, error) { diff --git a/internal/utils/labels.go b/internal/utils/labels.go deleted file mode 100644 index ce8d3a0e3..000000000 --- a/internal/utils/labels.go +++ /dev/null @@ -1,13 +0,0 @@ -package utils - -const ( - DpuSideLabelKey = "dpu.config.openshift.io/dpuside" - - DpuSideLabelValueDpu = "dpu" - DpuSideLabelValueHost = "dpu-host" - - P4HostPathLabelKey = "dpu.config.openshift.io/p4hostpath" - - P4HostPathLabelValueOpt = "opt" - P4HostPathLabelValueVarOpt = "varopt" -) diff --git a/internal/utils/path_manager.go b/internal/utils/path_manager.go index ad5ca6e97..ffa7c1fef 100644 --- a/internal/utils/path_manager.go +++ b/internal/utils/path_manager.go @@ -45,10 +45,10 @@ func (p *PathManager) CniHostDir(clusterFlavour Flavour, filesystemMode Filesyst case clusterFlavour == MicroShiftFlavour && filesystemMode == ImageMode: return p.wrap("/run/cni"), nil // OpenShift typically uses /var/lib/cni regardless of filesystem mode since nodes are always coreos based - case clusterFlavour == OpenShiftFlavour || (clusterFlavour == VanillaFlavour && filesystemMode == ImageMode): + case clusterFlavour == OpenShiftFlavour || (clusterFlavour == KubernetesFlavour && filesystemMode == ImageMode): return p.wrap("/var/lib/cni"), nil // MicroShift with PackageMode and Kind use /opt/cni - case (clusterFlavour == MicroShiftFlavour && filesystemMode == PackageMode) || (clusterFlavour == VanillaFlavour && filesystemMode == PackageMode) || clusterFlavour == KindFlavour: + case (clusterFlavour == MicroShiftFlavour && filesystemMode == PackageMode) || (clusterFlavour == KubernetesFlavour && filesystemMode == PackageMode) || clusterFlavour == KindFlavour: return p.wrap("/opt/cni"), nil default: return "", fmt.Errorf("unknown combination of cluster flavour (%s) and filesystem mode (%s)", clusterFlavour, filesystemMode) diff --git a/taskfile.yaml b/taskfile.yaml index 712174ed5..f33ff7c20 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -30,6 +30,14 @@ vars: SETUP_ENVTEST_VERSION: release-0.20 OPERATOR_SDK_VERSION: v1.41.0 OPM_VERSION: v1.23.0 + DEPLOY_PROFILE: '{{ default "openshift" .DEPLOY_PROFILE }}' + K8S_CLI: + sh: | + if command -v oc >/dev/null 2>&1; then + echo oc + else + echo kubectl + fi KUBECONFIG_HOST: "/root/kubeconfig.ocpcluster" KUBECONFIG_DPU: "/root/kubeconfig.microshift" @@ -74,33 +82,33 @@ tasks: vars: KUBECONFIG: '{{.KUBECONFIG}}' status: - - NS=$(KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator) + - NS=$(KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get ns openshift-dpu-operator) if [[ -n "$NS" ]]; then false else true fi cmds: - - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete pod -l app=vsp --ignore-not-found=true --wait=false || true - - KUBECONFIG={{.KUBECONFIG}} kubectl delete -f examples/config.yaml --ignore-not-found=true --wait=false || true + - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete pod -l app=vsp --ignore-not-found=true --wait=false || true + - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} delete -f examples/config.yaml --ignore-not-found=true --wait=false || true - | - if KUBECONFIG={{.KUBECONFIG}} kubectl get dpuoperatorconfig.config.openshift.io dpu-operator-config >/dev/null 2>&1; then - if ! KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete dpuoperatorconfig.config.openshift.io/dpu-operator-config --timeout=120s; then + if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get dpuoperatorconfig.config.openshift.io dpu-operator-config >/dev/null 2>&1; then + if ! KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=delete dpuoperatorconfig.config.openshift.io/dpu-operator-config --timeout=120s; then echo "Timed out waiting for DpuOperatorConfig deletion, removing finalizers" - KUBECONFIG={{.KUBECONFIG}} kubectl patch dpuoperatorconfig.config.openshift.io dpu-operator-config --type=merge -p '{"metadata":{"finalizers":[]}}' || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} patch dpuoperatorconfig.config.openshift.io dpu-operator-config --type=merge -p '{"metadata":{"finalizers":[]}}' || true fi fi - - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} kubectl delete --ignore-not-found=true --wait=false -f - + - bin/kustomize build config/default | KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} delete --ignore-not-found=true --wait=false -f - - | - if KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator >/dev/null 2>&1; then - if ! KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=300s; then + if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get ns openshift-dpu-operator >/dev/null 2>&1; then + if ! KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=delete ns openshift-dpu-operator --timeout=300s; then echo "Timed out waiting for namespace openshift-dpu-operator deletion" echo "Namespace phase/finalizers:" - KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator -o jsonpath='{.status.phase}{"\n"}{.spec.finalizers}{"\n"}' || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get ns openshift-dpu-operator -o jsonpath='{.status.phase}{"\n"}{.spec.finalizers}{"\n"}' || true echo "Remaining common resources in openshift-dpu-operator:" - KUBECONFIG={{.KUBECONFIG}} kubectl get pod,deploy,ds,rs,svc,sa,cm,secret,role,rolebinding -n openshift-dpu-operator --ignore-not-found=true || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get pod,deploy,ds,rs,svc,sa,cm,secret,role,rolebinding -n openshift-dpu-operator --ignore-not-found=true || true if [ "${FORCE_UNDEPLOY:-false}" = "true" ]; then echo "FORCE_UNDEPLOY=true set, clearing namespace finalizers" - KUBECONFIG={{.KUBECONFIG}} kubectl patch ns openshift-dpu-operator --type=merge -p '{"spec":{"finalizers":[]}}' || true - KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=delete ns openshift-dpu-operator --timeout=120s || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} patch ns openshift-dpu-operator --type=merge -p '{"spec":{"finalizers":[]}}' || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=delete ns openshift-dpu-operator --timeout=120s || true fi - if KUBECONFIG={{.KUBECONFIG}} kubectl get ns openshift-dpu-operator >/dev/null 2>&1; then + if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get ns openshift-dpu-operator >/dev/null 2>&1; then echo "Namespace 'openshift-dpu-operator' is still present after cleanup attempts" exit 1 fi @@ -133,24 +141,32 @@ tasks: -registry-url {{.REGISTRY}} -template-file config/dev/local-images-template.yaml -output-file bin/local-images.yaml - - cp config/dev/kustomization.yaml bin + - | + if [ "{{.DEPLOY_PROFILE}}" = "upstream" ]; then + cp config/dev/kustomization-upstream.yaml bin/kustomization.yaml + else + cp config/dev/kustomization.yaml bin/kustomization.yaml + fi - deploy-webhook-compat: + # Remove stale duplicate validating webhook configurations that conflict with + # dpu-operator's canonical webhook name. + reconcile-validating-webhook-configs: internal: true vars: KUBECONFIG: '{{.KUBECONFIG}}' cmds: - | - KUBECONFIG={{.KUBECONFIG}} python3 - <<'PY' + KUBECONFIG={{.KUBECONFIG}} K8S_CLI={{.K8S_CLI}} python3 - <<'PY' import json import os import subprocess env = dict(os.environ) env["KUBECONFIG"] = os.environ["KUBECONFIG"] + cli = os.environ.get("K8S_CLI", "kubectl") data = json.loads(subprocess.check_output([ - "kubectl", + cli, "get", "validatingwebhookconfigurations", "-o", @@ -165,7 +181,7 @@ tasks: for webhook in item.get("webhooks", []): if webhook.get("name") == "vdpuoperatorconfig.kb.io": subprocess.run([ - "kubectl", + cli, "delete", "validatingwebhookconfiguration", name, @@ -173,14 +189,35 @@ tasks: ], env=env, check=False) break PY + + # On OpenShift clusters, reconcile webhook cert objects so OpenShift serving + # cert and webhook CA bundle propagation can settle on the active owner. + reconcile-openshift-webhook-tls: + internal: true + vars: + KUBECONFIG: '{{.KUBECONFIG}}' + cmds: - | - if KUBECONFIG={{.KUBECONFIG}} kubectl get --raw /apis/route.openshift.io/v1 >/dev/null 2>&1; then - KUBECONFIG={{.KUBECONFIG}} kubectl annotate validatingwebhookconfiguration dpu-operator-validating-webhook-configuration cert-manager.io/inject-ca-from- --overwrite || true - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete certificate dpu-operator-serving-cert --ignore-not-found=true - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete issuer dpu-operator-selfsigned-issuer --ignore-not-found=true - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator delete secret webhook-server-cert --ignore-not-found=true - KUBECONFIG={{.KUBECONFIG}} kubectl -n openshift-dpu-operator wait --for=create secret/webhook-server-cert --timeout=180s - KUBECONFIG={{.KUBECONFIG}} kubectl wait --for=jsonpath='{.webhooks[0].clientConfig.caBundle}' validatingwebhookconfiguration/dpu-operator-validating-webhook-configuration --timeout=180s + if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get --raw /apis/route.openshift.io/v1 >/dev/null 2>&1; then + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} annotate validatingwebhookconfiguration dpu-operator-validating-webhook-configuration cert-manager.io/inject-ca-from- --overwrite || true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete certificate dpu-operator-serving-cert --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete issuer dpu-operator-selfsigned-issuer --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete secret webhook-server-cert --ignore-not-found=true + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator wait --for=create secret/webhook-server-cert --timeout=180s + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=jsonpath='{.webhooks[0].clientConfig.caBundle}' validatingwebhookconfiguration/dpu-operator-validating-webhook-configuration --timeout=180s + fi + + # Run post-deploy webhook reconciliation when webhook resources are enabled + # in the selected deployment profile. + reconcile-webhook-post-deploy: + internal: true + vars: + KUBECONFIG: '{{.KUBECONFIG}}' + cmds: + - | + if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get validatingwebhookconfiguration dpu-operator-validating-webhook-configuration >/dev/null 2>&1; then + task reconcile-validating-webhook-configs KUBECONFIG={{.KUBECONFIG}} + task reconcile-openshift-webhook-tls KUBECONFIG={{.KUBECONFIG}} fi ## Download envtest-setup locally if necessary @@ -199,18 +236,18 @@ tasks: vars: KUBECONFIG_DPU: "{{.KUBECONFIG_DPU}}" KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl apply -f - - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - - - task: deploy-webhook-compat + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_DPU}}" {{.K8S_CLI}} apply -f - + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} apply -f - + - task: reconcile-webhook-post-deploy vars: KUBECONFIG: "{{.KUBECONFIG_DPU}}" - - task: deploy-webhook-compat + - task: reconcile-webhook-post-deploy vars: KUBECONFIG: "{{.KUBECONFIG_HOST}}" - - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_DPU}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_DPU}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_DPU}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s - echo "DPU operator deployment complete - controller manager and webhook are ready" deploy-1c: @@ -220,12 +257,24 @@ tasks: - task: undeploy-1c vars: KUBECONFIG_HOST: "{{.KUBECONFIG_HOST}}" - - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl apply -f - - - task: deploy-webhook-compat + - bin/kustomize build bin | KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} apply -f - + - task: reconcile-webhook-post-deploy vars: KUBECONFIG: "{{.KUBECONFIG_HOST}}" - - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s - - KUBECONFIG="{{.KUBECONFIG_HOST}}" kubectl -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=Available deployment/dpu-operator-controller-manager --timeout=300s + - KUBECONFIG="{{.KUBECONFIG_HOST}}" {{.K8S_CLI}} -n openshift-dpu-operator wait --for=condition=ready pod --all --timeout=300s + + deploy-upstream: + cmds: + - task: deploy + vars: + DEPLOY_PROFILE: upstream + + deploy-upstream-1c: + cmds: + - task: deploy-1c + vars: + DEPLOY_PROFILE: upstream prepare-e2e-test: cmds: From 5a7e623f6e38104f684fd55ba6a55aae58aacc28 Mon Sep 17 00:00:00 2001 From: Sam DaSilva Date: Wed, 8 Apr 2026 16:11:36 -0400 Subject: [PATCH 13/13] feat: add upstream cert-manager deploy profile Add an upstream-certmanager deployment path for upstream installs that need cert-manager-backed webhook and CRD CA injection semantics. Introduce explicit NRI_TLS_PROVIDER modes (auto, openshift, cert-manager, disabled) so TLS behavior is deterministic across OpenShift, MicroShift, and generic Kubernetes environments. Add new task targets for upstream cert-manager deployments and document the new deployment flow and provider behavior without changing default OpenShift-oriented deploy behavior. Signed-off-by: Sam DaSilva --- README.md | 14 +++ .../cainjection_in_dpuoperatorconfigs.yaml | 8 ++ .../kustomization.yaml | 116 ++++++++++++++++++ .../manager_webhook_patch.yaml | 32 +++++ ...ove_openshift_serving_cert_annotation.yaml | 7 ++ .../webhookcainjection_patch.yaml | 13 ++ .../kustomization-upstream-certmanager.yaml | 5 + config/dev/local-images-template.yaml | 2 + config/dev/nri-tls-provider-certmanager.yaml | 13 ++ config/manager/manager.yaml | 2 + .../dpuoperatorconfig_controller.go | 54 ++++++-- taskfile.yaml | 98 ++++++++++++++- 12 files changed, 351 insertions(+), 13 deletions(-) create mode 100644 config/default-upstream-certmanager/cainjection_in_dpuoperatorconfigs.yaml create mode 100644 config/default-upstream-certmanager/kustomization.yaml create mode 100644 config/default-upstream-certmanager/manager_webhook_patch.yaml create mode 100644 config/default-upstream-certmanager/remove_openshift_serving_cert_annotation.yaml create mode 100644 config/default-upstream-certmanager/webhookcainjection_patch.yaml create mode 100644 config/dev/kustomization-upstream-certmanager.yaml create mode 100644 config/dev/nri-tls-provider-certmanager.yaml diff --git a/README.md b/README.md index feba32975..4e5ce61de 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,20 @@ compatibility adjustments), use: task deploy-upstream ``` +To run upstream semantics with cert-manager-backed NRI TLS provisioning, use: + +```sh +task deploy-upstream-certmanager +``` + +The manager supports `NRI_TLS_PROVIDER` with the following values: + +- `auto` (default): OpenShift/MicroShift uses OpenShift webhook cert path; + Kubernetes disables NRI webhook deployment unless explicitly enabled. +- `openshift`: force OpenShift webhook cert path. +- `cert-manager`: require cert-manager CRDs and use cert-manager TLS objects. +- `disabled`: skip NRI webhook deployment. + 4. **Undeploy** Undoes what deploying did: diff --git a/config/default-upstream-certmanager/cainjection_in_dpuoperatorconfigs.yaml b/config/default-upstream-certmanager/cainjection_in_dpuoperatorconfigs.yaml new file mode 100644 index 000000000..b112185f5 --- /dev/null +++ b/config/default-upstream-certmanager/cainjection_in_dpuoperatorconfigs.yaml @@ -0,0 +1,8 @@ +# The following patch adds a directive for cert-manager to inject CA into the +# DpuOperatorConfig CRD conversion webhook definition. +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE/CERTIFICATE_NAME + name: dpuoperatorconfigs.config.openshift.io diff --git a/config/default-upstream-certmanager/kustomization.yaml b/config/default-upstream-certmanager/kustomization.yaml new file mode 100644 index 000000000..4a6612679 --- /dev/null +++ b/config/default-upstream-certmanager/kustomization.yaml @@ -0,0 +1,116 @@ +# Adds namespace to all resources. +namespace: openshift-dpu-operator + +# Value of this field is prepended to the names of all resources. +namePrefix: dpu-operator- + +resources: +- ../crd +- ../rbac +- ../manager +- ../webhook +- ../certmanager + +patches: +- path: cainjection_in_dpuoperatorconfigs.yaml +- path: manager_webhook_patch.yaml +- path: webhookcainjection_patch.yaml +- path: remove_openshift_serving_cert_annotation.yaml + +replacements: + - source: # Add cert-manager annotation to ValidatingWebhookConfiguration, MutatingWebhookConfiguration and CRDs + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert # this name should match the one in certificate.yaml + fieldPath: .metadata.namespace # namespace of the certificate CR + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true + - select: + kind: CustomResourceDefinition + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true + - source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert # this name should match the one in certificate.yaml + fieldPath: .metadata.name + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true + - select: + kind: CustomResourceDefinition + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true + - source: # Add cert-manager annotation to the webhook Service + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.name # namespace of the service + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 0 + create: true + - source: + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.namespace # namespace of the service + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 1 + create: true diff --git a/config/default-upstream-certmanager/manager_webhook_patch.yaml b/config/default-upstream-certmanager/manager_webhook_patch.yaml new file mode 100644 index 000000000..316381be3 --- /dev/null +++ b/config/default-upstream-certmanager/manager_webhook_patch.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert diff --git a/config/default-upstream-certmanager/remove_openshift_serving_cert_annotation.yaml b/config/default-upstream-certmanager/remove_openshift_serving_cert_annotation.yaml new file mode 100644 index 000000000..4c8ae6723 --- /dev/null +++ b/config/default-upstream-certmanager/remove_openshift_serving_cert_annotation.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Service +metadata: + name: webhook-service + namespace: system + annotations: + service.beta.openshift.io/serving-cert-secret-name: null diff --git a/config/default-upstream-certmanager/webhookcainjection_patch.yaml b/config/default-upstream-certmanager/webhookcainjection_patch.yaml new file mode 100644 index 000000000..25bef521c --- /dev/null +++ b/config/default-upstream-certmanager/webhookcainjection_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + labels: + app.kubernetes.io/name: validatingwebhookconfiguration + app.kubernetes.io/instance: validating-webhook-configuration + app.kubernetes.io/component: webhook + app.kubernetes.io/created-by: dpu-operator + app.kubernetes.io/part-of: dpu-operator + app.kubernetes.io/managed-by: kustomize + name: validating-webhook-configuration + annotations: + cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE/CERTIFICATE_NAME diff --git a/config/dev/kustomization-upstream-certmanager.yaml b/config/dev/kustomization-upstream-certmanager.yaml new file mode 100644 index 000000000..999b1ae8e --- /dev/null +++ b/config/dev/kustomization-upstream-certmanager.yaml @@ -0,0 +1,5 @@ +resources: +- ../config/default-upstream-certmanager +patches: +- path: local-images.yaml +- path: nri-tls-provider-certmanager.yaml diff --git a/config/dev/local-images-template.yaml b/config/dev/local-images-template.yaml index f007b0994..e29a4bcf1 100644 --- a/config/dev/local-images-template.yaml +++ b/config/dev/local-images-template.yaml @@ -34,5 +34,7 @@ spec: value: Always - name: NRIWebhookImage value: {{ .RegistryURL }}/network-resources-injector:dev + - name: NRI_TLS_PROVIDER + value: auto image: {{ .RegistryURL }}/dpu-operator:dev imagePullPolicy: Always diff --git a/config/dev/nri-tls-provider-certmanager.yaml b/config/dev/nri-tls-provider-certmanager.yaml new file mode 100644 index 000000000..8033e0079 --- /dev/null +++ b/config/dev/nri-tls-provider-certmanager.yaml @@ -0,0 +1,13 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + env: + - name: NRI_TLS_PROVIDER + value: cert-manager diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 6f8a8bb9b..ea39ef5bd 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -80,6 +80,8 @@ spec: value: quay.io/openshift/dpu-daemon:latest - name: NRIWebhookImage value: quay.io/openshift/dpu-network-resources-injector:latest + - name: NRI_TLS_PROVIDER + value: auto - name: intel_ipu value: quay.io/openshift/dpu-intel-ipu-vsp:latest - name: IntelVspP4Image diff --git a/internal/controller/dpuoperatorconfig_controller.go b/internal/controller/dpuoperatorconfig_controller.go index 8722e9e2b..237e0b165 100644 --- a/internal/controller/dpuoperatorconfig_controller.go +++ b/internal/controller/dpuoperatorconfig_controller.go @@ -20,6 +20,8 @@ import ( "context" "embed" "fmt" + "os" + "strings" "github.com/go-logr/logr" configv1 "github.com/openshift/dpu-operator/api/v1" @@ -43,6 +45,13 @@ var binData embed.FS const dpuOperatorConfigFinalizer = "config.openshift.io/dpuoperatorconfig-finalizer" +const ( + nriTLSProviderAuto = "auto" + nriTLSProviderOpenShift = "openshift" + nriTLSProviderCertManager = "cert-manager" + nriTLSProviderDisabled = "disabled" +) + type componentError struct { component string err error @@ -331,11 +340,18 @@ func (r *DpuOperatorConfigReconciler) ensureNetworkResourcesInjector(ctx context return fmt.Errorf("failed to detect cluster flavour for network resources injector: %w", err) } + provider, err := resolveNriTLSProvider(flavour) + if err != nil { + return err + } + + if provider == nriTLSProviderDisabled { + logger.Info("Skipping Network Resources Injector deployment", "flavour", flavour, "provider", provider) + return nil + } + binDataPath := "network-resources-injector" - switch flavour { - case utils.OpenShiftFlavour, utils.MicroShiftFlavour: - binDataPath = "network-resources-injector" - default: + if provider == nriTLSProviderCertManager { if err := r.ensureCertManagerInstalled(ctx, flavour); err != nil { return err } @@ -343,10 +359,34 @@ func (r *DpuOperatorConfigReconciler) ensureNetworkResourcesInjector(ctx context } logger.Info("Create Network Resources Injector") - logger.Info("Selected network resources injector manifest set", "flavour", flavour, "path", binDataPath) + logger.Info("Selected network resources injector manifest set", "flavour", flavour, "provider", provider, "path", binDataPath) return r.createAndApplyAllFromBinData(logger, binDataPath, cfg) } +func resolveNriTLSProvider(flavour utils.Flavour) (string, error) { + // NRI_TLS_PROVIDER controls which TLS provisioning path to use: + // - auto: OpenShift/MicroShift -> openshift, Kubernetes -> disabled + // - openshift: OpenShift service-ca/serving-cert annotations + // - cert-manager: cert-manager issuer/certificate flow + // - disabled: skip NRI webhook deployment + provider := strings.ToLower(strings.TrimSpace(os.Getenv("NRI_TLS_PROVIDER"))) + if provider == "" || provider == nriTLSProviderAuto { + switch flavour { + case utils.OpenShiftFlavour, utils.MicroShiftFlavour: + return nriTLSProviderOpenShift, nil + default: + return nriTLSProviderDisabled, nil + } + } + + switch provider { + case nriTLSProviderOpenShift, nriTLSProviderCertManager, nriTLSProviderDisabled: + return provider, nil + default: + return "", fmt.Errorf("unsupported NRI_TLS_PROVIDER value %q (supported: auto, openshift, cert-manager, disabled)", provider) + } +} + func (r *DpuOperatorConfigReconciler) ensureCertManagerInstalled(ctx context.Context, flavour utils.Flavour) error { requiredCRDs := []string{ "certificates.cert-manager.io", @@ -357,9 +397,9 @@ func (r *DpuOperatorConfigReconciler) ensureCertManagerInstalled(ctx context.Con crd := &apiextensionsv1.CustomResourceDefinition{} if err := r.Get(ctx, types.NamespacedName{Name: crdName}, crd); err != nil { if apierrors.IsNotFound(err) { - return fmt.Errorf("cert-manager is required on %s clusters for network-resources-injector TLS provisioning: missing CRD %q", flavour, crdName) + return fmt.Errorf("cert-manager is required when NRI_TLS_PROVIDER=cert-manager on %s cluster: missing CRD %q", flavour, crdName) } - return fmt.Errorf("failed to verify cert-manager CRD %q on %s cluster: %w", crdName, flavour, err) + return fmt.Errorf("failed to verify cert-manager CRD %q on %s cluster when NRI_TLS_PROVIDER=cert-manager: %w", crdName, flavour, err) } } diff --git a/taskfile.yaml b/taskfile.yaml index f33ff7c20..057a7ed3f 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -144,6 +144,9 @@ tasks: - | if [ "{{.DEPLOY_PROFILE}}" = "upstream" ]; then cp config/dev/kustomization-upstream.yaml bin/kustomization.yaml + elif [ "{{.DEPLOY_PROFILE}}" = "upstream-certmanager" ]; then + cp config/dev/kustomization-upstream-certmanager.yaml bin/kustomization.yaml + cp config/dev/nri-tls-provider-certmanager.yaml bin/nri-tls-provider-certmanager.yaml else cp config/dev/kustomization.yaml bin/kustomization.yaml fi @@ -151,7 +154,6 @@ tasks: # Remove stale duplicate validating webhook configurations that conflict with # dpu-operator's canonical webhook name. reconcile-validating-webhook-configs: - internal: true vars: KUBECONFIG: '{{.KUBECONFIG}}' cmds: @@ -193,7 +195,6 @@ tasks: # On OpenShift clusters, reconcile webhook cert objects so OpenShift serving # cert and webhook CA bundle propagation can settle on the active owner. reconcile-openshift-webhook-tls: - internal: true vars: KUBECONFIG: '{{.KUBECONFIG}}' cmds: @@ -203,10 +204,77 @@ tasks: KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete certificate dpu-operator-serving-cert --ignore-not-found=true KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete issuer dpu-operator-selfsigned-issuer --ignore-not-found=true KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator delete secret webhook-server-cert --ignore-not-found=true - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator wait --for=create secret/webhook-server-cert --timeout=180s - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=jsonpath='{.webhooks[0].clientConfig.caBundle}' validatingwebhookconfiguration/dpu-operator-validating-webhook-configuration --timeout=180s + task wait-webhook-ca-sync KUBECONFIG={{.KUBECONFIG}} fi + # Wait until webhook serving certificate and validating webhook CA bundle are + # both present and reference the same certificate material. + wait-webhook-ca-sync: + vars: + KUBECONFIG: '{{.KUBECONFIG}}' + cmds: + - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator wait --for=create secret/webhook-server-cert --timeout=180s + - KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} wait --for=jsonpath='{.webhooks[0].clientConfig.caBundle}' validatingwebhookconfiguration/dpu-operator-validating-webhook-configuration --timeout=180s + - | + KUBECONFIG={{.KUBECONFIG}} K8S_CLI={{.K8S_CLI}} python3 - <<'PY' + import base64 + import json + import os + import subprocess + import time + + cli = os.environ.get("K8S_CLI", "kubectl") + env = dict(os.environ) + + def oc_json(args): + out = subprocess.check_output([cli, *args], env=env) + return json.loads(out) + + def cert_fp(pem_bytes): + out = subprocess.check_output( + ["openssl", "x509", "-noout", "-fingerprint", "-sha256"], + input=pem_bytes, + ).decode().strip() + return out.split("=", 1)[1] + + timeout_s = 180 + deadline = time.time() + timeout_s + while time.time() < deadline: + secret = oc_json([ + "-n", + "openshift-dpu-operator", + "get", + "secret", + "webhook-server-cert", + "-o", + "json", + ]) + webhook = oc_json([ + "get", + "validatingwebhookconfiguration", + "dpu-operator-validating-webhook-configuration", + "-o", + "json", + ]) + + tls_b64 = secret.get("data", {}).get("tls.crt", "") + ca_b64 = webhook.get("webhooks", [{}])[0].get("clientConfig", {}).get("caBundle", "") + if not tls_b64 or not ca_b64: + time.sleep(2) + continue + + tls_fp = cert_fp(base64.b64decode(tls_b64)) + ca_fp = cert_fp(base64.b64decode(ca_b64)) + if tls_fp == ca_fp: + print(f"Webhook certificate and CA bundle are synchronized: {tls_fp}") + raise SystemExit(0) + + print(f"Waiting for webhook CA sync (secret={tls_fp}, cabundle={ca_fp})") + time.sleep(2) + + raise SystemExit("timed out waiting for webhook CA bundle to match webhook-server-cert") + PY + # Run post-deploy webhook reconciliation when webhook resources are enabled # in the selected deployment profile. reconcile-webhook-post-deploy: @@ -217,9 +285,15 @@ tasks: - | if KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} get validatingwebhookconfiguration dpu-operator-validating-webhook-configuration >/dev/null 2>&1; then task reconcile-validating-webhook-configs KUBECONFIG={{.KUBECONFIG}} - task reconcile-openshift-webhook-tls KUBECONFIG={{.KUBECONFIG}} + if [ "{{.DEPLOY_PROFILE}}" = "upstream-certmanager" ]; then + task wait-webhook-ca-sync KUBECONFIG={{.KUBECONFIG}} + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator rollout restart deployment/dpu-operator-controller-manager + KUBECONFIG={{.KUBECONFIG}} {{.K8S_CLI}} -n openshift-dpu-operator rollout status deployment/dpu-operator-controller-manager --timeout=300s + else + task reconcile-openshift-webhook-tls KUBECONFIG={{.KUBECONFIG}} + fi fi - + ## Download envtest-setup locally if necessary envtest: status: @@ -276,6 +350,18 @@ tasks: vars: DEPLOY_PROFILE: upstream + deploy-upstream-certmanager: + cmds: + - task: deploy + vars: + DEPLOY_PROFILE: upstream-certmanager + + deploy-upstream-certmanager-1c: + cmds: + - task: deploy-1c + vars: + DEPLOY_PROFILE: upstream-certmanager + prepare-e2e-test: cmds: - >