From 1765f7caef92cce2a31430156fa7706ad756ba0b Mon Sep 17 00:00:00 2001 From: Jamo Luhrsen Date: Wed, 28 Jan 2026 19:30:58 -0800 Subject: [PATCH] DNM/TEST: chasing dualstack v6primary test perma-failures Signed-off-by: Jamo Luhrsen --- test/e2e/network/dns.go | 34 ++++++++++++++++++++++----- test/e2e/network/dns_common.go | 43 +++++++++++++++++++++++++++++++--- test/e2e/node/kubelet_authz.go | 39 ++++++++++++++++++++---------- 3 files changed, 95 insertions(+), 21 deletions(-) diff --git a/test/e2e/network/dns.go b/test/e2e/network/dns.go index 9a87ba4c065ae..072be27832b7d 100644 --- a/test/e2e/network/dns.go +++ b/test/e2e/network/dns.go @@ -34,6 +34,7 @@ import ( e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" "k8s.io/kubernetes/test/e2e/network/common" imageutils "k8s.io/kubernetes/test/utils/image" + utilnet "k8s.io/utils/net" admissionapi "k8s.io/pod-security-admission/api" "github.com/onsi/ginkgo/v2" @@ -42,6 +43,18 @@ import ( const dnsTestPodHostName = "dns-querier-1" const dnsTestServiceName = "dns-test-service" +// getClusterPrimaryIPFamily detects whether the cluster is using IPv6 as its primary IP family +// by checking the kubernetes.default service ClusterIP. This is more reliable than +// framework.TestContext.ClusterIsIPv6() which returns false for dualstack v6-primary clusters. +func getClusterPrimaryIPFamily(ctx context.Context, f *framework.Framework) bool { + kubeService, err := f.ClientSet.CoreV1().Services("default").Get(ctx, "kubernetes", metav1.GetOptions{}) + framework.ExpectNoError(err, "failed to get kubernetes.default service") + isIPv6 := utilnet.IsIPv6String(kubeService.Spec.ClusterIP) + framework.Logf("DEBUG-DNS: Detected cluster primary IP family: IPv6=%v (kubernetes.default ClusterIP=%s)", + isIPv6, kubeService.Spec.ClusterIP) + return isIPv6 +} + var _ = common.SIGDescribe("DNS", func() { f := framework.NewDefaultFramework("dns") f.NamespacePodSecurityLevel = admissionapi.LevelBaseline @@ -58,13 +71,16 @@ var _ = common.SIGDescribe("DNS", func() { namesToResolve := []string{ fmt.Sprintf("kubernetes.default.svc.%s", framework.TestContext.ClusterDNSDomain), } + // Detect actual cluster IP family instead of using ClusterIsIPv6() which returns false + // for dualstack v6-primary clusters. + isIPv6 := getClusterPrimaryIPFamily(ctx, f) // TODO: Validate both IPv4 and IPv6 families for dual-stack // TODO: We should change this whole test mechanism to run the same probes // against a known list of different base images. Agnhost happens to be alpine // (MUSL libc) for the moment, and jessie is (an old version of) libc. - agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, nil, "", "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, nil, "", "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) agnhostProber := dnsQuerier{name: "agnhost", image: imageutils.Agnhost, cmd: agnhostProbeCmd} - jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, nil, "", "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, nil, "", "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) jessieProber := dnsQuerier{name: "jessie", image: imageutils.JessieDnsutils, cmd: jessieProbeCmd} ginkgo.By("Running these commands on agnhost: " + agnhostProbeCmd + "\n") ginkgo.By("Running these commands on jessie: " + jessieProbeCmd + "\n") @@ -238,10 +254,13 @@ var _ = common.SIGDescribe("DNS", func() { fmt.Sprintf("_http._tcp.%s.%s.svc", regularService.Name, f.Namespace.Name), } + // Detect actual cluster IP family instead of using ClusterIsIPv6() which returns false + // for dualstack v6-primary clusters. + isIPv6 := getClusterPrimaryIPFamily(ctx, f) // TODO: Validate both IPv4 and IPv6 families for dual-stack - agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, nil, regularService.Spec.ClusterIP, "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, nil, regularService.Spec.ClusterIP, "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) agnhostProber := dnsQuerier{name: "agnhost", image: imageutils.Agnhost, cmd: agnhostProbeCmd} - jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, nil, regularService.Spec.ClusterIP, "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, nil, regularService.Spec.ClusterIP, "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) jessieProber := dnsQuerier{name: "jessie", image: imageutils.JessieDnsutils, cmd: jessieProbeCmd} ginkgo.By("Running these commands on agnhost: " + agnhostProbeCmd + "\n") ginkgo.By("Running these commands on jessie: " + jessieProbeCmd + "\n") @@ -635,10 +654,13 @@ var _ = common.SIGDescribe("DNS", func() { } hostFQDN := fmt.Sprintf("%s.%s.%s.svc.%s", dnsTestPodHostName, dnsTestServiceName, f.Namespace.Name, framework.TestContext.ClusterDNSDomain) hostEntries := []string{hostFQDN, dnsTestPodHostName} + // Detect actual cluster IP family instead of using ClusterIsIPv6() which returns false + // for dualstack v6-primary clusters. + isIPv6 := getClusterPrimaryIPFamily(ctx, f) // TODO: Validate both IPv4 and IPv6 families for dual-stack - agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, hostEntries, "", "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + agnhostProbeCmd, agnhostFileNames := createProbeCommand(namesToResolve, hostEntries, "", "agnhost", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) agnhostProber := dnsQuerier{name: "agnhost", image: imageutils.Agnhost, cmd: agnhostProbeCmd} - jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, hostEntries, "", "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, framework.TestContext.ClusterIsIPv6()) + jessieProbeCmd, jessieFileNames := createProbeCommand(namesToResolve, hostEntries, "", "jessie", f.Namespace.Name, framework.TestContext.ClusterDNSDomain, isIPv6) jessieProber := dnsQuerier{name: "jessie", image: imageutils.JessieDnsutils, cmd: jessieProbeCmd} ginkgo.By("Running these commands on agnhost: " + agnhostProbeCmd + "\n") ginkgo.By("Running these commands on jessie: " + jessieProbeCmd + "\n") diff --git a/test/e2e/network/dns_common.go b/test/e2e/network/dns_common.go index 0e007ea62f98a..34ec6cbaabf16 100644 --- a/test/e2e/network/dns_common.go +++ b/test/e2e/network/dns_common.go @@ -411,6 +411,9 @@ func createProbeCommand(namesToResolve []string, hostEntries []string, ptrLookup fileNames := make([]string, 0, len(namesToResolve)*2) probeCmd := "for i in `seq 1 600`; do " dnsRecord := "A" + + framework.Logf("DEBUG-DNS: Creating probe for IPv6=%v, names=%v, hostEntries=%v, ptrLookupIP=%s", + isIPv6, namesToResolve, hostEntries, ptrLookupIP) if isIPv6 { dnsRecord = "AAAA" } @@ -473,12 +476,22 @@ func assertFilesExist(ctx context.Context, fileNames []string, fileDir string, p func assertFilesContain(ctx context.Context, fileNames []string, fileDir string, pod *v1.Pod, client clientset.Interface, check bool, expected string) { var failed []string + framework.Logf("DEBUG-DNS: Starting to check for %d result files from pod %s/%s", len(fileNames), pod.Namespace, pod.Name) + framework.Logf("DEBUG-DNS: Files to check: %v", fileNames) + framework.Logf("DEBUG-DNS: Pod IP=%s, Host IP=%s, Node=%s", pod.Status.PodIP, pod.Status.HostIP, pod.Spec.NodeName) + + pollStartTime := time.Now() framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second*5, time.Second*600, true, func(ctx context.Context) (bool, error) { failed = []string{} + succeeded := []string{} + elapsed := time.Since(pollStartTime).Round(time.Second) ctx, cancel := context.WithTimeout(ctx, framework.SingleCallTimeout) defer cancel() + framework.Logf("DEBUG-DNS: Poll attempt at T+%v: checking %d files from pod %s/%s", + elapsed, len(fileNames), pod.Namespace, pod.Name) + for _, fileName := range fileNames { contents, err := client.CoreV1().RESTClient().Get(). Namespace(pod.Namespace). @@ -489,6 +502,8 @@ func assertFilesContain(ctx context.Context, fileNames []string, fileDir string, Do(ctx).Raw() if err != nil { + framework.Logf("DEBUG-DNS: [T+%v] FAILED to read %s from pod %s/%s: %v (timeout=%v)", + elapsed, fileName, pod.Namespace, pod.Name, err, ctx.Err() != nil) if ctx.Err() != nil { return false, fmt.Errorf("Unable to read %s from pod %s/%s: %v", fileName, pod.Namespace, pod.Name, err) } else { @@ -496,26 +511,42 @@ func assertFilesContain(ctx context.Context, fileNames []string, fileDir string, } failed = append(failed, fileName) } else if check && strings.TrimSpace(string(contents)) != expected { - framework.Logf("File %s from pod %s/%s contains '%s' instead of '%s'", fileName, pod.Namespace, pod.Name, string(contents), expected) + framework.Logf("DEBUG-DNS: [T+%v] MISMATCH in %s from pod %s/%s: got '%s', expected '%s'", + elapsed, fileName, pod.Namespace, pod.Name, string(contents), expected) failed = append(failed, fileName) + } else { + framework.Logf("DEBUG-DNS: [T+%v] SUCCESS reading %s from pod %s/%s (content: %q)", + elapsed, fileName, pod.Namespace, pod.Name, string(contents)) + succeeded = append(succeeded, fileName) } } + if len(failed) == 0 { + framework.Logf("DEBUG-DNS: [T+%v] All %d files successfully read! Test complete.", elapsed, len(fileNames)) return true, nil } - framework.Logf("Lookups using %s/%s failed for: %v\n", pod.Namespace, pod.Name, failed) + framework.Logf("DEBUG-DNS: [T+%v] Poll result: %d succeeded %v, %d still pending %v", + elapsed, len(succeeded), succeeded, len(failed), failed) // grab logs from all the containers + framework.Logf("DEBUG-DNS: [T+%v] Fetching container logs to diagnose failures", elapsed) for _, container := range pod.Spec.Containers { logs, err := e2epod.GetPodLogs(ctx, client, pod.Namespace, pod.Name, container.Name) if err != nil { return false, fmt.Errorf("unexpected error getting pod client logs for %s: %v", container.Name, err) } - framework.Logf("Pod client logs for %s: %s", container.Name, logs) + framework.Logf("DEBUG-DNS: [T+%v] Container %s logs:\n%s", elapsed, container.Name, logs) } return false, nil })) + + totalElapsed := time.Since(pollStartTime).Round(time.Second) + if len(failed) > 0 { + framework.Logf("DEBUG-DNS: TIMEOUT after %v - %d files never appeared: %v", totalElapsed, len(failed), failed) + } else { + framework.Logf("DEBUG-DNS: All files found after %v", totalElapsed) + } gomega.Expect(failed).To(gomega.BeEmpty()) } @@ -537,6 +568,12 @@ func validateDNSResults(ctx context.Context, f *framework.Framework, pod *v1.Pod if err != nil { framework.Failf("ginkgo.Failed to get pod %s/%s: %v", pod.Namespace, pod.Name, err) } + + framework.Logf("DEBUG-DNS: Starting DNS validation for pod %s/%s", pod.Namespace, pod.Name) + framework.Logf("DEBUG-DNS: Pod details - IP: %s, HostIP: %s, Node: %s, Phase: %s", + pod.Status.PodIP, pod.Status.HostIP, pod.Spec.NodeName, pod.Status.Phase) + framework.Logf("DEBUG-DNS: Expecting %d result files: %v", len(fileNames), fileNames) + // Try to find results for each expected name. ginkgo.By("looking for the results for each expected name from probers") assertFilesExist(ctx, fileNames, "results", pod, f.ClientSet) diff --git a/test/e2e/node/kubelet_authz.go b/test/e2e/node/kubelet_authz.go index 13484db03c952..8645dee011cd6 100644 --- a/test/e2e/node/kubelet_authz.go +++ b/test/e2e/node/kubelet_authz.go @@ -34,6 +34,7 @@ import ( e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eoutput "k8s.io/kubernetes/test/e2e/framework/pod/output" admissionapi "k8s.io/pod-security-admission/api" + utilnet "k8s.io/utils/net" ) var _ = SIGDescribe(framework.WithFeatureGate(features.KubeletFineGrainedAuthz), func() { @@ -134,26 +135,40 @@ func runKubeletAuthzTest(ctx context.Context, f *framework.Framework, endpoint, ginkgo.By(fmt.Sprintf("Creating Pod %s in namespace %s with serviceaccount %s", pod.Name, pod.Namespace, pod.Spec.ServiceAccountName)) - _ = e2epod.NewPodClient(f).CreateSync(ctx, pod) + createdPod := e2epod.NewPodClient(f).CreateSync(ctx, pod) ginkgo.By("Running command in Pod") var hostWarpStart, hostWarpEnd string - // IPv6 host must be wrapped within [] if you specify a port. - if framework.TestContext.ClusterIsIPv6() { + // IPv6 addresses must be wrapped in brackets when specifying a port in URLs (RFC 3986). + // Check the actual node IP rather than using ClusterIsIPv6(), which returns false for dualstack. + isIPv6 := utilnet.IsIPv6String(createdPod.Status.HostIP) + if isIPv6 { hostWarpStart = "[" hostWarpEnd = "]" } - result := e2eoutput.RunHostCmdOrDie(ns, - pod.Name, - fmt.Sprintf("curl -XGET -sIk -o /dev/null -w '%s' --header \"Authorization: Bearer `%s`\" https://%s$NODE_IP%s:%d/%s", - "%{http_code}", - "cat /var/run/secrets/kubernetes.io/serviceaccount/token", - hostWarpStart, - hostWarpEnd, - ports.KubeletPort, - endpoint)) + // Debug logging for v6 environments + framework.Logf("DEBUG-KUBELET: Pod %s/%s testing endpoint: %s", ns, pod.Name, endpoint) + framework.Logf("DEBUG-KUBELET: createdPod.Status.HostIP = %q", createdPod.Status.HostIP) + framework.Logf("DEBUG-KUBELET: utilnet.IsIPv6String returned %v", isIPv6) + framework.Logf("DEBUG-KUBELET: Using brackets: start=%q end=%q", hostWarpStart, hostWarpEnd) + + // Check what $NODE_IP actually is inside the pod + nodeIPValue := e2eoutput.RunHostCmdOrDie(ns, pod.Name, "echo $NODE_IP") + framework.Logf("DEBUG-KUBELET: $NODE_IP inside pod = %q", nodeIPValue) + + curlCmd := fmt.Sprintf("curl -XGET -sIk -o /dev/null -w '%s' --header \"Authorization: Bearer `%s`\" https://%s$NODE_IP%s:%d/%s", + "%{http_code}", + "cat /var/run/secrets/kubernetes.io/serviceaccount/token", + hostWarpStart, + hostWarpEnd, + ports.KubeletPort, + endpoint) + framework.Logf("DEBUG-KUBELET: Executing curl command: %s", curlCmd) + + result := e2eoutput.RunHostCmdOrDie(ns, pod.Name, curlCmd) + framework.Logf("DEBUG-KUBELET: curl result (HTTP status code): %s", result) return result }