diff --git a/ci-operator/step-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh b/ci-operator/step-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh index d224a3b61de63..319c0683aa035 100644 --- a/ci-operator/step-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh +++ b/ci-operator/step-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh @@ -34,6 +34,37 @@ readonly RETRY_SLEEP_INTERVAL=30 # --- Functions --- +# collect_oom_debug_info gathers debugging information when a pod exits with 137 (OOM/SIGKILL). +# This helps determine if the pod was killed due to pod limits or node-level OOM. +collect_oom_debug_info() { + echo "=== OOM DEBUG INFO: Collecting diagnostics for exit code 137 ===" + + echo "--- Pod description for ${MAISTRA_SC_POD} ---" + oc describe pod "${MAISTRA_SC_POD}" -n "${MAISTRA_NAMESPACE}" 2>&1 || echo "Failed to describe pod" + + local node_name + node_name=$(oc get pod "${MAISTRA_SC_POD}" -n "${MAISTRA_NAMESPACE}" -o jsonpath='{.spec.nodeName}' 2>/dev/null || echo "") + + echo "--- Events from namespace ${MAISTRA_NAMESPACE} ---" + oc get events -n "${MAISTRA_NAMESPACE}" --sort-by='.lastTimestamp' 2>&1 || echo "Failed to get namespace events" + + if [[ -n "${node_name}" ]]; then + echo "--- Node ${node_name} description ---" + oc describe node "${node_name}" 2>&1 || echo "Failed to describe node" + + echo "--- Events from node ${node_name} ---" + oc get events --field-selector "involvedObject.name=${node_name}" --all-namespaces --sort-by='.lastTimestamp' 2>&1 || echo "Failed to get node events" + + echo "--- All pods on node ${node_name} with resource usage ---" + oc adm top pods --all-namespaces --selector="spec.nodeName=${node_name}" 2>&1 || echo "Failed to get pod metrics (metrics-server may not be available)" + oc get pods --all-namespaces --field-selector "spec.nodeName=${node_name}" -o wide 2>&1 || echo "Failed to list pods on node" + else + echo "WARNING: Could not determine node name for pod ${MAISTRA_SC_POD}" + fi + + echo "=== END OOM DEBUG INFO ===" +} + # check_cluster_operators waits up to 15 minutes for all OpenShift cluster # operators to be in a stable (not Progressing, not Degraded, and Available) state. check_cluster_operators() { @@ -127,6 +158,11 @@ execute_and_collect_artifacts() { test_rc=$? echo "Test run (attempt ${attempt}) completed with exit code ${test_rc}" + # Collect debug info if pod was killed (likely OOM) + if [[ "${test_rc}" -eq 137 ]]; then + collect_oom_debug_info + fi + echo "Copying artifacts from test pod after attempt ${attempt}..." oc cp "${MAISTRA_NAMESPACE}"/"${MAISTRA_SC_POD}":"${ARTIFACT_DIR}"/. "${ARTIFACT_DIR}"