openshift · ctartici · May 14, 2026 · coderabbitai · May 14, 2026
diff --git a/...-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh b/...-registry/servicemesh/sail-operator/e2e-ocp/servicemesh-sail-operator-e2e-ocp-commands.sh
@@ -34,6 +34,37 @@ readonly RETRY_SLEEP_INTERVAL=30
 
 # --- Functions ---
 
+# collect_oom_debug_info gathers debugging information when a pod exits with 137 (OOM/SIGKILL).
+# This helps determine if the pod was killed due to pod limits or node-level OOM.
+collect_oom_debug_info() {
+  echo "=== OOM DEBUG INFO: Collecting diagnostics for exit code 137 ==="
+
+  echo "--- Pod description for ${MAISTRA_SC_POD} ---"
+  oc describe pod "${MAISTRA_SC_POD}" -n "${MAISTRA_NAMESPACE}" 2>&1 || echo "Failed to describe pod"
+
+  local node_name
+  node_name=$(oc get pod "${MAISTRA_SC_POD}" -n "${MAISTRA_NAMESPACE}" -o jsonpath='{.spec.nodeName}' 2>/dev/null || echo "")
+
+  echo "--- Events from namespace ${MAISTRA_NAMESPACE} ---"
+  oc get events -n "${MAISTRA_NAMESPACE}" --sort-by='.lastTimestamp' 2>&1 || echo "Failed to get namespace events"
+
+  if [[ -n "${node_name}" ]]; then
+    echo "--- Node ${node_name} description ---"
+    oc describe node "${node_name}" 2>&1 || echo "Failed to describe node"
+
+    echo "--- Events from node ${node_name} ---"
+    oc get events --field-selector "involvedObject.name=${node_name}" --all-namespaces --sort-by='.lastTimestamp' 2>&1 || echo "Failed to get node events"
+
+    echo "--- All pods on node ${node_name} with resource usage ---"
+    oc adm top pods --all-namespaces --selector="spec.nodeName=${node_name}" 2>&1 || echo "Failed to get pod metrics (metrics-server may not be available)"
+    oc get pods --all-namespaces --field-selector "spec.nodeName=${node_name}" -o wide 2>&1 || echo "Failed to list pods on node"
+  else
+    echo "WARNING: Could not determine node name for pod ${MAISTRA_SC_POD}"
+  fi
+
+  echo "=== END OOM DEBUG INFO ==="
+}
+
 # check_cluster_operators waits up to 15 minutes for all OpenShift cluster
 # operators to be in a stable (not Progressing, not Degraded, and Available) state.
 check_cluster_operators() {
@@ -127,6 +158,11 @@ execute_and_collect_artifacts() {
   test_rc=$?
   echo "Test run (attempt ${attempt}) completed with exit code ${test_rc}"
 
+  # Collect debug info if pod was killed (likely OOM)
+  if [[ "${test_rc}" -eq 137 ]]; then
+    collect_oom_debug_info
+  fi
+
   echo "Copying artifacts from test pod after attempt ${attempt}..."
   oc cp "${MAISTRA_NAMESPACE}"/"${MAISTRA_SC_POD}":"${ARTIFACT_DIR}"/. "${ARTIFACT_DIR}"